diff options
Diffstat (limited to 'src/third_party/wiredtiger/src')
260 files changed, 88711 insertions, 0 deletions
diff --git a/src/third_party/wiredtiger/src/async/async_api.c b/src/third_party/wiredtiger/src/async/async_api.c new file mode 100644 index 00000000000..3cb78e80b09 --- /dev/null +++ b/src/third_party/wiredtiger/src/async/async_api.c @@ -0,0 +1,604 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __async_get_format -- + * Find or allocate the uri/config/format structure. + */ +static int +__async_get_format(WT_CONNECTION_IMPL *conn, const char *uri, + const char *config, WT_ASYNC_OP_IMPL *op) +{ + WT_ASYNC *async; + WT_ASYNC_FORMAT *af; + WT_CURSOR *c; + WT_DECL_RET; + WT_SESSION *wt_session; + WT_SESSION_IMPL *session; + uint64_t cfg_hash, uri_hash; + + async = conn->async; + c = NULL; + op->format = NULL; + + if (uri != NULL) + uri_hash = __wt_hash_city64(uri, strlen(uri)); + else + uri_hash = 0; + if (config != NULL) + cfg_hash = __wt_hash_city64(config, strlen(config)); + else + cfg_hash = 0; + + /* + * We don't need to hold a lock around this walk. The list is + * permanent and always valid. We might race an insert and there + * is a possibility a duplicate entry might be inserted, but + * that is not harmful. + */ + STAILQ_FOREACH(af, &async->formatqh, q) { + if (af->uri_hash == uri_hash && af->cfg_hash == cfg_hash) + goto setup; + } + /* + * We didn't find one in the cache. Allocate and initialize one. + * Insert it at the head expecting LRU usage. We need a real session + * for the cursor. + */ + WT_RET( + __wt_open_internal_session(conn, "async-cursor", 1, 1, &session)); + __wt_spin_lock(session, &async->ops_lock); + WT_ERR(__wt_calloc_def(session, 1, &af)); + WT_ERR(__wt_strdup(session, uri, &af->uri)); + WT_ERR(__wt_strdup(session, config, &af->config)); + af->uri_hash = uri_hash; + af->cfg_hash = cfg_hash; + /* + * Get the key_format and value_format for this URI and store + * it in the structure so that async->set_key/value work. + */ + wt_session = &session->iface; + WT_ERR(wt_session->open_cursor(wt_session, uri, NULL, NULL, &c)); + WT_ERR(__wt_strdup(session, c->key_format, &af->key_format)); + WT_ERR(__wt_strdup(session, c->value_format, &af->value_format)); + WT_ERR(c->close(c)); + c = NULL; + + STAILQ_INSERT_HEAD(&async->formatqh, af, q); + __wt_spin_unlock(session, &async->ops_lock); + WT_ERR(wt_session->close(wt_session, NULL)); + +setup: op->format = af; + /* + * Copy the pointers for the formats. Items in the async format + * queue remain there until the connection is closed. We must + * initialize the format fields in the async_op, which are publicly + * visible, and its internal cursor used by internal key/value + * functions. + */ + op->iface.c.key_format = op->iface.key_format = af->key_format; + op->iface.c.value_format = op->iface.value_format = af->value_format; + return (0); + +err: + if (c != NULL) + (void)c->close(c); + __wt_free(session, af->uri); + __wt_free(session, af->config); + __wt_free(session, af->key_format); + __wt_free(session, af->value_format); + __wt_free(session, af); + return (ret); +} + +/* + * __async_new_op_alloc -- + * Find and allocate the next available async op handle. + */ +static int +__async_new_op_alloc(WT_SESSION_IMPL *session, const char *uri, + const char *config, WT_ASYNC_OP_IMPL **opp) +{ + WT_ASYNC *async; + WT_ASYNC_OP_IMPL *op; + WT_CONNECTION_IMPL *conn; + uint32_t i, save_i, view; + + conn = S2C(session); + async = conn->async; + WT_STAT_FAST_CONN_INCR(session, async_op_alloc); + *opp = NULL; + +retry: + op = NULL; + WT_ORDERED_READ(save_i, async->ops_index); + /* + * Look after the last one allocated for a free one. We'd expect + * ops to be freed mostly FIFO so we should quickly find one. + */ + for (view = 1, i = save_i; i < conn->async_size; i++, view++) { + op = &async->async_ops[i]; + if (op->state == WT_ASYNCOP_FREE) + break; + } + + /* + * Loop around back to the beginning if we need to. + */ + if (op == NULL || op->state != WT_ASYNCOP_FREE) + for (i = 0; i < save_i; i++, view++) { + op = &async->async_ops[i]; + if (op->state == WT_ASYNCOP_FREE) + break; + } + + /* + * We still haven't found one. Return an error. + */ + if (op == NULL || op->state != WT_ASYNCOP_FREE) { + WT_STAT_FAST_CONN_INCR(session, async_full); + WT_RET(EBUSY); + } + /* + * Set the state of this op handle as READY for the user to use. + * If we can set the state then the op entry is ours. + * Start the next search at the next entry after this one. + */ + if (!WT_ATOMIC_CAS4(op->state, WT_ASYNCOP_FREE, WT_ASYNCOP_READY)) { + WT_STAT_FAST_CONN_INCR(session, async_alloc_race); + goto retry; + } + WT_STAT_FAST_CONN_INCRV(session, async_alloc_view, view); + WT_RET(__async_get_format(conn, uri, config, op)); + op->unique_id = WT_ATOMIC_ADD8(async->op_id, 1); + op->optype = WT_AOP_NONE; + (void)WT_ATOMIC_STORE4(async->ops_index, (i + 1) % conn->async_size); + *opp = op; + return (0); +} + +/* + * __async_config -- + * Parse and setup the async API options. + */ +static int +__async_config(WT_SESSION_IMPL *session, + WT_CONNECTION_IMPL *conn, const char **cfg, int *runp) +{ + WT_CONFIG_ITEM cval; + + /* + * The async configuration is off by default. + */ + WT_RET(__wt_config_gets(session, cfg, "async.enabled", &cval)); + *runp = cval.val != 0; + + /* + * Even if async is turned off, we want to parse and store the + * default values so that reconfigure can just enable them. + */ + WT_RET(__wt_config_gets(session, cfg, "async.ops_max", &cval)); + conn->async_size = (uint32_t)cval.val; + + WT_RET(__wt_config_gets(session, cfg, "async.threads", &cval)); + conn->async_workers = (uint32_t)cval.val; + /* Sanity check that api_data.py is in sync with async.h */ + WT_ASSERT(session, conn->async_workers <= WT_ASYNC_MAX_WORKERS); + + return (0); +} + +/* + * __wt_async_stats_update -- + * Update the async stats for return to the application. + */ +void +__wt_async_stats_update(WT_SESSION_IMPL *session) +{ + WT_ASYNC *async; + WT_CONNECTION_IMPL *conn; + WT_CONNECTION_STATS *stats; + + conn = S2C(session); + async = conn->async; + if (async == NULL) + return; + stats = &conn->stats; + WT_STAT_SET(stats, async_cur_queue, async->cur_queue); + WT_STAT_SET(stats, async_max_queue, async->max_queue); + F_SET(conn, WT_CONN_SERVER_ASYNC); +} + +/* + * __async_start -- + * Start the async subsystem. All configuration processing has + * already been done by the caller. + */ +static int +__async_start(WT_SESSION_IMPL *session) +{ + WT_ASYNC *async; + WT_CONNECTION_IMPL *conn; + uint32_t i; + + conn = S2C(session); + conn->async_cfg = 1; + /* + * Async is on, allocate the WT_ASYNC structure and initialize the ops. + */ + WT_RET(__wt_calloc(session, 1, sizeof(WT_ASYNC), &conn->async)); + async = conn->async; + STAILQ_INIT(&async->formatqh); + WT_RET(__wt_spin_init(session, &async->ops_lock, "ops")); + WT_RET(__wt_cond_alloc(session, "async flush", 0, &async->flush_cond)); + WT_RET(__wt_async_op_init(session)); + + /* + * Start up the worker threads. + */ + F_SET(conn, WT_CONN_SERVER_ASYNC); + for (i = 0; i < conn->async_workers; i++) { + /* + * Each worker has its own session. We set both a general + * server flag in the connection and an individual flag + * in the session. The user may reconfigure the number of + * workers and we may want to selectively stop some workers + * while leaving the rest running. + */ + WT_RET(__wt_open_internal_session( + conn, "async-worker", 1, 1, &async->worker_sessions[i])); + F_SET(async->worker_sessions[i], WT_SESSION_SERVER_ASYNC); + } + for (i = 0; i < conn->async_workers; i++) { + /* + * Start the threads. + */ + WT_RET(__wt_thread_create(session, &async->worker_tids[i], + __wt_async_worker, async->worker_sessions[i])); + } + __wt_async_stats_update(session); + return (0); +} + +/* + * __wt_async_create -- + * Start the async subsystem and worker threads. + */ +int +__wt_async_create(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONNECTION_IMPL *conn; + int run; + + conn = S2C(session); + + /* Handle configuration. */ + run = 0; + WT_RET(__async_config(session, conn, cfg, &run)); + + /* If async is not configured, we're done. */ + if (!run) + return (0); + return (__async_start(session)); +} + +/* + * __wt_async_reconfig -- + * Start the async subsystem and worker threads. + */ +int +__wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_ASYNC *async; + WT_CONNECTION_IMPL *conn, tmp_conn; + WT_DECL_RET; + WT_SESSION *wt_session; + int run; + uint32_t i; + + conn = S2C(session); + async = conn->async; + memset(&tmp_conn, 0, sizeof(tmp_conn)); + tmp_conn.async_cfg = conn->async_cfg; + tmp_conn.async_workers = conn->async_workers; + tmp_conn.async_size = conn->async_size; + + /* Handle configuration. */ + run = conn->async_cfg; + WT_RET(__async_config(session, &tmp_conn, cfg, &run)); + + /* + * There are some restrictions on the live reconfiguration of async. + * Unlike other subsystems where we simply destroy anything existing + * and restart with the new configuration, async is not so easy. + * If the user is just changing the number of workers, we want to + * allow the existing op handles and other information to remain in + * existence. So we must handle various combinations of changes + * individually. + * + * One restriction is that if async is currently on, the user cannot + * change the number of async op handles available. The user can try + * but we do nothing with it. However we must allow the ops_max config + * string so that a user can completely start async via reconfigure. + */ + + /* + * Easy cases: + * 1. If async is on and the user wants it off, shut it down. + * 2. If async is off, and the user wants it on, start it. + * 3. If not a toggle and async is off, we're done. + */ + if (conn->async_cfg > 0 && !run) { + /* Case 1 */ + WT_TRET(__wt_async_flush(session)); + ret = __wt_async_destroy(session); + conn->async_cfg = 0; + return (ret); + } else if (conn->async_cfg == 0 && run) + /* Case 2 */ + return (__async_start(session)); + else if (conn->async_cfg == 0) + /* Case 3 */ + return (0); + + /* + * Running async worker modification cases: + * 4. If number of workers didn't change, we're done. + * 5. If more workers, start new ones. + * 6. If fewer workers, kill some. + */ + if (conn->async_workers == tmp_conn.async_workers) + /* No change in the number of workers. */ + return (0); + if (conn->async_workers < tmp_conn.async_workers) { + /* Case 5 */ + /* + * The worker_sessions array is allocated for the maximum + * allowed number of workers, so starting more is easy. + */ + for (i = conn->async_workers; i < tmp_conn.async_workers; i++) { + /* + * Each worker has its own session. + */ + WT_RET(__wt_open_internal_session(conn, + "async-worker", 1, 1, &async->worker_sessions[i])); + F_SET(async->worker_sessions[i], + WT_SESSION_SERVER_ASYNC); + } + for (i = conn->async_workers; i < tmp_conn.async_workers; i++) { + /* + * Start the threads. + */ + WT_RET(__wt_thread_create(session, + &async->worker_tids[i], __wt_async_worker, + async->worker_sessions[i])); + } + conn->async_workers = tmp_conn.async_workers; + } + if (conn->async_workers > tmp_conn.async_workers) { + /* Case 6 */ + /* + * Stopping an individual async worker is the most complex case. + * We clear the session async flag on the targeted worker thread + * so that only that thread stops, and the others keep running. + */ + for (i = conn->async_workers - 1; + i >= tmp_conn.async_workers; i--) { + /* + * Join any worker we're stopping. + * After the thread is stopped, close its session. + */ + WT_ASSERT(session, async->worker_tids[i] != 0); + WT_ASSERT(session, async->worker_sessions[i] != NULL); + F_CLR(async->worker_sessions[i], + WT_SESSION_SERVER_ASYNC); + WT_TRET(__wt_thread_join( + session, async->worker_tids[i])); + async->worker_tids[i] = 0; + wt_session = &async->worker_sessions[i]->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + async->worker_sessions[i] = NULL; + } + conn->async_workers = tmp_conn.async_workers; + } + + return (0); +} + +/* + * __wt_async_destroy -- + * Destroy the async worker threads and async subsystem. + */ +int +__wt_async_destroy(WT_SESSION_IMPL *session) +{ + WT_ASYNC *async; + WT_ASYNC_FORMAT *af, *afnext; + WT_ASYNC_OP *op; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION *wt_session; + uint32_t i; + + conn = S2C(session); + async = conn->async; + + if (!conn->async_cfg) + return (0); + + F_CLR(conn, WT_CONN_SERVER_ASYNC); + for (i = 0; i < conn->async_workers; i++) + if (async->worker_tids[i] != 0) { + WT_TRET(__wt_thread_join( + session, async->worker_tids[i])); + async->worker_tids[i] = 0; + } + WT_TRET(__wt_cond_destroy(session, &async->flush_cond)); + + /* Close the server threads' sessions. */ + for (i = 0; i < conn->async_workers; i++) + if (async->worker_sessions[i] != NULL) { + wt_session = &async->worker_sessions[i]->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + async->worker_sessions[i] = NULL; + } + /* Free any op key/value buffers. */ + for (i = 0; i < conn->async_size; i++) { + op = (WT_ASYNC_OP *)&async->async_ops[i]; + if (op->c.key.data != NULL) + __wt_buf_free(session, &op->c.key); + if (op->c.value.data != NULL) + __wt_buf_free(session, &op->c.value); + } + + /* Free format resources */ + af = STAILQ_FIRST(&async->formatqh); + while (af != NULL) { + afnext = STAILQ_NEXT(af, q); + __wt_free(session, af->uri); + __wt_free(session, af->config); + __wt_free(session, af->key_format); + __wt_free(session, af->value_format); + __wt_free(session, af); + af = afnext; + } + __wt_free(session, async->async_queue); + __wt_free(session, async->async_ops); + __wt_spin_destroy(session, &async->ops_lock); + __wt_free(session, conn->async); + + return (ret); +} + +/* + * __wt_async_flush -- + * Implementation of the WT_CONN->async_flush method. + */ +int +__wt_async_flush(WT_SESSION_IMPL *session) +{ + WT_ASYNC *async; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + conn = S2C(session); + if (!conn->async_cfg) + return (0); + + async = conn->async; + WT_STAT_FAST_CONN_INCR(session, async_flush); + /* + * We have to do several things. First we have to prevent + * other callers from racing with us so that only one + * flush is happening at a time. Next we have to wait for + * the worker threads to notice the flush and indicate + * that the flush is complete on their side. Then we + * clear the flush flags and return. + */ +retry: + while (async->flush_state != WT_ASYNC_FLUSH_NONE) + /* + * We're racing an in-progress flush. We need to wait + * our turn to start our own. We need to convoy the + * racing calls because a later call may be waiting for + * specific enqueued ops to be complete before this returns. + */ + __wt_sleep(0, 100000); + + if (!WT_ATOMIC_CAS4(async->flush_state, WT_ASYNC_FLUSH_NONE, + WT_ASYNC_FLUSH_IN_PROGRESS)) + goto retry; + /* + * We're the owner of this flush operation. Set the + * WT_ASYNC_FLUSH_IN_PROGRESS to block other callers. + * We're also preventing all worker threads from taking + * things off the work queue with the lock. + */ + async->flush_count = 0; + (void)WT_ATOMIC_ADD8(async->flush_gen, 1); + WT_ASSERT(session, async->flush_op.state == WT_ASYNCOP_FREE); + async->flush_op.state = WT_ASYNCOP_READY; + WT_ERR(__wt_async_op_enqueue(session, &async->flush_op)); + while (async->flush_state != WT_ASYNC_FLUSH_COMPLETE) + WT_ERR(__wt_cond_wait(NULL, async->flush_cond, 100000)); + /* + * Flush is done. Clear the flags. + */ + async->flush_op.state = WT_ASYNCOP_FREE; + WT_PUBLISH(async->flush_state, WT_ASYNC_FLUSH_NONE); +err: + return (ret); +} + +/* + * __async_runtime_config -- + * Configure runtime fields at allocation. + */ +static int +__async_runtime_config(WT_ASYNC_OP_IMPL *op, const char *cfg[]) +{ + WT_ASYNC_OP *asyncop; + WT_CONFIG_ITEM cval; + WT_SESSION_IMPL *session; + + session = O2S(op); + asyncop = (WT_ASYNC_OP *)op; + WT_RET(__wt_config_gets_def(session, cfg, "append", 0, &cval)); + if (cval.val) + F_SET(&asyncop->c, WT_CURSTD_APPEND); + else + F_CLR(&asyncop->c, WT_CURSTD_APPEND); + WT_RET(__wt_config_gets_def(session, cfg, "overwrite", 1, &cval)); + if (cval.val) + F_SET(&asyncop->c, WT_CURSTD_OVERWRITE); + else + F_CLR(&asyncop->c, WT_CURSTD_OVERWRITE); + WT_RET(__wt_config_gets_def(session, cfg, "raw", 0, &cval)); + if (cval.val) + F_SET(&asyncop->c, WT_CURSTD_RAW); + else + F_CLR(&asyncop->c, WT_CURSTD_RAW); + return (0); + +} + +/* + * __wt_async_new_op -- + * Implementation of the WT_CONN->async_new_op method. + */ +int +__wt_async_new_op(WT_SESSION_IMPL *session, const char *uri, + const char *config, const char *cfg[], WT_ASYNC_CALLBACK *cb, + WT_ASYNC_OP_IMPL **opp) +{ + WT_ASYNC_OP_IMPL *op; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + *opp = NULL; + + conn = S2C(session); + if (!conn->async_cfg) + return (ENOTSUP); + + op = NULL; + WT_ERR(__async_new_op_alloc(session, uri, config, &op)); + WT_ERR(__async_runtime_config(op, cfg)); + op->cb = cb; + *opp = op; + return (0); + +err: + /* + * If we get an error after allocating op, set its state to free. + */ + if (op != NULL) + op->state = WT_ASYNCOP_FREE; + return (ret); +} diff --git a/src/third_party/wiredtiger/src/async/async_op.c b/src/third_party/wiredtiger/src/async/async_op.c new file mode 100644 index 00000000000..9dba2b2b5f3 --- /dev/null +++ b/src/third_party/wiredtiger/src/async/async_op.c @@ -0,0 +1,359 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" +/* + * __async_get_key -- + * WT_ASYNC_OP->get_key implementation for op handles. + */ +static int +__async_get_key(WT_ASYNC_OP *asyncop, ...) +{ + WT_DECL_RET; + va_list ap; + + va_start(ap, asyncop); + ret = __wt_cursor_get_keyv(&asyncop->c, asyncop->c.flags, ap); + va_end(ap); + return (ret); +} + +/* + * __async_set_key -- + * WT_ASYNC_OP->set_key implementation for op handles. + */ +static void +__async_set_key(WT_ASYNC_OP *asyncop, ...) +{ + WT_CURSOR *c; + WT_DECL_RET; + va_list ap; + + c = &asyncop->c; + va_start(ap, asyncop); + __wt_cursor_set_keyv(c, c->flags, ap); + if (!WT_DATA_IN_ITEM(&c->key) && !WT_CURSOR_RECNO(c)) + WT_ERR(__wt_buf_set(O2S((WT_ASYNC_OP_IMPL *)asyncop), &c->key, + c->key.data, c->key.size)); + va_end(ap); + if (0) +err: c->saved_err = ret; +} + +/* + * __async_get_value -- + * WT_ASYNC_OP->get_value implementation for op handles. + */ +static int +__async_get_value(WT_ASYNC_OP *asyncop, ...) +{ + WT_DECL_RET; + va_list ap; + + va_start(ap, asyncop); + ret = __wt_cursor_get_valuev(&asyncop->c, ap); + va_end(ap); + return (ret); +} + +/* + * __async_set_value -- + * WT_ASYNC_OP->set_value implementation for op handles. + */ +static void +__async_set_value(WT_ASYNC_OP *asyncop, ...) +{ + WT_CURSOR *c; + WT_DECL_RET; + va_list ap; + + c = &asyncop->c; + va_start(ap, asyncop); + __wt_cursor_set_valuev(c, ap); + /* Copy the data, if it is pointing at data elsewhere. */ + if (!WT_DATA_IN_ITEM(&c->value)) + WT_ERR(__wt_buf_set(O2S((WT_ASYNC_OP_IMPL *)asyncop), + &c->value, c->value.data, c->value.size)); + va_end(ap); + if (0) +err: c->saved_err = ret; +} + +/* + * __async_op_wrap -- + * Common wrapper for all async operations. + */ +static int +__async_op_wrap(WT_ASYNC_OP_IMPL *op, WT_ASYNC_OPTYPE type) +{ + op->optype = type; + return (__wt_async_op_enqueue(O2S(op), op)); +} + +/* + * __async_search -- + * WT_ASYNC_OP->search implementation for op handles. + */ +static int +__async_search(WT_ASYNC_OP *asyncop) +{ + WT_ASYNC_OP_IMPL *op; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + op = (WT_ASYNC_OP_IMPL *)asyncop; + ASYNCOP_API_CALL(O2C(op), session, search); + WT_STAT_FAST_CONN_INCR(O2S(op), async_op_search); + WT_ERR(__async_op_wrap(op, WT_AOP_SEARCH)); +err: API_END_RET(session, ret); +} + +/* + * __async_insert -- + * WT_ASYNC_OP->insert implementation for op handles. + */ +static int +__async_insert(WT_ASYNC_OP *asyncop) +{ + WT_ASYNC_OP_IMPL *op; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + op = (WT_ASYNC_OP_IMPL *)asyncop; + ASYNCOP_API_CALL(O2C(op), session, insert); + WT_STAT_FAST_CONN_INCR(O2S(op), async_op_insert); + WT_ERR(__async_op_wrap(op, WT_AOP_INSERT)); +err: API_END_RET(session, ret); +} + +/* + * __async_update -- + * WT_ASYNC_OP->update implementation for op handles. + */ +static int +__async_update(WT_ASYNC_OP *asyncop) +{ + WT_ASYNC_OP_IMPL *op; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + op = (WT_ASYNC_OP_IMPL *)asyncop; + ASYNCOP_API_CALL(O2C(op), session, update); + WT_STAT_FAST_CONN_INCR(O2S(op), async_op_update); + WT_ERR(__async_op_wrap(op, WT_AOP_UPDATE)); +err: API_END_RET(session, ret); +} + +/* + * __async_remove -- + * WT_ASYNC_OP->remove implementation for op handles. + */ +static int +__async_remove(WT_ASYNC_OP *asyncop) +{ + WT_ASYNC_OP_IMPL *op; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + op = (WT_ASYNC_OP_IMPL *)asyncop; + ASYNCOP_API_CALL(O2C(op), session, remove); + WT_STAT_FAST_CONN_INCR(O2S(op), async_op_remove); + WT_ERR(__async_op_wrap(op, WT_AOP_REMOVE)); +err: API_END_RET(session, ret); +} + +/* + * __async_compact -- + * WT_ASYNC_OP->compact implementation for op handles. + */ +static int +__async_compact(WT_ASYNC_OP *asyncop) +{ + WT_ASYNC_OP_IMPL *op; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + op = (WT_ASYNC_OP_IMPL *)asyncop; + ASYNCOP_API_CALL(O2C(op), session, compact); + WT_STAT_FAST_CONN_INCR(O2S(op), async_op_compact); + WT_ERR(__async_op_wrap(op, WT_AOP_COMPACT)); +err: API_END_RET(session, ret); +} + +/* + * __async_get_id -- + * WT_ASYNC_OP->get_id implementation for op handles. + */ +static uint64_t +__async_get_id(WT_ASYNC_OP *asyncop) +{ + return (((WT_ASYNC_OP_IMPL *)asyncop)->unique_id); +} + +/* + * __async_get_type -- + * WT_ASYNC_OP->get_type implementation for op handles. + */ +static WT_ASYNC_OPTYPE +__async_get_type(WT_ASYNC_OP *asyncop) +{ + return (((WT_ASYNC_OP_IMPL *)asyncop)->optype); +} + +/* + * __async_op_init -- + * Initialize all the op handle fields. + */ +static int +__async_op_init(WT_CONNECTION_IMPL *conn, WT_ASYNC_OP_IMPL *op, uint32_t id) +{ + WT_ASYNC_OP *asyncop; + + asyncop = (WT_ASYNC_OP *)op; + asyncop->connection = (WT_CONNECTION *)conn; + asyncop->key_format = asyncop->value_format = NULL; + asyncop->c.key_format = asyncop->c.value_format = NULL; + asyncop->get_key = __async_get_key; + asyncop->get_value = __async_get_value; + asyncop->set_key = __async_set_key; + asyncop->set_value = __async_set_value; + asyncop->search = __async_search; + asyncop->insert = __async_insert; + asyncop->update = __async_update; + asyncop->remove = __async_remove; + asyncop->compact = __async_compact; + asyncop->get_id = __async_get_id; + asyncop->get_type = __async_get_type; + /* + * The cursor needs to have the get/set key/value functions initialized. + * It also needs the key/value related fields set up. + */ + asyncop->c.get_key = __wt_cursor_get_key; + asyncop->c.set_key = __wt_cursor_set_key; + asyncop->c.get_value = __wt_cursor_get_value; + asyncop->c.set_value = __wt_cursor_set_value; + asyncop->c.recno = 0; + memset(asyncop->c.raw_recno_buf, 0, sizeof(asyncop->c.raw_recno_buf)); + memset(&asyncop->c.key, 0, sizeof(asyncop->c.key)); + memset(&asyncop->c.value, 0, sizeof(asyncop->c.value)); + asyncop->c.session = (WT_SESSION *)conn->default_session; + asyncop->c.saved_err = 0; + asyncop->c.flags = 0; + + op->internal_id = id; + op->state = WT_ASYNCOP_FREE; + return (0); +} + +/* + * __wt_async_op_enqueue -- + * Enqueue an operation onto the work queue. + */ +int +__wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op) +{ + WT_ASYNC *async; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + uint64_t cur_head, cur_tail, my_alloc, my_slot; +#ifdef HAVE_DIAGNOSTIC + WT_ASYNC_OP_IMPL *my_op; +#endif + + conn = S2C(session); + async = conn->async; + /* + * Enqueue op at the tail of the work queue. + */ + WT_ASSERT(session, op->state == WT_ASYNCOP_READY); + /* + * We get our slot in the ring buffer to use. + */ + my_alloc = WT_ATOMIC_ADD8(async->alloc_head, 1); + my_slot = my_alloc % async->async_qsize; + + /* + * Make sure we haven't wrapped around the queue. + * If so, wait for the tail to advance off this slot. + */ + WT_ORDERED_READ(cur_tail, async->tail_slot); + while (cur_tail == my_slot) { + __wt_yield(); + WT_ORDERED_READ(cur_tail, async->tail_slot); + } + +#ifdef HAVE_DIAGNOSTIC + WT_ORDERED_READ(my_op, async->async_queue[my_slot]); + if (my_op != NULL) + return (__wt_panic(session)); +#endif + WT_PUBLISH(async->async_queue[my_slot], op); + op->state = WT_ASYNCOP_ENQUEUED; + if (WT_ATOMIC_ADD4(async->cur_queue, 1) > async->max_queue) + WT_PUBLISH(async->max_queue, async->cur_queue); + /* + * Multiple threads may be adding ops to the queue. We need to wait + * our turn to make our slot visible to workers. + */ + WT_ORDERED_READ(cur_head, async->head); + while (cur_head != (my_alloc - 1)) { + __wt_yield(); + WT_ORDERED_READ(cur_head, async->head); + } + WT_PUBLISH(async->head, my_alloc); + return (ret); +} + +/* + * __wt_async_op_init -- + * Initialize all the op handles. + */ +int +__wt_async_op_init(WT_SESSION_IMPL *session) +{ + WT_ASYNC *async; + WT_ASYNC_OP_IMPL *op; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + uint32_t i; + + conn = S2C(session); + async = conn->async; + + /* + * Initialize the flush op structure. + */ + WT_RET(__async_op_init(conn, &async->flush_op, OPS_INVALID_INDEX)); + + /* + * Allocate and initialize the work queue. This is sized so that + * the ring buffer is known to be big enough such that the head + * can never overlap the tail. Include extra for the flush op. + */ + async->async_qsize = conn->async_size + 2; + WT_RET(__wt_calloc_def( + session, async->async_qsize, &async->async_queue)); + /* + * Allocate and initialize all the user ops. + */ + WT_ERR(__wt_calloc_def(session, conn->async_size, &async->async_ops)); + for (i = 0; i < conn->async_size; i++) { + op = &async->async_ops[i]; + WT_ERR(__async_op_init(conn, op, i)); + } + return (0); +err: + if (async->async_ops != NULL) { + __wt_free(session, async->async_ops); + async->async_ops = NULL; + } + if (async->async_queue != NULL) { + __wt_free(session, async->async_queue); + async->async_queue = NULL; + } + return (ret); +} diff --git a/src/third_party/wiredtiger/src/async/async_worker.c b/src/third_party/wiredtiger/src/async/async_worker.c new file mode 100644 index 00000000000..74ee2dd2f86 --- /dev/null +++ b/src/third_party/wiredtiger/src/async/async_worker.c @@ -0,0 +1,359 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __async_op_dequeue -- + * Wait for work to be available. Then atomically take it off + * the work queue. + */ +static int +__async_op_dequeue(WT_CONNECTION_IMPL *conn, WT_SESSION_IMPL *session, + WT_ASYNC_OP_IMPL **op) +{ + WT_ASYNC *async; + long sleep_usec; + uint64_t cur_tail, last_consume, my_consume, my_slot, prev_slot; + uint32_t tries; + + async = conn->async; + *op = NULL; + /* + * Wait for work to do. Work is available when async->head moves. + * Then grab the slot containing the work. If we lose, try again. + */ +retry: + tries = 0; + sleep_usec = 100; + WT_ORDERED_READ(last_consume, async->alloc_tail); + /* + * We stay in this loop until there is work to do. + */ + while (last_consume == async->head && + async->flush_state != WT_ASYNC_FLUSHING) { + WT_STAT_FAST_CONN_INCR(session, async_nowork); + if (++tries < MAX_ASYNC_YIELD) + /* + * Initially when we find no work, allow other + * threads to run. + */ + __wt_yield(); + else { + /* + * If we haven't found work in a while, start sleeping + * to wait for work to arrive instead of spinning. + */ + __wt_sleep(0, sleep_usec); + sleep_usec = WT_MIN(sleep_usec * 2, + MAX_ASYNC_SLEEP_USECS); + } + if (!F_ISSET(session, WT_SESSION_SERVER_ASYNC)) + return (0); + if (!F_ISSET(conn, WT_CONN_SERVER_ASYNC)) + return (0); + if (F_ISSET(conn, WT_CONN_PANIC)) + return (__wt_panic(session)); + WT_ORDERED_READ(last_consume, async->alloc_tail); + } + if (async->flush_state == WT_ASYNC_FLUSHING) + return (0); + /* + * Try to increment the tail to claim this slot. If we lose + * a race, try again. + */ + my_consume = last_consume + 1; + if (!WT_ATOMIC_CAS8(async->alloc_tail, last_consume, my_consume)) + goto retry; + /* + * This item of work is ours to process. Clear it out of the + * queue and return. + */ + my_slot = my_consume % async->async_qsize; + prev_slot = last_consume % async->async_qsize; + *op = WT_ATOMIC_STORE8(async->async_queue[my_slot], NULL); + + WT_ASSERT(session, async->cur_queue > 0); + WT_ASSERT(session, *op != NULL); + WT_ASSERT(session, (*op)->state == WT_ASYNCOP_ENQUEUED); + (void)WT_ATOMIC_SUB4(async->cur_queue, 1); + (*op)->state = WT_ASYNCOP_WORKING; + + if (*op == &async->flush_op) + /* + * We're the worker to take the flush op off the queue. + */ + WT_PUBLISH(async->flush_state, WT_ASYNC_FLUSHING); + WT_ORDERED_READ(cur_tail, async->tail_slot); + while (cur_tail != prev_slot) { + __wt_yield(); + WT_ORDERED_READ(cur_tail, async->tail_slot); + } + WT_PUBLISH(async->tail_slot, my_slot); + return (0); +} + +/* + * __async_flush_wait -- + * Wait for the final worker to finish flushing. + */ +static int +__async_flush_wait(WT_SESSION_IMPL *session, WT_ASYNC *async, uint64_t my_gen) +{ + WT_DECL_RET; + + while (async->flush_state == WT_ASYNC_FLUSHING && + async->flush_gen == my_gen) + WT_ERR(__wt_cond_wait(session, async->flush_cond, 10000)); +err: return (ret); +} + +/* + * __async_worker_cursor -- + * Return a cursor for the worker thread to use for its op. + * The worker thread caches cursors. So first search for one + * with the same config/uri signature. Otherwise open a new + * cursor and cache it. + */ +static int +__async_worker_cursor(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op, + WT_ASYNC_WORKER_STATE *worker, WT_CURSOR **cursorp) +{ + WT_ASYNC_CURSOR *ac; + WT_CURSOR *c; + WT_DECL_RET; + WT_SESSION *wt_session; + + wt_session = (WT_SESSION *)session; + *cursorp = NULL; + /* + * Compact doesn't need a cursor. + */ + if (op->optype == WT_AOP_COMPACT) + return (0); + WT_ASSERT(session, op->format != NULL); + STAILQ_FOREACH(ac, &worker->cursorqh, q) { + if (op->format->cfg_hash == ac->cfg_hash && + op->format->uri_hash == ac->uri_hash) { + /* + * If one of our cached cursors has a matching + * signature, use it and we're done. + */ + *cursorp = ac->c; + return (0); + } + } + /* + * We didn't find one in our cache. Open one and cache it. + * Insert it at the head expecting LRU usage. + */ + WT_RET(__wt_calloc_def(session, 1, &ac)); + WT_ERR(wt_session->open_cursor( + wt_session, op->format->uri, NULL, op->format->config, &c)); + ac->cfg_hash = op->format->cfg_hash; + ac->uri_hash = op->format->uri_hash; + ac->c = c; + STAILQ_INSERT_HEAD(&worker->cursorqh, ac, q); + worker->num_cursors++; + *cursorp = c; + return (0); + +err: __wt_free(session, ac); + return (ret); +} + +/* + * __async_worker_execop -- + * A worker thread executes an individual op with a cursor. + */ +static int +__async_worker_execop(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op, + WT_CURSOR *cursor) +{ + WT_ASYNC_OP *asyncop; + WT_ITEM val; + WT_SESSION *wt_session; + + asyncop = (WT_ASYNC_OP *)op; + /* + * Set the key of our local cursor from the async op handle. + * If needed, also set the value. + */ + if (op->optype != WT_AOP_COMPACT) { + WT_RET(__wt_cursor_get_raw_key(&asyncop->c, &val)); + __wt_cursor_set_raw_key(cursor, &val); + if (op->optype == WT_AOP_INSERT || + op->optype == WT_AOP_UPDATE) { + WT_RET(__wt_cursor_get_raw_value(&asyncop->c, &val)); + __wt_cursor_set_raw_value(cursor, &val); + } + } + switch (op->optype) { + case WT_AOP_COMPACT: + wt_session = &session->iface; + WT_RET(wt_session->compact(wt_session, + op->format->uri, op->format->config)); + break; + case WT_AOP_INSERT: + WT_RET(cursor->insert(cursor)); + break; + case WT_AOP_UPDATE: + WT_RET(cursor->update(cursor)); + break; + case WT_AOP_REMOVE: + WT_RET(cursor->remove(cursor)); + break; + case WT_AOP_SEARCH: + WT_RET(cursor->search(cursor)); + /* + * Get the value from the cursor and put it into + * the op for op->get_value. + */ + WT_RET(__wt_cursor_get_raw_value(cursor, &val)); + __wt_cursor_set_raw_value(&asyncop->c, &val); + break; + case WT_AOP_NONE: + default: + WT_RET_MSG(session, EINVAL, "Unknown async optype %d\n", + op->optype); + } + return (0); +} + +/* + * __async_worker_op -- + * A worker thread handles an individual op. + */ +static int +__async_worker_op(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op, + WT_ASYNC_WORKER_STATE *worker) +{ + WT_ASYNC_OP *asyncop; + WT_CURSOR *cursor; + WT_DECL_RET; + WT_SESSION *wt_session; + int cb_ret; + + asyncop = (WT_ASYNC_OP *)op; + + cb_ret = 0; + + wt_session = &session->iface; + if (op->optype != WT_AOP_COMPACT) + WT_RET(wt_session->begin_transaction(wt_session, NULL)); + WT_ASSERT(session, op->state == WT_ASYNCOP_WORKING); + WT_RET(__async_worker_cursor(session, op, worker, &cursor)); + /* + * Perform op and invoke the callback. + */ + ret = __async_worker_execop(session, op, cursor); + if (op->cb != NULL && op->cb->notify != NULL) + cb_ret = op->cb->notify(op->cb, asyncop, ret, 0); + + /* + * If the operation succeeded and the user callback returned + * zero then commit. Otherwise rollback. + */ + if (op->optype != WT_AOP_COMPACT) { + if ((ret == 0 || ret == WT_NOTFOUND) && cb_ret == 0) + WT_TRET(wt_session->commit_transaction( + wt_session, NULL)); + else + WT_TRET(wt_session->rollback_transaction( + wt_session, NULL)); + F_CLR(&asyncop->c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + WT_TRET(cursor->reset(cursor)); + } + /* + * After the callback returns, and the transaction resolved release + * the op back to the free pool. We do this regardless of + * success or failure. + */ + WT_PUBLISH(op->state, WT_ASYNCOP_FREE); + return (ret); +} + +/* + * __async_worker -- + * The async worker threads. + */ +void * +__wt_async_worker(void *arg) +{ + WT_ASYNC *async; + WT_ASYNC_CURSOR *ac, *acnext; + WT_ASYNC_OP_IMPL *op; + WT_ASYNC_WORKER_STATE worker; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION_IMPL *session; + uint64_t flush_gen; + + session = arg; + conn = S2C(session); + async = conn->async; + + worker.num_cursors = 0; + STAILQ_INIT(&worker.cursorqh); + while (F_ISSET(conn, WT_CONN_SERVER_ASYNC) && + F_ISSET(session, WT_SESSION_SERVER_ASYNC)) { + WT_ERR(__async_op_dequeue(conn, session, &op)); + if (op != NULL && op != &async->flush_op) { + /* + * If an operation fails, we want the worker thread to + * keep running, unless there is a panic. + */ + (void)__async_worker_op(session, op, &worker); + if (F_ISSET(conn, WT_CONN_PANIC)) + WT_ERR(__wt_panic(session)); + } else if (async->flush_state == WT_ASYNC_FLUSHING) { + /* + * Worker flushing going on. Last worker to the party + * needs to clear the FLUSHING flag and signal the cond. + * If FLUSHING is going on, we do not take anything off + * the queue. + */ + WT_ORDERED_READ(flush_gen, async->flush_gen); + if (WT_ATOMIC_ADD4(async->flush_count, 1) == + conn->async_workers) { + /* + * We're last. All workers accounted for so + * signal the condition and clear the FLUSHING + * flag to release the other worker threads. + * Set the FLUSH_COMPLETE flag so that the + * caller can return to the application. + */ + WT_PUBLISH(async->flush_state, + WT_ASYNC_FLUSH_COMPLETE); + WT_ERR(__wt_cond_signal(session, + async->flush_cond)); + } else + /* + * We need to wait for the last worker to + * signal the condition. + */ + WT_ERR(__async_flush_wait( + session, async, flush_gen)); + } + } + + if (0) { +err: __wt_err(session, ret, "async worker error"); + } + /* + * Worker thread cleanup, close our cached cursors and + * free all the WT_ASYNC_CURSOR structures. + */ + ac = STAILQ_FIRST(&worker.cursorqh); + while (ac != NULL) { + acnext = STAILQ_NEXT(ac, q); + WT_TRET(ac->c->close(ac->c)); + __wt_free(session, ac); + ac = acnext; + } + return (NULL); +} diff --git a/src/third_party/wiredtiger/src/block/block_addr.c b/src/third_party/wiredtiger/src/block/block_addr.c new file mode 100644 index 00000000000..bbd52359157 --- /dev/null +++ b/src/third_party/wiredtiger/src/block/block_addr.c @@ -0,0 +1,202 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __block_buffer_to_addr -- + * Convert a filesystem address cookie into its components, UPDATING the + * caller's buffer reference so it can be called repeatedly to load a buffer. + */ +static int +__block_buffer_to_addr(WT_BLOCK *block, + const uint8_t **pp, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump) +{ + uint64_t o, s, c; + + WT_RET(__wt_vunpack_uint(pp, 0, &o)); + WT_RET(__wt_vunpack_uint(pp, 0, &s)); + WT_RET(__wt_vunpack_uint(pp, 0, &c)); + + /* + * To avoid storing large offsets, we minimize the value by subtracting + * a block for description information, then storing a count of block + * allocation units. That implies there is no such thing as an + * "invalid" offset though, they could all be valid (other than very + * large numbers), which is what we didn't want to store in the first + * place. Use the size: writing a block of size 0 makes no sense, so + * that's the out-of-band value. Once we're out of this function and + * are working with a real file offset, size and checksum triplet, there + * can be invalid offsets, that's simpler than testing sizes of 0 all + * over the place. + */ + if (s == 0) { + *offsetp = 0; + *sizep = *cksump = 0; + } else { + *offsetp = (wt_off_t)(o + 1) * block->allocsize; + *sizep = (uint32_t)s * block->allocsize; + *cksump = (uint32_t)c; + } + return (0); +} + +/* + * __wt_block_addr_to_buffer -- + * Convert the filesystem components into its address cookie. + */ +int +__wt_block_addr_to_buffer(WT_BLOCK *block, + uint8_t **pp, wt_off_t offset, uint32_t size, uint32_t cksum) +{ + uint64_t o, s, c; + + /* See the comment above: this is the reverse operation. */ + if (size == 0) { + o = WT_BLOCK_INVALID_OFFSET; + s = c = 0; + } else { + o = (uint64_t)offset / block->allocsize - 1; + s = size / block->allocsize; + c = cksum; + } + WT_RET(__wt_vpack_uint(pp, 0, o)); + WT_RET(__wt_vpack_uint(pp, 0, s)); + WT_RET(__wt_vpack_uint(pp, 0, c)); + return (0); +} + +/* + * __wt_block_buffer_to_addr -- + * Convert a filesystem address cookie into its components NOT UPDATING + * the caller's buffer reference. + */ +int +__wt_block_buffer_to_addr(WT_BLOCK *block, + const uint8_t *p, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump) +{ + return (__block_buffer_to_addr(block, &p, offsetp, sizep, cksump)); +} + +/* + * __wt_block_addr_valid -- + * Return if an address cookie is valid. + */ +int +__wt_block_addr_valid(WT_SESSION_IMPL *session, + WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int live) +{ + wt_off_t offset; + uint32_t cksum, size; + + WT_UNUSED(session); + WT_UNUSED(addr_size); + WT_UNUSED(live); + + /* Crack the cookie. */ + WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); + +#ifdef HAVE_DIAGNOSTIC + /* + * In diagnostic mode, verify the address isn't on the available list, + * or for live systems, the discard list. + */ + WT_RET(__wt_block_misplaced( + session, block, "addr-valid", offset, size, live)); +#endif + + /* Check if it's past the end of the file. */ + return (offset + size > block->fh->size ? 0 : 1); +} + +/* + * __wt_block_addr_string -- + * Return a printable string representation of an address cookie. + */ +int +__wt_block_addr_string(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) +{ + wt_off_t offset; + uint32_t cksum, size; + + WT_UNUSED(addr_size); + + /* Crack the cookie. */ + WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); + + /* Printable representation. */ + WT_RET(__wt_buf_fmt(session, buf, + "[%" PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", + (uintmax_t)offset, (uintmax_t)offset + size, size, cksum)); + + return (0); +} + +/* + * __wt_block_buffer_to_ckpt -- + * Convert a checkpoint cookie into its components. + */ +int +__wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, + WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci) +{ + uint64_t a; + const uint8_t **pp; + + ci->version = *p++; + if (ci->version != WT_BM_CHECKPOINT_VERSION) + WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version"); + + pp = &p; + WT_RET(__block_buffer_to_addr(block, pp, + &ci->root_offset, &ci->root_size, &ci->root_cksum)); + WT_RET(__block_buffer_to_addr(block, pp, + &ci->alloc.offset, &ci->alloc.size, &ci->alloc.cksum)); + WT_RET(__block_buffer_to_addr(block, pp, + &ci->avail.offset, &ci->avail.size, &ci->avail.cksum)); + WT_RET(__block_buffer_to_addr(block, pp, + &ci->discard.offset, &ci->discard.size, &ci->discard.cksum)); + WT_RET(__wt_vunpack_uint(pp, 0, &a)); + ci->file_size = (wt_off_t)a; + WT_RET(__wt_vunpack_uint(pp, 0, &a)); + ci->ckpt_size = a; + + return (0); +} + +/* + * __wt_block_ckpt_to_buffer -- + * Convert the components into its checkpoint cookie. + */ +int +__wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session, + WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci) +{ + uint64_t a; + + if (ci->version != WT_BM_CHECKPOINT_VERSION) + WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version"); + + (*pp)[0] = ci->version; + (*pp)++; + + WT_RET(__wt_block_addr_to_buffer(block, pp, + ci->root_offset, ci->root_size, ci->root_cksum)); + WT_RET(__wt_block_addr_to_buffer(block, pp, + ci->alloc.offset, ci->alloc.size, ci->alloc.cksum)); + WT_RET(__wt_block_addr_to_buffer(block, pp, + ci->avail.offset, ci->avail.size, ci->avail.cksum)); + WT_RET(__wt_block_addr_to_buffer(block, pp, + ci->discard.offset, ci->discard.size, ci->discard.cksum)); + a = (uint64_t)ci->file_size; + WT_RET(__wt_vpack_uint(pp, 0, a)); + a = (uint64_t)ci->ckpt_size; + WT_RET(__wt_vpack_uint(pp, 0, a)); + + return (0); +} diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c new file mode 100644 index 00000000000..83c3a40e8e1 --- /dev/null +++ b/src/third_party/wiredtiger/src/block/block_ckpt.c @@ -0,0 +1,842 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __ckpt_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *); +static int __ckpt_string( + WT_SESSION_IMPL *, WT_BLOCK *, const uint8_t *, WT_ITEM *); +static int __ckpt_update( + WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *, WT_BLOCK_CKPT *, int); + +/* + * __wt_block_ckpt_init -- + * Initialize a checkpoint structure. + */ +int +__wt_block_ckpt_init( + WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name) +{ + WT_CLEAR(*ci); + + ci->version = WT_BM_CHECKPOINT_VERSION; + ci->root_offset = WT_BLOCK_INVALID_OFFSET; + + WT_RET(__wt_block_extlist_init(session, &ci->alloc, name, "alloc", 0)); + WT_RET(__wt_block_extlist_init(session, &ci->avail, name, "avail", 1)); + WT_RET(__wt_block_extlist_init( + session, &ci->discard, name, "discard", 0)); + WT_RET(__wt_block_extlist_init( + session, &ci->ckpt_avail, name, "ckpt_avail", 1)); + + return (0); +} + +/* + * __wt_block_checkpoint_load -- + * Load a checkpoint. + */ +int +__wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, + const uint8_t *addr, size_t addr_size, + uint8_t *root_addr, size_t *root_addr_sizep, int checkpoint) +{ + WT_BLOCK_CKPT *ci, _ci; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + uint8_t *endp; + + WT_UNUSED(addr_size); + ci = NULL; + + /* + * Sometimes we don't find a root page (we weren't given a checkpoint, + * or the checkpoint was empty). In that case we return an empty root + * address, set that up now. + */ + *root_addr_sizep = 0; + + if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { + if (addr != NULL) { + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__ckpt_string(session, block, addr, tmp)); + } + WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, + "%s: load-checkpoint: %s", block->name, + addr == NULL ? "[Empty]" : (const char *)tmp->data)); + } + + /* + * There's a single checkpoint in the file that can be written, all of + * the others are read-only. We use the same initialization calls for + * readonly checkpoints, but the information doesn't persist. + */ + if (checkpoint) { + ci = &_ci; + WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint")); + } else { + /* + * We depend on the btree level for locking: things will go + * bad fast should we open the live system in two handles, or + * if we create, salvage, truncate or verify the live/running + * file, for that matter. + */ + ci = &block->live; + WT_ERR(__wt_block_ckpt_init(session, ci, "live")); + } + + /* + * If the checkpoint has an on-disk root page, load it. Otherwise, size + * the file past the description information. + */ + if (addr == NULL || addr_size == 0) + ci->file_size = block->allocsize; + else { + /* Crack the checkpoint cookie. */ + WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci)); + + /* Verify sets up next. */ + if (block->verify) + WT_ERR(__wt_verify_ckpt_load(session, block, ci)); + + /* Read any root page. */ + if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) { + endp = root_addr; + WT_ERR(__wt_block_addr_to_buffer(block, &endp, + ci->root_offset, ci->root_size, ci->root_cksum)); + *root_addr_sizep = WT_PTRDIFF(endp, root_addr); + } + + /* + * Rolling a checkpoint forward requires the avail list, the + * blocks from which we can allocate. + */ + if (!checkpoint) + WT_ERR(__wt_block_extlist_read_avail( + session, block, &ci->avail, ci->file_size)); + } + + /* + * If the checkpoint can be written, that means anything written after + * the checkpoint is no longer interesting, truncate the file. Don't + * bother checking the avail list for a block at the end of the file, + * that was done when the checkpoint was first written (re-writing the + * checkpoint might possibly make it relevant here, but it's unlikely + * enough I don't bother). + */ + if (!checkpoint) { + /* + * The truncate might fail if there's a file mapping (if there's + * an open checkpoint on the file), that's OK. + */ + WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, + "truncate file to %" PRIuMAX, (uintmax_t)ci->file_size)); + WT_ERR_BUSY_OK( + __wt_ftruncate(session, block->fh, ci->file_size)); + } + + if (0) { +err: /* + * Don't call checkpoint-unload: unload does real work including + * file truncation. If we fail early enough that the checkpoint + * information isn't correct, bad things would happen. The only + * allocated memory was in the service of verify, clean that up. + */ + if (block->verify) + WT_TRET(__wt_verify_ckpt_unload(session, block)); + } + + /* Checkpoints don't need the original information, discard it. */ + if (checkpoint && ci != NULL) + __wt_block_ckpt_destroy(session, ci); + + __wt_scr_free(&tmp); + return (ret); +} + +/* + * __wt_block_checkpoint_unload -- + * Unload a checkpoint. + */ +int +__wt_block_checkpoint_unload( + WT_SESSION_IMPL *session, WT_BLOCK *block, int checkpoint) +{ + WT_DECL_RET; + + /* Verify cleanup. */ + if (block->verify) + WT_TRET(__wt_verify_ckpt_unload(session, block)); + + /* + * If it's the live system, truncate to discard any extended blocks and + * discard the active extent lists. Hold the lock even though we're + * unloading the live checkpoint, there could be readers active in + * other checkpoints. + */ + if (!checkpoint) { + /* + * The truncate might fail if there's a file mapping (if there's + * an open checkpoint on the file), that's OK. + */ + WT_TRET_BUSY_OK( + __wt_ftruncate(session, block->fh, block->fh->size)); + + __wt_spin_lock(session, &block->live_lock); + __wt_block_ckpt_destroy(session, &block->live); + __wt_spin_unlock(session, &block->live_lock); + } + + return (ret); +} + +/* + * __wt_block_ckpt_destroy -- + * Clear a checkpoint structure. + */ +void +__wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci) +{ + /* Discard the extent lists. */ + __wt_block_extlist_free(session, &ci->alloc); + __wt_block_extlist_free(session, &ci->avail); + __wt_block_extlist_free(session, &ci->discard); + __wt_block_extlist_free(session, &ci->ckpt_alloc); + __wt_block_extlist_free(session, &ci->ckpt_avail); + __wt_block_extlist_free(session, &ci->ckpt_discard); +} + +/* + * __wt_block_checkpoint -- + * Create a new checkpoint. + */ +int +__wt_block_checkpoint(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, int data_cksum) +{ + WT_BLOCK_CKPT *ci; + WT_DECL_RET; + + ci = &block->live; + + /* + * Write the root page: it's possible for there to be a checkpoint of + * an empty tree, in which case, we store an illegal root offset. + * + * !!! + * We happen to know that checkpoints are single-threaded above us in + * the btree engine. That's probably something we want to guarantee + * for any WiredTiger block manager. + */ + if (buf == NULL) { + ci->root_offset = WT_BLOCK_INVALID_OFFSET; + ci->root_size = ci->root_cksum = 0; + } else + WT_RET(__wt_block_write_off(session, block, buf, + &ci->root_offset, &ci->root_size, &ci->root_cksum, + data_cksum, 0)); + + /* + * Checkpoints are potentially reading/writing/merging lots of blocks, + * pre-allocate structures for this thread's use. + */ + WT_RET(__wt_block_ext_prealloc(session, 250)); + + /* Process the checkpoint list, deleting and updating as required. */ + ret = __ckpt_process(session, block, ckptbase); + + /* Discard any excessive memory we've allocated. */ + WT_TRET(__wt_block_ext_discard(session, 250)); + + return (ret); +} + +/* + * __ckpt_extlist_read -- + * Read a checkpoints extent lists and copy + */ +static int +__ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt) +{ + WT_BLOCK_CKPT *ci; + + /* + * Allocate a checkpoint structure, crack the cookie and read the + * checkpoint's extent lists. + * + * Ignore the avail list: checkpoint avail lists are only useful if we + * are rolling forward from the particular checkpoint and they represent + * our best understanding of what blocks can be allocated. If we are + * not operating on the live checkpoint, subsequent checkpoints might + * have allocated those blocks, and the avail list is useless. We don't + * discard it, because it is useful as part of verification, but we + * don't re-write it either. + */ + WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv)); + + ci = ckpt->bpriv; + WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name)); + WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci)); + WT_RET(__wt_block_extlist_read( + session, block, &ci->alloc, ci->file_size)); + WT_RET(__wt_block_extlist_read( + session, block, &ci->discard, ci->file_size)); + + return (0); +} + +/* + * __ckpt_extlist_fblocks -- + * If a checkpoint's extent list is going away, free its blocks. + */ +static int +__ckpt_extlist_fblocks( + WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el) +{ + if (el->offset == WT_BLOCK_INVALID_OFFSET) + return (0); + + /* + * Free blocks used to write checkpoint extents into the live system's + * checkpoint avail list (they were never on any alloc list). Do not + * use the live system's avail list because that list is used to decide + * if the file can be truncated, and we can't truncate any part of the + * file that contains a previous checkpoint's extents. + */ + return (__wt_block_insert_ext( + session, &block->live.ckpt_avail, el->offset, el->size)); +} + +#ifdef HAVE_DIAGNOSTIC +/* + * __ckpt_verify -- + * Diagnostic code, confirm we get what we expect in the checkpoint array. + */ +static int +__ckpt_verify(WT_SESSION_IMPL *session, WT_CKPT *ckptbase) +{ + WT_CKPT *ckpt; + + /* + * Fast check that we're seeing what we expect to see: some number of + * checkpoints to add, delete or ignore, terminated by a new checkpoint. + */ + WT_CKPT_FOREACH(ckptbase, ckpt) + switch (ckpt->flags) { + case 0: + case WT_CKPT_DELETE: + case WT_CKPT_DELETE | WT_CKPT_FAKE: + case WT_CKPT_FAKE: + break; + case WT_CKPT_ADD: + if (ckpt[1].name == NULL) + break; + /* FALLTHROUGH */ + default: + return ( + __wt_illegal_value(session, "checkpoint array")); + } + return (0); +} +#endif + +/* + * __ckpt_process -- + * Process the list of checkpoints. + */ +static int +__ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) +{ + WT_BLOCK_CKPT *a, *b, *ci; + WT_CKPT *ckpt, *next_ckpt; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + uint64_t ckpt_size; + int deleting, locked; + + ci = &block->live; + locked = 0; + +#ifdef HAVE_DIAGNOSTIC + WT_RET(__ckpt_verify(session, ckptbase)); +#endif + + /* + * Checkpoints are a two-step process: first, write a new checkpoint to + * disk (including all the new extent lists for modified checkpoints + * and the live system). As part of this, create a list of file blocks + * newly available for reallocation, based on checkpoints being deleted. + * We then return the locations of the new checkpoint information to our + * caller. Our caller has to write that information into some kind of + * stable storage, and once that's done, we can actually allocate from + * that list of newly available file blocks. (We can't allocate from + * that list immediately because the allocation might happen before our + * caller saves the new checkpoint information, and if we crashed before + * the new checkpoint location was saved, we'd have overwritten blocks + * still referenced by checkpoints in the system.) In summary, there is + * a second step: after our caller saves the checkpoint information, we + * are called to add the newly available blocks into the live system's + * available list. + * + * This function is the first step, the second step is in the resolve + * function. + * + * If we're called to checkpoint the same file twice, without the second + * resolution step, it's an error at an upper level and our choices are + * all bad: either leak blocks or risk crashing with our caller not + * having saved the checkpoint information to stable storage. Leaked + * blocks are a safer choice, but that means file verify will fail for + * the rest of "forever", and the chance of us allocating a block and + * then crashing such that it matters is reasonably low: don't leak the + * blocks. + */ + if (block->ckpt_inprogress) { + __wt_errx(session, + "%s: checkpointed without the checkpoint being resolved", + block->name); + + WT_RET(__wt_block_checkpoint_resolve(session, block)); + } + + /* + * Extents newly available as a result of deleting previous checkpoints + * are added to a list of extents. The list should be empty, but as + * described above, there is no "free the checkpoint information" call + * into the block manager; if there was an error in an upper level that + * resulted in some previous checkpoint never being resolved, the list + * may not be empty. We should have caught that with the "checkpoint + * in progress" test, but it doesn't cost us anything to be cautious. + * + * We free the checkpoint's allocation and discard extent lists as part + * of the resolution step, not because they're needed at that time, but + * because it's potentially a lot of work, and waiting allows the btree + * layer to continue eviction sooner. As for the checkpoint-available + * list, make sure they get cleaned out. + */ + __wt_block_extlist_free(session, &ci->ckpt_avail); + WT_RET(__wt_block_extlist_init( + session, &ci->ckpt_avail, "live", "ckpt_avail", 1)); + __wt_block_extlist_free(session, &ci->ckpt_alloc); + __wt_block_extlist_free(session, &ci->ckpt_discard); + + /* + * To delete a checkpoint, we'll need checkpoint information for it and + * the subsequent checkpoint into which it gets rolled; read them from + * disk before we lock things down. + */ + deleting = 0; + WT_CKPT_FOREACH(ckptbase, ckpt) { + if (F_ISSET(ckpt, WT_CKPT_FAKE) || + !F_ISSET(ckpt, WT_CKPT_DELETE)) + continue; + deleting = 1; + + /* + * Read the checkpoint and next checkpoint extent lists if we + * haven't already read them (we may have already read these + * extent blocks if there is more than one deleted checkpoint). + */ + if (ckpt->bpriv == NULL) + WT_ERR(__ckpt_extlist_read(session, block, ckpt)); + + for (next_ckpt = ckpt + 1;; ++next_ckpt) + if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) + break; + + /* + * The "next" checkpoint may be the live tree which has no + * extent blocks to read. + */ + if (next_ckpt->bpriv == NULL && + !F_ISSET(next_ckpt, WT_CKPT_ADD)) + WT_ERR(__ckpt_extlist_read(session, block, next_ckpt)); + } + + /* + * Hold a lock so the live extent lists and the file size can't change + * underneath us. I suspect we'll tighten this if checkpoints take too + * much time away from real work: we read the historic checkpoint + * information without a lock, but we could also merge and re-write the + * deleted and merged checkpoint information without a lock, except for + * the final merge of ranges into the live tree. + */ + __wt_spin_lock(session, &block->live_lock); + locked = 1; + + /* + * We've allocated our last page, update the checkpoint size. We need + * to calculate the live system's checkpoint size before merging + * checkpoint allocation and discard information from the checkpoints + * we're deleting, those operations change the underlying byte counts. + */ + ckpt_size = ci->ckpt_size; + ckpt_size += ci->alloc.bytes; + ckpt_size -= ci->discard.bytes; + + /* Skip the additional processing if we aren't deleting checkpoints. */ + if (!deleting) + goto live_update; + + /* + * Delete any no-longer-needed checkpoints: we do this first as it frees + * blocks to the live lists, and the freed blocks will then be included + * when writing the live extent lists. + */ + WT_CKPT_FOREACH(ckptbase, ckpt) { + if (F_ISSET(ckpt, WT_CKPT_FAKE) || + !F_ISSET(ckpt, WT_CKPT_DELETE)) + continue; + + if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { + if (tmp == NULL) + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__ckpt_string( + session, block, ckpt->raw.data, tmp)); + WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, + "%s: delete-checkpoint: %s: %s", + block->name, ckpt->name, (const char *)tmp->data)); + } + + /* + * Find the checkpoint into which we'll roll this checkpoint's + * blocks: it's the next real checkpoint in the list, and it + * better have been read in (if it's not the add slot). + */ + for (next_ckpt = ckpt + 1;; ++next_ckpt) + if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) + break; + + /* + * Set the from/to checkpoint structures, where the "to" value + * may be the live tree. + */ + a = ckpt->bpriv; + if (F_ISSET(next_ckpt, WT_CKPT_ADD)) + b = &block->live; + else + b = next_ckpt->bpriv; + + /* + * Free the root page: there's nothing special about this free, + * the root page is allocated using normal rules, that is, it + * may have been taken from the avail list, and was entered on + * the live system's alloc list at that time. We free it into + * the checkpoint's discard list, however, not the live system's + * list because it appears on the checkpoint's alloc list and so + * must be paired in the checkpoint. + */ + if (a->root_offset != WT_BLOCK_INVALID_OFFSET) + WT_ERR(__wt_block_insert_ext(session, + &a->discard, a->root_offset, a->root_size)); + + /* + * Free the blocks used to hold the "from" checkpoint's extent + * lists, including the avail list. + */ + WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc)); + WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail)); + WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard)); + + /* + * Roll the "from" alloc and discard extent lists into the "to" + * checkpoint's lists. + */ + if (a->alloc.entries != 0) + WT_ERR(__wt_block_extlist_merge( + session, &a->alloc, &b->alloc)); + if (a->discard.entries != 0) + WT_ERR(__wt_block_extlist_merge( + session, &a->discard, &b->discard)); + + /* + * If the "to" checkpoint is also being deleted, we're done with + * it, it's merged into some other checkpoint in the next loop. + * This means the extent lists may aggregate over a number of + * checkpoints, but that's OK, they're disjoint sets of ranges. + */ + if (F_ISSET(next_ckpt, WT_CKPT_DELETE)) + continue; + + /* + * Find blocks for re-use: wherever the "to" checkpoint's + * allocate and discard lists overlap, move the range to + * the live system's checkpoint available list. + */ + WT_ERR(__wt_block_extlist_overlap(session, block, b)); + + /* + * If we're updating the live system's information, we're done. + */ + if (F_ISSET(next_ckpt, WT_CKPT_ADD)) + continue; + + /* + * We have to write the "to" checkpoint's extent lists out in + * new blocks, and update its cookie. + * + * Free the blocks used to hold the "to" checkpoint's extent + * lists; don't include the avail list, it's not changing. + */ + WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc)); + WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard)); + + F_SET(next_ckpt, WT_CKPT_UPDATE); + } + + /* Update checkpoints marked for update. */ + WT_CKPT_FOREACH(ckptbase, ckpt) + if (F_ISSET(ckpt, WT_CKPT_UPDATE)) + WT_ERR(__ckpt_update( + session, block, ckpt, ckpt->bpriv, 0)); + +live_update: + /* Truncate the file if that's possible. */ + WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail)); + + /* Update the final, added checkpoint based on the live system. */ + WT_CKPT_FOREACH(ckptbase, ckpt) + if (F_ISSET(ckpt, WT_CKPT_ADD)) { + /* + * Set the checkpoint size for the live system. + * + * !!! + * Our caller wants the final checkpoint size. Setting + * the size here violates layering, but the alternative + * is a call for the btree layer to crack the checkpoint + * cookie into its components, and that's a fair amount + * of work. + */ + ckpt->ckpt_size = ci->ckpt_size = ckpt_size; + + WT_ERR(__ckpt_update(session, block, ckpt, ci, 1)); + } + + /* + * Reset the live system's alloc and discard extent lists, leave the + * avail list alone. This includes freeing a lot of extents, so do it + * outside of the system's lock by copying and resetting the original, + * then doing the work later. + */ + ci->ckpt_alloc = ci->alloc; + WT_ERR(__wt_block_extlist_init( + session, &ci->alloc, "live", "alloc", 0)); + ci->ckpt_discard = ci->discard; + WT_ERR(__wt_block_extlist_init( + session, &ci->discard, "live", "discard", 0)); + +#ifdef HAVE_DIAGNOSTIC + /* + * The first checkpoint in the system should always have an empty + * discard list. If we've read that checkpoint and/or created it, + * check. + */ + WT_CKPT_FOREACH(ckptbase, ckpt) + if (!F_ISSET(ckpt, WT_CKPT_DELETE)) + break; + if ((a = ckpt->bpriv) == NULL) + a = &block->live; + if (a->discard.entries != 0) { + __wt_errx(session, + "first checkpoint incorrectly has blocks on the discard " + "list"); + WT_ERR(WT_ERROR); + } +#endif + + block->ckpt_inprogress = 1; + +err: if (locked) + __wt_spin_unlock(session, &block->live_lock); + + /* Discard any checkpoint information we loaded. */ + WT_CKPT_FOREACH(ckptbase, ckpt) + if ((ci = ckpt->bpriv) != NULL) + __wt_block_ckpt_destroy(session, ci); + + __wt_scr_free(&tmp); + return (ret); +} + +/* + * __ckpt_update -- + * Update a checkpoint. + */ +static int +__ckpt_update(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci, int is_live) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + uint8_t *endp; + +#ifdef HAVE_DIAGNOSTIC + /* Check the extent list combinations for overlaps. */ + WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail)); + WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail)); + WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard)); +#endif + /* + * Write the checkpoint's alloc and discard extent lists. After each + * write, remove any allocated blocks from the system's allocation + * list, checkpoint extent blocks don't appear on any extent lists. + */ + WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL)); + WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL)); + + /* + * We only write an avail list for the live system, other checkpoint's + * avail lists are static and never change. + * + * Write the avail list last so it reflects changes due to allocating + * blocks for the alloc and discard lists. Second, when we write the + * live system's avail list, it's two lists: the current avail list + * plus the list of blocks to be made available when the new checkpoint + * completes. We can't merge that second list into the real list yet, + * it's not truly available until the new checkpoint locations have been + * saved to the metadata. + */ + if (is_live) + WT_RET(__wt_block_extlist_write( + session, block, &ci->avail, &ci->ckpt_avail)); + + /* + * Set the file size for the live system. + * + * !!! + * We do NOT set the file size when re-writing checkpoints because we + * want to test the checkpoint's blocks against a reasonable maximum + * file size during verification. This is bad: imagine a checkpoint + * appearing early in the file, re-written, and then the checkpoint + * requires blocks at the end of the file, blocks after the listed file + * size. If the application opens that checkpoint for writing + * (discarding subsequent checkpoints), we would truncate the file to + * the early chunk, discarding the re-written checkpoint information. + * The alternative, updating the file size has its own problems, in + * that case we'd work correctly, but we'd lose all of the blocks + * between the original checkpoint and the re-written checkpoint. + * Currently, there's no API to roll-forward intermediate checkpoints, + * if there ever is, this will need to be fixed. + */ + if (is_live) + ci->file_size = block->fh->size; + + /* + * Copy the checkpoint information into the checkpoint array's address + * cookie. + */ + WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE)); + endp = ckpt->raw.mem; + WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci)); + ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem); + + if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp)); + WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, + "%s: create-checkpoint: %s: %s", + block->name, ckpt->name, (const char *)tmp->data)); + } + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __wt_block_checkpoint_resolve -- + * Resolve a checkpoint. + */ +int +__wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + WT_BLOCK_CKPT *ci; + WT_DECL_RET; + + ci = &block->live; + + /* + * Resolve the checkpoint after our caller has written the checkpoint + * information to stable storage. + */ + if (!block->ckpt_inprogress) + WT_RET_MSG(session, WT_ERROR, + "%s: checkpoint resolved, but no checkpoint in progress", + block->name); + block->ckpt_inprogress = 0; + + __wt_spin_lock(session, &block->live_lock); + ret = __wt_block_extlist_merge(session, &ci->ckpt_avail, &ci->avail); + __wt_spin_unlock(session, &block->live_lock); + + /* Discard the lists remaining after the checkpoint call. */ + __wt_block_extlist_free(session, &ci->ckpt_avail); + __wt_block_extlist_free(session, &ci->ckpt_alloc); + __wt_block_extlist_free(session, &ci->ckpt_discard); + + return (ret); +} + +/* + * __ckpt_string -- + * Return a printable string representation of a checkpoint address cookie. + */ +static int +__ckpt_string(WT_SESSION_IMPL *session, + WT_BLOCK *block, const uint8_t *addr, WT_ITEM *buf) +{ + WT_BLOCK_CKPT *ci, _ci; + + /* Initialize the checkpoint, crack the cookie. */ + ci = &_ci; + WT_RET(__wt_block_ckpt_init(session, ci, "string")); + WT_RET(__wt_block_buffer_to_ckpt(session, block, addr, ci)); + + WT_RET(__wt_buf_fmt(session, buf, + "version=%d", + ci->version)); + if (ci->root_offset == WT_BLOCK_INVALID_OFFSET) + WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]")); + else + WT_RET(__wt_buf_catfmt(session, buf, + ", root=[%" + PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", + (uintmax_t)ci->root_offset, + (uintmax_t)(ci->root_offset + ci->root_size), + ci->root_size, ci->root_cksum)); + if (ci->alloc.offset == WT_BLOCK_INVALID_OFFSET) + WT_RET(__wt_buf_catfmt(session, buf, ", alloc=[Empty]")); + else + WT_RET(__wt_buf_catfmt(session, buf, + ", alloc=[%" + PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", + (uintmax_t)ci->alloc.offset, + (uintmax_t)(ci->alloc.offset + ci->alloc.size), + ci->alloc.size, ci->alloc.cksum)); + if (ci->avail.offset == WT_BLOCK_INVALID_OFFSET) + WT_RET(__wt_buf_catfmt(session, buf, ", avail=[Empty]")); + else + WT_RET(__wt_buf_catfmt(session, buf, + ", avail=[%" + PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", + (uintmax_t)ci->avail.offset, + (uintmax_t)(ci->avail.offset + ci->avail.size), + ci->avail.size, ci->avail.cksum)); + if (ci->discard.offset == WT_BLOCK_INVALID_OFFSET) + WT_RET(__wt_buf_catfmt(session, buf, ", discard=[Empty]")); + else + WT_RET(__wt_buf_catfmt(session, buf, + ", discard=[%" + PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", + (uintmax_t)ci->discard.offset, + (uintmax_t)(ci->discard.offset + ci->discard.size), + ci->discard.size, ci->discard.cksum)); + WT_RET(__wt_buf_catfmt(session, buf, + ", file size=%" PRIuMAX, (uintmax_t)ci->file_size)); + + __wt_block_ckpt_destroy(session, ci); + + return (0); +} diff --git a/src/third_party/wiredtiger/src/block/block_compact.c b/src/third_party/wiredtiger/src/block/block_compact.c new file mode 100644 index 00000000000..007c77f3291 --- /dev/null +++ b/src/third_party/wiredtiger/src/block/block_compact.c @@ -0,0 +1,221 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *); + +/* + * __wt_block_compact_start -- + * Start compaction of a file. + */ +int +__wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + WT_UNUSED(session); + + /* + * Save the current allocation plan, switch to first-fit allocation. + * We don't need the lock, but it's not a performance question and + * might avoid bugs in the future. + */ + __wt_spin_lock(session, &block->live_lock); + block->allocfirst_save = block->allocfirst; + block->allocfirst = 1; + __wt_spin_unlock(session, &block->live_lock); + + return (0); +} + +/* + * __wt_block_compact_end -- + * End compaction of a file. + */ +int +__wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + WT_UNUSED(session); + + /* + * Restore the previous allocation plan. + * We don't need the lock, but it's not a performance question and + * might avoid bugs in the future. + */ + __wt_spin_lock(session, &block->live_lock); + block->allocfirst = block->allocfirst_save; + __wt_spin_unlock(session, &block->live_lock); + + return (0); +} + +/* + * __wt_block_compact_skip -- + * Return if compaction will shrink the file. + */ +int +__wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, int *skipp) +{ + WT_DECL_RET; + WT_EXT *ext; + WT_EXTLIST *el; + WT_FH *fh; + wt_off_t avail, ninety; + + *skipp = 1; /* Return a default skip. */ + + fh = block->fh; + + /* + * We do compaction by copying blocks from the end of the file to the + * beginning of the file, and we need some metrics to decide if it's + * worth doing. Ignore small files, and files where we are unlikely + * to recover 10% of the file. + */ + if (fh->size <= 10 * 1024) + return (0); + + __wt_spin_lock(session, &block->live_lock); + + if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) + WT_ERR(__block_dump_avail(session, block)); + + /* Sum the number of available bytes in the first 90% of the file. */ + avail = 0; + ninety = fh->size - fh->size / 10; + + el = &block->live.avail; + WT_EXT_FOREACH(ext, el->off) + if (ext->off < ninety) + avail += ext->size; + + /* + * If at least 10% of the total file is available and in the first 90% + * of the file, we'll try compaction. + */ + if (avail >= fh->size / 10) + *skipp = 0; + + WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, + "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " + "90%% of the file, require 10%% or %" PRIuMAX "MB (%" PRIuMAX + ") to perform compaction, compaction %s", + block->name, + (uintmax_t)avail / WT_MEGABYTE, (uintmax_t)avail, + (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10, + *skipp ? "skipped" : "proceeding")); + +err: __wt_spin_unlock(session, &block->live_lock); + + return (ret); +} + +/* + * __wt_block_compact_page_skip -- + * Return if writing a particular page will shrink the file. + */ +int +__wt_block_compact_page_skip(WT_SESSION_IMPL *session, + WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int *skipp) +{ + WT_DECL_RET; + WT_EXT *ext; + WT_EXTLIST *el; + WT_FH *fh; + wt_off_t ninety, offset; + uint32_t size, cksum; + + WT_UNUSED(addr_size); + *skipp = 1; /* Return a default skip. */ + + fh = block->fh; + + /* Crack the cookie. */ + WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); + + __wt_spin_lock(session, &block->live_lock); + + /* + * If this block is in the last 10% of the file and there's a block on + * the available list that's in the first 90% of the file, rewrite the + * block. Checking the available list is necessary (otherwise writing + * the block would extend the file), but there's an obvious race if the + * file is sufficiently busy. + */ + ninety = fh->size - fh->size / 10; + if (offset > ninety) { + el = &block->live.avail; + WT_EXT_FOREACH(ext, el->off) + if (ext->off < ninety && ext->size >= size) { + *skipp = 0; + break; + } + } + + __wt_spin_unlock(session, &block->live_lock); + + return (ret); +} + +/* + * __block_dump_avail -- + * Dump out the avail list so we can see what compaction will look like. + */ +static int +__block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + WT_EXTLIST *el; + WT_EXT *ext; + wt_off_t decile[10], percentile[100], size, v; + u_int i; + + el = &block->live.avail; + size = block->fh->size; + + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, + "file size %" PRIuMAX "MB (%" PRIuMAX ") with %" PRIuMAX + "%% space available %" PRIuMAX "MB (%" PRIuMAX ")", + (uintmax_t)size / WT_MEGABYTE, (uintmax_t)size, + ((uintmax_t)el->bytes * 100) / (uintmax_t)size, + (uintmax_t)el->bytes / WT_MEGABYTE, (uintmax_t)el->bytes)); + + if (el->entries == 0) + return (0); + + /* + * Bucket the available memory into file deciles/percentiles. Large + * pieces of memory will cross over multiple buckets, assign to the + * decile/percentile in 512B chunks. + */ + memset(decile, 0, sizeof(decile)); + memset(percentile, 0, sizeof(percentile)); + WT_EXT_FOREACH(ext, el->off) + for (i = 0; i < ext->size / 512; ++i) { + ++decile[((ext->off + i * 512) * 10) / size]; + ++percentile[((ext->off + i * 512) * 100) / size]; + } + +#ifdef __VERBOSE_OUTPUT_PERCENTILE + for (i = 0; i < WT_ELEMENTS(percentile); ++i) { + v = percentile[i] * 512; + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, + "%2u%%: %12" PRIuMAX "MB, (%" PRIuMAX "B, %" + PRIuMAX "%%)", + i, (uintmax_t)v / WT_MEGABYTE, (uintmax_t)v, + (uintmax_t)((v * 100) / (wt_off_t)el->bytes))); + } +#endif + for (i = 0; i < WT_ELEMENTS(decile); ++i) { + v = decile[i] * 512; + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, + "%2u%%: %12" PRIuMAX "MB, (%" PRIuMAX "B, %" + PRIuMAX "%%)", + i * 10, (uintmax_t)v / WT_MEGABYTE, (uintmax_t)v, + (uintmax_t)((v * 100) / (wt_off_t)el->bytes))); + } + + return (0); +} diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c new file mode 100644 index 00000000000..d500f93817a --- /dev/null +++ b/src/third_party/wiredtiger/src/block/block_ext.c @@ -0,0 +1,1437 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __block_append(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t); +static int __block_ext_overlap(WT_SESSION_IMPL *, + WT_BLOCK *, WT_EXTLIST *, WT_EXT **, WT_EXTLIST *, WT_EXT **); +static int __block_extlist_dump( + WT_SESSION_IMPL *, const char *, WT_EXTLIST *, int); +static int __block_merge(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t); + +/* + * __block_off_srch_last -- + * Return the last element in the list, along with a stack for appending. + */ +static inline WT_EXT * +__block_off_srch_last(WT_EXT **head, WT_EXT ***stack) +{ + WT_EXT **extp, *last; + int i; + + last = NULL; /* The list may be empty */ + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;) + if (*extp != NULL) { + last = *extp; + extp = &(*extp)->next[i]; + } else + stack[i--] = extp--; + return (last); +} + +/* + * __block_off_srch -- + * Search a by-offset skiplist (either the primary by-offset list, or the + * by-offset list referenced by a size entry), for the specified offset. + */ +static inline void +__block_off_srch(WT_EXT **head, wt_off_t off, WT_EXT ***stack, int skip_off) +{ + WT_EXT **extp; + int i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + * + * Return a stack for an exact match or the next-largest item. + * + * The WT_EXT structure contains two skiplists, the primary one and the + * per-size bucket one: if the skip_off flag is set, offset the skiplist + * array by the depth specified in this particular structure. + */ + for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;) + if (*extp != NULL && (*extp)->off < off) + extp = + &(*extp)->next[i + (skip_off ? (*extp)->depth : 0)]; + else + stack[i--] = extp--; +} + +/* + * __block_first_srch -- + * Search the skiplist for the first available slot. + */ +static inline int +__block_first_srch(WT_EXT **head, wt_off_t size, WT_EXT ***stack) +{ + WT_EXT *ext; + + /* + * Linear walk of the available chunks in offset order; take the first + * one that's large enough. + */ + WT_EXT_FOREACH(ext, head) + if (ext->size >= size) + break; + if (ext == NULL) + return (0); + + /* Build a stack for the offset we want. */ + __block_off_srch(head, ext->off, stack, 0); + return (1); +} + +/* + * __block_size_srch -- + * Search the by-size skiplist for the specified size. + */ +static inline void +__block_size_srch(WT_SIZE **head, wt_off_t size, WT_SIZE ***stack) +{ + WT_SIZE **szp; + int i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + * + * Return a stack for an exact match or the next-largest item. + */ + for (i = WT_SKIP_MAXDEPTH - 1, szp = &head[i]; i >= 0;) + if (*szp != NULL && (*szp)->size < size) + szp = &(*szp)->next[i]; + else + stack[i--] = szp--; +} + +/* + * __block_off_srch_pair -- + * Search a by-offset skiplist for before/after records of the specified + * offset. + */ +static inline void +__block_off_srch_pair( + WT_EXTLIST *el, wt_off_t off, WT_EXT **beforep, WT_EXT **afterp) +{ + WT_EXT **head, **extp; + int i; + + *beforep = *afterp = NULL; + + head = el->off; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;) { + if (*extp == NULL) { + --i; + --extp; + continue; + } + + if ((*extp)->off < off) { /* Keep going at this level */ + *beforep = *extp; + extp = &(*extp)->next[i]; + } else { /* Drop down a level */ + *afterp = *extp; + --i; + --extp; + } + } +} + +/* + * __block_ext_insert -- + * Insert an extent into an extent list. + */ +static int +__block_ext_insert(WT_SESSION_IMPL *session, WT_EXTLIST *el, WT_EXT *ext) +{ + WT_EXT **astack[WT_SKIP_MAXDEPTH]; + WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH]; + u_int i; + + /* + * If we are inserting a new size onto the size skiplist, we'll need a + * new WT_SIZE structure for that skiplist. + */ + if (el->track_size) { + __block_size_srch(el->sz, ext->size, sstack); + szp = *sstack[0]; + if (szp == NULL || szp->size != ext->size) { + WT_RET(__wt_block_size_alloc(session, &szp)); + szp->size = ext->size; + szp->depth = ext->depth; + for (i = 0; i < ext->depth; ++i) { + szp->next[i] = *sstack[i]; + *sstack[i] = szp; + } + } + + /* + * Insert the new WT_EXT structure into the size element's + * offset skiplist. + */ + __block_off_srch(szp->off, ext->off, astack, 1); + for (i = 0; i < ext->depth; ++i) { + ext->next[i + ext->depth] = *astack[i]; + *astack[i] = ext; + } + } +#ifdef HAVE_DIAGNOSTIC + if (!el->track_size) + for (i = 0; i < ext->depth; ++i) + ext->next[i + ext->depth] = NULL; +#endif + + /* Insert the new WT_EXT structure into the offset skiplist. */ + __block_off_srch(el->off, ext->off, astack, 0); + for (i = 0; i < ext->depth; ++i) { + ext->next[i] = *astack[i]; + *astack[i] = ext; + } + + ++el->entries; + el->bytes += (uint64_t)ext->size; + + /* Update the cached end-of-list. */ + if (ext->next[0] == NULL) + el->last = ext; + + return (0); +} + +/* + * __block_off_insert -- + * Insert a file range into an extent list. + */ +static int +__block_off_insert( + WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size) +{ + WT_EXT *ext; + + WT_RET(__wt_block_ext_alloc(session, &ext)); + ext->off = off; + ext->size = size; + + return (__block_ext_insert(session, el, ext)); +} + +#ifdef HAVE_DIAGNOSTIC +/* + * __block_off_match -- + * Return if any part of a specified range appears on a specified extent + * list. + */ +static int +__block_off_match(WT_EXTLIST *el, wt_off_t off, wt_off_t size) +{ + WT_EXT *before, *after; + + /* Search for before and after entries for the offset. */ + __block_off_srch_pair(el, off, &before, &after); + + /* If "before" or "after" overlaps, we have a winner. */ + if (before != NULL && before->off + before->size > off) + return (1); + if (after != NULL && off + size > after->off) + return (1); + return (0); +} + +/* + * __wt_block_misplaced -- + * Complain if a block appears on the available or discard lists. + */ +int +__wt_block_misplaced(WT_SESSION_IMPL *session, + WT_BLOCK *block, const char *tag, wt_off_t offset, uint32_t size, int live) +{ + const char *name; + + name = NULL; + + /* + * Don't check during the salvage read phase, we might be reading an + * already freed overflow page. + */ + if (F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) + return (0); + + /* + * Verify a block the btree engine thinks it "owns" doesn't appear on + * the available or discard lists (it might reasonably be on the alloc + * list, if it was allocated since the last checkpoint). The engine + * "owns" a block if it's trying to read or free the block, and those + * functions make this check. + * + * Any block being read or freed should not be "available". + * + * Any block being read or freed in the live system should not be on the + * discard list. (A checkpoint handle might be reading a block which is + * on the live system's discard list; any attempt to free a block from a + * checkpoint handle has already failed.) + */ + __wt_spin_lock(session, &block->live_lock); + if (__block_off_match(&block->live.avail, offset, size)) + name = "available"; + else if (live && __block_off_match(&block->live.discard, offset, size)) + name = "discard"; + __wt_spin_unlock(session, &block->live_lock); + if (name != NULL) { + __wt_errx(session, + "%s failed: %" PRIuMAX "/%" PRIu32 " is on the %s list", + tag, (uintmax_t)offset, size, name); + return (__wt_panic(session)); + } + return (0); +} +#endif + +/* + * __block_off_remove -- + * Remove a record from an extent list. + */ +static int +__block_off_remove( + WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, WT_EXT **extp) +{ + WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH]; + WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH]; + u_int i; + + /* Find and remove the record from the by-offset skiplist. */ + __block_off_srch(el->off, off, astack, 0); + ext = *astack[0]; + if (ext == NULL || ext->off != off) + goto corrupt; + for (i = 0; i < ext->depth; ++i) + *astack[i] = ext->next[i]; + + /* + * Find and remove the record from the size's offset skiplist; if that + * empties the by-size skiplist entry, remove it as well. + */ + if (el->track_size) { + __block_size_srch(el->sz, ext->size, sstack); + szp = *sstack[0]; + if (szp == NULL || szp->size != ext->size) + return (EINVAL); + __block_off_srch(szp->off, off, astack, 1); + ext = *astack[0]; + if (ext == NULL || ext->off != off) + goto corrupt; + for (i = 0; i < ext->depth; ++i) + *astack[i] = ext->next[i + ext->depth]; + if (szp->off[0] == NULL) { + for (i = 0; i < szp->depth; ++i) + *sstack[i] = szp->next[i]; + __wt_block_size_free(session, szp); + } + } +#ifdef HAVE_DIAGNOSTIC + if (!el->track_size) { + int not_null; + for (i = 0, not_null = 0; i < ext->depth; ++i) + if (ext->next[i + ext->depth] != NULL) + not_null = 1; + WT_ASSERT(session, not_null == 0); + } +#endif + + --el->entries; + el->bytes -= (uint64_t)ext->size; + + /* Return the record if our caller wants it, otherwise free it. */ + if (extp == NULL) + __wt_block_ext_free(session, ext); + else + *extp = ext; + + /* Update the cached end-of-list. */ + if (el->last == ext) + el->last = NULL; + + return (0); + +corrupt: + WT_PANIC_RET(session, EINVAL, + "attempt to remove non-existent offset from an extent list"); +} + +/* + * __wt_block_off_remove_overlap -- + * Remove a range from an extent list, where the range may be part of a + * overlapping entry. + */ +int +__wt_block_off_remove_overlap( + WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size) +{ + WT_EXT *before, *after, *ext; + wt_off_t a_off, a_size, b_off, b_size; + + WT_ASSERT(session, off != WT_BLOCK_INVALID_OFFSET); + + /* Search for before and after entries for the offset. */ + __block_off_srch_pair(el, off, &before, &after); + + /* If "before" or "after" overlaps, retrieve the overlapping entry. */ + if (before != NULL && before->off + before->size > off) { + WT_RET(__block_off_remove(session, el, before->off, &ext)); + + /* Calculate overlapping extents. */ + a_off = ext->off; + a_size = off - ext->off; + b_off = off + size; + b_size = ext->size - (a_size + size); + } else if (after != NULL && off + size > after->off) { + WT_RET(__block_off_remove(session, el, after->off, &ext)); + + /* + * Calculate overlapping extents. There's no initial overlap + * since the after extent presumably cannot begin before "off". + */ + a_off = WT_BLOCK_INVALID_OFFSET; + a_size = 0; + b_off = off + size; + b_size = ext->size - (b_off - ext->off); + } else + return (WT_NOTFOUND); + + /* + * If there are overlaps, insert the item; re-use the extent structure + * and save the allocation (we know there's no need to merge). + */ + if (a_size != 0) { + ext->off = a_off; + ext->size = a_size; + WT_RET(__block_ext_insert(session, el, ext)); + ext = NULL; + } + if (b_size != 0) { + if (ext == NULL) + WT_RET(__block_off_insert(session, el, b_off, b_size)); + else { + ext->off = b_off; + ext->size = b_size; + WT_RET(__block_ext_insert(session, el, ext)); + ext = NULL; + } + } + if (ext != NULL) + __wt_block_ext_free(session, ext); + return (0); +} + +/* + * __block_extend -- + * Extend the file to allocate space. + */ +static inline int +__block_extend( + WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size) +{ + WT_FH *fh; + + fh = block->fh; + + /* + * Callers of this function are expected to have already acquired any + * locks required to extend the file. + * + * We should never be allocating from an empty file. + */ + if (fh->size < block->allocsize) + WT_RET_MSG(session, EINVAL, + "file has no description information"); + + /* + * Make sure we don't allocate past the maximum file size. There's no + * easy way to know the maximum wt_off_t on a system, limit growth to + * 8B bits (we currently check an wt_off_t is 8B in verify_build.h). I + * don't think we're likely to see anything bigger for awhile. + */ + if (fh->size > (wt_off_t)INT64_MAX - size) + WT_RET_MSG(session, WT_ERROR, + "block allocation failed, file cannot grow further"); + + *offp = fh->size; + fh->size += size; + + WT_STAT_FAST_DATA_INCR(session, block_extension); + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, + "file extend %" PRIdMAX "B @ %" PRIdMAX, + (intmax_t)size, (intmax_t)*offp)); + + return (0); +} + +/* + * __wt_block_alloc -- + * Alloc a chunk of space from the underlying file. + */ +int +__wt_block_alloc( + WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size) +{ + WT_EXT *ext, **estack[WT_SKIP_MAXDEPTH]; + WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH]; + + /* Assert we're maintaining the by-size skiplist. */ + WT_ASSERT(session, block->live.avail.track_size != 0); + + WT_STAT_FAST_DATA_INCR(session, block_alloc); + if (size % block->allocsize != 0) + WT_RET_MSG(session, EINVAL, + "cannot allocate a block size %" PRIdMAX " that is not " + "a multiple of the allocation size %" PRIu32, + (intmax_t)size, block->allocsize); + + /* + * Allocation is either first-fit (lowest offset), or best-fit (best + * size). If it's first-fit, walk the offset list linearly until we + * find an entry that will work. + * + * If it's best-fit by size, search the by-size skiplist for the size + * and take the first entry on the by-size offset list. This means we + * prefer best-fit over lower offset, but within a size we'll prefer an + * offset appearing earlier in the file. + * + * If we don't have anything big enough, extend the file. + */ + if (block->live.avail.bytes < (uint64_t)size) + goto append; + if (block->allocfirst) { + if (!__block_first_srch(block->live.avail.off, size, estack)) + goto append; + ext = *estack[0]; + } else { + __block_size_srch(block->live.avail.sz, size, sstack); + if ((szp = *sstack[0]) == NULL) { +append: WT_RET(__block_extend(session, block, offp, size)); + WT_RET(__block_append(session, + &block->live.alloc, *offp, (wt_off_t)size)); + return (0); + } + + /* Take the first record. */ + ext = szp->off[0]; + } + + /* Remove the record, and set the returned offset. */ + WT_RET(__block_off_remove(session, &block->live.avail, ext->off, &ext)); + *offp = ext->off; + + /* If doing a partial allocation, adjust the record and put it back. */ + if (ext->size > size) { + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, + "allocate %" PRIdMAX " from range %" PRIdMAX "-%" + PRIdMAX ", range shrinks to %" PRIdMAX "-%" PRIdMAX, + (intmax_t)size, + (intmax_t)ext->off, (intmax_t)(ext->off + ext->size), + (intmax_t)(ext->off + size), + (intmax_t)(ext->off + size + ext->size - size))); + + ext->off += size; + ext->size -= size; + WT_RET(__block_ext_insert(session, &block->live.avail, ext)); + } else { + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, + "allocate range %" PRIdMAX "-%" PRIdMAX, + (intmax_t)ext->off, (intmax_t)(ext->off + ext->size))); + + __wt_block_ext_free(session, ext); + } + + /* Add the newly allocated extent to the list of allocations. */ + WT_RET(__block_merge( + session, &block->live.alloc, *offp, (wt_off_t)size)); + return (0); +} + +/* + * __wt_block_free -- + * Free a cookie-referenced chunk of space to the underlying file. + */ +int +__wt_block_free(WT_SESSION_IMPL *session, + WT_BLOCK *block, const uint8_t *addr, size_t addr_size) +{ + WT_DECL_RET; + wt_off_t offset; + uint32_t cksum, size; + + WT_UNUSED(addr_size); + WT_STAT_FAST_DATA_INCR(session, block_free); + + /* Crack the cookie. */ + WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); + + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, + "free %" PRIdMAX "/%" PRIdMAX, (intmax_t)offset, (intmax_t)size)); + +#ifdef HAVE_DIAGNOSTIC + WT_RET(__wt_block_misplaced(session, block, "free", offset, size, 1)); +#endif + WT_RET(__wt_block_ext_prealloc(session, 5)); + __wt_spin_lock(session, &block->live_lock); + ret = __wt_block_off_free(session, block, offset, (wt_off_t)size); + __wt_spin_unlock(session, &block->live_lock); + + return (ret); +} + +/* + * __wt_block_off_free -- + * Free a file range to the underlying file. + */ +int +__wt_block_off_free( + WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size) +{ + WT_DECL_RET; + + /* + * Callers of this function are expected to have already acquired any + * locks required to manipulate the extent lists. + * + * We can reuse this extent immediately if it was allocated during this + * checkpoint, merge it into the avail list (which slows file growth in + * workloads including repeated overflow record modification). If this + * extent is referenced in a previous checkpoint, merge into the discard + * list. + */ + if ((ret = __wt_block_off_remove_overlap( + session, &block->live.alloc, offset, size)) == 0) + ret = __block_merge( + session, &block->live.avail, offset, (wt_off_t)size); + else if (ret == WT_NOTFOUND) + ret = __block_merge( + session, &block->live.discard, offset, (wt_off_t)size); + return (ret); +} + +#ifdef HAVE_DIAGNOSTIC +/* + * __wt_block_extlist_check -- + * Return if the extent lists overlap. + */ +int +__wt_block_extlist_check( + WT_SESSION_IMPL *session, WT_EXTLIST *al, WT_EXTLIST *bl) +{ + WT_EXT *a, *b; + + a = al->off[0]; + b = bl->off[0]; + + /* Walk the lists in parallel, looking for overlaps. */ + while (a != NULL && b != NULL) { + /* + * If there's no overlap, move the lower-offset entry to the + * next entry in its list. + */ + if (a->off + a->size <= b->off) { + a = a->next[0]; + continue; + } + if (b->off + b->size <= a->off) { + b = b->next[0]; + continue; + } + WT_PANIC_RET(session, EINVAL, + "checkpoint merge check: %s list overlaps the %s list", + al->name, bl->name); + } + return (0); +} +#endif + +/* + * __wt_block_extlist_overlap -- + * Review a checkpoint's alloc/discard extent lists, move overlaps into the + * live system's checkpoint-avail list. + */ +int +__wt_block_extlist_overlap( + WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci) +{ + WT_EXT *alloc, *discard; + + alloc = ci->alloc.off[0]; + discard = ci->discard.off[0]; + + /* Walk the lists in parallel, looking for overlaps. */ + while (alloc != NULL && discard != NULL) { + /* + * If there's no overlap, move the lower-offset entry to the + * next entry in its list. + */ + if (alloc->off + alloc->size <= discard->off) { + alloc = alloc->next[0]; + continue; + } + if (discard->off + discard->size <= alloc->off) { + discard = discard->next[0]; + continue; + } + + /* Reconcile the overlap. */ + WT_RET(__block_ext_overlap(session, block, + &ci->alloc, &alloc, &ci->discard, &discard)); + } + return (0); +} + +/* + * __block_ext_overlap -- + * Reconcile two overlapping ranges. + */ +static int +__block_ext_overlap(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_EXTLIST *ael, WT_EXT **ap, WT_EXTLIST *bel, WT_EXT **bp) +{ + WT_EXT *a, *b, **ext; + WT_EXTLIST *avail, *el; + wt_off_t off, size; + + avail = &block->live.ckpt_avail; + + /* + * The ranges overlap, choose the range we're going to take from each. + * + * We can think of the overlap possibilities as 11 different cases: + * + * AAAAAAAAAAAAAAAAAA + * #1 BBBBBBBBBBBBBBBBBB ranges are the same + * #2 BBBBBBBBBBBBB overlaps the beginning + * #3 BBBBBBBBBBBBBBBB overlaps the end + * #4 BBBBB B is a prefix of A + * #5 BBBBBB B is middle of A + * #6 BBBBBBBBBB B is a suffix of A + * + * and: + * + * BBBBBBBBBBBBBBBBBB + * #7 AAAAAAAAAAAAA same as #3 + * #8 AAAAAAAAAAAAAAAA same as #2 + * #9 AAAAA A is a prefix of B + * #10 AAAAAA A is middle of B + * #11 AAAAAAAAAA A is a suffix of B + * + * + * By swapping the arguments so "A" is always the lower range, we can + * eliminate cases #2, #8, #10 and #11, and only handle 7 cases: + * + * AAAAAAAAAAAAAAAAAA + * #1 BBBBBBBBBBBBBBBBBB ranges are the same + * #3 BBBBBBBBBBBBBBBB overlaps the end + * #4 BBBBB B is a prefix of A + * #5 BBBBBB B is middle of A + * #6 BBBBBBBBBB B is a suffix of A + * + * and: + * + * BBBBBBBBBBBBBBBBBB + * #7 AAAAAAAAAAAAA same as #3 + * #9 AAAAA A is a prefix of B + */ + a = *ap; + b = *bp; + if (a->off > b->off) { /* Swap */ + b = *ap; + a = *bp; + ext = ap; ap = bp; bp = ext; + el = ael; ael = bel; bel = el; + } + + if (a->off == b->off) { /* Case #1, #4, #9 */ + if (a->size == b->size) { /* Case #1 */ + /* + * Move caller's A and B to the next element + * Add that A and B range to the avail list + * Delete A and B + */ + *ap = (*ap)->next[0]; + *bp = (*bp)->next[0]; + WT_RET(__block_merge(session, avail, b->off, b->size)); + WT_RET(__block_off_remove(session, ael, a->off, NULL)); + WT_RET(__block_off_remove(session, bel, b->off, NULL)); + } + else if (a->size > b->size) { /* Case #4 */ + /* + * Remove A from its list + * Increment/Decrement A's offset/size by the size of B + * Insert A on its list + */ + WT_RET(__block_off_remove(session, ael, a->off, &a)); + a->off += b->size; + a->size -= b->size; + WT_RET(__block_ext_insert(session, ael, a)); + + /* + * Move caller's B to the next element + * Add B's range to the avail list + * Delete B + */ + *bp = (*bp)->next[0]; + WT_RET(__block_merge(session, avail, b->off, b->size)); + WT_RET(__block_off_remove(session, bel, b->off, NULL)); + } else { /* Case #9 */ + /* + * Remove B from its list + * Increment/Decrement B's offset/size by the size of A + * Insert B on its list + */ + WT_RET(__block_off_remove(session, bel, b->off, &b)); + b->off += a->size; + b->size -= a->size; + WT_RET(__block_ext_insert(session, bel, b)); + + /* + * Move caller's A to the next element + * Add A's range to the avail list + * Delete A + */ + *ap = (*ap)->next[0]; + WT_RET(__block_merge(session, avail, a->off, a->size)); + WT_RET(__block_off_remove(session, ael, a->off, NULL)); + } /* Case #6 */ + } else if (a->off + a->size == b->off + b->size) { + /* + * Remove A from its list + * Decrement A's size by the size of B + * Insert A on its list + */ + WT_RET(__block_off_remove(session, ael, a->off, &a)); + a->size -= b->size; + WT_RET(__block_ext_insert(session, ael, a)); + + /* + * Move caller's B to the next element + * Add B's range to the avail list + * Delete B + */ + *bp = (*bp)->next[0]; + WT_RET(__block_merge(session, avail, b->off, b->size)); + WT_RET(__block_off_remove(session, bel, b->off, NULL)); + } else if /* Case #3, #7 */ + (a->off + a->size < b->off + b->size) { + /* + * Add overlap to the avail list + */ + off = b->off; + size = (a->off + a->size) - b->off; + WT_RET(__block_merge(session, avail, off, size)); + + /* + * Remove A from its list + * Decrement A's size by the overlap + * Insert A on its list + */ + WT_RET(__block_off_remove(session, ael, a->off, &a)); + a->size -= size; + WT_RET(__block_ext_insert(session, ael, a)); + + /* + * Remove B from its list + * Increment/Decrement B's offset/size by the overlap + * Insert B on its list + */ + WT_RET(__block_off_remove(session, bel, b->off, &b)); + b->off += size; + b->size -= size; + WT_RET(__block_ext_insert(session, bel, b)); + } else { /* Case #5 */ + /* Calculate the offset/size of the trailing part of A. */ + off = b->off + b->size; + size = (a->off + a->size) - off; + + /* + * Remove A from its list + * Decrement A's size by trailing part of A plus B's size + * Insert A on its list + */ + WT_RET(__block_off_remove(session, ael, a->off, &a)); + a->size = b->off - a->off; + WT_RET(__block_ext_insert(session, ael, a)); + + /* Add trailing part of A to A's list as a new element. */ + WT_RET(__block_merge(session, ael, off, size)); + + /* + * Move caller's B to the next element + * Add B's range to the avail list + * Delete B + */ + *bp = (*bp)->next[0]; + WT_RET(__block_merge(session, avail, b->off, b->size)); + WT_RET(__block_off_remove(session, bel, b->off, NULL)); + } + + return (0); +} + +/* + * __wt_block_extlist_merge -- + * Merge one extent list into another. + */ +int +__wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b) +{ + WT_EXT *ext; + WT_EXTLIST tmp; + u_int i; + + WT_RET(__wt_verbose( + session, WT_VERB_BLOCK, "merging %s into %s", a->name, b->name)); + + /* + * Sometimes the list we are merging is much bigger than the other: if + * so, swap the lists around to reduce the amount of work we need to do + * during the merge. The size lists have to match as well, so this is + * only possible if both lists are tracking sizes, or neither are. + */ + if (a->track_size == b->track_size && a->entries > b->entries) { + tmp = *a; + a->bytes = b->bytes; + b->bytes = tmp.bytes; + a->entries = b->entries; + b->entries = tmp.entries; + for (i = 0; i < WT_SKIP_MAXDEPTH; i++) { + a->off[i] = b->off[i]; + b->off[i] = tmp.off[i]; + a->sz[i] = b->sz[i]; + b->sz[i] = tmp.sz[i]; + } + } + + WT_EXT_FOREACH(ext, a->off) + WT_RET(__block_merge(session, b, ext->off, ext->size)); + + return (0); +} + +/* + * __block_append -- + * Append a new entry to the allocation list. + */ +static int +__block_append( + WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size) +{ + WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH]; + u_int i; + + WT_ASSERT(session, el->track_size == 0); + + /* + * Identical to __block_merge, when we know the file is being extended, + * that is, the information is either going to be used to extend the + * last object on the list, or become a new object ending the list. + * + * The terminating element of the list is cached, check it; otherwise, + * get a stack for the last object in the skiplist, check for a simple + * extension, and otherwise append a new structure. + */ + if ((ext = el->last) != NULL && ext->off + ext->size == off) + ext->size += size; + else { + ext = __block_off_srch_last(el->off, astack); + if (ext != NULL && ext->off + ext->size == off) + ext->size += size; + else { + WT_RET(__wt_block_ext_alloc(session, &ext)); + ext->off = off; + ext->size = size; + + for (i = 0; i < ext->depth; ++i) + *astack[i] = ext; + ++el->entries; + } + + /* Update the cached end-of-list */ + el->last = ext; + } + el->bytes += (uint64_t)size; + + return (0); +} + +/* + * __wt_block_insert_ext -- + * Insert an extent into an extent list, merging if possible. + */ +int +__wt_block_insert_ext( + WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size) +{ + /* + * There are currently two copies of this function (this code is a one- + * liner that calls the internal version of the function, which means + * the compiler should compress out the function call). It's that way + * because the interface is still fluid, I'm not convinced there won't + * be a need for a functional split between the internal and external + * versions in the future. + * + * Callers of this function are expected to have already acquired any + * locks required to manipulate the extent list. + */ + return (__block_merge(session, el, off, size)); +} + +/* + * __block_merge -- + * Insert an extent into an extent list, merging if possible (internal + * version). + */ +static int +__block_merge( + WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size) +{ + WT_EXT *ext, *after, *before; + + /* + * Retrieve the records preceding/following the offset. If the records + * are contiguous with the free'd offset, combine records. + */ + __block_off_srch_pair(el, off, &before, &after); + if (before != NULL) { + if (before->off + before->size > off) + WT_PANIC_RET(session, EINVAL, + "%s: existing range %" PRIdMAX "-%" PRIdMAX + " overlaps with merge range %" PRIdMAX "-%" PRIdMAX, + el->name, + (intmax_t)before->off, + (intmax_t)(before->off + before->size), + (intmax_t)off, (intmax_t)(off + size)); + if (before->off + before->size != off) + before = NULL; + } + if (after != NULL) { + if (off + size > after->off) + WT_PANIC_RET(session, EINVAL, + "%s: merge range %" PRIdMAX "-%" PRIdMAX + " overlaps with existing range %" PRIdMAX + "-%" PRIdMAX, + el->name, + (intmax_t)off, (intmax_t)(off + size), + (intmax_t)after->off, + (intmax_t)(after->off + after->size)); + if (off + size != after->off) + after = NULL; + } + if (before == NULL && after == NULL) { + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, + "%s: insert range %" PRIdMAX "-%" PRIdMAX, + el->name, (intmax_t)off, (intmax_t)(off + size))); + + return (__block_off_insert(session, el, off, size)); + } + + /* + * If the "before" offset range abuts, we'll use it as our new record; + * if the "after" offset range also abuts, include its size and remove + * it from the system. Else, only the "after" offset range abuts, use + * the "after" offset range as our new record. In either case, remove + * the record we're going to use, adjust it and re-insert it. + */ + if (before == NULL) { + WT_RET(__block_off_remove(session, el, after->off, &ext)); + + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, + "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %" + PRIdMAX "-%" PRIdMAX, + el->name, + (intmax_t)ext->off, (intmax_t)(ext->off + ext->size), + (intmax_t)off, (intmax_t)(off + ext->size + size))); + + ext->off = off; + ext->size += size; + } else { + if (after != NULL) { + size += after->size; + WT_RET( + __block_off_remove(session, el, after->off, NULL)); + } + WT_RET(__block_off_remove(session, el, before->off, &ext)); + + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, + "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %" + PRIdMAX "-%" PRIdMAX, + el->name, + (intmax_t)ext->off, (intmax_t)(ext->off + ext->size), + (intmax_t)ext->off, + (intmax_t)(ext->off + ext->size + size))); + + ext->size += size; + } + return (__block_ext_insert(session, el, ext)); +} + +/* + * __wt_block_extlist_read_avail -- + * Read an avail extent list, includes minor special handling. + */ +int +__wt_block_extlist_read_avail(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size) +{ + WT_DECL_RET; + + /* If there isn't a list, we're done. */ + if (el->offset == WT_BLOCK_INVALID_OFFSET) + return (0); + +#ifdef HAVE_DIAGNOSTIC + /* + * In diagnostic mode, reads are checked against the available and + * discard lists (a block being read should never appear on either). + * Checkpoint threads may be running in the file, don't race with + * them. + */ + __wt_spin_lock(session, &block->live_lock); +#endif + + WT_ERR(__wt_block_extlist_read(session, block, el, ckpt_size)); + + /* + * Extent blocks are allocated from the available list: if reading the + * avail list, the extent blocks might be included, remove them. + */ + WT_ERR_NOTFOUND_OK( + __wt_block_off_remove_overlap(session, el, el->offset, el->size)); + +err: +#ifdef HAVE_DIAGNOSTIC + __wt_spin_unlock(session, &block->live_lock); +#endif + + return (ret); +} + +/* + * __wt_block_extlist_read -- + * Read an extent list. + */ +int +__wt_block_extlist_read(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + wt_off_t off, size; + int (*func)(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t); + const uint8_t *p; + + /* If there isn't a list, we're done. */ + if (el->offset == WT_BLOCK_INVALID_OFFSET) + return (0); + + WT_RET(__wt_scr_alloc(session, el->size, &tmp)); + WT_ERR(__wt_block_read_off( + session, block, tmp, el->offset, el->size, el->cksum)); + +#define WT_EXTLIST_READ(p, v) do { \ + uint64_t _v; \ + WT_ERR(__wt_vunpack_uint(&(p), 0, &_v)); \ + (v) = (wt_off_t)_v; \ +} while (0) + + p = WT_BLOCK_HEADER_BYTE(tmp->mem); + WT_EXTLIST_READ(p, off); + WT_EXTLIST_READ(p, size); + if (off != WT_BLOCK_EXTLIST_MAGIC || size != 0) + goto corrupted; + + /* + * If we're not creating both offset and size skiplists, use the simpler + * append API, otherwise do a full merge. There are two reasons for the + * test: first, checkpoint "available" lists are NOT sorted (checkpoints + * write two separate lists, both of which are sorted but they're not + * merged). Second, the "available" list is sorted by size as well as + * by offset, and the fast-path append code doesn't support that, it's + * limited to offset. The test of "track size" is short-hand for "are + * we reading the "available" list. + */ + func = el->track_size == 0 ? __block_append : __block_merge; + for (;;) { + WT_EXTLIST_READ(p, off); + WT_EXTLIST_READ(p, size); + if (off == WT_BLOCK_INVALID_OFFSET) + break; + + /* + * We check the offset/size pairs represent valid file ranges, + * then insert them into the list. We don't necessarily have + * to check for offsets past the end of the checkpoint, but it's + * a cheap test to do here and we'd have to do the check as part + * of file verification, regardless. + */ + if (off < block->allocsize || + off % block->allocsize != 0 || + size % block->allocsize != 0 || + off + size > ckpt_size) +corrupted: WT_PANIC_RET(session, WT_ERROR, + "file contains a corrupted %s extent list, range %" + PRIdMAX "-%" PRIdMAX " past end-of-file", + el->name, + (intmax_t)off, (intmax_t)(off + size)); + + WT_ERR(func(session, el, off, size)); + } + + if (WT_VERBOSE_ISSET(session, WT_VERB_BLOCK)) + WT_ERR(__block_extlist_dump(session, "read extlist", el, 0)); + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __wt_block_extlist_write -- + * Write an extent list at the tail of the file. + */ +int +__wt_block_extlist_write(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_EXTLIST *el, WT_EXTLIST *additional) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_EXT *ext; + WT_PAGE_HEADER *dsk; + size_t size; + uint32_t entries; + uint8_t *p; + + if (WT_VERBOSE_ISSET(session, WT_VERB_BLOCK)) + WT_RET(__block_extlist_dump(session, "write extlist", el, 0)); + + /* + * Figure out how many entries we're writing -- if there aren't any + * entries, we're done. + */ + entries = el->entries + (additional == NULL ? 0 : additional->entries); + if (entries == 0) { + el->offset = WT_BLOCK_INVALID_OFFSET; + el->cksum = el->size = 0; + return (0); + } + + /* + * Get a scratch buffer, clear the page's header and data, initialize + * the header. + * + * Allocate memory for the extent list entries plus two additional + * entries: the initial WT_BLOCK_EXTLIST_MAGIC/0 pair and the list- + * terminating WT_BLOCK_INVALID_OFFSET/0 pair. + */ + size = (entries + 2) * 2 * WT_INTPACK64_MAXSIZE; + WT_RET(__wt_block_write_size(session, block, &size)); + WT_RET(__wt_scr_alloc(session, size, &tmp)); + dsk = tmp->mem; + memset(dsk, 0, WT_BLOCK_HEADER_BYTE_SIZE); + dsk->type = WT_PAGE_BLOCK_MANAGER; + +#define WT_EXTLIST_WRITE(p, v) \ + WT_ERR(__wt_vpack_uint(&(p), 0, (uint64_t)(v))) + + /* Fill the page's data. */ + p = WT_BLOCK_HEADER_BYTE(dsk); + WT_EXTLIST_WRITE(p, WT_BLOCK_EXTLIST_MAGIC); /* Initial value */ + WT_EXTLIST_WRITE(p, 0); + WT_EXT_FOREACH(ext, el->off) { /* Free ranges */ + WT_EXTLIST_WRITE(p, ext->off); + WT_EXTLIST_WRITE(p, ext->size); + } + if (additional != NULL) + WT_EXT_FOREACH(ext, additional->off) { /* Free ranges */ + WT_EXTLIST_WRITE(p, ext->off); + WT_EXTLIST_WRITE(p, ext->size); + } + WT_EXTLIST_WRITE(p, WT_BLOCK_INVALID_OFFSET); /* Ending value */ + WT_EXTLIST_WRITE(p, 0); + + dsk->u.datalen = WT_PTRDIFF32(p, WT_BLOCK_HEADER_BYTE(dsk)); + tmp->size = dsk->mem_size = WT_PTRDIFF32(p, dsk); + +#ifdef HAVE_DIAGNOSTIC + /* + * The extent list is written as a valid btree page because the salvage + * functionality might move into the btree layer some day, besides, we + * don't need another format and this way the page format can be easily + * verified. + */ + WT_ERR(__wt_verify_dsk(session, "[extent list check]", tmp)); +#endif + + /* Write the extent list to disk. */ + WT_ERR(__wt_block_write_off( + session, block, tmp, &el->offset, &el->size, &el->cksum, 1, 1)); + + /* + * Remove the allocated blocks from the system's allocation list, extent + * blocks never appear on any allocation list. + */ + WT_TRET(__wt_block_off_remove_overlap( + session, &block->live.alloc, el->offset, el->size)); + + WT_ERR(__wt_verbose(session, WT_VERB_BLOCK, + "%s written %" PRIdMAX "/%" PRIu32, + el->name, (intmax_t)el->offset, el->size)); + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __wt_block_extlist_truncate -- + * Truncate the file based on the last available extent in the list. + */ +int +__wt_block_extlist_truncate( + WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el) +{ + WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH]; + WT_FH *fh; + wt_off_t orig, size; + + fh = block->fh; + + /* + * Check if the last available extent is at the end of the file, and if + * so, truncate the file and discard the extent. + */ + if ((ext = __block_off_srch_last(el->off, astack)) == NULL) + return (0); + WT_ASSERT(session, ext->off + ext->size <= fh->size); + if (ext->off + ext->size < fh->size) + return (0); + + /* + * Remove the extent list entry. (Save the value, we need it to reset + * the cached file size, and that can't happen until after the extent + * list removal succeeds.) + */ + orig = fh->size; + size = ext->off; + WT_RET(__block_off_remove(session, el, size, NULL)); + fh->size = size; + + /* + * Truncate the file. The truncate might fail if there's a file mapping + * (if there's an open checkpoint on the file), that's OK, we'll ignore + * those blocks. + */ + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, + "truncate file from %" PRIdMAX " to %" PRIdMAX, + (intmax_t)orig, (intmax_t)size)); + WT_RET_BUSY_OK(__wt_ftruncate(session, block->fh, size)); + + return (0); +} + +/* + * __wt_block_extlist_init -- + * Initialize an extent list. + */ +int +__wt_block_extlist_init(WT_SESSION_IMPL *session, + WT_EXTLIST *el, const char *name, const char *extname, int track_size) +{ + size_t size; + + WT_CLEAR(*el); + + size = (name == NULL ? 0 : strlen(name)) + + strlen(".") + (extname == NULL ? 0 : strlen(extname) + 1); + WT_RET(__wt_calloc_def(session, size, &el->name)); + (void)snprintf(el->name, size, "%s.%s", + name == NULL ? "" : name, extname == NULL ? "" : extname); + + el->offset = WT_BLOCK_INVALID_OFFSET; + el->track_size = track_size; + return (0); +} + +/* + * __wt_block_extlist_free -- + * Discard an extent list. + */ +void +__wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el) +{ + WT_EXT *ext, *next; + WT_SIZE *szp, *nszp; + + __wt_free(session, el->name); + + for (ext = el->off[0]; ext != NULL; ext = next) { + next = ext->next[0]; + __wt_free(session, ext); + } + for (szp = el->sz[0]; szp != NULL; szp = nszp) { + nszp = szp->next[0]; + __wt_free(session, szp); + } + + /* Extent lists are re-used, clear them. */ + WT_CLEAR(*el); +} + +/* + * __block_extlist_dump -- + * Dump an extent list as verbose messages. + */ +static int +__block_extlist_dump( + WT_SESSION_IMPL *session, const char *tag, WT_EXTLIST *el, int show_size) +{ + WT_EXT *ext; + WT_SIZE *szp; + + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, + "%s: %s: %" PRIu64 " bytes, by offset:%s", + tag, el->name, el->bytes, el->entries == 0 ? " [Empty]" : "")); + if (el->entries == 0) + return (0); + + WT_EXT_FOREACH(ext, el->off) + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, + "\t{%" PRIuMAX "/%" PRIuMAX "}", + (uintmax_t)ext->off, (uintmax_t)ext->size)); + + if (!show_size) + return (0); + + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, + "%s: %s: by size:%s", + tag, el->name, el->entries == 0 ? " [Empty]" : "")); + if (el->entries == 0) + return (0); + + WT_EXT_FOREACH(szp, el->sz) { + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, + "\t{%" PRIuMAX "}", (uintmax_t)szp->size)); + WT_EXT_FOREACH_OFF(ext, szp->off) + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, + "\t\t{%" PRIuMAX "/%" PRIuMAX "}", + (uintmax_t)ext->off, (uintmax_t)ext->size)); + } + return (0); +} diff --git a/src/third_party/wiredtiger/src/block/block_map.c b/src/third_party/wiredtiger/src/block/block_map.c new file mode 100644 index 00000000000..68fb75179d9 --- /dev/null +++ b/src/third_party/wiredtiger/src/block/block_map.c @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_block_map -- + * Map a segment of the file in, if possible. + */ +int +__wt_block_map( + WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapp, size_t *maplenp, + void **mappingcookie) +{ + *(void **)mapp = NULL; + *maplenp = 0; + + /* + * Turn off mapping when verifying the file, because we can't perform + * checksum validation of mapped segments, and verify has to checksum + * pages. + */ + if (block->verify) + return (0); + + /* + * Turn off mapping when direct I/O is configured for the file, the + * Linux open(2) documentation says applications should avoid mixing + * mmap(2) of files with direct I/O to the same files. + */ + if (block->fh->direct_io) + return (0); + + /* + * Turn off mapping if the application configured a cache size maximum, + * we can't control how much of the cache size we use in that case. + */ + if (block->os_cache_max != 0) + return (0); + + /* + * Map the file into memory. + * Ignore errors, we'll read the file through the cache if map fails. + */ + (void)__wt_mmap(session, block->fh, mapp, maplenp, mappingcookie); + + return (0); +} + +/* + * __wt_block_unmap -- + * Unmap any mapped-in segment of the file. + */ +int +__wt_block_unmap( + WT_SESSION_IMPL *session, WT_BLOCK *block, void *map, size_t maplen, + void **mappingcookie) +{ + /* Unmap the file from memory. */ + return (__wt_munmap(session, block->fh, map, maplen, mappingcookie)); +} diff --git a/src/third_party/wiredtiger/src/block/block_mgr.c b/src/third_party/wiredtiger/src/block/block_mgr.c new file mode 100644 index 00000000000..4f7f2898de5 --- /dev/null +++ b/src/third_party/wiredtiger/src/block/block_mgr.c @@ -0,0 +1,433 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static void __bm_method_set(WT_BM *, int); + +/* + * __bm_readonly -- + * General-purpose "writes not supported on this handle" function. + */ +static int +__bm_readonly(WT_BM *bm, WT_SESSION_IMPL *session) +{ + WT_RET_MSG(session, ENOTSUP, + "%s: write operation on read-only checkpoint handle", + bm->block->name); +} + +/* + * __bm_addr_string -- + * Return a printable string representation of an address cookie. + */ +static int +__bm_addr_string(WT_BM *bm, WT_SESSION_IMPL *session, + WT_ITEM *buf, const uint8_t *addr, size_t addr_size) +{ + return ( + __wt_block_addr_string(session, bm->block, buf, addr, addr_size)); +} + +/* + * __bm_addr_valid -- + * Return if an address cookie is valid. + */ +static int +__bm_addr_valid(WT_BM *bm, + WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) +{ + return (__wt_block_addr_valid( + session, bm->block, addr, addr_size, bm->is_live)); +} + +/* + * __bm_block_header -- + * Return the size of the block header. + */ +static u_int +__bm_block_header(WT_BM *bm) +{ + return (__wt_block_header(bm->block)); +} + +/* + * __bm_checkpoint -- + * Write a buffer into a block, creating a checkpoint. + */ +static int +__bm_checkpoint(WT_BM *bm, + WT_SESSION_IMPL *session, WT_ITEM *buf, WT_CKPT *ckptbase, int data_cksum) +{ + return (__wt_block_checkpoint( + session, bm->block, buf, ckptbase, data_cksum)); +} + +/* + * __bm_sync -- + * Flush a file to disk. + */ +static int +__bm_sync(WT_BM *bm, WT_SESSION_IMPL *session, int async) +{ + return (async ? + __wt_fsync_async(session, bm->block->fh) : + __wt_fsync(session, bm->block->fh)); +} + +/* + * __bm_checkpoint_load -- + * Load a checkpoint. + */ +static int +__bm_checkpoint_load(WT_BM *bm, WT_SESSION_IMPL *session, + const uint8_t *addr, size_t addr_size, + uint8_t *root_addr, size_t *root_addr_sizep, int checkpoint) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + /* If not opening a checkpoint, we're opening the live system. */ + bm->is_live = !checkpoint; + WT_RET(__wt_block_checkpoint_load(session, bm->block, + addr, addr_size, root_addr, root_addr_sizep, checkpoint)); + + if (checkpoint) { + /* + * Read-only objects are optionally mapped into memory instead + * of being read into cache buffers. + */ + if (conn->mmap) + WT_RET(__wt_block_map(session, bm->block, + &bm->map, &bm->maplen, &bm->mappingcookie)); + + /* + * If this handle is for a checkpoint, that is, read-only, there + * isn't a lot you can do with it. Although the btree layer + * prevents attempts to write a checkpoint reference, paranoia + * is healthy. + */ + __bm_method_set(bm, 1); + } + + return (0); +} + +/* + * __bm_checkpoint_resolve -- + * Resolve the checkpoint. + */ +static int +__bm_checkpoint_resolve(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__wt_block_checkpoint_resolve(session, bm->block)); +} + +/* + * __bm_checkpoint_unload -- + * Unload a checkpoint point. + */ +static int +__bm_checkpoint_unload(WT_BM *bm, WT_SESSION_IMPL *session) +{ + WT_DECL_RET; + + /* Unmap any mapped segment. */ + if (bm->map != NULL) + WT_TRET(__wt_block_unmap(session, + bm->block, bm->map, bm->maplen, &bm->mappingcookie)); + + /* Unload the checkpoint. */ + WT_TRET(__wt_block_checkpoint_unload(session, bm->block, !bm->is_live)); + + return (ret); +} + +/* + * __bm_close -- + * Close a file. + */ +static int +__bm_close(WT_BM *bm, WT_SESSION_IMPL *session) +{ + WT_DECL_RET; + + if (bm == NULL) /* Safety check */ + return (0); + + ret = __wt_block_close(session, bm->block); + + __wt_overwrite_and_free(session, bm); + return (ret); +} + +/* + * __bm_compact_start -- + * Start a block manager compaction. + */ +static int +__bm_compact_start(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__wt_block_compact_start(session, bm->block)); +} + +/* + * __bm_compact_page_skip -- + * Return if a page is useful for compaction. + */ +static int +__bm_compact_page_skip(WT_BM *bm, WT_SESSION_IMPL *session, + const uint8_t *addr, size_t addr_size, int *skipp) +{ + return (__wt_block_compact_page_skip( + session, bm->block, addr, addr_size, skipp)); +} + +/* + * __bm_compact_skip -- + * Return if a file can be compacted. + */ +static int +__bm_compact_skip(WT_BM *bm, WT_SESSION_IMPL *session, int *skipp) +{ + return (__wt_block_compact_skip(session, bm->block, skipp)); +} + +/* + * __bm_compact_end -- + * End a block manager compaction. + */ +static int +__bm_compact_end(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__wt_block_compact_end(session, bm->block)); +} + +/* + * __bm_free -- + * Free a block of space to the underlying file. + */ +static int +__bm_free(WT_BM *bm, + WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) +{ + return (__wt_block_free(session, bm->block, addr, addr_size)); +} + +/* + * __bm_stat -- + * Block-manager statistics. + */ +static int +__bm_stat(WT_BM *bm, WT_SESSION_IMPL *session, WT_DSRC_STATS *stats) +{ + __wt_block_stat(session, bm->block, stats); + return (0); +} + +/* + * __bm_write -- + * Write a buffer into a block, returning the block's address cookie. + */ +static int +__bm_write(WT_BM *bm, WT_SESSION_IMPL *session, + WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int data_cksum) +{ + return (__wt_block_write( + session, bm->block, buf, addr, addr_sizep, data_cksum)); +} + +/* + * __bm_write_size -- + * Return the buffer size required to write a block. + */ +static int +__bm_write_size(WT_BM *bm, WT_SESSION_IMPL *session, size_t *sizep) +{ + return (__wt_block_write_size(session, bm->block, sizep)); +} + +/* + * __bm_salvage_start -- + * Start a block manager salvage. + */ +static int +__bm_salvage_start(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__wt_block_salvage_start(session, bm->block)); +} + +/* + * __bm_salvage_valid -- + * Inform salvage a block is valid. + */ +static int +__bm_salvage_valid(WT_BM *bm, + WT_SESSION_IMPL *session, uint8_t *addr, size_t addr_size, int valid) +{ + return (__wt_block_salvage_valid( + session, bm->block, addr, addr_size, valid)); +} + +/* + * __bm_salvage_next -- + * Return the next block from the file. + */ +static int +__bm_salvage_next(WT_BM *bm, + WT_SESSION_IMPL *session, uint8_t *addr, size_t *addr_sizep, int *eofp) +{ + return (__wt_block_salvage_next( + session, bm->block, addr, addr_sizep, eofp)); +} + +/* + * __bm_salvage_end -- + * End a block manager salvage. + */ +static int +__bm_salvage_end(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__wt_block_salvage_end(session, bm->block)); +} + +/* + * __bm_verify_start -- + * Start a block manager verify. + */ +static int +__bm_verify_start(WT_BM *bm, WT_SESSION_IMPL *session, WT_CKPT *ckptbase) +{ + return (__wt_block_verify_start(session, bm->block, ckptbase)); +} + +/* + * __bm_verify_addr -- + * Verify an address. + */ +static int +__bm_verify_addr(WT_BM *bm, + WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) +{ + return (__wt_block_verify_addr(session, bm->block, addr, addr_size)); +} + +/* + * __bm_verify_end -- + * End a block manager verify. + */ +static int +__bm_verify_end(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__wt_block_verify_end(session, bm->block)); +} + +/* + * __bm_method_set -- + * Set up the legal methods. + */ +static void +__bm_method_set(WT_BM *bm, int readonly) +{ + if (readonly) { + bm->addr_string = __bm_addr_string; + bm->addr_valid = __bm_addr_valid; + bm->block_header = __bm_block_header; + bm->checkpoint = (int (*)(WT_BM *, + WT_SESSION_IMPL *, WT_ITEM *, WT_CKPT *, int))__bm_readonly; + bm->checkpoint_load = __bm_checkpoint_load; + bm->checkpoint_resolve = + (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly; + bm->checkpoint_unload = __bm_checkpoint_unload; + bm->close = __bm_close; + bm->compact_end = + (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly; + bm->compact_page_skip = (int (*)(WT_BM *, WT_SESSION_IMPL *, + const uint8_t *, size_t, int *))__bm_readonly; + bm->compact_skip = (int (*) + (WT_BM *, WT_SESSION_IMPL *, int *))__bm_readonly; + bm->compact_start = + (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly; + bm->free = (int (*)(WT_BM *, + WT_SESSION_IMPL *, const uint8_t *, size_t))__bm_readonly; + bm->preload = __wt_bm_preload; + bm->read = __wt_bm_read; + bm->salvage_end = (int (*) + (WT_BM *, WT_SESSION_IMPL *))__bm_readonly; + bm->salvage_next = (int (*)(WT_BM *, WT_SESSION_IMPL *, + uint8_t *, size_t *, int *))__bm_readonly; + bm->salvage_start = (int (*) + (WT_BM *, WT_SESSION_IMPL *))__bm_readonly; + bm->salvage_valid = (int (*)(WT_BM *, + WT_SESSION_IMPL *, uint8_t *, size_t, int))__bm_readonly; + bm->stat = __bm_stat; + bm->sync = + (int (*)(WT_BM *, WT_SESSION_IMPL *, int))__bm_readonly; + bm->verify_addr = __bm_verify_addr; + bm->verify_end = __bm_verify_end; + bm->verify_start = __bm_verify_start; + bm->write = (int (*)(WT_BM *, WT_SESSION_IMPL *, + WT_ITEM *, uint8_t *, size_t *, int))__bm_readonly; + bm->write_size = (int (*) + (WT_BM *, WT_SESSION_IMPL *, size_t *))__bm_readonly; + } else { + bm->addr_string = __bm_addr_string; + bm->addr_valid = __bm_addr_valid; + bm->block_header = __bm_block_header; + bm->checkpoint = __bm_checkpoint; + bm->checkpoint_load = __bm_checkpoint_load; + bm->checkpoint_resolve = __bm_checkpoint_resolve; + bm->checkpoint_unload = __bm_checkpoint_unload; + bm->close = __bm_close; + bm->compact_end = __bm_compact_end; + bm->compact_page_skip = __bm_compact_page_skip; + bm->compact_skip = __bm_compact_skip; + bm->compact_start = __bm_compact_start; + bm->free = __bm_free; + bm->preload = __wt_bm_preload; + bm->read = __wt_bm_read; + bm->salvage_end = __bm_salvage_end; + bm->salvage_next = __bm_salvage_next; + bm->salvage_start = __bm_salvage_start; + bm->salvage_valid = __bm_salvage_valid; + bm->stat = __bm_stat; + bm->sync = __bm_sync; + bm->verify_addr = __bm_verify_addr; + bm->verify_end = __bm_verify_end; + bm->verify_start = __bm_verify_start; + bm->write = __bm_write; + bm->write_size = __bm_write_size; + } +} + +/* + * __wt_block_manager_open -- + * Open a file. + */ +int +__wt_block_manager_open(WT_SESSION_IMPL *session, + const char *filename, const char *cfg[], + int forced_salvage, int readonly, uint32_t allocsize, WT_BM **bmp) +{ + WT_BM *bm; + WT_DECL_RET; + + *bmp = NULL; + + WT_RET(__wt_calloc_def(session, 1, &bm)); + __bm_method_set(bm, 0); + + WT_ERR(__wt_block_open(session, filename, cfg, + forced_salvage, readonly, allocsize, &bm->block)); + + *bmp = bm; + return (0); + +err: WT_TRET(bm->close(bm, session)); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c new file mode 100644 index 00000000000..2fbaa0fe331 --- /dev/null +++ b/src/third_party/wiredtiger/src/block/block_open.c @@ -0,0 +1,330 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __desc_read(WT_SESSION_IMPL *, WT_BLOCK *); + +/* + * __wt_block_manager_truncate -- + * Truncate a file. + */ +int +__wt_block_manager_truncate( + WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize) +{ + WT_DECL_RET; + WT_FH *fh; + + /* Open the underlying file handle. */ + WT_RET(__wt_open(session, filename, 0, 0, WT_FILE_TYPE_DATA, &fh)); + + /* Truncate the file. */ + WT_ERR(__wt_ftruncate(session, fh, (wt_off_t)0)); + + /* Write out the file's meta-data. */ + ret = __wt_desc_init(session, fh, allocsize); + + /* Close the file handle. */ +err: WT_TRET(__wt_close(session, fh)); + + return (ret); +} + +/* + * __wt_block_manager_create -- + * Create a file. + */ +int +__wt_block_manager_create( + WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize) +{ + WT_DECL_RET; + WT_FH *fh; + + /* Create the underlying file and open a handle. */ + WT_RET(__wt_open(session, filename, 1, 1, WT_FILE_TYPE_DATA, &fh)); + + /* Write out the file's meta-data. */ + ret = __wt_desc_init(session, fh, allocsize); + + /* Close the file handle. */ + WT_TRET(__wt_close(session, fh)); + + /* Undo any create on error. */ + if (ret != 0) + WT_TRET(__wt_remove(session, filename)); + + return (ret); +} + +/* + * __block_destroy -- + * Destroy a block handle. + */ +static int +__block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + conn = S2C(session); + TAILQ_REMOVE(&conn->blockqh, block, q); + + if (block->name != NULL) + __wt_free(session, block->name); + + if (block->fh != NULL) + WT_TRET(__wt_close(session, block->fh)); + + __wt_spin_destroy(session, &block->live_lock); + + __wt_overwrite_and_free(session, block); + + return (ret); +} + +/* + * __wt_block_open -- + * Open a block handle. + */ +int +__wt_block_open(WT_SESSION_IMPL *session, + const char *filename, const char *cfg[], + int forced_salvage, int readonly, uint32_t allocsize, WT_BLOCK **blockp) +{ + WT_BLOCK *block; + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + WT_TRET(__wt_verbose(session, WT_VERB_BLOCK, "open: %s", filename)); + + conn = S2C(session); + *blockp = NULL; + + __wt_spin_lock(session, &conn->block_lock); + TAILQ_FOREACH(block, &conn->blockqh, q) + if (strcmp(filename, block->name) == 0) { + ++block->ref; + *blockp = block; + __wt_spin_unlock(session, &conn->block_lock); + return (0); + } + + /* Basic structure allocation, initialization. */ + WT_ERR(__wt_calloc_def(session, 1, &block)); + block->ref = 1; + TAILQ_INSERT_HEAD(&conn->blockqh, block, q); + + WT_ERR(__wt_strdup(session, filename, &block->name)); + block->allocsize = allocsize; + + WT_ERR(__wt_config_gets(session, cfg, "block_allocation", &cval)); + block->allocfirst = + WT_STRING_MATCH("first", cval.str, cval.len) ? 1 : 0; + + /* Configuration: optional OS buffer cache maximum size. */ + WT_ERR(__wt_config_gets(session, cfg, "os_cache_max", &cval)); + block->os_cache_max = (size_t)cval.val; +#ifdef HAVE_POSIX_FADVISE + if (conn->direct_io && block->os_cache_max) + WT_ERR_MSG(session, EINVAL, + "os_cache_max not supported in combination with direct_io"); +#else + if (block->os_cache_max) + WT_ERR_MSG(session, EINVAL, + "os_cache_max not supported if posix_fadvise not " + "available"); +#endif + + /* Configuration: optional immediate write scheduling flag. */ + WT_ERR(__wt_config_gets(session, cfg, "os_cache_dirty_max", &cval)); + block->os_cache_dirty_max = (size_t)cval.val; +#ifdef HAVE_SYNC_FILE_RANGE + if (conn->direct_io && block->os_cache_dirty_max) + WT_ERR_MSG(session, EINVAL, + "os_cache_dirty_max not supported in combination with " + "direct_io"); +#else + if (block->os_cache_dirty_max) { + /* + * Ignore any setting if it is not supported. + */ + block->os_cache_dirty_max = 0; + WT_ERR(__wt_verbose(session, WT_VERB_BLOCK, + "os_cache_dirty_max ignored when sync_file_range not " + "available")); + } +#endif + + /* Open the underlying file handle. */ + WT_ERR(__wt_open(session, filename, 0, 0, + readonly ? WT_FILE_TYPE_CHECKPOINT : WT_FILE_TYPE_DATA, + &block->fh)); + + /* Initialize the live checkpoint's lock. */ + WT_ERR(__wt_spin_init(session, &block->live_lock, "block manager")); + + /* + * Read the description information from the first block. + * + * Salvage is a special case: if we're forcing the salvage, we don't + * look at anything, including the description information. + */ + if (!forced_salvage) + WT_ERR(__desc_read(session, block)); + + *blockp = block; + __wt_spin_unlock(session, &conn->block_lock); + return (0); + +err: WT_TRET(__block_destroy(session, block)); + __wt_spin_unlock(session, &conn->block_lock); + return (ret); +} + +/* + * __wt_block_close -- + * Close a block handle. + */ +int +__wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + if (block == NULL) /* Safety check */ + return (0); + + conn = S2C(session); + + WT_TRET(__wt_verbose(session, WT_VERB_BLOCK, + "close: %s", block->name == NULL ? "" : block->name )); + + __wt_spin_lock(session, &conn->block_lock); + + /* Reference count is initialized to 1. */ + if (block->ref == 0 || --block->ref == 0) + WT_TRET(__block_destroy(session, block)); + + __wt_spin_unlock(session, &conn->block_lock); + + return (ret); +} + +/* + * __wt_desc_init -- + * Write a file's initial descriptor structure. + */ +int +__wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize) +{ + WT_BLOCK_DESC *desc; + WT_DECL_ITEM(buf); + WT_DECL_RET; + + /* Use a scratch buffer to get correct alignment for direct I/O. */ + WT_RET(__wt_scr_alloc(session, allocsize, &buf)); + memset(buf->mem, 0, allocsize); + + desc = buf->mem; + desc->magic = WT_BLOCK_MAGIC; + desc->majorv = WT_BLOCK_MAJOR_VERSION; + desc->minorv = WT_BLOCK_MINOR_VERSION; + + /* Update the checksum. */ + desc->cksum = 0; + desc->cksum = __wt_cksum(desc, allocsize); + + ret = __wt_write(session, fh, (wt_off_t)0, (size_t)allocsize, desc); + + __wt_scr_free(&buf); + return (ret); +} + +/* + * __desc_read -- + * Read and verify the file's metadata. + */ +static int +__desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + WT_BLOCK_DESC *desc; + WT_DECL_ITEM(buf); + WT_DECL_RET; + uint32_t cksum; + + /* Use a scratch buffer to get correct alignment for direct I/O. */ + WT_RET(__wt_scr_alloc(session, block->allocsize, &buf)); + + /* Read the first allocation-sized block and verify the file format. */ + WT_ERR(__wt_read(session, + block->fh, (wt_off_t)0, (size_t)block->allocsize, buf->mem)); + + desc = buf->mem; + WT_ERR(__wt_verbose(session, WT_VERB_BLOCK, + "%s: magic %" PRIu32 + ", major/minor: %" PRIu32 "/%" PRIu32 + ", checksum %#" PRIx32, + block->name, desc->magic, + desc->majorv, desc->minorv, + desc->cksum)); + + /* + * We fail the open if the checksum fails, or the magic number is wrong + * or the major/minor numbers are unsupported for this version. This + * test is done even if the caller is verifying or salvaging the file: + * it makes sense for verify, and for salvage we don't overwrite files + * without some reason to believe they are WiredTiger files. The user + * may have entered the wrong file name, and is now frantically pounding + * their interrupt key. + */ + cksum = desc->cksum; + desc->cksum = 0; + if (desc->magic != WT_BLOCK_MAGIC || + cksum != __wt_cksum(desc, block->allocsize)) + WT_ERR_MSG(session, WT_ERROR, + "%s does not appear to be a WiredTiger file", block->name); + + if (desc->majorv > WT_BLOCK_MAJOR_VERSION || + (desc->majorv == WT_BLOCK_MAJOR_VERSION && + desc->minorv > WT_BLOCK_MINOR_VERSION)) + WT_ERR_MSG(session, WT_ERROR, + "unsupported WiredTiger file version: this build only " + "supports major/minor versions up to %d/%d, and the file " + "is version %d/%d", + WT_BLOCK_MAJOR_VERSION, WT_BLOCK_MINOR_VERSION, + desc->majorv, desc->minorv); + +err: __wt_scr_free(&buf); + return (ret); +} + +/* + * __wt_block_stat -- + * Block statistics + */ +void +__wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats) +{ + /* + * We're looking inside the live system's structure, which normally + * requires locking: the chances of a corrupted read are probably + * non-existent, and it's statistics information regardless, but it + * isn't like this is a common function for an application to call. + */ + __wt_spin_lock(session, &block->live_lock); + WT_STAT_SET(stats, allocation_size, block->allocsize); + WT_STAT_SET(stats, block_checkpoint_size, block->live.ckpt_size); + WT_STAT_SET(stats, block_magic, WT_BLOCK_MAGIC); + WT_STAT_SET(stats, block_major, WT_BLOCK_MAJOR_VERSION); + WT_STAT_SET(stats, block_minor, WT_BLOCK_MINOR_VERSION); + WT_STAT_SET(stats, block_reuse_bytes, block->live.avail.bytes); + WT_STAT_SET(stats, block_size, block->fh->size); + __wt_spin_unlock(session, &block->live_lock); +} diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c new file mode 100644 index 00000000000..c528ee4a6aa --- /dev/null +++ b/src/third_party/wiredtiger/src/block/block_read.c @@ -0,0 +1,212 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_bm_preload -- + * Pre-load a page. + */ +int +__wt_bm_preload(WT_BM *bm, + WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) +{ + WT_BLOCK *block; + WT_DECL_RET; + wt_off_t offset; + uint32_t cksum, size; + int mapped; + + WT_UNUSED(addr_size); + block = bm->block; + ret = EINVAL; /* Play games due to conditional compilation */ + + /* Crack the cookie. */ + WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); + + /* Check for a mapped block. */ + mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen; + if (mapped) + WT_RET(__wt_mmap_preload( + session, (uint8_t *)bm->map + offset, size)); + else { +#ifdef HAVE_POSIX_FADVISE + ret = posix_fadvise(block->fh->fd, + (wt_off_t)offset, (wt_off_t)size, POSIX_FADV_WILLNEED); +#endif + if (ret != 0) { + WT_DECL_ITEM(tmp); + WT_RET(__wt_scr_alloc(session, size, &tmp)); + ret = __wt_block_read_off( + session, block, tmp, offset, size, cksum); + __wt_scr_free(&tmp); + WT_RET(ret); + } + } + + WT_STAT_FAST_CONN_INCR(session, block_preload); + + return (0); +} + +/* + * __wt_bm_read -- + * Map or read address cookie referenced block into a buffer. + */ +int +__wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, + WT_ITEM *buf, const uint8_t *addr, size_t addr_size) +{ + WT_BLOCK *block; + int mapped; + wt_off_t offset; + uint32_t cksum, size; + + WT_UNUSED(addr_size); + block = bm->block; + + /* Crack the cookie. */ + WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); + + /* + * Map the block if it's possible. + */ + mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen; + if (mapped) { + buf->data = (uint8_t *)bm->map + offset; + buf->size = size; + WT_RET(__wt_mmap_preload(session, buf->data, buf->size)); + + WT_STAT_FAST_CONN_INCR(session, block_map_read); + WT_STAT_FAST_CONN_INCRV(session, block_byte_map_read, size); + return (0); + } + +#ifdef HAVE_DIAGNOSTIC + /* + * In diagnostic mode, verify the block we're about to read isn't on + * the available list, or for live systems, the discard list. + */ + WT_RET(__wt_block_misplaced( + session, block, "read", offset, size, bm->is_live)); +#endif + /* Read the block. */ + WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum)); + +#ifdef HAVE_POSIX_FADVISE + /* Optionally discard blocks from the system's buffer cache. */ + if (block->os_cache_max != 0 && + (block->os_cache += size) > block->os_cache_max) { + WT_DECL_RET; + + block->os_cache = 0; + /* Ignore EINVAL - some file systems don't support the flag. */ + if ((ret = posix_fadvise(block->fh->fd, + (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0 && + ret != EINVAL) + WT_RET_MSG( + session, ret, "%s: posix_fadvise", block->name); + } +#endif + return (0); +} + +#ifdef HAVE_DIAGNOSTIC +/* + * __wt_block_read_off_blind -- + * Read the block at an offset, try to figure out what it looks like, + * debugging only. + */ +int +__wt_block_read_off_blind( + WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset) +{ + WT_BLOCK_HEADER *blk; + uint32_t cksum, size; + + /* + * Make sure the buffer is large enough for the header and read the + * the first allocation-size block. + */ + WT_RET(__wt_buf_init(session, buf, block->allocsize)); + WT_RET(__wt_read( + session, block->fh, offset, (size_t)block->allocsize, buf->mem)); + blk = WT_BLOCK_HEADER_REF(buf->mem); + + /* + * Copy out the size and checksum (we're about to re-use the buffer), + * and if the size isn't insane, read the rest of the block. + */ + size = blk->disk_size; + cksum = blk->cksum; + if (__wt_block_offset_invalid(block, offset, size)) + WT_RET_MSG(session, EINVAL, + "block at offset %" PRIuMAX " cannot be a valid block, no " + "read attempted", + (uintmax_t)offset); + return (__wt_block_read_off(session, block, buf, offset, size, cksum)); +} +#endif + +/* + * __wt_block_read_off -- + * Read an addr/size pair referenced block into a buffer. + */ +int +__wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum) +{ + WT_BLOCK_HEADER *blk; + size_t bufsize; + uint32_t page_cksum; + + WT_RET(__wt_verbose(session, WT_VERB_READ, + "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, + (uintmax_t)offset, size, cksum)); + + /* + * Grow the buffer as necessary and read the block. Buffers should be + * aligned for reading, but there are lots of buffers (for example, file + * cursors have two buffers each, key and value), and it's difficult to + * be sure we've found all of them. If the buffer isn't aligned, it's + * an easy fix: set the flag and guarantee we reallocate it. (Most of + * the time on reads, the buffer memory has not yet been allocated, so + * we're not adding any additional processing time.) + */ + if (F_ISSET(buf, WT_ITEM_ALIGNED)) + bufsize = size; + else { + F_SET(buf, WT_ITEM_ALIGNED); + bufsize = WT_MAX(size, buf->memsize + 10); + } + WT_RET(__wt_buf_init(session, buf, bufsize)); + WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); + buf->size = size; + + blk = WT_BLOCK_HEADER_REF(buf->mem); + blk->cksum = 0; + page_cksum = __wt_cksum(buf->mem, + F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); + if (cksum != page_cksum) { + if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) + __wt_errx(session, + "read checksum error [%" + PRIu32 "B @ %" PRIuMAX ", %" + PRIu32 " != %" PRIu32 "]", + size, (uintmax_t)offset, cksum, page_cksum); + + /* Panic if a checksum fails during an ordinary read. */ + return (block->verify || + F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ? + WT_ERROR : + __wt_illegal_value(session, block->name)); + } + + WT_STAT_FAST_CONN_INCR(session, block_read); + WT_STAT_FAST_CONN_INCRV(session, block_byte_read, size); + return (0); +} diff --git a/src/third_party/wiredtiger/src/block/block_session.c b/src/third_party/wiredtiger/src/block/block_session.c new file mode 100644 index 00000000000..fa56b72f49b --- /dev/null +++ b/src/third_party/wiredtiger/src/block/block_session.c @@ -0,0 +1,305 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * Per session handle cached block manager information. + */ +typedef struct { + WT_EXT *ext_cache; /* List of WT_EXT handles */ + u_int ext_cache_cnt; /* Count */ + + WT_SIZE *sz_cache; /* List of WT_SIZE handles */ + u_int sz_cache_cnt; /* Count */ +} WT_BLOCK_MGR_SESSION; + +/* + * __block_ext_alloc -- + * Allocate a new WT_EXT structure. + */ +static int +__block_ext_alloc(WT_SESSION_IMPL *session, WT_EXT **extp) +{ + WT_EXT *ext; + + u_int skipdepth; + + skipdepth = __wt_skip_choose_depth(session); + WT_RET(__wt_calloc(session, 1, + sizeof(WT_EXT) + skipdepth * 2 * sizeof(WT_EXT *), &ext)); + ext->depth = (uint8_t)skipdepth; + (*extp) = ext; + + return (0); +} + +/* + * __wt_block_ext_alloc -- + * Return a WT_EXT structure for use. + */ +int +__wt_block_ext_alloc(WT_SESSION_IMPL *session, WT_EXT **extp) +{ + WT_EXT *ext; + WT_BLOCK_MGR_SESSION *bms; + u_int i; + + bms = session->block_manager; + + /* Return a WT_EXT structure for use from a cached list. */ + if (bms != NULL && bms->ext_cache != NULL) { + ext = bms->ext_cache; + bms->ext_cache = ext->next[0]; + + /* Clear any left-over references. */ + for (i = 0; i < ext->depth; ++i) + ext->next[i] = ext->next[i + ext->depth] = NULL; + + /* + * The count is advisory to minimize our exposure to bugs, but + * don't let it go negative. + */ + if (bms->ext_cache_cnt > 0) + --bms->ext_cache_cnt; + + *extp = ext; + return (0); + } + + return (__block_ext_alloc(session, extp)); +} + +/* + * __block_ext_prealloc -- + * Pre-allocate WT_EXT structures. + */ +static int +__block_ext_prealloc(WT_SESSION_IMPL *session, u_int max) +{ + WT_BLOCK_MGR_SESSION *bms; + WT_EXT *ext; + + bms = session->block_manager; + + for (; bms->ext_cache_cnt < max; ++bms->ext_cache_cnt) { + WT_RET(__block_ext_alloc(session, &ext)); + + ext->next[0] = bms->ext_cache; + bms->ext_cache = ext; + } + return (0); +} + +/* + * __wt_block_ext_free -- + * Add a WT_EXT structure to the cached list. + */ +void +__wt_block_ext_free(WT_SESSION_IMPL *session, WT_EXT *ext) +{ + WT_BLOCK_MGR_SESSION *bms; + + if ((bms = session->block_manager) == NULL) + __wt_free(session, ext); + else { + ext->next[0] = bms->ext_cache; + bms->ext_cache = ext; + + ++bms->ext_cache_cnt; + } +} + +/* + * __block_ext_discard -- + * Discard some or all of the WT_EXT structure cache. + */ +static int +__block_ext_discard(WT_SESSION_IMPL *session, u_int max) +{ + WT_BLOCK_MGR_SESSION *bms; + WT_EXT *ext, *next; + + bms = session->block_manager; + if (max != 0 && bms->ext_cache_cnt <= max) + return (0); + + for (ext = bms->ext_cache; ext != NULL;) { + next = ext->next[0]; + __wt_free(session, ext); + ext = next; + + --bms->ext_cache_cnt; + if (max != 0 && bms->ext_cache_cnt <= max) + break; + } + bms->ext_cache = ext; + + if (max == 0 && bms->ext_cache_cnt != 0) + WT_RET_MSG(session, WT_ERROR, + "incorrect count in session handle's block manager cache"); + return (0); +} + +/* + * __block_size_alloc -- + * Allocate a new WT_SIZE structure. + */ +static int +__block_size_alloc(WT_SESSION_IMPL *session, WT_SIZE **szp) +{ + return (__wt_calloc(session, 1, sizeof(WT_SIZE), szp)); +} + +/* + * __wt_block_size_alloc -- + * Return a WT_SIZE structure for use. + */ +int +__wt_block_size_alloc(WT_SESSION_IMPL *session, WT_SIZE **szp) +{ + WT_BLOCK_MGR_SESSION *bms; + + bms = session->block_manager; + + /* Return a WT_SIZE structure for use from a cached list. */ + if (bms != NULL && bms->sz_cache != NULL) { + (*szp) = bms->sz_cache; + bms->sz_cache = bms->sz_cache->next[0]; + + /* + * The count is advisory to minimize our exposure to bugs, but + * don't let it go negative. + */ + if (bms->sz_cache_cnt > 0) + --bms->sz_cache_cnt; + return (0); + } + + return (__block_size_alloc(session, szp)); +} + +/* + * __block_size_prealloc -- + * Pre-allocate WT_SIZE structures. + */ +static int +__block_size_prealloc(WT_SESSION_IMPL *session, u_int max) +{ + WT_BLOCK_MGR_SESSION *bms; + WT_SIZE *sz; + + bms = session->block_manager; + + for (; bms->sz_cache_cnt < max; ++bms->sz_cache_cnt) { + WT_RET(__block_size_alloc(session, &sz)); + + sz->next[0] = bms->sz_cache; + bms->sz_cache = sz; + } + return (0); +} + +/* + * __wt_block_size_free -- + * Add a WT_SIZE structure to the cached list. + */ +void +__wt_block_size_free(WT_SESSION_IMPL *session, WT_SIZE *sz) +{ + WT_BLOCK_MGR_SESSION *bms; + + if ((bms = session->block_manager) == NULL) + __wt_free(session, sz); + else { + sz->next[0] = bms->sz_cache; + bms->sz_cache = sz; + + ++bms->sz_cache_cnt; + } +} + +/* + * __block_size_discard -- + * Discard some or all of the WT_SIZE structure cache. + */ +static int +__block_size_discard(WT_SESSION_IMPL *session, u_int max) +{ + WT_BLOCK_MGR_SESSION *bms; + WT_SIZE *sz, *nsz; + + bms = session->block_manager; + if (max != 0 && bms->sz_cache_cnt <= max) + return (0); + + for (sz = bms->sz_cache; sz != NULL;) { + nsz = sz->next[0]; + __wt_free(session, sz); + sz = nsz; + + --bms->sz_cache_cnt; + if (max != 0 && bms->sz_cache_cnt <= max) + break; + } + bms->sz_cache = sz; + + if (max == 0 && bms->sz_cache_cnt != 0) + WT_RET_MSG(session, WT_ERROR, + "incorrect count in session handle's block manager cache"); + return (0); +} + +/* + * __block_manager_session_cleanup -- + * Clean up the session handle's block manager information. + */ +static int +__block_manager_session_cleanup(WT_SESSION_IMPL *session) +{ + WT_DECL_RET; + + if (session->block_manager == NULL) + return (0); + + WT_TRET(__block_ext_discard(session, 0)); + WT_TRET(__block_size_discard(session, 0)); + + __wt_free(session, session->block_manager); + + return (ret); +} + +/* + * __wt_block_ext_prealloc -- + * Pre-allocate WT_EXT and WT_SIZE structures. + */ +int +__wt_block_ext_prealloc(WT_SESSION_IMPL *session, u_int max) +{ + if (session->block_manager == NULL) { + WT_RET(__wt_calloc(session, 1, + sizeof(WT_BLOCK_MGR_SESSION), &session->block_manager)); + session->block_manager_cleanup = + __block_manager_session_cleanup; + } + WT_RET(__block_ext_prealloc(session, max)); + WT_RET(__block_size_prealloc(session, max)); + return (0); +} + +/* + * __wt_block_ext_discard -- + * Discard WT_EXT and WT_SIZE structures after checkpoint runs. + */ +int +__wt_block_ext_discard(WT_SESSION_IMPL *session, u_int max) +{ + WT_RET(__block_ext_discard(session, max)); + WT_RET(__block_size_discard(session, max)); + return (0); +} diff --git a/src/third_party/wiredtiger/src/block/block_slvg.c b/src/third_party/wiredtiger/src/block/block_slvg.c new file mode 100644 index 00000000000..349daa620f5 --- /dev/null +++ b/src/third_party/wiredtiger/src/block/block_slvg.c @@ -0,0 +1,190 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_block_salvage_start -- + * Start a file salvage. + */ +int +__wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + wt_off_t len; + uint32_t allocsize; + + allocsize = block->allocsize; + + /* Reset the description information in the first block. */ + WT_RET(__wt_desc_init(session, block->fh, allocsize)); + + /* + * Salvage creates a new checkpoint when it's finished, set up for + * rolling an empty file forward. + */ + WT_RET(__wt_block_ckpt_init(session, &block->live, "live")); + + /* + * Truncate the file to an allocation-size multiple of blocks (bytes + * trailing the last block must be garbage, by definition). + */ + if (block->fh->size > allocsize) { + len = (block->fh->size / allocsize) * allocsize; + if (len != block->fh->size) + WT_RET(__wt_ftruncate(session, block->fh, len)); + } else + len = allocsize; + block->live.file_size = len; + + /* + * The file's first allocation-sized block is description information, + * skip it when reading through the file. + */ + block->slvg_off = allocsize; + + /* + * The only checkpoint extent we care about is the allocation list. + * Start with the entire file on the allocation list, we'll "free" + * any blocks we don't want as we process the file. + */ + WT_RET(__wt_block_insert_ext( + session, &block->live.alloc, allocsize, len - allocsize)); + + return (0); +} + +/* + * __wt_block_salvage_end -- + * End a file salvage. + */ +int +__wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + /* Discard the checkpoint. */ + return (__wt_block_checkpoint_unload(session, block, 0)); +} + +/* + * __wt_block_offset_invalid -- + * Return if the block offset is insane. + */ +int +__wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size) +{ + if (size == 0) /* < minimum page size */ + return (1); + if (size % block->allocsize != 0) /* not allocation-size units */ + return (1); + if (size > WT_BTREE_PAGE_SIZE_MAX) /* > maximum page size */ + return (1); + /* past end-of-file */ + if (offset + (wt_off_t)size > block->fh->size) + return (1); + return (0); +} + +/* + * __wt_block_salvage_next -- + * Return the address for the next potential block from the file. + */ +int +__wt_block_salvage_next(WT_SESSION_IMPL *session, + WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, int *eofp) +{ + WT_BLOCK_HEADER *blk; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_FH *fh; + wt_off_t max, offset; + uint32_t allocsize, cksum, size; + uint8_t *endp; + + *eofp = 0; + + fh = block->fh; + allocsize = block->allocsize; + WT_ERR(__wt_scr_alloc(session, allocsize, &tmp)); + + /* Read through the file, looking for pages. */ + for (max = fh->size;;) { + offset = block->slvg_off; + if (offset >= max) { /* Check eof. */ + *eofp = 1; + goto done; + } + + /* + * Read the start of a possible page (an allocation-size block), + * and get a page length from it. Move to the next allocation + * sized boundary, we'll never consider this one again. + */ + WT_ERR(__wt_read( + session, fh, offset, (size_t)allocsize, tmp->mem)); + blk = WT_BLOCK_HEADER_REF(tmp->mem); + size = blk->disk_size; + cksum = blk->cksum; + + /* + * Check the block size: if it's not insane, read the block. + * Reading the block validates any checksum; if reading the + * block succeeds, return its address as a possible page, + * otherwise, move past it. + */ + if (!__wt_block_offset_invalid(block, offset, size) && + __wt_block_read_off( + session, block, tmp, offset, size, cksum) == 0) + break; + + /* Free the allocation-size block. */ + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "skipping %" PRIu32 "B at file offset %" PRIuMAX, + allocsize, (uintmax_t)offset)); + WT_ERR(__wt_block_off_free( + session, block, offset, (wt_off_t)allocsize)); + block->slvg_off += allocsize; + } + + /* Re-create the address cookie that should reference this block. */ + endp = addr; + WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); + *addr_sizep = WT_PTRDIFF(endp, addr); + +done: +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __wt_block_salvage_valid -- + * Let salvage know if a block is valid. + */ +int +__wt_block_salvage_valid(WT_SESSION_IMPL *session, + WT_BLOCK *block, uint8_t *addr, size_t addr_size, int valid) +{ + wt_off_t offset; + uint32_t size, cksum; + + WT_UNUSED(session); + WT_UNUSED(addr_size); + + /* + * Crack the cookie. + * If the upper layer took the block, move past it; if the upper layer + * rejected the block, move past an allocation size chunk and free it. + */ + WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); + if (valid) + block->slvg_off = offset + size; + else { + WT_RET(__wt_block_off_free( + session, block, offset, (wt_off_t)block->allocsize)); + block->slvg_off = offset + block->allocsize; + } + + return (0); +} diff --git a/src/third_party/wiredtiger/src/block/block_vrfy.c b/src/third_party/wiredtiger/src/block/block_vrfy.c new file mode 100644 index 00000000000..148b4fa9743 --- /dev/null +++ b/src/third_party/wiredtiger/src/block/block_vrfy.c @@ -0,0 +1,514 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __verify_ckptfrag_add( + WT_SESSION_IMPL *, WT_BLOCK *, wt_off_t, wt_off_t); +static int __verify_ckptfrag_chk(WT_SESSION_IMPL *, WT_BLOCK *); +static int __verify_filefrag_add( + WT_SESSION_IMPL *, WT_BLOCK *, const char *, wt_off_t, wt_off_t, int); +static int __verify_filefrag_chk(WT_SESSION_IMPL *, WT_BLOCK *); +static int __verify_last_avail(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *); +static int __verify_last_truncate(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *); + +/* The bit list ignores the first block: convert to/from a frag/offset. */ +#define WT_wt_off_tO_FRAG(block, off) \ + ((off) / (block)->allocsize - 1) +#define WT_FRAG_TO_OFF(block, frag) \ + (((wt_off_t)(frag + 1)) * (block)->allocsize) + +/* + * __wt_block_verify_start -- + * Start file verification. + */ +int +__wt_block_verify_start( + WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) +{ + WT_CKPT *ckpt; + wt_off_t size; + + /* + * Find the last checkpoint in the list: if there are none, or the only + * checkpoint we have is fake, there's no work to do. Don't complain, + * that's not our problem to solve. + */ + WT_CKPT_FOREACH(ckptbase, ckpt) + ; + for (;; --ckpt) { + if (ckpt->name != NULL && !F_ISSET(ckpt, WT_CKPT_FAKE)) + break; + if (ckpt == ckptbase) + return (0); + } + + /* Truncate the file to the size of the last checkpoint. */ + WT_RET(__verify_last_truncate(session, block, ckpt)); + + /* + * We're done if the file has no data pages (this happens if we verify + * a file immediately after creation or the checkpoint doesn't reflect + * any of the data pages). + */ + size = block->fh->size; + if (size <= block->allocsize) + return (0); + + /* The file size should be a multiple of the allocation size. */ + if (size % block->allocsize != 0) + WT_RET_MSG(session, WT_ERROR, + "the file size is not a multiple of the allocation size"); + + /* + * Allocate a bit array, where each bit represents a single allocation + * size piece of the file (this is how we track the parts of the file + * we've verified, and check for multiply referenced or unreferenced + * blocks). Storing this on the heap seems reasonable, verifying a 1TB + * file with an 512B allocation size would require a 256MB bit array: + * + * (((1 * 2^40) / 512) / 8) = 256 * 2^20 + * + * To verify larger files than we can handle in this way, we'd have to + * write parts of the bit array into a disk file. + * + * Alternatively, we could switch to maintaining ranges of the file as + * we do with the extents, but that has its own failure mode, where we + * verify many non-contiguous blocks creating too many entries on the + * list to fit into memory. + */ + block->frags = (uint64_t)WT_wt_off_tO_FRAG(block, size); + WT_RET(__bit_alloc(session, block->frags, &block->fragfile)); + + /* + * We maintain an allocation list that is rolled forward through the + * set of checkpoints. + */ + WT_RET(__wt_block_extlist_init( + session, &block->verify_alloc, "verify", "alloc", 0)); + + /* + * The only checkpoint avail list we care about is the last one written; + * get it now and initialize the list of file fragments. + */ + WT_RET(__verify_last_avail(session, block, ckpt)); + + block->verify = 1; + return (0); +} + +/* + * __verify_last_avail -- + * Get the last checkpoint's avail list and load it into the list of file + * fragments. + */ +static int +__verify_last_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt) +{ + WT_BLOCK_CKPT *ci, _ci; + WT_DECL_RET; + WT_EXT *ext; + WT_EXTLIST *el; + + ci = &_ci; + WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name)); + WT_ERR(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci)); + + el = &ci->avail; + if (el->offset != WT_BLOCK_INVALID_OFFSET) { + WT_ERR(__wt_block_extlist_read_avail( + session, block, el, ci->file_size)); + WT_EXT_FOREACH(ext, el->off) + if ((ret = __verify_filefrag_add(session, block, + "avail-list chunk", ext->off, ext->size, 1)) != 0) + break; + } + +err: __wt_block_ckpt_destroy(session, ci); + return (ret); +} + +/* + * __verify_last_truncate -- + * Truncate the file to the last checkpoint's size. + */ +static int +__verify_last_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt) +{ + WT_BLOCK_CKPT *ci, _ci; + WT_DECL_RET; + + ci = &_ci; + WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name)); + WT_ERR(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci)); + WT_ERR(__wt_ftruncate(session, block->fh, ci->file_size)); + +err: __wt_block_ckpt_destroy(session, ci); + return (ret); +} + +/* + * __wt_block_verify_end -- + * End file verification. + */ +int +__wt_block_verify_end(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + WT_DECL_RET; + + /* Confirm we verified every file block. */ + ret = __verify_filefrag_chk(session, block); + + /* Discard the accumulated allocation list. */ + __wt_block_extlist_free(session, &block->verify_alloc); + + /* Discard the fragment tracking lists. */ + __wt_free(session, block->fragfile); + __wt_free(session, block->fragckpt); + + block->verify = 0; + return (ret); +} + +/* + * __wt_verify_ckpt_load -- + * Verify work done when a checkpoint is loaded. + */ +int +__wt_verify_ckpt_load( + WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci) +{ + WT_EXTLIST *el; + WT_EXT *ext; + uint64_t frag, frags; + + /* Set the maximum file size for this checkpoint. */ + block->verify_size = ci->file_size; + + /* + * Add the root page and disk blocks used to store the extent lists to + * the list of blocks we've "seen" from the file. + */ + if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) + WT_RET(__verify_filefrag_add(session, block, "checkpoint", + ci->root_offset, (wt_off_t)ci->root_size, 1)); + if (ci->alloc.offset != WT_BLOCK_INVALID_OFFSET) + WT_RET(__verify_filefrag_add(session, block, "alloc list", + ci->alloc.offset, (wt_off_t)ci->alloc.size, 1)); + if (ci->avail.offset != WT_BLOCK_INVALID_OFFSET) + WT_RET(__verify_filefrag_add(session, block, "avail list", + ci->avail.offset, (wt_off_t)ci->avail.size, 1)); + if (ci->discard.offset != WT_BLOCK_INVALID_OFFSET) + WT_RET(__verify_filefrag_add(session, block, "discard list", + ci->discard.offset, (wt_off_t)ci->discard.size, 1)); + + /* + * Checkpoint verification is similar to deleting checkpoints. As we + * read each new checkpoint, we merge the allocation lists (accumulating + * all allocated pages as we move through the system), and then remove + * any pages found in the discard list. The result should be a + * one-to-one mapping to the pages we find in this specific checkpoint. + */ + el = &ci->alloc; + if (el->offset != WT_BLOCK_INVALID_OFFSET) { + WT_RET(__wt_block_extlist_read( + session, block, el, ci->file_size)); + WT_RET(__wt_block_extlist_merge( + session, el, &block->verify_alloc)); + __wt_block_extlist_free(session, el); + } + el = &ci->discard; + if (el->offset != WT_BLOCK_INVALID_OFFSET) { + WT_RET(__wt_block_extlist_read( + session, block, el, ci->file_size)); + WT_EXT_FOREACH(ext, el->off) + WT_RET(__wt_block_off_remove_overlap(session, + &block->verify_alloc, ext->off, ext->size)); + __wt_block_extlist_free(session, el); + } + + /* + * The root page of the checkpoint appears on the alloc list, but not, + * at least until the checkpoint is deleted, on a discard list. To + * handle this case, remove the root page from the accumulated list of + * checkpoint pages, so it doesn't add a new requirement for subsequent + * checkpoints. + */ + if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) + WT_RET(__wt_block_off_remove_overlap(session, + &block->verify_alloc, ci->root_offset, ci->root_size)); + + /* + * Allocate the per-checkpoint bit map. The per-checkpoint bit map is + * the opposite of the per-file bit map, that is, we set all the bits + * that we expect to be set based on the checkpoint's allocation and + * discard lists, then clear bits as we verify blocks. When finished + * verifying the checkpoint, the bit list should be empty. + */ + WT_RET(__bit_alloc(session, block->frags, &block->fragckpt)); + el = &block->verify_alloc; + WT_EXT_FOREACH(ext, el->off) { + frag = (uint64_t)WT_wt_off_tO_FRAG(block, ext->off); + frags = (uint64_t)(ext->size / block->allocsize); + __bit_nset(block->fragckpt, frag, frag + (frags - 1)); + } + + return (0); +} + +/* + * __wt_verify_ckpt_unload -- + * Verify work done when a checkpoint is unloaded. + */ +int +__wt_verify_ckpt_unload(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + WT_DECL_RET; + + /* Confirm we verified every checkpoint block. */ + ret = __verify_ckptfrag_chk(session, block); + + /* Discard the per-checkpoint fragment list. */ + __wt_free(session, block->fragckpt); + + return (ret); +} + +/* + * __wt_block_verify_addr -- + * Update an address in a checkpoint as verified. + */ +int +__wt_block_verify_addr(WT_SESSION_IMPL *session, + WT_BLOCK *block, const uint8_t *addr, size_t addr_size) +{ + wt_off_t offset; + uint32_t cksum, size; + + WT_UNUSED(addr_size); + + /* Crack the cookie. */ + WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); + + /* Add to the per-file list. */ + WT_RET(__verify_filefrag_add(session, block, NULL, offset, size, 0)); + + /* + * It's tempting to try and flag a page as "verified" when we read it. + * That doesn't work because we may visit a page multiple times when + * verifying a single checkpoint (for example, when verifying the + * physical image of a row-store leaf page with overflow keys, the + * overflow keys are read when checking for key sort issues, and read + * again when more general overflow item checking is done). This + * function is called by the btree verification code, once per logical + * visit in a checkpoint, so we can detect if a page is referenced + * multiple times within a single checkpoint. This doesn't apply to + * the per-file list, because it is expected for the same btree blocks + * to appear in multiple checkpoints. + * + * Add the block to the per-checkpoint list. + */ + WT_RET(__verify_ckptfrag_add(session, block, offset, size)); + + return (0); +} + +/* + * __verify_filefrag_add -- + * Add the fragments to the per-file fragment list, optionally complain if + * we've already verified this chunk of the file. + */ +static int +__verify_filefrag_add(WT_SESSION_IMPL *session, WT_BLOCK *block, + const char *type, wt_off_t offset, wt_off_t size, int nodup) +{ + uint64_t f, frag, frags, i; + + WT_RET(__wt_verbose(session, WT_VERB_VERIFY, + "add file block%s%s%s at %" PRIuMAX "-%" PRIuMAX " (%" PRIuMAX ")", + type == NULL ? "" : " (", + type == NULL ? "" : type, + type == NULL ? "" : ")", + (uintmax_t)offset, (uintmax_t)(offset + size), (uintmax_t)size)); + + /* Check each chunk against the total file size. */ + if (offset + size > block->fh->size) + WT_RET_MSG(session, WT_ERROR, + "fragment %" PRIuMAX "-%" PRIuMAX " references " + "non-existent file blocks", + (uintmax_t)offset, (uintmax_t)(offset + size)); + + frag = (uint64_t)WT_wt_off_tO_FRAG(block, offset); + frags = (uint64_t)(size / block->allocsize); + + /* It may be illegal to reference a particular chunk more than once. */ + if (nodup) + for (f = frag, i = 0; i < frags; ++f, ++i) + if (__bit_test(block->fragfile, f)) + WT_RET_MSG(session, WT_ERROR, + "file fragment at %" PRIuMAX " referenced " + "multiple times", + (uintmax_t)offset); + + /* Add fragments to the file's fragment list. */ + __bit_nset(block->fragfile, frag, frag + (frags - 1)); + + return (0); +} + +/* + * __verify_filefrag_chk -- + * Verify we've checked all the fragments in the file. + */ +static int +__verify_filefrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + uint64_t count, first, last; + + /* If there's nothing to verify, it was a fast run. */ + if (block->frags == 0) + return (0); + + /* + * It's OK if we have not verified blocks at the end of the file: that + * happens if the file is truncated during a checkpoint or load or was + * extended after writing a checkpoint. We should never see unverified + * blocks anywhere else, though. + * + * I'm deliberately testing for a last fragment of 0, it makes no sense + * there would be no fragments verified, complain if the first fragment + * in the file wasn't verified. + */ + for (last = block->frags - 1; last != 0; --last) { + if (__bit_test(block->fragfile, last)) + break; + __bit_set(block->fragfile, last); + } + + /* + * Check for any other file fragments we haven't verified -- every time + * we find a bit that's clear, complain. We re-start the search each + * time after setting the clear bit(s) we found: it's simpler and this + * isn't supposed to happen a lot. + */ + for (count = 0;; ++count) { + if (__bit_ffc(block->fragfile, block->frags, &first) != 0) + break; + __bit_set(block->fragfile, first); + for (last = first + 1; last < block->frags; ++last) { + if (__bit_test(block->fragfile, last)) + break; + __bit_set(block->fragfile, last); + } + + if (!WT_VERBOSE_ISSET(session, WT_VERB_VERIFY)) + continue; + + __wt_errx(session, + "file range %" PRIuMAX "-%" PRIuMAX " never verified", + (uintmax_t)WT_FRAG_TO_OFF(block, first), + (uintmax_t)WT_FRAG_TO_OFF(block, last)); + } + if (count == 0) + return (0); + + __wt_errx(session, "file ranges never verified: %" PRIu64, count); + return (WT_ERROR); +} + +/* + * __verify_ckptfrag_add -- + * Clear the fragments in the per-checkpoint fragment list, and complain if + * we've already verified this chunk of the checkpoint. + */ +static int +__verify_ckptfrag_add( + WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size) +{ + uint64_t f, frag, frags, i; + + WT_RET(__wt_verbose(session, WT_VERB_VERIFY, + "add checkpoint block at %" PRIuMAX "-%" PRIuMAX " (%" PRIuMAX ")", + (uintmax_t)offset, (uintmax_t)(offset + size), (uintmax_t)size)); + + /* + * Check each chunk against the checkpoint's size, a checkpoint should + * never reference a block outside of the checkpoint's stored size. + */ + if (offset + size > block->verify_size) + WT_RET_MSG(session, WT_ERROR, + "fragment %" PRIuMAX "-%" PRIuMAX " references " + "file blocks outside the checkpoint", + (uintmax_t)offset, (uintmax_t)(offset + size)); + + frag = (uint64_t)WT_wt_off_tO_FRAG(block, offset); + frags = (uint64_t)(size / block->allocsize); + + /* It is illegal to reference a particular chunk more than once. */ + for (f = frag, i = 0; i < frags; ++f, ++i) + if (!__bit_test(block->fragckpt, f)) + WT_RET_MSG(session, WT_ERROR, + "fragment at %" PRIuMAX " referenced multiple " + "times in a single checkpoint or found in the " + "checkpoint but not listed in the checkpoint's " + "allocation list", + (uintmax_t)offset); + + /* Remove fragments from the checkpoint's allocation list. */ + __bit_nclr(block->fragckpt, frag, frag + (frags - 1)); + + return (0); +} + +/* + * __verify_ckptfrag_chk -- + * Verify we've checked all the fragments in the checkpoint. + */ +static int +__verify_ckptfrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + uint64_t count, first, last; + + /* + * The checkpoint fragment memory is only allocated as a checkpoint + * is successfully loaded; don't check if there's nothing there. + */ + if (block->fragckpt == NULL) + return (0); + + /* + * Check for checkpoint fragments we haven't verified -- every time we + * find a bit that's set, complain. We re-start the search each time + * after clearing the set bit(s) we found: it's simpler and this isn't + * supposed to happen a lot. + */ + for (count = 0;; ++count) { + if (__bit_ffs(block->fragckpt, block->frags, &first) != 0) + break; + __bit_clear(block->fragckpt, first); + for (last = first + 1; last < block->frags; ++last) { + if (!__bit_test(block->fragckpt, last)) + break; + __bit_clear(block->fragckpt, last); + } + + if (!WT_VERBOSE_ISSET(session, WT_VERB_VERIFY)) + continue; + + __wt_errx(session, + "checkpoint range %" PRIuMAX "-%" PRIuMAX " never verified", + (uintmax_t)WT_FRAG_TO_OFF(block, first), + (uintmax_t)WT_FRAG_TO_OFF(block, last)); + } + + if (count == 0) + return (0); + + __wt_errx(session, + "checkpoint ranges never verified: %" PRIu64, count); + return (WT_ERROR); +} diff --git a/src/third_party/wiredtiger/src/block/block_write.c b/src/third_party/wiredtiger/src/block/block_write.c new file mode 100644 index 00000000000..0da6380e61f --- /dev/null +++ b/src/third_party/wiredtiger/src/block/block_write.c @@ -0,0 +1,269 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_block_header -- + * Return the size of the block-specific header. + */ +u_int +__wt_block_header(WT_BLOCK *block) +{ + WT_UNUSED(block); + + return ((u_int)WT_BLOCK_HEADER_SIZE); +} + +/* + * __wt_block_write_size -- + * Return the buffer size required to write a block. + */ +int +__wt_block_write_size(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t *sizep) +{ + WT_UNUSED(session); + + /* + * We write the page size, in bytes, into the block's header as a 4B + * unsigned value, and it's possible for the engine to accept an item + * we can't write. For example, a huge key/value where the allocation + * size has been set to something large will overflow 4B when it tries + * to align the write. We could make this work (for example, writing + * the page size in units of allocation size or something else), but + * it's not worth the effort, writing 4GB objects into a btree makes + * no sense. Limit the writes to (4GB - 1KB), it gives us potential + * mode bits, and I'm not interested in debugging corner cases anyway. + */ + *sizep = (size_t) + WT_ALIGN(*sizep + WT_BLOCK_HEADER_BYTE_SIZE, block->allocsize); + return (*sizep > UINT32_MAX - 1024 ? EINVAL : 0); +} + +/* + * __wt_block_write -- + * Write a buffer into a block, returning the block's address cookie. + */ +int +__wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int data_cksum) +{ + wt_off_t offset; + uint32_t size, cksum; + uint8_t *endp; + + WT_RET(__wt_block_write_off( + session, block, buf, &offset, &size, &cksum, data_cksum, 0)); + + endp = addr; + WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); + *addr_sizep = WT_PTRDIFF(endp, addr); + + return (0); +} + +/* + * __wt_block_write_off -- + * Write a buffer into a block, returning the block's offset, size and + * checksum. + */ +int +__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, + int data_cksum, int caller_locked) +{ + WT_BLOCK_HEADER *blk; + WT_DECL_RET; + WT_FH *fh; + size_t align_size; + wt_off_t offset; + int local_locked; + + blk = WT_BLOCK_HEADER_REF(buf->mem); + fh = block->fh; + local_locked = 0; + + /* Buffers should be aligned for writing. */ + if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { + WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); + WT_RET_MSG(session, EINVAL, + "direct I/O check: write buffer incorrectly allocated"); + } + + /* + * Align the size to an allocation unit. + * + * The buffer must be big enough for us to zero to the next allocsize + * boundary, this is one of the reasons the btree layer must find out + * from the block-manager layer the maximum size of the eventual write. + */ + align_size = WT_ALIGN(buf->size, block->allocsize); + if (align_size > buf->memsize) { + WT_ASSERT(session, align_size <= buf->memsize); + WT_RET_MSG(session, EINVAL, + "buffer size check: write buffer incorrectly allocated"); + } + if (align_size > UINT32_MAX) { + WT_ASSERT(session, align_size <= UINT32_MAX); + WT_RET_MSG(session, EINVAL, + "buffer size check: write buffer too large to write"); + } + + /* Zero out any unused bytes at the end of the buffer. */ + memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); + + /* + * Set the disk size so we don't have to incrementally read blocks + * during salvage. + */ + blk->disk_size = WT_STORE_SIZE(align_size); + + /* + * Update the block's checksum: if our caller specifies, checksum the + * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP + * bytes. The assumption is applications with good compression support + * turn off checksums and assume corrupted blocks won't decompress + * correctly. However, if compression failed to shrink the block, the + * block wasn't compressed, in which case our caller will tell us to + * checksum the data to detect corruption. If compression succeeded, + * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes + * because they're not compressed, both to give salvage a quick test + * of whether a block is useful and to give us a test so we don't lose + * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. + */ + blk->flags = 0; + if (data_cksum) + F_SET(blk, WT_BLOCK_DATA_CKSUM); + blk->cksum = 0; + blk->cksum = __wt_cksum( + buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); + + if (!caller_locked) { + WT_RET(__wt_block_ext_prealloc(session, 5)); + __wt_spin_lock(session, &block->live_lock); + local_locked = 1; + } + ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); + + /* + * Extend the file in chunks. We want to limit the number of threads + * extending the file at the same time, so choose the one thread that's + * crossing the extended boundary. We don't extend newly created files, + * and it's theoretically possible we might wait so long our extension + * of the file is passed by another thread writing single blocks, that's + * why there's a check in case the extended file size becomes too small: + * if the file size catches up, every thread tries to extend it. + * + * File extension may require locking: some variants of the system call + * used to extend the file initialize the extended space. If a writing + * thread races with the extending thread, the extending thread might + * overwrite already written data, and that would be very, very bad. + * + * Some variants of the system call to extend the file fail at run-time + * based on the filesystem type, fall back to ftruncate in that case, + * and remember that ftruncate requires locking. + */ + if (ret == 0 && + fh->extend_len != 0 && + (fh->extend_size <= fh->size || + (offset + fh->extend_len <= fh->extend_size && + offset + + fh->extend_len + (wt_off_t)align_size >= fh->extend_size))) { + fh->extend_size = offset + fh->extend_len * 2; + if (fh->fallocate_available) { + /* + * Release any locally acquired lock if it's not needed + * to extend the file, extending the file might require + * updating file metadata, which can be slow. (It may be + * a bad idea to configure for file extension on systems + * that require locking over the extend call.) + */ + if (!fh->fallocate_requires_locking && local_locked) { + __wt_spin_unlock(session, &block->live_lock); + local_locked = 0; + } + + /* Extend the file. */ + if ((ret = __wt_fallocate(session, + fh, offset, fh->extend_len * 2)) == ENOTSUP) { + ret = 0; + goto extend_truncate; + } + } else { +extend_truncate: /* + * We may have a caller lock or a locally acquired lock, + * but we need a lock to call ftruncate. + */ + if (!caller_locked && local_locked == 0) { + __wt_spin_lock(session, &block->live_lock); + local_locked = 1; + } + /* + * The truncate might fail if there's a file mapping + * (if there's an open checkpoint on the file), that's + * OK. + */ + if ((ret = __wt_ftruncate( + session, fh, offset + fh->extend_len * 2)) == EBUSY) + ret = 0; + } + } + /* Release any locally acquired lock. */ + if (local_locked) { + __wt_spin_unlock(session, &block->live_lock); + local_locked = 0; + } + WT_RET(ret); + + /* Write the block. */ + if ((ret = + __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { + if (!caller_locked) + __wt_spin_lock(session, &block->live_lock); + WT_TRET(__wt_block_off_free( + session, block, offset, (wt_off_t)align_size)); + if (!caller_locked) + __wt_spin_unlock(session, &block->live_lock); + WT_RET(ret); + } + +#ifdef HAVE_SYNC_FILE_RANGE + /* + * Optionally schedule writes for dirty pages in the system buffer + * cache, but only if the current session can wait. + */ + if (block->os_cache_dirty_max != 0 && + (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && + __wt_session_can_wait(session)) { + block->os_cache_dirty = 0; + WT_RET(__wt_fsync_async(session, fh)); + } +#endif +#ifdef HAVE_POSIX_FADVISE + /* Optionally discard blocks from the system buffer cache. */ + if (block->os_cache_max != 0 && + (block->os_cache += align_size) > block->os_cache_max) { + block->os_cache = 0; + if ((ret = posix_fadvise(fh->fd, + (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0) + WT_RET_MSG( + session, ret, "%s: posix_fadvise", block->name); + } +#endif + WT_STAT_FAST_CONN_INCR(session, block_write); + WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size); + + WT_RET(__wt_verbose(session, WT_VERB_WRITE, + "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, + (uintmax_t)offset, (uintmax_t)align_size, blk->cksum)); + + *offsetp = offset; + *sizep = WT_STORE_SIZE(align_size); + *cksump = blk->cksum; + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/bloom/bloom.c b/src/third_party/wiredtiger/src/bloom/bloom.c new file mode 100644 index 00000000000..8c8c8bc723e --- /dev/null +++ b/src/third_party/wiredtiger/src/bloom/bloom.c @@ -0,0 +1,351 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +#define WT_BLOOM_TABLE_CONFIG "key_format=r,value_format=1t,exclusive=true" + +/* + * __bloom_init -- + * Allocate a WT_BLOOM handle. + */ +static int +__bloom_init(WT_SESSION_IMPL *session, + const char *uri, const char *config, WT_BLOOM **bloomp) +{ + WT_BLOOM *bloom; + WT_DECL_RET; + size_t len; + + *bloomp = NULL; + + WT_RET(__wt_calloc_def(session, 1, &bloom)); + + WT_ERR(__wt_strdup(session, uri, &bloom->uri)); + len = strlen(WT_BLOOM_TABLE_CONFIG) + 2; + if (config != NULL) + len += strlen(config); + WT_ERR(__wt_calloc_def(session, len, &bloom->config)); + /* Add the standard config at the end, so it overrides user settings. */ + (void)snprintf(bloom->config, len, + "%s,%s", config == NULL ? "" : config, WT_BLOOM_TABLE_CONFIG); + + bloom->session = session; + + *bloomp = bloom; + return (0); + +err: __wt_free(session, bloom->uri); + __wt_free(session, bloom->config); + __wt_free(session, bloom->bitstring); + __wt_free(session, bloom); + return (ret); +} + +/* + * __bloom_setup -- + * Populate the bloom structure. + * + * Setup is passed in either the count of items expected (n), or the length of + * the bitstring (m). Depends on whether the function is called via create or + * open. + */ +static int +__bloom_setup( + WT_BLOOM *bloom, uint64_t n, uint64_t m, uint32_t factor, uint32_t k) +{ + if (k < 2) + return (EINVAL); + + bloom->k = k; + bloom->factor = factor; + if (n != 0) { + bloom->n = n; + bloom->m = bloom->n * bloom->factor; + } else { + bloom->m = m; + bloom->n = bloom->m / bloom->factor; + } + return (0); +} + +/* + * __wt_bloom_create -- + * + * Creates and configures a WT_BLOOM handle, allocates a bitstring in memory to + * use while populating the bloom filter. + * + * count - is the expected number of inserted items + * factor - is the number of bits to use per inserted item + * k - is the number of hash values to set or test per item + */ +int +__wt_bloom_create( + WT_SESSION_IMPL *session, const char *uri, const char *config, + uint64_t count, uint32_t factor, uint32_t k, WT_BLOOM **bloomp) +{ + WT_BLOOM *bloom; + WT_DECL_RET; + + WT_RET(__bloom_init(session, uri, config, &bloom)); + WT_ERR(__bloom_setup(bloom, count, 0, factor, k)); + + WT_ERR(__bit_alloc(session, bloom->m, &bloom->bitstring)); + + *bloomp = bloom; + return (0); + +err: (void)__wt_bloom_close(bloom); + return (ret); +} + +/* + * __bloom_open_cursor -- + * Open a cursor to read from a Bloom filter. + */ +static int +__bloom_open_cursor(WT_BLOOM *bloom, WT_CURSOR *owner) +{ + WT_CURSOR *c; + WT_SESSION_IMPL *session; + const char *cfg[3]; + + if ((c = bloom->c) != NULL) + return (0); + + session = bloom->session; + cfg[0] = WT_CONFIG_BASE(session, session_open_cursor); + cfg[1] = bloom->config; + cfg[2] = NULL; + c = NULL; + WT_RET(__wt_open_cursor(session, bloom->uri, owner, cfg, &c)); + + /* XXX Layering violation: bump the cache priority for Bloom filters. */ + ((WT_CURSOR_BTREE *)c)->btree->evict_priority = WT_EVICT_INT_SKEW; + + bloom->c = c; + return (0); +} + +/* + * __wt_bloom_open -- + * Open a Bloom filter object for use by a single session. The filter must + * have been created and finalized. + */ +int +__wt_bloom_open(WT_SESSION_IMPL *session, + const char *uri, uint32_t factor, uint32_t k, + WT_CURSOR *owner, WT_BLOOM **bloomp) +{ + WT_BLOOM *bloom; + WT_CURSOR *c; + WT_DECL_RET; + uint64_t size; + + WT_RET(__bloom_init(session, uri, NULL, &bloom)); + WT_ERR(__bloom_open_cursor(bloom, owner)); + c = bloom->c; + + /* Find the largest key, to get the size of the filter. */ + WT_ERR(c->prev(c)); + WT_ERR(c->get_key(c, &size)); + WT_ERR(c->reset(c)); + + WT_ERR(__bloom_setup(bloom, 0, size, factor, k)); + + *bloomp = bloom; + return (0); + +err: (void)__wt_bloom_close(bloom); + return (ret); +} + +/* + * __wt_bloom_insert -- + * Adds the given key to the Bloom filter. + */ +int +__wt_bloom_insert(WT_BLOOM *bloom, WT_ITEM *key) +{ + uint64_t h1, h2; + uint32_t i; + + h1 = __wt_hash_fnv64(key->data, key->size); + h2 = __wt_hash_city64(key->data, key->size); + for (i = 0; i < bloom->k; i++, h1 += h2) { + __bit_set(bloom->bitstring, h1 % bloom->m); + } + return (0); +} + +/* + * __wt_bloom_finalize -- + * Writes the Bloom filter to stable storage. After calling finalize, only + * read operations can be performed on the bloom filter. + */ +int +__wt_bloom_finalize(WT_BLOOM *bloom) +{ + WT_CURSOR *c; + WT_DECL_RET; + WT_ITEM values; + WT_SESSION *wt_session; + uint64_t i; + + wt_session = (WT_SESSION *)bloom->session; + WT_CLEAR(values); + + /* + * Create a bit table to store the bloom filter in. + * TODO: should this call __wt_schema_create directly? + */ + WT_RET(wt_session->create(wt_session, bloom->uri, bloom->config)); + WT_RET(wt_session->open_cursor( + wt_session, bloom->uri, NULL, "bulk=bitmap", &c)); + + /* Add the entries from the array into the table. */ + for (i = 0; i < bloom->m; i += values.size) { + /* Adjust bits to bytes for string offset */ + values.data = bloom->bitstring + (i >> 3); + /* + * Shave off some bytes for pure paranoia, in case WiredTiger + * reserves some special sizes. Choose a value so that if + * we do multiple inserts, it will be on an byte boundary. + */ + values.size = (uint32_t)WT_MIN(bloom->m - i, UINT32_MAX - 127); + c->set_value(c, &values); + WT_ERR(c->insert(c)); + } + +err: WT_TRET(c->close(c)); + __wt_free(bloom->session, bloom->bitstring); + bloom->bitstring = NULL; + + return (ret); +} + +/* + * __wt_bloom_hash -- + * Calculate the hash values for a given key. + */ +int +__wt_bloom_hash(WT_BLOOM *bloom, WT_ITEM *key, WT_BLOOM_HASH *bhash) +{ + WT_UNUSED(bloom); + + bhash->h1 = __wt_hash_fnv64(key->data, key->size); + bhash->h2 = __wt_hash_city64(key->data, key->size); + + return (0); +} + +/* + * __wt_bloom_hash_get -- + * Tests whether the key (as given by its hash signature) is in the Bloom + * filter. Returns zero if found, WT_NOTFOUND if not. + */ +int +__wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash) +{ + WT_CURSOR *c; + WT_DECL_RET; + int result; + uint32_t i; + uint64_t h1, h2; + uint8_t bit; + + /* Get operations are only supported by finalized bloom filters. */ + WT_ASSERT(bloom->session, bloom->bitstring == NULL); + + /* Create a cursor on the first time through. */ + WT_ERR(__bloom_open_cursor(bloom, NULL)); + c = bloom->c; + + h1 = bhash->h1; + h2 = bhash->h2; + + result = 0; + for (i = 0; i < bloom->k; i++, h1 += h2) { + /* + * Add 1 to the hash because WiredTiger tables are 1 based and + * the original bitstring array was 0 based. + */ + c->set_key(c, (h1 % bloom->m) + 1); + WT_ERR(c->search(c)); + WT_ERR(c->get_value(c, &bit)); + + if (bit == 0) { + result = WT_NOTFOUND; + break; + } + } + WT_ERR(c->reset(c)); + return (result); + +err: /* Don't return WT_NOTFOUND from a failed search. */ + if (ret == WT_NOTFOUND) + ret = WT_ERROR; + __wt_err(bloom->session, ret, "Failed lookup in bloom filter."); + return (ret); +} + +/* + * __wt_bloom_get -- + * Tests whether the given key is in the Bloom filter. + * Returns zero if found, WT_NOTFOUND if not. + */ +int +__wt_bloom_get(WT_BLOOM *bloom, WT_ITEM *key) +{ + WT_BLOOM_HASH bhash; + + WT_RET(__wt_bloom_hash(bloom, key, &bhash)); + return (__wt_bloom_hash_get(bloom, &bhash)); +} + +/* + * __wt_bloom_close -- + * Close the Bloom filter, release any resources. + */ +int +__wt_bloom_close(WT_BLOOM *bloom) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = bloom->session; + + if (bloom->c != NULL) + ret = bloom->c->close(bloom->c); + __wt_free(session, bloom->uri); + __wt_free(session, bloom->config); + __wt_free(session, bloom->bitstring); + __wt_free(session, bloom); + + return (ret); +} + +/* + * __wt_bloom_drop -- + * Drop a Bloom filter, release any resources. + */ +int +__wt_bloom_drop(WT_BLOOM *bloom, const char *config) +{ + WT_DECL_RET; + WT_SESSION *wt_session; + + wt_session = (WT_SESSION *)bloom->session; + if (bloom->c != NULL) { + ret = bloom->c->close(bloom->c); + bloom->c = NULL; + } + WT_TRET(wt_session->drop(wt_session, bloom->uri, config)); + WT_TRET(__wt_bloom_close(bloom)); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c new file mode 100644 index 00000000000..e81c951e9f6 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_compact.c @@ -0,0 +1,215 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __compact_rewrite -- + * Return if a page needs to be re-written. + */ +static int +__compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp) +{ + WT_BM *bm; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_MODIFY *mod; + size_t addr_size; + const uint8_t *addr; + + *skipp = 1; /* Default skip. */ + + bm = S2BT(session)->bm; + page = ref->page; + mod = page->modify; + + /* + * Ignore the root: it may not have a replacement address, and besides, + * if anything else gets written, so will it. + */ + if (__wt_ref_is_root(ref)) + return (0); + + /* Ignore currently dirty pages, they will be written regardless. */ + if (__wt_page_is_modified(page)) + return (0); + + /* + * If the page is clean, test the original addresses. + * If the page is a 1-to-1 replacement, test the replacement addresses. + * Ignore empty pages, they get merged into the parent. + */ + if (mod == NULL || F_ISSET(mod, WT_PM_REC_MASK) == 0) { + WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); + if (addr == NULL) + return (0); + WT_RET( + bm->compact_page_skip(bm, session, addr, addr_size, skipp)); + } else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE) { + /* + * The page's modification information can change underfoot if + * the page is being reconciled, lock the page down. + */ + WT_PAGE_LOCK(session, page); + ret = bm->compact_page_skip(bm, session, + mod->mod_replace.addr, mod->mod_replace.size, skipp); + WT_PAGE_UNLOCK(session, page); + WT_RET(ret); + } + return (0); +} + +/* + * __wt_compact -- + * Compact a file. + */ +int +__wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_REF *ref; + int block_manager_begin, skip; + + WT_UNUSED(cfg); + + conn = S2C(session); + btree = S2BT(session); + bm = btree->bm; + ref = NULL; + block_manager_begin = 0; + + WT_STAT_FAST_DATA_INCR(session, session_compact); + + /* + * Check if compaction might be useful -- the API layer will quit trying + * to compact the data source if we make no progress, set a flag if the + * block layer thinks compaction is possible. + */ + WT_RET(bm->compact_skip(bm, session, &skip)); + if (skip) + return (0); + + /* + * Reviewing in-memory pages requires looking at page reconciliation + * results, because we care about where the page is stored now, not + * where the page was stored when we first read it into the cache. + * We need to ensure we don't race with page reconciliation as it's + * writing the page modify information. + * + * There are three ways we call reconciliation: checkpoints, threads + * writing leaf pages (usually in preparation for a checkpoint), and + * eviction. + * + * We're holding the schema lock which serializes with checkpoints. + */ + WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); + + /* + * Get the tree handle's flush lock which blocks threads writing leaf + * pages. + */ + __wt_spin_lock(session, &btree->flush_lock); + + /* + * That leaves eviction, we don't want to block eviction. Set a flag + * so reconciliation knows compaction is running. If reconciliation + * sees the flag it locks the page it's writing, we acquire the same + * lock when reading the page's modify information, serializing access. + * The same page lock blocks work on the page, but compaction is an + * uncommon, heavy-weight operation. If it's ever a problem, there's + * no reason we couldn't use an entirely separate lock than the page + * lock. + * + * We also need to ensure we don't race with an on-going reconciliation. + * After we set the flag, wait for eviction of this file to drain, and + * then let eviction continue; + */ + conn->compact_in_memory_pass = 1; + WT_ERR(__wt_evict_file_exclusive_on(session)); + __wt_evict_file_exclusive_off(session); + + /* Start compaction. */ + WT_ERR(bm->compact_start(bm, session)); + block_manager_begin = 1; + + /* Walk the tree reviewing pages to see if they should be re-written. */ + session->compaction = 1; + for (;;) { + /* + * Pages read for compaction aren't "useful"; don't update the + * read generation of pages already in memory, and if a page is + * read, set its generation to a low value so it is evicted + * quickly. + */ + WT_ERR(__wt_tree_walk(session, &ref, + WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED)); + if (ref == NULL) + break; + + WT_ERR(__compact_rewrite(session, ref, &skip)); + if (skip) + continue; + + /* Rewrite the page: mark the page and tree dirty. */ + WT_ERR(__wt_page_modify_init(session, ref->page)); + __wt_page_modify_set(session, ref->page); + + WT_STAT_FAST_DATA_INCR(session, btree_compact_rewrite); + } + +err: if (ref != NULL) + WT_TRET(__wt_page_release(session, ref, 0)); + + if (block_manager_begin) + WT_TRET(bm->compact_end(bm, session)); + + __wt_spin_unlock(session, &btree->flush_lock); + + conn->compact_in_memory_pass = 0; + WT_FULL_BARRIER(); + + return (ret); +} + +/* + * __wt_compact_page_skip -- + * Return if compaction requires we read this page. + */ +int +__wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp) +{ + WT_BM *bm; + size_t addr_size; + u_int type; + const uint8_t *addr; + + *skipp = 0; /* Default to reading. */ + type = 0; /* Keep compiler quiet. */ + + bm = S2BT(session)->bm; + + /* + * We aren't holding a hazard pointer, so we can't look at the page + * itself, all we can look at is the WT_REF information. If there's no + * address, the page isn't on disk, but we have to read internal pages + * to walk the tree regardless; throw up our hands and read it. + */ + WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, &type)); + if (addr == NULL) + return (0); + + /* + * Internal pages must be read to walk the tree; ask the block-manager + * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite + * won't help. + */ + return (type == WT_CELL_ADDR_INT ? 0 : + bm->compact_page_skip(bm, session, addr, addr_size, skipp)); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c new file mode 100644 index 00000000000..0cc79776634 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -0,0 +1,468 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __cursor_fix_append_next -- + * Return the next entry on the append list. + */ +static inline int +__cursor_fix_append_next(WT_CURSOR_BTREE *cbt, int newpage) +{ + WT_ITEM *val; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + val = &cbt->iface.value; + + if (newpage) { + if ((cbt->ins = WT_SKIP_FIRST(cbt->ins_head)) == NULL) + return (WT_NOTFOUND); + } else + if (cbt->recno >= WT_INSERT_RECNO(cbt->ins) && + (cbt->ins = WT_SKIP_NEXT(cbt->ins)) == NULL) + return (WT_NOTFOUND); + + /* + * This code looks different from the cursor-previous code. The append + * list appears on the last page of the tree, but it may be preceded by + * other rows, which means the cursor's recno will be set to a value and + * we simply want to increment it. If the cursor's recno is NOT set, + * we're starting our iteration in a tree that has only appended items. + * In that case, recno will be 0 and happily enough the increment will + * set it to 1, which is correct. + */ + __cursor_set_recno(cbt, cbt->recno + 1); + + /* + * Fixed-width column store appends are inherently non-transactional. + * Even a non-visible update by a concurrent or aborted transaction + * changes the effective end of the data. The effect is subtle because + * of the blurring between deleted and empty values, but ideally we + * would skip all uncommitted changes at the end of the data. This + * doesn't apply to variable-width column stores because the implicitly + * created records written by reconciliation are deleted and so can be + * never seen by a read. + * + * The problem is that we don't know at this point whether there may be + * multiple uncommitted changes at the end of the data, and it would be + * expensive to check every time we hit an aborted update. If an + * insert is aborted, we simply return zero (empty), regardless of + * whether we are at the end of the data. + */ + if (cbt->recno < WT_INSERT_RECNO(cbt->ins) || + (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { + cbt->v = 0; + val->data = &cbt->v; + } else + val->data = WT_UPDATE_DATA(upd); + val->size = 1; + return (0); +} + +/* + * __cursor_fix_next -- + * Move to the next, fixed-length column-store item. + */ +static inline int +__cursor_fix_next(WT_CURSOR_BTREE *cbt, int newpage) +{ + WT_BTREE *btree; + WT_ITEM *val; + WT_PAGE *page; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + btree = S2BT(session); + page = cbt->ref->page; + val = &cbt->iface.value; + + /* Initialize for each new page. */ + if (newpage) { + cbt->last_standard_recno = __col_fix_last_recno(page); + if (cbt->last_standard_recno == 0) + return (WT_NOTFOUND); + __cursor_set_recno(cbt, page->pg_fix_recno); + goto new_page; + } + + /* Move to the next entry and return the item. */ + if (cbt->recno >= cbt->last_standard_recno) + return (WT_NOTFOUND); + __cursor_set_recno(cbt, cbt->recno + 1); + +new_page: + /* Check any insert list for a matching record. */ + cbt->ins_head = WT_COL_UPDATE_SINGLE(page); + cbt->ins = __col_insert_search( + cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno); + if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) + cbt->ins = NULL; + upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); + if (upd == NULL) { + cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt); + val->data = &cbt->v; + } else + val->data = WT_UPDATE_DATA(upd); + val->size = 1; + return (0); +} + +/* + * __cursor_var_append_next -- + * Return the next variable-length entry on the append list. + */ +static inline int +__cursor_var_append_next(WT_CURSOR_BTREE *cbt, int newpage) +{ + WT_ITEM *val; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + val = &cbt->iface.value; + + if (newpage) { + cbt->ins = WT_SKIP_FIRST(cbt->ins_head); + goto new_page; + } + + for (;;) { + cbt->ins = WT_SKIP_NEXT(cbt->ins); +new_page: if (cbt->ins == NULL) + return (WT_NOTFOUND); + + __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); + if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL || + WT_UPDATE_DELETED_ISSET(upd)) + continue; + val->data = WT_UPDATE_DATA(upd); + val->size = upd->size; + break; + } + return (0); +} + +/* + * __cursor_var_next -- + * Move to the next, variable-length column-store item. + */ +static inline int +__cursor_var_next(WT_CURSOR_BTREE *cbt, int newpage) +{ + WT_CELL *cell; + WT_CELL_UNPACK unpack; + WT_COL *cip; + WT_ITEM *val; + WT_PAGE *page; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + page = cbt->ref->page; + val = &cbt->iface.value; + + /* Initialize for each new page. */ + if (newpage) { + cbt->last_standard_recno = __col_var_last_recno(page); + if (cbt->last_standard_recno == 0) + return (WT_NOTFOUND); + __cursor_set_recno(cbt, page->pg_var_recno); + goto new_page; + } + + /* Move to the next entry and return the item. */ + for (;;) { + if (cbt->recno >= cbt->last_standard_recno) + return (WT_NOTFOUND); + __cursor_set_recno(cbt, cbt->recno + 1); + +new_page: /* Find the matching WT_COL slot. */ + if ((cip = __col_var_search(page, cbt->recno)) == NULL) + return (WT_NOTFOUND); + cbt->slot = WT_COL_SLOT(page, cip); + + /* Check any insert list for a matching record. */ + cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); + cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); + upd = cbt->ins == NULL ? + NULL : __wt_txn_read(session, cbt->ins->upd); + if (upd != NULL) { + if (WT_UPDATE_DELETED_ISSET(upd)) + continue; + + val->data = WT_UPDATE_DATA(upd); + val->size = upd->size; + return (0); + } + + /* + * If we're at the same slot as the last reference and there's + * no matching insert list item, re-use the return information + * (so encoded items with large repeat counts aren't repeatedly + * decoded). Otherwise, unpack the cell and build the return + * information. + */ + if (cbt->cip_saved != cip) { + if ((cell = WT_COL_PTR(page, cip)) == NULL) + continue; + __wt_cell_unpack(cell, &unpack); + if (unpack.type == WT_CELL_DEL) + continue; + WT_RET(__wt_page_cell_data_ref( + session, page, &unpack, &cbt->tmp)); + + cbt->cip_saved = cip; + } + val->data = cbt->tmp.data; + val->size = cbt->tmp.size; + return (0); + } + /* NOTREACHED */ +} + +/* + * __cursor_row_next -- + * Move to the next row-store item. + */ +static inline int +__cursor_row_next(WT_CURSOR_BTREE *cbt, int newpage) +{ + WT_INSERT *ins; + WT_ITEM *key, *val; + WT_PAGE *page; + WT_ROW *rip; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + page = cbt->ref->page; + key = &cbt->iface.key; + val = &cbt->iface.value; + + /* + * For row-store pages, we need a single item that tells us the part + * of the page we're walking (otherwise switching from next to prev + * and vice-versa is just too complicated), so we map the WT_ROW and + * WT_INSERT_HEAD insert array slots into a single name space: slot 1 + * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is + * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are + * odd-numbered slots, and WT_ROW array slots are even-numbered slots. + * + * New page configuration. + */ + if (newpage) { + cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); + cbt->ins = WT_SKIP_FIRST(cbt->ins_head); + cbt->row_iteration_slot = 1; + goto new_insert; + } + + /* Move to the next entry and return the item. */ + for (;;) { + /* + * Continue traversing any insert list; maintain the insert list + * head reference and entry count in case we switch to a cursor + * previous movement. + */ + if (cbt->ins != NULL) + cbt->ins = WT_SKIP_NEXT(cbt->ins); + +new_insert: if ((ins = cbt->ins) != NULL) { + if ((upd = __wt_txn_read(session, ins->upd)) == NULL || + WT_UPDATE_DELETED_ISSET(upd)) + continue; + key->data = WT_INSERT_KEY(ins); + key->size = WT_INSERT_KEY_SIZE(ins); + val->data = WT_UPDATE_DATA(upd); + val->size = upd->size; + return (0); + } + + /* Check for the end of the page. */ + if (cbt->row_iteration_slot >= page->pg_row_entries * 2 + 1) + return (WT_NOTFOUND); + ++cbt->row_iteration_slot; + + /* + * Odd-numbered slots configure as WT_INSERT_HEAD entries, + * even-numbered slots configure as WT_ROW entries. + */ + if (cbt->row_iteration_slot & 0x01) { + cbt->ins_head = WT_ROW_INSERT_SLOT( + page, cbt->row_iteration_slot / 2 - 1); + cbt->ins = WT_SKIP_FIRST(cbt->ins_head); + goto new_insert; + } + cbt->ins_head = NULL; + cbt->ins = NULL; + + cbt->slot = cbt->row_iteration_slot / 2 - 1; + rip = &page->pg_row_d[cbt->slot]; + upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); + if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) + continue; + + return (__cursor_row_slot_return(cbt, rip, upd)); + } + /* NOTREACHED */ +} + +/* + * __wt_btcur_iterate_setup -- + * Initialize a cursor for iteration, usually based on a search. + */ +void +__wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next) +{ + WT_PAGE *page; + + WT_UNUSED(next); + + /* + * We don't currently have to do any setup when we switch between next + * and prev calls, but I'm sure we will someday -- I'm leaving support + * here for both flags for that reason. + */ + F_SET(cbt, WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV); + + /* + * If we don't have a search page, then we're done, we're starting at + * the beginning or end of the tree, not as a result of a search. + */ + if (cbt->ref == NULL) + return; + page = cbt->ref->page; + + if (page->type == WT_PAGE_ROW_LEAF) { + /* + * For row-store pages, we need a single item that tells us the + * part of the page we're walking (otherwise switching from next + * to prev and vice-versa is just too complicated), so we map + * the WT_ROW and WT_INSERT_HEAD insert array slots into a + * single name space: slot 1 is the "smallest key insert list", + * slot 2 is WT_ROW[0], slot 3 is WT_INSERT_HEAD[0], and so on. + * This means WT_INSERT lists are odd-numbered slots, and WT_ROW + * array slots are even-numbered slots. + */ + cbt->row_iteration_slot = (cbt->slot + 1) * 2; + if (cbt->ins_head != NULL) { + if (cbt->ins_head == WT_ROW_INSERT_SMALLEST(page)) + cbt->row_iteration_slot = 1; + else + cbt->row_iteration_slot += 1; + } + } else { + /* + * For column-store pages, calculate the largest record on the + * page. + */ + cbt->last_standard_recno = page->type == WT_PAGE_COL_VAR ? + __col_var_last_recno(page) : __col_fix_last_recno(page); + + /* If we're traversing the append list, set the reference. */ + if (cbt->ins_head != NULL && + cbt->ins_head == WT_COL_APPEND(page)) + F_SET(cbt, WT_CBT_ITERATE_APPEND); + } +} + +/* + * __wt_btcur_next -- + * Move to the next record in the tree. + */ +int +__wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating) +{ + WT_DECL_RET; + WT_PAGE *page; + WT_SESSION_IMPL *session; + uint32_t flags; + int newpage; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + + WT_STAT_FAST_CONN_INCR(session, cursor_next); + WT_STAT_FAST_DATA_INCR(session, cursor_next); + + flags = WT_READ_SKIP_INTL; /* Tree walk flags. */ + if (truncating) + LF_SET(WT_READ_TRUNCATE); + + WT_RET(__cursor_func_init(cbt, 0)); + + /* + * If we aren't already iterating in the right direction, there's + * some setup to do. + */ + if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT)) + __wt_btcur_iterate_setup(cbt, 1); + + /* + * Walk any page we're holding until the underlying call returns not- + * found. Then, move to the next page, until we reach the end of the + * file. + */ + page = cbt->ref == NULL ? NULL : cbt->ref->page; + for (newpage = 0;; newpage = 1) { + if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { + switch (page->type) { + case WT_PAGE_COL_FIX: + ret = __cursor_fix_append_next(cbt, newpage); + break; + case WT_PAGE_COL_VAR: + ret = __cursor_var_append_next(cbt, newpage); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + if (ret == 0) + break; + F_CLR(cbt, WT_CBT_ITERATE_APPEND); + if (ret != WT_NOTFOUND) + break; + } else if (page != NULL) { + switch (page->type) { + case WT_PAGE_COL_FIX: + ret = __cursor_fix_next(cbt, newpage); + break; + case WT_PAGE_COL_VAR: + ret = __cursor_var_next(cbt, newpage); + break; + case WT_PAGE_ROW_LEAF: + ret = __cursor_row_next(cbt, newpage); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + if (ret != WT_NOTFOUND) + break; + + /* + * The last page in a column-store has appended entries. + * We handle it separately from the usual cursor code: + * it's only that one page and it's in a simple format. + */ + if (page->type != WT_PAGE_ROW_LEAF && + (cbt->ins_head = WT_COL_APPEND(page)) != NULL) { + F_SET(cbt, WT_CBT_ITERATE_APPEND); + continue; + } + } + + WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); + WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); + + page = cbt->ref->page; + WT_ASSERT(session, !WT_PAGE_IS_INTERNAL(page)); + } + +err: if (ret != 0) + WT_TRET(__cursor_reset(cbt)); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c new file mode 100644 index 00000000000..8de784d1f1d --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -0,0 +1,560 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * Walking backwards through skip lists. + * + * The skip list stack is an array of pointers set up by a search. It points + * to the position a node should go in the skip list. In other words, the skip + * list search stack always points *after* the search item (that is, into the + * search item's next array). + * + * Helper macros to go from a stack pointer at level i, pointing into a next + * array, back to the insert node containing that next array. + */ +#undef PREV_ITEM +#define PREV_ITEM(ins_head, insp, i) \ + (((insp) == &(ins_head)->head[i] || (insp) == NULL) ? NULL : \ + (WT_INSERT *)((char *)((insp) - (i)) - offsetof(WT_INSERT, next))) + +#undef PREV_INS +#define PREV_INS(cbt, i) \ + PREV_ITEM((cbt)->ins_head, (cbt)->ins_stack[(i)], (i)) + +/* + * __cursor_skip_prev -- + * Move back one position in a skip list stack (aka "finger"). + */ +static inline int +__cursor_skip_prev(WT_CURSOR_BTREE *cbt) +{ + WT_INSERT *current, *ins; + WT_ITEM key; + WT_SESSION_IMPL *session; + int i; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + +restart: + /* + * If the search stack does not point at the current item, fill it in + * with a search. + */ + while ((current = cbt->ins) != PREV_INS(cbt, 0)) { + if (cbt->btree->type == BTREE_ROW) { + key.data = WT_INSERT_KEY(current); + key.size = WT_INSERT_KEY_SIZE(current); + WT_RET(__wt_search_insert(session, cbt, &key)); + } else + cbt->ins = __col_insert_search(cbt->ins_head, + cbt->ins_stack, cbt->next_stack, + WT_INSERT_RECNO(current)); + } + + /* + * Find the first node up the search stack that does not move. + * + * The depth of the current item must be at least this level, since we + * see it in that many levels of the stack. + * + * !!! Watch these loops carefully: they all rely on the value of i, + * and the exit conditions to end up with the right values are + * non-trivial. + */ + ins = NULL; /* -Wconditional-uninitialized */ + for (i = 0; i < WT_SKIP_MAXDEPTH - 1; i++) + if ((ins = PREV_INS(cbt, i + 1)) != current) + break; + + /* + * Find a starting point for the new search. That is either at the + * non-moving node if we found a valid node, or the beginning of the + * next list down that is not the current node. + * + * Since it is the beginning of a list, and we know the current node is + * has a skip depth at least this high, any node we find must sort + * before the current node. + */ + if (ins == NULL || ins == current) + for (; i >= 0; i--) { + cbt->ins_stack[i] = NULL; + cbt->next_stack[i] = NULL; + ins = cbt->ins_head->head[i]; + if (ins != NULL && ins != current) + break; + } + + /* Walk any remaining levels until just before the current node. */ + while (i >= 0) { + /* + * If we get to the end of a list without finding the current + * item, we must have raced with an insert. Restart the search. + */ + if (ins == NULL) { + cbt->ins_stack[0] = NULL; + cbt->next_stack[0] = NULL; + goto restart; + } + if (ins->next[i] != current) /* Stay at this level */ + ins = ins->next[i]; + else { /* Drop down a level */ + cbt->ins_stack[i] = &ins->next[i]; + cbt->next_stack[i] = ins->next[i]; + --i; + } + } + + /* If we found a previous node, the next one must be current. */ + if (cbt->ins_stack[0] != NULL && *cbt->ins_stack[0] != current) + goto restart; + + cbt->ins = PREV_INS(cbt, 0); + return (0); +} + +/* + * __cursor_fix_append_prev -- + * Return the previous fixed-length entry on the append list. + */ +static inline int +__cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, int newpage) +{ + WT_ITEM *val; + WT_PAGE *page; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + page = cbt->ref->page; + val = &cbt->iface.value; + + if (newpage) { + if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL) + return (WT_NOTFOUND); + } else { + /* + * Handle the special case of leading implicit records, that is, + * there aren't any records in the tree not on the append list, + * and the first record on the append list isn't record 1. + * + * The "right" place to handle this is probably in our caller. + * The high-level cursor-previous routine would: + * -- call this routine to walk the append list + * -- call the routine to walk the standard page items + * -- call the tree walk routine looking for a previous page + * Each of them returns WT_NOTFOUND, at which point our caller + * checks the cursor record number, and if it's larger than 1, + * returns the implicit records. Instead, I'm trying to detect + * the case here, mostly because I don't want to put that code + * into our caller. Anyway, if this code breaks for any reason, + * that's the way I'd go. + * + * If we're not pointing to a WT_INSERT entry, or we can't find + * a WT_INSERT record that precedes our record name-space, check + * if there are any records on the page. If there aren't, then + * we're in the magic zone, keep going until we get to a record + * number of 1. + */ + if (cbt->ins != NULL && + cbt->recno <= WT_INSERT_RECNO(cbt->ins)) + WT_RET(__cursor_skip_prev(cbt)); + if (cbt->ins == NULL && + (cbt->recno == 1 || __col_fix_last_recno(page) != 0)) + return (WT_NOTFOUND); + } + + /* + * This code looks different from the cursor-next code. The append + * list appears on the last page of the tree and contains the last + * records in the tree. If we're iterating through the tree, starting + * at the last record in the tree, by definition we're starting a new + * iteration and we set the record number to the last record found in + * the tree. Otherwise, decrement the record. + */ + if (newpage) + __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); + else + __cursor_set_recno(cbt, cbt->recno - 1); + + /* + * Fixed-width column store appends are inherently non-transactional. + * Even a non-visible update by a concurrent or aborted transaction + * changes the effective end of the data. The effect is subtle because + * of the blurring between deleted and empty values, but ideally we + * would skip all uncommitted changes at the end of the data. This + * doesn't apply to variable-width column stores because the implicitly + * created records written by reconciliation are deleted and so can be + * never seen by a read. + */ + if (cbt->ins == NULL || + cbt->recno > WT_INSERT_RECNO(cbt->ins) || + (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { + cbt->v = 0; + val->data = &cbt->v; + } else + val->data = WT_UPDATE_DATA(upd); + val->size = 1; + return (0); +} + +/* + * __cursor_fix_prev -- + * Move to the previous, fixed-length column-store item. + */ +static inline int +__cursor_fix_prev(WT_CURSOR_BTREE *cbt, int newpage) +{ + WT_BTREE *btree; + WT_ITEM *val; + WT_PAGE *page; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + page = cbt->ref->page; + btree = S2BT(session); + val = &cbt->iface.value; + + /* Initialize for each new page. */ + if (newpage) { + cbt->last_standard_recno = __col_fix_last_recno(page); + if (cbt->last_standard_recno == 0) + return (WT_NOTFOUND); + __cursor_set_recno(cbt, cbt->last_standard_recno); + goto new_page; + } + + /* Move to the previous entry and return the item. */ + if (cbt->recno == page->pg_fix_recno) + return (WT_NOTFOUND); + __cursor_set_recno(cbt, cbt->recno - 1); + +new_page: + /* Check any insert list for a matching record. */ + cbt->ins_head = WT_COL_UPDATE_SINGLE(page); + cbt->ins = __col_insert_search( + cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno); + if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) + cbt->ins = NULL; + upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); + if (upd == NULL) { + cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt); + val->data = &cbt->v; + } else + val->data = WT_UPDATE_DATA(upd); + val->size = 1; + return (0); +} + +/* + * __cursor_var_append_prev -- + * Return the previous variable-length entry on the append list. + */ +static inline int +__cursor_var_append_prev(WT_CURSOR_BTREE *cbt, int newpage) +{ + WT_ITEM *val; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + val = &cbt->iface.value; + + if (newpage) { + cbt->ins = WT_SKIP_LAST(cbt->ins_head); + goto new_page; + } + + for (;;) { + WT_RET(__cursor_skip_prev(cbt)); +new_page: if (cbt->ins == NULL) + return (WT_NOTFOUND); + + __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); + if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL || + WT_UPDATE_DELETED_ISSET(upd)) + continue; + val->data = WT_UPDATE_DATA(upd); + val->size = upd->size; + break; + } + return (0); +} + +/* + * __cursor_var_prev -- + * Move to the previous, variable-length column-store item. + */ +static inline int +__cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage) +{ + WT_CELL *cell; + WT_CELL_UNPACK unpack; + WT_COL *cip; + WT_ITEM *val; + WT_PAGE *page; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + page = cbt->ref->page; + val = &cbt->iface.value; + + /* Initialize for each new page. */ + if (newpage) { + cbt->last_standard_recno = __col_var_last_recno(page); + if (cbt->last_standard_recno == 0) + return (WT_NOTFOUND); + __cursor_set_recno(cbt, cbt->last_standard_recno); + goto new_page; + } + + /* Move to the previous entry and return the item. */ + for (;;) { + __cursor_set_recno(cbt, cbt->recno - 1); + +new_page: if (cbt->recno < page->pg_var_recno) + return (WT_NOTFOUND); + + /* Find the matching WT_COL slot. */ + if ((cip = __col_var_search(page, cbt->recno)) == NULL) + return (WT_NOTFOUND); + cbt->slot = WT_COL_SLOT(page, cip); + + /* Check any insert list for a matching record. */ + cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); + cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); + upd = cbt->ins == NULL ? + NULL : __wt_txn_read(session, cbt->ins->upd); + if (upd != NULL) { + if (WT_UPDATE_DELETED_ISSET(upd)) + continue; + + val->data = WT_UPDATE_DATA(upd); + val->size = upd->size; + return (0); + } + + /* + * If we're at the same slot as the last reference and there's + * no matching insert list item, re-use the return information + * (so encoded items with large repeat counts aren't repeatedly + * decoded). Otherwise, unpack the cell and build the return + * information. + */ + if (cbt->cip_saved != cip) { + if ((cell = WT_COL_PTR(page, cip)) == NULL) + continue; + __wt_cell_unpack(cell, &unpack); + if (unpack.type == WT_CELL_DEL) + continue; + WT_RET(__wt_page_cell_data_ref( + session, page, &unpack, &cbt->tmp)); + + cbt->cip_saved = cip; + } + val->data = cbt->tmp.data; + val->size = cbt->tmp.size; + return (0); + } + /* NOTREACHED */ +} + +/* + * __cursor_row_prev -- + * Move to the previous row-store item. + */ +static inline int +__cursor_row_prev(WT_CURSOR_BTREE *cbt, int newpage) +{ + WT_INSERT *ins; + WT_ITEM *key, *val; + WT_PAGE *page; + WT_ROW *rip; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + page = cbt->ref->page; + key = &cbt->iface.key; + val = &cbt->iface.value; + + /* + * For row-store pages, we need a single item that tells us the part + * of the page we're walking (otherwise switching from next to prev + * and vice-versa is just too complicated), so we map the WT_ROW and + * WT_INSERT_HEAD insert array slots into a single name space: slot 1 + * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is + * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are + * odd-numbered slots, and WT_ROW array slots are even-numbered slots. + * + * New page configuration. + */ + if (newpage) { + /* + * If we haven't instantiated keys on this page, do so, else it + * is a very, very slow traversal. + */ + if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) + WT_RET(__wt_row_leaf_keys(session, page)); + + if (page->pg_row_entries == 0) + cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); + else + cbt->ins_head = + WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); + cbt->ins = WT_SKIP_LAST(cbt->ins_head); + cbt->row_iteration_slot = page->pg_row_entries * 2 + 1; + goto new_insert; + } + + /* Move to the previous entry and return the item. */ + for (;;) { + /* + * Continue traversing any insert list. Maintain the reference + * to the current insert element in case we switch to a cursor + * next movement. + */ + if (cbt->ins != NULL) + WT_RET(__cursor_skip_prev(cbt)); + +new_insert: if ((ins = cbt->ins) != NULL) { + if ((upd = __wt_txn_read(session, ins->upd)) == NULL || + WT_UPDATE_DELETED_ISSET(upd)) + continue; + key->data = WT_INSERT_KEY(ins); + key->size = WT_INSERT_KEY_SIZE(ins); + val->data = WT_UPDATE_DATA(upd); + val->size = upd->size; + return (0); + } + + /* Check for the beginning of the page. */ + if (cbt->row_iteration_slot == 1) + return (WT_NOTFOUND); + --cbt->row_iteration_slot; + + /* + * Odd-numbered slots configure as WT_INSERT_HEAD entries, + * even-numbered slots configure as WT_ROW entries. + */ + if (cbt->row_iteration_slot & 0x01) { + cbt->ins_head = cbt->row_iteration_slot == 1 ? + WT_ROW_INSERT_SMALLEST(page) : + WT_ROW_INSERT_SLOT( + page, cbt->row_iteration_slot / 2 - 1); + cbt->ins = WT_SKIP_LAST(cbt->ins_head); + goto new_insert; + } + cbt->ins_head = NULL; + cbt->ins = NULL; + + cbt->slot = cbt->row_iteration_slot / 2 - 1; + rip = &page->pg_row_d[cbt->slot]; + upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); + if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) + continue; + + return (__cursor_row_slot_return(cbt, rip, upd)); + } + /* NOTREACHED */ +} + +/* + * __wt_btcur_prev -- + * Move to the previous record in the tree. + */ +int +__wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating) +{ + WT_DECL_RET; + WT_PAGE *page; + WT_SESSION_IMPL *session; + uint32_t flags; + int newpage; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + + WT_STAT_FAST_CONN_INCR(session, cursor_prev); + WT_STAT_FAST_DATA_INCR(session, cursor_prev); + + flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* Tree walk flags. */ + if (truncating) + LF_SET(WT_READ_TRUNCATE); + + WT_RET(__cursor_func_init(cbt, 0)); + + /* + * If we aren't already iterating in the right direction, there's + * some setup to do. + */ + if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) + __wt_btcur_iterate_setup(cbt, 0); + + /* + * Walk any page we're holding until the underlying call returns not- + * found. Then, move to the previous page, until we reach the start + * of the file. + */ + page = cbt->ref == NULL ? NULL : cbt->ref->page; + for (newpage = 0;; newpage = 1) { + if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { + switch (page->type) { + case WT_PAGE_COL_FIX: + ret = __cursor_fix_append_prev(cbt, newpage); + break; + case WT_PAGE_COL_VAR: + ret = __cursor_var_append_prev(cbt, newpage); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + if (ret == 0) + break; + F_CLR(cbt, WT_CBT_ITERATE_APPEND); + if (ret != WT_NOTFOUND) + break; + newpage = 1; + } + if (page != NULL) { + switch (page->type) { + case WT_PAGE_COL_FIX: + ret = __cursor_fix_prev(cbt, newpage); + break; + case WT_PAGE_COL_VAR: + ret = __cursor_var_prev(cbt, newpage); + break; + case WT_PAGE_ROW_LEAF: + ret = __cursor_row_prev(cbt, newpage); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + if (ret != WT_NOTFOUND) + break; + } + + WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); + WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); + + page = cbt->ref->page; + WT_ASSERT(session, !WT_PAGE_IS_INTERNAL(page)); + + /* + * The last page in a column-store has appended entries. + * We handle it separately from the usual cursor code: + * it's only that one page and it's in a simple format. + */ + if (page->type != WT_PAGE_ROW_LEAF && + (cbt->ins_head = WT_COL_APPEND(page)) != NULL) + F_SET(cbt, WT_CBT_ITERATE_APPEND); + } + +err: if (ret != 0) + WT_TRET(__cursor_reset(cbt)); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c new file mode 100644 index 00000000000..5b2d9b055b5 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -0,0 +1,1025 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __cursor_size_chk -- + * Return if an inserted item is too large. + */ +static inline int +__cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_DECL_RET; + size_t size; + + btree = S2BT(session); + bm = btree->bm; + + if (btree->type == BTREE_COL_FIX) { + /* Fixed-size column-stores take a single byte. */ + if (kv->size != 1) + WT_RET_MSG(session, EINVAL, + "item size of %" WT_SIZET_FMT " does not match " + "fixed-length file requirement of 1 byte", + kv->size); + return (0); + } + + /* Don't waste effort, 1GB is always cool. */ + if (kv->size <= WT_GIGABYTE) + return (0); + + /* + * There are two checks: what we are willing to store in the tree, and + * what the block manager can actually write. + */ + if (kv->size > WT_BTREE_MAX_OBJECT_SIZE) + ret = EINVAL; + else { + size = kv->size; + ret = bm->write_size(bm, session, &size); + } + if (ret != 0) + WT_RET_MSG(session, ret, + "item size of %" WT_SIZET_FMT " exceeds the maximum " + "supported size", + kv->size); + return (0); +} + +/* + * __cursor_fix_implicit -- + * Return if search went past the end of the tree. + */ +static inline int +__cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt) +{ + return (btree->type == BTREE_COL_FIX && + !F_ISSET(cbt, WT_CBT_MAX_RECORD) ? 1 : 0); +} + +/* + * __cursor_valid -- + * Return if the cursor references an valid key/value pair. + */ +static inline int +__cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_COL *cip; + WT_PAGE *page; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + + btree = cbt->btree; + page = cbt->ref->page; + session = (WT_SESSION_IMPL *)cbt->iface.session; + if (updp != NULL) + *updp = NULL; + + /* + * We may be pointing to an insert object, and we may have a page with + * existing entries. Insert objects always have associated update + * objects (the value). Any update object may be deleted, or invisible + * to us. In the case of an on-page entry, there is by definition a + * value that is visible to us, the original page cell. + * + * If we find a visible update structure, return our caller a reference + * to it because we don't want to repeatedly search for the update, it + * might suddenly become invisible (imagine a read-uncommitted session + * with another session's aborted insert), and we don't want to handle + * that potential error every time we look at the value. + * + * Unfortunately, the objects we might have and their relationships are + * different for the underlying page types. + * + * In the case of row-store, an insert object implies ignoring any page + * objects, no insert object can have the same key as an on-page object. + * For row-store: + * if there's an insert object: + * if there's a visible update: + * exact match + * else + * no exact match + * else + * use the on-page object (which may have an associated + * update object that may or may not be visible to us). + * + * Column-store is more complicated because an insert object can have + * the same key as an on-page object: updates to column-store rows + * are insert/object pairs, and an invisible update isn't the end as + * there may be an on-page object that is visible. This changes the + * logic to: + * if there's an insert object: + * if there's a visible update: + * exact match + * else if the on-page object's key matches the insert key + * use the on-page object + * else + * use the on-page object + * + * First, check for an insert object with a visible update (a visible + * update that's been deleted is not a valid key/value pair). + */ + if (cbt->ins != NULL && + (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { + if (WT_UPDATE_DELETED_ISSET(upd)) + return (0); + if (updp != NULL) + *updp = upd; + return (1); + } + + /* + * If we don't have an insert object, or in the case of column-store, + * there's an insert object but no update was visible to us and the key + * on the page is the same as the insert object's key, and the slot as + * set by the search function is valid, we can use the original page + * information. + */ + switch (btree->type) { + case BTREE_COL_FIX: + /* + * If search returned an insert object, there may or may not be + * a matching on-page object, we have to check. Fixed-length + * column-store pages don't have slots, but map one-to-one to + * keys, check for retrieval past the end of the page. + */ + if (cbt->recno >= page->pg_fix_recno + page->pg_fix_entries) + return (0); + + /* + * Updates aren't stored on the page, an update would have + * appeared as an "insert" object; no further checks to do. + */ + break; + case BTREE_COL_VAR: + /* + * If search returned an insert object, there may or may not be + * a matching on-page object, we have to check. Variable-length + * column-store pages don't map one-to-one to keys, but have + * "slots", check if search returned a valid slot. + */ + if (cbt->slot >= page->pg_var_entries) + return (0); + + /* + * Updates aren't stored on the page, an update would have + * appeared as an "insert" object; however, variable-length + * column store deletes are written into the backing store, + * check the cell for a record already deleted when read. + */ + cip = &page->pg_var_d[cbt->slot]; + if ((cell = WT_COL_PTR(page, cip)) == NULL || + __wt_cell_type(cell) == WT_CELL_DEL) + return (0); + break; + case BTREE_ROW: + /* + * See above: for row-store, no insert object can have the same + * key as an on-page object, we're done. + */ + if (cbt->ins != NULL) + return (0); + + /* + * Check if searched returned a valid slot (the failure mode is + * an empty page, the search function doesn't check, and so the + * more exact test is "page->pg_row_entries == 0", but this test + * mirrors the column-store test). + */ + if (cbt->slot >= page->pg_row_entries) + return (0); + + /* Updates are stored on the page, check for a delete. */ + if (page->pg_row_upd != NULL && (upd = __wt_txn_read( + session, page->pg_row_upd[cbt->slot])) != NULL) { + if (WT_UPDATE_DELETED_ISSET(upd)) + return (0); + if (updp != NULL) + *updp = upd; + } + break; + } + return (1); +} + +/* + * __cursor_col_search -- + * Column-store search from an application cursor. + */ +static inline int +__cursor_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +{ + WT_DECL_RET; + + WT_WITH_PAGE_INDEX(session, + ret = __wt_col_search(session, cbt->iface.recno, NULL, cbt)); + return (ret); +} + +/* + * __cursor_row_search -- + * Row-store search from an application cursor. + */ +static inline int +__cursor_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int insert) +{ + WT_DECL_RET; + + WT_WITH_PAGE_INDEX(session, + ret = __wt_row_search(session, &cbt->iface.key, NULL, cbt, insert)); + return (ret); +} + +/* + * __cursor_col_modify -- + * Column-store delete, insert, and update from an application cursor. + */ +static inline int +__cursor_col_modify( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) +{ + return (__wt_col_modify(session, + cbt, cbt->iface.recno, &cbt->iface.value, NULL, is_remove)); +} + +/* + * __cursor_row_modify -- + * Row-store insert, update and delete from an application cursor. + */ +static inline int +__cursor_row_modify( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) +{ + return (__wt_row_modify(session, + cbt, &cbt->iface.key, &cbt->iface.value, NULL, is_remove)); +} + +/* + * __wt_btcur_reset -- + * Invalidate the cursor position. + */ +int +__wt_btcur_reset(WT_CURSOR_BTREE *cbt) +{ + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + + WT_STAT_FAST_CONN_INCR(session, cursor_reset); + WT_STAT_FAST_DATA_INCR(session, cursor_reset); + + return (__cursor_reset(cbt)); +} + +/* + * __wt_btcur_search -- + * Search for a matching record in the tree. + */ +int +__wt_btcur_search(WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_DECL_RET; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + + btree = cbt->btree; + cursor = &cbt->iface; + session = (WT_SESSION_IMPL *)cursor->session; + + WT_STAT_FAST_CONN_INCR(session, cursor_search); + WT_STAT_FAST_DATA_INCR(session, cursor_search); + + if (btree->type == BTREE_ROW) + WT_RET(__cursor_size_chk(session, &cursor->key)); + + WT_RET(__cursor_func_init(cbt, 1)); + + WT_ERR(btree->type == BTREE_ROW ? + __cursor_row_search(session, cbt, 0) : + __cursor_col_search(session, cbt)); + if (cbt->compare == 0 && __cursor_valid(cbt, &upd)) + ret = __wt_kv_return(session, cbt, upd); + else if (__cursor_fix_implicit(btree, cbt)) { + /* + * Creating a record past the end of the tree in a fixed-length + * column-store implicitly fills the gap with empty records. + */ + cbt->recno = cursor->recno; + cbt->v = 0; + cursor->value.data = &cbt->v; + cursor->value.size = 1; + } else + ret = WT_NOTFOUND; + +err: if (ret != 0) + WT_TRET(__cursor_reset(cbt)); + return (ret); +} + +/* + * __wt_btcur_search_near -- + * Search for a record in the tree. + */ +int +__wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_DECL_RET; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + int exact; + + btree = cbt->btree; + cursor = &cbt->iface; + session = (WT_SESSION_IMPL *)cursor->session; + exact = 0; + + WT_STAT_FAST_CONN_INCR(session, cursor_search_near); + WT_STAT_FAST_DATA_INCR(session, cursor_search_near); + + if (btree->type == BTREE_ROW) + WT_RET(__cursor_size_chk(session, &cursor->key)); + + WT_RET(__cursor_func_init(cbt, 1)); + + /* + * Set the "insert" flag for the btree row-store search; we may intend + * to position our cursor at the end of the tree, rather than match an + * existing record. + */ + WT_ERR(btree->type == BTREE_ROW ? + __cursor_row_search(session, cbt, 1) : + __cursor_col_search(session, cbt)); + + /* + * If we find an valid key, return it. + * + * Else, creating a record past the end of the tree in a fixed-length + * column-store implicitly fills the gap with empty records. In this + * case, we instantiate the empty record, it's an exact match. + * + * Else, move to the next key in the tree (bias for prefix searches). + * Cursor next skips invalid rows, so we don't have to test for them + * again. + * + * Else, redo the search and move to the previous key in the tree. + * Cursor previous skips invalid rows, so we don't have to test for + * them again. + * + * If that fails, quit, there's no record to return. + */ + if (__cursor_valid(cbt, &upd)) { + exact = cbt->compare; + ret = __wt_kv_return(session, cbt, upd); + } else if (__cursor_fix_implicit(btree, cbt)) { + cbt->recno = cursor->recno; + cbt->v = 0; + cursor->value.data = &cbt->v; + cursor->value.size = 1; + exact = 0; + } else if ((ret = __wt_btcur_next(cbt, 0)) != WT_NOTFOUND) + exact = 1; + else { + WT_ERR(btree->type == BTREE_ROW ? + __cursor_row_search(session, cbt, 1) : + __cursor_col_search(session, cbt)); + if (__cursor_valid(cbt, &upd)) { + exact = cbt->compare; + ret = __wt_kv_return(session, cbt, upd); + } else if ((ret = __wt_btcur_prev(cbt, 0)) != WT_NOTFOUND) + exact = -1; + } + +err: if (ret != 0) + WT_TRET(__cursor_reset(cbt)); + if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND)) + *exactp = exact; + return (ret); +} + +/* + * __wt_btcur_insert -- + * Insert a record into the tree. + */ +int +__wt_btcur_insert(WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + btree = cbt->btree; + cursor = &cbt->iface; + session = (WT_SESSION_IMPL *)cursor->session; + + WT_STAT_FAST_CONN_INCR(session, cursor_insert); + WT_STAT_FAST_DATA_INCR(session, cursor_insert); + WT_STAT_FAST_DATA_INCRV(session, + cursor_insert_bytes, cursor->key.size + cursor->value.size); + + if (btree->type == BTREE_ROW) + WT_RET(__cursor_size_chk(session, &cursor->key)); + WT_RET(__cursor_size_chk(session, &cursor->value)); + + /* + * The tree is no longer empty: eviction should pay attention to it, + * and it's no longer possible to bulk-load into it. + */ + if (btree->bulk_load_ok) { + btree->bulk_load_ok = 0; + __wt_btree_evictable(session, 1); + } + +retry: WT_RET(__cursor_func_init(cbt, 1)); + + switch (btree->type) { + case BTREE_COL_FIX: + case BTREE_COL_VAR: + /* + * If WT_CURSTD_APPEND is set, insert a new record (ignoring + * the application's record number). First we search for the + * maximum possible record number so the search ends on the + * last page. The real record number is assigned by the + * serialized append operation. + */ + if (F_ISSET(cursor, WT_CURSTD_APPEND)) + cbt->iface.recno = UINT64_MAX; + + WT_ERR(__cursor_col_search(session, cbt)); + + if (F_ISSET(cursor, WT_CURSTD_APPEND)) + cbt->iface.recno = 0; + + /* + * If not overwriting, fail if the key exists. Creating a + * record past the end of the tree in a fixed-length + * column-store implicitly fills the gap with empty records. + * Fail in that case, the record exists. + */ + if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && + ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) || + (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) + WT_ERR(WT_DUPLICATE_KEY); + + WT_ERR(__cursor_col_modify(session, cbt, 0)); + if (F_ISSET(cursor, WT_CURSTD_APPEND)) + cbt->iface.recno = cbt->recno; + break; + case BTREE_ROW: + WT_ERR(__cursor_row_search(session, cbt, 1)); + /* + * If not overwriting, fail if the key exists, else insert the + * key/value pair. + */ + if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && + cbt->compare == 0 && __cursor_valid(cbt, NULL)) + WT_ERR(WT_DUPLICATE_KEY); + + ret = __cursor_row_modify(session, cbt, 0); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + +err: if (ret == WT_RESTART) + goto retry; + /* Insert doesn't maintain a position across calls, clear resources. */ + if (ret == 0) + WT_TRET(__curfile_leave(cbt)); + if (ret != 0) + WT_TRET(__cursor_reset(cbt)); + return (ret); +} + +/* + * __wt_btcur_update_check -- + * Check whether an update would conflict. + * + * This can be used to replace WT_CURSOR::insert or WT_CURSOR::update, so + * they only check for conflicts without updating the tree. It is used to + * maintain snapshot isolation for transactions that span multiple chunks + * in an LSM tree. + */ +int +__wt_btcur_update_check(WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cursor = &cbt->iface; + btree = cbt->btree; + session = (WT_SESSION_IMPL *)cursor->session; + +retry: WT_RET(__cursor_func_init(cbt, 1)); + + switch (btree->type) { + case BTREE_ROW: + WT_ERR(__cursor_row_search(session, cbt, 1)); + + /* + * We are only interested in checking for conflicts. + */ + if (cbt->compare == 0 && cbt->ins != NULL) + ret = __wt_txn_update_check(session, cbt->ins->upd); + break; + case BTREE_COL_FIX: + case BTREE_COL_VAR: + WT_ILLEGAL_VALUE_ERR(session); + } + +err: if (ret == WT_RESTART) + goto retry; + WT_TRET(__curfile_leave(cbt)); + if (ret != 0) + WT_TRET(__cursor_reset(cbt)); + return (ret); +} + +/* + * __wt_btcur_remove -- + * Remove a record from the tree. + */ +int +__wt_btcur_remove(WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + btree = cbt->btree; + cursor = &cbt->iface; + session = (WT_SESSION_IMPL *)cursor->session; + + WT_STAT_FAST_CONN_INCR(session, cursor_remove); + WT_STAT_FAST_DATA_INCR(session, cursor_remove); + WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size); + + if (btree->type == BTREE_ROW) + WT_RET(__cursor_size_chk(session, &cursor->key)); + +retry: WT_RET(__cursor_func_init(cbt, 1)); + + switch (btree->type) { + case BTREE_COL_FIX: + case BTREE_COL_VAR: + WT_ERR(__cursor_col_search(session, cbt)); + + /* Remove the record if it exists. */ + if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) { + if (!__cursor_fix_implicit(btree, cbt)) + WT_ERR(WT_NOTFOUND); + /* + * Creating a record past the end of the tree in a + * fixed-length column-store implicitly fills the + * gap with empty records. Return success in that + * case, the record was deleted successfully. + * + * Correct the btree cursor's location: the search + * will have pointed us at the previous/next item, + * and that's not correct. + */ + cbt->recno = cursor->recno; + } else + ret = __cursor_col_modify(session, cbt, 1); + break; + case BTREE_ROW: + /* Remove the record if it exists. */ + WT_ERR(__cursor_row_search(session, cbt, 0)); + if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) + WT_ERR(WT_NOTFOUND); + + ret = __cursor_row_modify(session, cbt, 1); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + +err: if (ret == WT_RESTART) + goto retry; + /* + * If the cursor is configured to overwrite and the record is not + * found, that is exactly what we want. + */ + if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ret == WT_NOTFOUND) + ret = 0; + + if (ret != 0) + WT_TRET(__cursor_reset(cbt)); + + return (ret); +} + +/* + * __wt_btcur_update -- + * Update a record in the tree. + */ +int +__wt_btcur_update(WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + btree = cbt->btree; + cursor = &cbt->iface; + session = (WT_SESSION_IMPL *)cursor->session; + + WT_STAT_FAST_CONN_INCR(session, cursor_update); + WT_STAT_FAST_DATA_INCR(session, cursor_update); + WT_STAT_FAST_DATA_INCRV( + session, cursor_update_bytes, cursor->value.size); + + if (btree->type == BTREE_ROW) + WT_RET(__cursor_size_chk(session, &cursor->key)); + WT_RET(__cursor_size_chk(session, &cursor->value)); + + /* + * The tree is no longer empty: eviction should pay attention to it, + * and it's no longer possible to bulk-load into it. + */ + if (btree->bulk_load_ok) { + btree->bulk_load_ok = 0; + __wt_btree_evictable(session, 1); + } + +retry: WT_RET(__cursor_func_init(cbt, 1)); + + switch (btree->type) { + case BTREE_COL_FIX: + case BTREE_COL_VAR: + WT_ERR(__cursor_col_search(session, cbt)); + + /* + * If not overwriting, fail if the key doesn't exist. Update + * the record if it exists. Creating a record past the end of + * the tree in a fixed-length column-store implicitly fills the + * gap with empty records. Update the record in that case, the + * record exists. + */ + if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && + (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) && + !__cursor_fix_implicit(btree, cbt)) + WT_ERR(WT_NOTFOUND); + ret = __cursor_col_modify(session, cbt, 0); + break; + case BTREE_ROW: + WT_ERR(__cursor_row_search(session, cbt, 1)); + /* + * If not overwriting, fail if the key does not exist. + */ + if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && + (cbt->compare != 0 || !__cursor_valid(cbt, NULL))) + WT_ERR(WT_NOTFOUND); + ret = __cursor_row_modify(session, cbt, 0); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + +err: if (ret == WT_RESTART) + goto retry; + + /* + * If successful, point the cursor at internal copies of the data. We + * could shuffle memory in the cursor so the key/value pair are in local + * buffer memory, but that's a data copy. We don't want to do another + * search (and we might get a different update structure if we race). + * To make this work, we add a field to the btree cursor to pass back a + * pointer to the modify function's allocated update structure. + */ + if (ret == 0) + WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update)); + + if (ret != 0) + WT_TRET(__cursor_reset(cbt)); + return (ret); +} + +/* + * __wt_btcur_next_random -- + * Move to a random record in the tree. + */ +int +__wt_btcur_next_random(WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + btree = cbt->btree; + + /* + * Only supports row-store: applications can trivially select a random + * value from a column-store, if there were any reason to do so. + */ + if (btree->type != BTREE_ROW) + WT_RET(ENOTSUP); + + WT_STAT_FAST_CONN_INCR(session, cursor_next); + WT_STAT_FAST_DATA_INCR(session, cursor_next); + + WT_RET(__cursor_func_init(cbt, 1)); + + WT_ERR(__wt_row_random(session, cbt)); + if (__cursor_valid(cbt, &upd)) + WT_ERR(__wt_kv_return(session, cbt, upd)); + else + WT_ERR(__wt_btcur_search_near(cbt, 0)); + +err: if (ret != 0) + WT_TRET(__cursor_reset(cbt)); + return (ret); +} + +/* + * __wt_btcur_compare -- + * Return a comparison between two cursors. + */ +int +__wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp) +{ + WT_BTREE *btree; + WT_CURSOR *a, *b; + WT_SESSION_IMPL *session; + + a = (WT_CURSOR *)a_arg; + b = (WT_CURSOR *)b_arg; + btree = a_arg->btree; + session = (WT_SESSION_IMPL *)a->session; + + switch (btree->type) { + case BTREE_COL_FIX: + case BTREE_COL_VAR: + /* + * Compare the interface's cursor record, not the underlying + * cursor reference: the interface's cursor reference is the + * one being returned to the application. + */ + if (a->recno < b->recno) + *cmpp = -1; + else if (a->recno == b->recno) + *cmpp = 0; + else + *cmpp = 1; + break; + case BTREE_ROW: + WT_RET(__wt_compare( + session, btree->collator, &a->key, &b->key, cmpp)); + break; + WT_ILLEGAL_VALUE(session); + } + return (0); +} + +/* + * __cursor_equals -- + * Return if two cursors reference the same row. + */ +static int +__cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b) +{ + switch (a->btree->type) { + case BTREE_COL_FIX: + case BTREE_COL_VAR: + /* + * Compare the interface's cursor record, not the underlying + * cursor reference: the interface's cursor reference is the + * one being returned to the application. + */ + if (((WT_CURSOR *)a)->recno == ((WT_CURSOR *)b)->recno) + return (1); + break; + case BTREE_ROW: + if (a->ref != b->ref) + return (0); + if (a->ins != NULL || b->ins != NULL) { + if (a->ins == b->ins) + return (1); + break; + } + if (a->slot == b->slot) + return (1); + break; + } + return (0); +} + +/* + * __cursor_truncate -- + * Discard a cursor range from row-store or variable-width column-store + * tree. + */ +static int +__cursor_truncate(WT_SESSION_IMPL *session, + WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop, + int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, int)) +{ + WT_DECL_RET; + + /* + * First, call the standard cursor remove method to do a full search and + * re-position the cursor because we don't have a saved copy of the + * page's write generation information, which we need to remove records. + * Once that's done, we can delete records without a full search, unless + * we encounter a restart error because the page was modified by some + * other thread of control; in that case, repeat the full search to + * refresh the page's modification information. + * + * If this is a row-store, we delete leaf pages having no overflow items + * without reading them; for that to work, we have to ensure we read the + * page referenced by the ending cursor, since we may be deleting only a + * partial page at the end of the truncation. Our caller already fully + * instantiated the end cursor, so we know that page is pinned in memory + * and we can proceed without concern. + */ + if (start == NULL) { + do { + WT_RET(__wt_btcur_remove(stop)); + for (;;) { + if ((ret = __wt_btcur_prev(stop, 1)) != 0) + break; + stop->compare = 0; /* Exact match */ + if ((ret = rmfunc(session, stop, 1)) != 0) + break; + } + } while (ret == WT_RESTART); + } else { + do { + WT_RET(__wt_btcur_remove(start)); + for (;;) { + if (stop != NULL && + __cursor_equals(start, stop)) + break; + if ((ret = __wt_btcur_next(start, 1)) != 0) + break; + start->compare = 0; /* Exact match */ + if ((ret = rmfunc(session, start, 1)) != 0) + break; + } + } while (ret == WT_RESTART); + } + + WT_RET_NOTFOUND_OK(ret); + return (0); +} + +/* + * __cursor_truncate_fix -- + * Discard a cursor range from fixed-width column-store tree. + */ +static int +__cursor_truncate_fix(WT_SESSION_IMPL *session, + WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop, + int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, int)) +{ + WT_DECL_RET; + uint8_t *value; + + /* + * Handle fixed-length column-store objects separately: for row-store + * and variable-length column-store objects we have "deleted" values + * and so returned objects actually exist: fixed-length column-store + * objects are filled-in if they don't exist, that is, if you create + * record 37, records 1-36 magically appear. Those records can't be + * deleted, which means we have to ignore already "deleted" records. + * + * First, call the standard cursor remove method to do a full search and + * re-position the cursor because we don't have a saved copy of the + * page's write generation information, which we need to remove records. + * Once that's done, we can delete records without a full search, unless + * we encounter a restart error because the page was modified by some + * other thread of control; in that case, repeat the full search to + * refresh the page's modification information. + */ + if (start == NULL) { + do { + WT_RET(__wt_btcur_remove(stop)); + for (;;) { + if ((ret = __wt_btcur_prev(stop, 1)) != 0) + break; + stop->compare = 0; /* Exact match */ + value = (uint8_t *)stop->iface.value.data; + if (*value != 0 && + (ret = rmfunc(session, stop, 1)) != 0) + break; + } + } while (ret == WT_RESTART); + } else { + do { + WT_RET(__wt_btcur_remove(start)); + for (;;) { + if (stop != NULL && + __cursor_equals(start, stop)) + break; + if ((ret = __wt_btcur_next(start, 1)) != 0) + break; + start->compare = 0; /* Exact match */ + value = (uint8_t *)start->iface.value.data; + if (*value != 0 && + (ret = rmfunc(session, start, 1)) != 0) + break; + } + } while (ret == WT_RESTART); + } + + WT_RET_NOTFOUND_OK(ret); + return (0); +} + +/* + * __wt_btcur_range_truncate -- + * Discard a cursor range from the tree. + */ +int +__wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) +{ + WT_BTREE *btree; + WT_CURSOR_BTREE *cbt; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbt = (start != NULL) ? start : stop; + session = (WT_SESSION_IMPL *)cbt->iface.session; + btree = cbt->btree; + + /* + * For recovery, we log the start and stop keys for a truncate + * operation, not the individual records removed. On the other hand, + * for rollback we need to keep track of all the in-memory operations. + * + * We deal with this here by logging the truncate range first, then (in + * the logging code) disabling writing of the in-memory remove records + * to disk. + */ + if (S2C(session)->logging) + WT_RET(__wt_txn_truncate_log(session, start, stop)); + + switch (btree->type) { + case BTREE_COL_FIX: + WT_ERR(__cursor_truncate_fix( + session, start, stop, __cursor_col_modify)); + break; + case BTREE_COL_VAR: + WT_ERR(__cursor_truncate( + session, start, stop, __cursor_col_modify)); + break; + case BTREE_ROW: + /* + * The underlying cursor comparison routine requires cursors be + * fully instantiated when truncating row-store objects because + * it's comparing page and/or skiplist positions, not keys. (Key + * comparison would work, it's only that a key comparison would + * be relatively expensive. Column-store objects have record + * number keys, so the key comparison is cheap.) Cursors may + * have only had their keys set, so we must ensure the cursors + * are positioned in the tree. + */ + if (start != NULL) + WT_ERR(__wt_btcur_search(start)); + if (stop != NULL) + WT_ERR(__wt_btcur_search(stop)); + WT_ERR(__cursor_truncate( + session, start, stop, __cursor_row_modify)); + break; + } + +err: if (S2C(session)->logging) + WT_TRET(__wt_txn_truncate_end(session)); + return (ret); +} + +/* + * __wt_btcur_close -- + * Close a btree cursor. + */ +int +__wt_btcur_close(WT_CURSOR_BTREE *cbt) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + + ret = __curfile_leave(cbt); + __wt_buf_free(session, &cbt->search_key); + __wt_buf_free(session, &cbt->tmp); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c new file mode 100644 index 00000000000..ebbb335d3a8 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -0,0 +1,1104 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +#ifdef HAVE_DIAGNOSTIC +/* + * We pass around a session handle and output information, group it together. + */ +typedef struct { + WT_SESSION_IMPL *session; /* Enclosing session */ + + /* + * When using the standard event handlers, the debugging output has to + * do its own message handling because its output isn't line-oriented. + */ + FILE *fp; /* Output file stream */ + WT_ITEM *msg; /* Buffered message */ + + WT_ITEM *tmp; /* Temporary space */ +} WT_DBG; + +static const /* Output separator */ + char * const sep = "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n"; + +static int __debug_cell(WT_DBG *, const WT_PAGE_HEADER *, WT_CELL_UNPACK *); +static int __debug_cell_data( + WT_DBG *, WT_PAGE *, int type, const char *, WT_CELL_UNPACK *); +static void __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, int); +static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *); +static int __debug_dsk_cell(WT_DBG *, const WT_PAGE_HEADER *); +static void __debug_dsk_col_fix(WT_DBG *, const WT_PAGE_HEADER *); +static void __debug_item(WT_DBG *, const char *, const void *, size_t); +static int __debug_page(WT_DBG *, WT_PAGE *, uint32_t); +static void __debug_page_col_fix(WT_DBG *, WT_PAGE *); +static int __debug_page_col_int(WT_DBG *, WT_PAGE *, uint32_t); +static int __debug_page_col_var(WT_DBG *, WT_PAGE *); +static int __debug_page_metadata(WT_DBG *, WT_PAGE *); +static int __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t); +static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *); +static int __debug_ref(WT_DBG *, WT_REF *); +static void __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *); +static int __debug_tree(WT_SESSION_IMPL *, WT_PAGE *, const char *, uint32_t); +static void __debug_update(WT_DBG *, WT_UPDATE *, int); +static void __dmsg(WT_DBG *, const char *, ...) + WT_GCC_ATTRIBUTE((format (printf, 2, 3))); +static void __dmsg_wrapup(WT_DBG *); + +/* + * __wt_debug_set_verbose -- + * Set verbose flags from the debugger. + */ +int +__wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v) +{ + const char *cfg[2] = { NULL, NULL }; + char buf[256]; + + snprintf(buf, sizeof(buf), "verbose=[%s]", v); + cfg[0] = buf; + return (__wt_verbose_config(session, cfg)); +} + +/* + * __debug_hex_byte -- + * Output a single byte in hex. + */ +static inline void +__debug_hex_byte(WT_DBG *ds, uint8_t v) +{ + static const char hex[] = "0123456789abcdef"; + + __dmsg(ds, "#%c%c", hex[(v & 0xf0) >> 4], hex[v & 0x0f]); +} + +/* + * __debug_config -- + * Configure debugging output. + */ +static int +__debug_config(WT_SESSION_IMPL *session, WT_DBG *ds, const char *ofile) +{ + memset(ds, 0, sizeof(WT_DBG)); + + ds->session = session; + + WT_RET(__wt_scr_alloc(session, 512, &ds->tmp)); + + /* + * If we weren't given a file, we use the default event handler, and + * we'll have to buffer messages. + */ + if (ofile == NULL) + return (__wt_scr_alloc(session, 512, &ds->msg)); + + /* If we're using a file, flush on each line. */ + if ((ds->fp = fopen(ofile, "w")) == NULL) + WT_RET_MSG(session, __wt_errno(), "%s", ofile); + + (void)setvbuf(ds->fp, NULL, _IOLBF, 0); + return (0); +} + +/* + * __dmsg_wrapup -- + * Flush any remaining output, release resources. + */ +static void +__dmsg_wrapup(WT_DBG *ds) +{ + WT_SESSION_IMPL *session; + WT_ITEM *msg; + + session = ds->session; + msg = ds->msg; + + __wt_scr_free(&ds->tmp); + + /* + * Discard the buffer -- it shouldn't have anything in it, but might + * as well be cautious. + */ + if (msg != NULL) { + if (msg->size != 0) + (void)__wt_msg(session, "%s", (char *)msg->mem); + __wt_scr_free(&ds->msg); + } + + /* Close any file we opened. */ + if (ds->fp != NULL) + (void)fclose(ds->fp); +} + +/* + * __dmsg -- + * Debug message. + */ +static void +__dmsg(WT_DBG *ds, const char *fmt, ...) +{ + va_list ap; + WT_ITEM *msg; + WT_SESSION_IMPL *session; + size_t len, space; + char *p; + + session = ds->session; + + /* + * Debug output chunks are not necessarily terminated with a newline + * character. It's easy if we're dumping to a stream, but if we're + * dumping to an event handler, which is line-oriented, we must buffer + * the output chunk, and pass it to the event handler once we see a + * terminating newline. + */ + if (ds->fp == NULL) { + msg = ds->msg; + for (;;) { + p = (char *)msg->mem + msg->size; + space = msg->memsize - msg->size; + va_start(ap, fmt); + len = (size_t)vsnprintf(p, space, fmt, ap); + va_end(ap); + + /* Check if there was enough space. */ + if (len < space) { + msg->size += len; + break; + } + + /* + * There's not much to do on error without checking for + * an error return on every single printf. Anyway, it's + * pretty unlikely and this is debugging output, I'm not + * going to worry about it. + */ + if (__wt_buf_grow( + session, msg, msg->memsize + len + 128) != 0) + return; + } + if (((uint8_t *)msg->mem)[msg->size - 1] == '\n') { + ((uint8_t *)msg->mem)[msg->size - 1] = '\0'; + (void)__wt_msg(session, "%s", (char *)msg->mem); + msg->size = 0; + } + } else { + va_start(ap, fmt); + (void)vfprintf(ds->fp, fmt, ap); + va_end(ap); + } +} + +/* + * __wt_debug_addr_print -- + * Print out an address. + */ +int +__wt_debug_addr_print( + WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) +{ + WT_DECL_ITEM(buf); + + WT_RET(__wt_scr_alloc(session, 128, &buf)); + fprintf(stderr, "%s\n", + __wt_addr_string(session, addr, addr_size, buf)); + __wt_scr_free(&buf); + + return (0); +} + +/* + * __wt_debug_addr -- + * Read and dump a disk page in debugging mode, using an addr/size pair. + */ +int +__wt_debug_addr(WT_SESSION_IMPL *session, + const uint8_t *addr, size_t addr_size, const char *ofile) +{ + WT_BM *bm; + WT_DECL_ITEM(buf); + WT_DECL_RET; + + bm = S2BT(session)->bm; + + WT_RET(__wt_scr_alloc(session, 1024, &buf)); + WT_ERR(bm->read(bm, session, buf, addr, addr_size)); + ret = __wt_debug_disk(session, buf->mem, ofile); + +err: __wt_scr_free(&buf); + return (ret); +} + +/* + * __wt_debug_offset_blind -- + * Read and dump a disk page in debugging mode, using a file offset. + */ +int +__wt_debug_offset_blind( + WT_SESSION_IMPL *session, wt_off_t offset, const char *ofile) +{ + WT_DECL_ITEM(buf); + WT_DECL_RET; + + /* + * This routine depends on the default block manager's view of files, + * where an address consists of a file offset, length, and checksum. + * This is for debugging only. Other block managers might not see a + * file or address the same way, that's why there's no block manager + * method. + */ + WT_RET(__wt_scr_alloc(session, 1024, &buf)); + WT_ERR(__wt_block_read_off_blind( + session, S2BT(session)->bm->block, buf, offset)); + ret = __wt_debug_disk(session, buf->mem, ofile); + +err: __wt_scr_free(&buf); + return (ret); +} + +/* + * __wt_debug_offset -- + * Read and dump a disk page in debugging mode, using a file + * offset/size/checksum triplet. + */ +int +__wt_debug_offset(WT_SESSION_IMPL *session, + wt_off_t offset, uint32_t size, uint32_t cksum, const char *ofile) +{ + WT_DECL_ITEM(buf); + WT_DECL_RET; + uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE], *endp; + + /* + * This routine depends on the default block manager's view of files, + * where an address consists of a file offset, length, and checksum. + * This is for debugging only: other block managers might not see a + * file or address the same way, that's why there's no block manager + * method. + * + * Convert the triplet into an address structure. + */ + endp = addr; + WT_RET(__wt_block_addr_to_buffer( + S2BT(session)->bm->block, &endp, offset, size, cksum)); + + /* + * Read the address through the btree I/O functions (so the block is + * decompressed as necessary). + */ + WT_RET(__wt_scr_alloc(session, 0, &buf)); + WT_ERR(__wt_bt_read(session, buf, addr, WT_PTRDIFF(endp, addr))); + ret = __wt_debug_disk(session, buf->mem, ofile); + +err: __wt_scr_free(&buf); + return (ret); +} + +/* + * __wt_debug_disk -- + * Dump a disk page in debugging mode. + */ +int +__wt_debug_disk( + WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile) +{ + WT_DBG *ds, _ds; + WT_DECL_RET; + + ds = &_ds; + WT_RET(__debug_config(session, ds, ofile)); + + __dmsg(ds, "%s page", __wt_page_type_string(dsk->type)); + switch (dsk->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_INT: + case WT_PAGE_COL_VAR: + __dmsg(ds, ", recno %" PRIu64, dsk->recno); + /* FALLTHROUGH */ + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + __dmsg(ds, ", entries %" PRIu32 "\n", dsk->u.entries); + break; + case WT_PAGE_OVFL: + __dmsg(ds, ", datalen %" PRIu32 "\n", dsk->u.datalen); + break; + WT_ILLEGAL_VALUE(session); + } + + switch (dsk->type) { + case WT_PAGE_COL_FIX: + __debug_dsk_col_fix(ds, dsk); + break; + case WT_PAGE_COL_INT: + case WT_PAGE_COL_VAR: + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + ret = __debug_dsk_cell(ds, dsk); + break; + default: + break; + } + + __dmsg_wrapup(ds); + + return (ret); +} + +/* + * __debug_dsk_col_fix -- + * Dump a WT_PAGE_COL_FIX page. + */ +static void +__debug_dsk_col_fix(WT_DBG *ds, const WT_PAGE_HEADER *dsk) +{ + WT_BTREE *btree; + uint32_t i; + uint8_t v; + + btree = S2BT(ds->session); + + WT_FIX_FOREACH(btree, dsk, v, i) { + __dmsg(ds, "\t{"); + __debug_hex_byte(ds, v); + __dmsg(ds, "}\n"); + } +} + +/* + * __debug_dsk_cell -- + * Dump a page of WT_CELL's. + */ +static int +__debug_dsk_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + uint32_t i; + + btree = S2BT(ds->session); + unpack = &_unpack; + + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { + __wt_cell_unpack(cell, unpack); + WT_RET(__debug_cell(ds, dsk, unpack)); + } + return (0); +} + +/* + * __debug_shape_info -- + * Pretty-print information about a page. + */ +static char * +__debug_tree_shape_info(WT_PAGE *page) +{ + uint64_t v; + static char buf[32]; + + v = page->memory_footprint; + if (v >= WT_GIGABYTE) + snprintf(buf, sizeof(buf), "(%" PRIu64 "G)", v / WT_GIGABYTE); + else if (v >= WT_MEGABYTE) + snprintf(buf, sizeof(buf), "(%" PRIu64 "M)", v / WT_MEGABYTE); + else + snprintf(buf, sizeof(buf), "(%" PRIu64 ")", v); + return (buf); +} + +/* + * __debug_tree_shape_worker -- + * Dump information about the current page and descend. + */ +static void +__debug_tree_shape_worker(WT_DBG *ds, WT_PAGE *page, int level) +{ + WT_REF *ref; + WT_SESSION_IMPL *session; + + session = ds->session; + + if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) { + __dmsg(ds, "%*s" "I" "%s\n", + level, " ", __debug_tree_shape_info(page)); + WT_INTL_FOREACH_BEGIN(session, page, ref) { + if (ref->state == WT_REF_MEM) + __debug_tree_shape_worker( + ds, ref->page, level + 3); + } WT_INTL_FOREACH_END; + } else + __dmsg(ds, "%*s" "L" "%s\n", + level, " ", __debug_tree_shape_info(page)); +} + +/* + * __wt_debug_tree_shape -- + * Dump the shape of the in-memory tree. + */ +int +__wt_debug_tree_shape( + WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile) +{ + WT_DBG *ds, _ds; + + ds = &_ds; + WT_RET(__debug_config(session, ds, ofile)); + + /* A NULL page starts at the top of the tree -- it's a convenience. */ + if (page == NULL) + page = S2BT(session)->root.page; + + __debug_tree_shape_worker(ds, page, 0); + + __dmsg_wrapup(ds); + return (0); +} + +#define WT_DEBUG_TREE_LEAF 0x01 /* Debug leaf pages */ +#define WT_DEBUG_TREE_WALK 0x02 /* Descend the tree */ + +/* + * __wt_debug_tree_all -- + * Dump the in-memory information for a tree, including leaf pages. + */ +int +__wt_debug_tree_all(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile) +{ + return (__debug_tree( + session, page, ofile, WT_DEBUG_TREE_LEAF | WT_DEBUG_TREE_WALK)); +} + +/* + * __wt_debug_tree -- + * Dump the in-memory information for a tree, not including leaf pages. + */ +int +__wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile) +{ + return (__debug_tree(session, page, ofile, WT_DEBUG_TREE_WALK)); +} + +/* + * __wt_debug_page -- + * Dump the in-memory information for a page. + */ +int +__wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile) +{ + WT_DBG *ds, _ds; + WT_DECL_RET; + + ds = &_ds; + WT_RET(__debug_config(session, ds, ofile)); + + ret = __debug_page(ds, page, WT_DEBUG_TREE_LEAF); + + __dmsg_wrapup(ds); + + return (ret); +} + +/* + * __debug_tree -- + * Dump the in-memory information for a tree. + */ +static int +__debug_tree( + WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile, uint32_t flags) +{ + WT_DBG *ds, _ds; + WT_DECL_RET; + + ds = &_ds; + WT_RET(__debug_config(session, ds, ofile)); + + /* A NULL page starts at the top of the tree -- it's a convenience. */ + if (page == NULL) + page = S2BT(session)->root.page; + + ret = __debug_page(ds, page, flags); + + __dmsg_wrapup(ds); + + return (ret); +} + +/* + * __debug_page -- + * Dump the in-memory information for an in-memory page. + */ +static int +__debug_page(WT_DBG *ds, WT_PAGE *page, uint32_t flags) +{ + WT_SESSION_IMPL *session; + + session = ds->session; + + /* Dump the page metadata. */ + WT_RET(__debug_page_metadata(ds, page)); + + /* Dump the page. */ + switch (page->type) { + case WT_PAGE_COL_FIX: + if (LF_ISSET(WT_DEBUG_TREE_LEAF)) + __debug_page_col_fix(ds, page); + break; + case WT_PAGE_COL_INT: + WT_RET(__debug_page_col_int(ds, page, flags)); + break; + case WT_PAGE_COL_VAR: + if (LF_ISSET(WT_DEBUG_TREE_LEAF)) + WT_RET(__debug_page_col_var(ds, page)); + break; + case WT_PAGE_ROW_INT: + WT_RET(__debug_page_row_int(ds, page, flags)); + break; + case WT_PAGE_ROW_LEAF: + if (LF_ISSET(WT_DEBUG_TREE_LEAF)) + WT_RET(__debug_page_row_leaf(ds, page)); + break; + WT_ILLEGAL_VALUE(session); + } + + return (0); +} + +/* + * __debug_page_metadata -- + * Dump an in-memory page's metadata. + */ +static int +__debug_page_metadata(WT_DBG *ds, WT_PAGE *page) +{ + WT_PAGE_INDEX *pindex; + WT_PAGE_MODIFY *mod; + WT_SESSION_IMPL *session; + uint32_t entries; + + session = ds->session; + mod = page->modify; + + __dmsg(ds, "%p", page); + + switch (page->type) { + case WT_PAGE_COL_INT: + __dmsg(ds, " recno %" PRIu64, page->pg_intl_recno); + pindex = WT_INTL_INDEX_COPY(page); + entries = pindex->entries; + break; + case WT_PAGE_COL_FIX: + __dmsg(ds, " recno %" PRIu64, page->pg_fix_recno); + entries = page->pg_fix_entries; + break; + case WT_PAGE_COL_VAR: + __dmsg(ds, " recno %" PRIu64, page->pg_var_recno); + entries = page->pg_var_entries; + break; + case WT_PAGE_ROW_INT: + pindex = WT_INTL_INDEX_COPY(page); + entries = pindex->entries; + break; + case WT_PAGE_ROW_LEAF: + entries = page->pg_row_entries; + break; + WT_ILLEGAL_VALUE(session); + } + + __dmsg(ds, ": %s\n", __wt_page_type_string(page->type)); + __dmsg(ds, "\t" "disk %p, entries %" PRIu32, page->dsk, entries); + __dmsg(ds, "%s", __wt_page_is_modified(page) ? ", dirty" : ", clean"); + if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) + __dmsg(ds, ", keys-built"); + if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) + __dmsg(ds, ", disk-alloc"); + if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED)) + __dmsg(ds, ", disk-mapped"); + if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) + __dmsg(ds, ", evict-lru"); + if (F_ISSET_ATOMIC(page, WT_PAGE_SCANNING)) + __dmsg(ds, ", scanning"); + if (F_ISSET_ATOMIC(page, WT_PAGE_SPLITTING)) + __dmsg(ds, ", splitting"); + + if (mod != NULL) + switch (F_ISSET(mod, WT_PM_REC_MASK)) { + case WT_PM_REC_EMPTY: + __dmsg(ds, ", empty"); + break; + case WT_PM_REC_MULTIBLOCK: + __dmsg(ds, ", multiblock"); + break; + case WT_PM_REC_REPLACE: + __dmsg(ds, ", replaced"); + break; + case 0: + break; + WT_ILLEGAL_VALUE(session); + } + if (mod != NULL) + __dmsg(ds, ", write generation=%" PRIu32, mod->write_gen); + __dmsg(ds, "\n"); + + return (0); +} + +/* + * __debug_page_col_fix -- + * Dump an in-memory WT_PAGE_COL_FIX page. + */ +static void +__debug_page_col_fix(WT_DBG *ds, WT_PAGE *page) +{ + WT_BTREE *btree; + WT_INSERT *ins; + const WT_PAGE_HEADER *dsk; + WT_SESSION_IMPL *session; + uint64_t recno; + uint32_t i; + uint8_t v; + + session = ds->session; + btree = S2BT(session); + dsk = page->dsk; + recno = page->pg_fix_recno; + + if (dsk != NULL) { + ins = WT_SKIP_FIRST(WT_COL_UPDATE_SINGLE(page)); + WT_FIX_FOREACH(btree, dsk, v, i) { + __dmsg(ds, "\t%" PRIu64 "\t{", recno); + __debug_hex_byte(ds, v); + __dmsg(ds, "}\n"); + + /* Check for a match on the update list. */ + if (ins != NULL && WT_INSERT_RECNO(ins) == recno) { + __dmsg(ds, + "\tupdate %" PRIu64 "\n", + WT_INSERT_RECNO(ins)); + __debug_update(ds, ins->upd, 1); + ins = WT_SKIP_NEXT(ins); + } + ++recno; + } + } + + if (WT_COL_UPDATE_SINGLE(page) != NULL) { + __dmsg(ds, "%s", sep); + __debug_col_skip(ds, WT_COL_UPDATE_SINGLE(page), "update", 1); + } + if (WT_COL_APPEND(page) != NULL) { + __dmsg(ds, "%s", sep); + __debug_col_skip(ds, WT_COL_APPEND(page), "append", 1); + } +} + +/* + * __debug_page_col_int -- + * Dump an in-memory WT_PAGE_COL_INT page. + */ +static int +__debug_page_col_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags) +{ + WT_REF *ref; + WT_SESSION_IMPL *session; + + session = ds->session; + + WT_INTL_FOREACH_BEGIN(session, page, ref) { + __dmsg(ds, "\trecno %" PRIu64 "\n", ref->key.recno); + WT_RET(__debug_ref(ds, ref)); + } WT_INTL_FOREACH_END; + + if (LF_ISSET(WT_DEBUG_TREE_WALK)) + WT_INTL_FOREACH_BEGIN(session, page, ref) { + if (ref->state == WT_REF_MEM) { + __dmsg(ds, "\n"); + WT_RET(__debug_page(ds, ref->page, flags)); + } + } WT_INTL_FOREACH_END; + + return (0); +} + +/* + * __debug_page_col_var -- + * Dump an in-memory WT_PAGE_COL_VAR page. + */ +static int +__debug_page_col_var(WT_DBG *ds, WT_PAGE *page) +{ + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + WT_COL *cip; + WT_INSERT_HEAD *update; + uint64_t recno, rle; + uint32_t i; + char tag[64]; + + unpack = &_unpack; + recno = page->pg_var_recno; + + WT_COL_FOREACH(page, cip, i) { + if ((cell = WT_COL_PTR(page, cip)) == NULL) { + unpack = NULL; + rle = 1; + } else { + __wt_cell_unpack(cell, unpack); + rle = __wt_cell_rle(unpack); + } + snprintf(tag, sizeof(tag), "%" PRIu64 " %" PRIu64, recno, rle); + WT_RET( + __debug_cell_data(ds, page, WT_PAGE_COL_VAR, tag, unpack)); + + if ((update = WT_COL_UPDATE(page, cip)) != NULL) + __debug_col_skip(ds, update, "update", 0); + recno += rle; + } + + if (WT_COL_APPEND(page) != NULL) { + __dmsg(ds, "%s", sep); + __debug_col_skip(ds, WT_COL_APPEND(page), "append", 0); + } + + return (0); +} + +/* + * __debug_page_row_int -- + * Dump an in-memory WT_PAGE_ROW_INT page. + */ +static int +__debug_page_row_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags) +{ + WT_REF *ref; + WT_SESSION_IMPL *session; + size_t len; + uint8_t *p; + + session = ds->session; + + WT_INTL_FOREACH_BEGIN(session, page, ref) { + __wt_ref_key(page, ref, &p, &len); + __debug_item(ds, "K", p, len); + WT_RET(__debug_ref(ds, ref)); + } WT_INTL_FOREACH_END; + + if (LF_ISSET(WT_DEBUG_TREE_WALK)) + WT_INTL_FOREACH_BEGIN(session, page, ref) { + if (ref->state == WT_REF_MEM) { + __dmsg(ds, "\n"); + WT_RET(__debug_page(ds, ref->page, flags)); + } + } WT_INTL_FOREACH_END; + return (0); +} + +/* + * __debug_page_row_leaf -- + * Dump an in-memory WT_PAGE_ROW_LEAF page. + */ +static int +__debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page) +{ + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_INSERT_HEAD *insert; + WT_ROW *rip; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + uint32_t i; + + session = ds->session; + unpack = &_unpack; + WT_RET(__wt_scr_alloc(session, 256, &key)); + + /* + * Dump any K/V pairs inserted into the page before the first from-disk + * key on the page. + */ + if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) + __debug_row_skip(ds, insert); + + /* Dump the page's K/V pairs. */ + WT_ROW_FOREACH(page, rip, i) { + WT_RET(__wt_row_leaf_key(session, page, rip, key, 0)); + __debug_item(ds, "K", key->data, key->size); + + if ((cell = __wt_row_leaf_value_cell(page, rip, NULL)) == NULL) + __dmsg(ds, "\tV {}\n"); + else { + __wt_cell_unpack(cell, unpack); + WT_ERR(__debug_cell_data( + ds, page, WT_PAGE_ROW_LEAF, "V", unpack)); + } + + if ((upd = WT_ROW_UPDATE(page, rip)) != NULL) + __debug_update(ds, upd, 0); + + if ((insert = WT_ROW_INSERT(page, rip)) != NULL) + __debug_row_skip(ds, insert); + } + +err: __wt_scr_free(&key); + return (ret); +} + +/* + * __debug_col_skip -- + * Dump a column-store skiplist. + */ +static void +__debug_col_skip(WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, int hexbyte) +{ + WT_INSERT *ins; + + WT_SKIP_FOREACH(ins, head) { + __dmsg(ds, + "\t%s %" PRIu64 "\n", tag, WT_INSERT_RECNO(ins)); + __debug_update(ds, ins->upd, hexbyte); + } +} + +/* + * __debug_row_skip -- + * Dump an insert list. + */ +static void +__debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head) +{ + WT_INSERT *ins; + + WT_SKIP_FOREACH(ins, head) { + __debug_item(ds, + "insert", WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins)); + __debug_update(ds, ins->upd, 0); + } +} + +/* + * __debug_update -- + * Dump an update list. + */ +static void +__debug_update(WT_DBG *ds, WT_UPDATE *upd, int hexbyte) +{ + for (; upd != NULL; upd = upd->next) + if (WT_UPDATE_DELETED_ISSET(upd)) + __dmsg(ds, "\tvalue {deleted}\n"); + else if (hexbyte) { + __dmsg(ds, "\t{"); + __debug_hex_byte(ds, + ((uint8_t *)WT_UPDATE_DATA(upd))[0]); + __dmsg(ds, "}\n"); + } else + __debug_item(ds, + "value", WT_UPDATE_DATA(upd), upd->size); +} + +/* + * __debug_ref -- + * Dump a WT_REF structure. + */ +static int +__debug_ref(WT_DBG *ds, WT_REF *ref) +{ + WT_SESSION_IMPL *session; + size_t addr_size; + const uint8_t *addr; + + session = ds->session; + + __dmsg(ds, "\t"); + switch (ref->state) { + case WT_REF_DISK: + __dmsg(ds, "disk"); + break; + case WT_REF_DELETED: + __dmsg(ds, "deleted"); + break; + case WT_REF_LOCKED: + __dmsg(ds, "locked %p", ref->page); + break; + case WT_REF_MEM: + __dmsg(ds, "memory %p", ref->page); + break; + case WT_REF_READING: + __dmsg(ds, "reading"); + break; + case WT_REF_SPLIT: + __dmsg(ds, "split"); + break; + WT_ILLEGAL_VALUE(session); + } + + WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); + __dmsg(ds, " %s\n", + __wt_addr_string(session, addr, addr_size, ds->tmp)); + + return (0); +} + +/* + * __debug_cell -- + * Dump a single unpacked WT_CELL. + */ +static int +__debug_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK *unpack) +{ + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_SESSION_IMPL *session; + const char *type; + + session = ds->session; + + __dmsg(ds, "\t%s: len %" PRIu32, + __wt_cell_type_string(unpack->raw), unpack->size); + + /* Dump cell's per-disk page type information. */ + switch (dsk->type) { + case WT_PAGE_COL_INT: + switch (unpack->type) { + case WT_CELL_VALUE: + __dmsg(ds, ", recno: %" PRIu64, unpack->v); + break; + } + break; + case WT_PAGE_COL_VAR: + switch (unpack->type) { + case WT_CELL_DEL: + case WT_CELL_KEY_OVFL_RM: + case WT_CELL_VALUE: + case WT_CELL_VALUE_OVFL: + case WT_CELL_VALUE_OVFL_RM: + __dmsg(ds, ", rle: %" PRIu64, __wt_cell_rle(unpack)); + break; + } + break; + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + switch (unpack->type) { + case WT_CELL_KEY: + __dmsg(ds, ", pfx: %" PRIu8, unpack->prefix); + break; + } + break; + } + + /* Dump addresses. */ + switch (unpack->raw) { + case WT_CELL_ADDR_DEL: + type = "addr/del"; + goto addr; + case WT_CELL_ADDR_INT: + type = "addr/int"; + goto addr; + case WT_CELL_ADDR_LEAF: + type = "addr/leaf"; + goto addr; + case WT_CELL_ADDR_LEAF_NO: + type = "addr/leaf-no"; + goto addr; + case WT_CELL_KEY_OVFL: + case WT_CELL_KEY_OVFL_RM: + case WT_CELL_VALUE_OVFL: + case WT_CELL_VALUE_OVFL_RM: + type = "ovfl"; +addr: WT_RET(__wt_scr_alloc(session, 128, &buf)); + __dmsg(ds, ", %s %s", type, + __wt_addr_string(session, unpack->data, unpack->size, buf)); + __wt_scr_free(&buf); + WT_RET(ret); + break; + } + __dmsg(ds, "\n"); + + return (__debug_cell_data(ds, NULL, dsk->type, NULL, unpack)); +} + +/* + * __debug_cell_data -- + * Dump a single cell's data in debugging mode. + */ +static int +__debug_cell_data(WT_DBG *ds, + WT_PAGE *page, int page_type, const char *tag, WT_CELL_UNPACK *unpack) +{ + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_SESSION_IMPL *session; + const char *p; + + session = ds->session; + + /* + * Column-store references to deleted cells return a NULL cell + * reference. + */ + if (unpack == NULL) { + __debug_item(ds, tag, "deleted", strlen("deleted")); + return (0); + } + + switch (unpack->raw) { + case WT_CELL_ADDR_DEL: + case WT_CELL_ADDR_INT: + case WT_CELL_ADDR_LEAF: + case WT_CELL_ADDR_LEAF_NO: + case WT_CELL_DEL: + case WT_CELL_KEY_OVFL_RM: + case WT_CELL_VALUE_OVFL_RM: + p = __wt_cell_type_string(unpack->raw); + __debug_item(ds, tag, p, strlen(p)); + break; + case WT_CELL_KEY: + case WT_CELL_KEY_OVFL: + case WT_CELL_KEY_PFX: + case WT_CELL_KEY_SHORT: + case WT_CELL_KEY_SHORT_PFX: + case WT_CELL_VALUE: + case WT_CELL_VALUE_COPY: + case WT_CELL_VALUE_OVFL: + case WT_CELL_VALUE_SHORT: + WT_RET(__wt_scr_alloc(session, 256, &buf)); + ret = page == NULL ? + __wt_dsk_cell_data_ref(session, page_type, unpack, buf) : + __wt_page_cell_data_ref(session, page, unpack, buf); + if (ret == 0) + __debug_item(ds, tag, buf->data, buf->size); + __wt_scr_free(&buf); + break; + WT_ILLEGAL_VALUE(session); + } + + return (ret); +} + +/* + * __debug_item -- + * Dump a single data/size pair, with an optional tag. + */ +static void +__debug_item(WT_DBG *ds, const char *tag, const void *data_arg, size_t size) +{ + size_t i; + int ch; + const uint8_t *data; + + __dmsg(ds, "\t%s%s{", tag == NULL ? "" : tag, tag == NULL ? "" : " "); + for (data = data_arg, i = 0; i < size; ++i, ++data) { + ch = data[0]; + if (isprint(ch)) + __dmsg(ds, "%c", ch); + else + __debug_hex_byte(ds, data[0]); + } + __dmsg(ds, "}\n"); +} +#endif diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c new file mode 100644 index 00000000000..2fc1b0d5460 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -0,0 +1,339 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * Fast-delete support. + * + * This file contains most of the code that allows WiredTiger to delete pages + * of data without reading them into the cache. (This feature is currently + * only available for row-store objects.) + * + * The way cursor truncate works in a row-store object is it explicitly reads + * the first and last pages of the truncate range, then walks the tree with a + * flag so the cursor walk code marks any page within the range, that hasn't + * yet been read and which has no overflow items, as deleted, by changing the + * WT_REF state to WT_REF_DELETED. Pages already in the cache or with overflow + * items, have their rows updated/deleted individually. The transaction for the + * delete operation is stored in memory referenced by the WT_REF.page_del field. + * + * Future cursor walks of the tree will skip the deleted page based on the + * transaction stored for the delete, but it gets more complicated if a read is + * done using a random key, or a cursor walk is done with a transaction where + * the delete is not visible. In those cases, we read the original contents of + * the page. The page-read code notices a deleted page is being read, and as + * part of the read instantiates the contents of the page, creating a WT_UPDATE + * with a deleted operation, in the same transaction as deleted the page. In + * other words, the read process makes it appear as if the page was read and + * each individual row deleted, exactly as would have happened if the page had + * been in the cache all along. + * + * There's an additional complication to support rollback of the page delete. + * When the page was marked deleted, a pointer to the WT_REF was saved in the + * deleting session's transaction list and the delete is unrolled by resetting + * the WT_REF_DELETED state back to WT_REF_DISK. However, if the page has been + * instantiated by some reading thread, that's not enough, each individual row + * on the page must have the delete operation reset. If the page split, the + * WT_UPDATE lists might have been saved/restored during reconciliation and + * appear on multiple pages, and the WT_REF stored in the deleting session's + * transaction list is no longer useful. For this reason, when the page is + * instantiated by a read, a list of the WT_UPDATE structures on the page is + * stored in the WT_REF.page_del field, with the transaction ID, that way the + * session unrolling the delete can find all of the WT_UPDATE structures that + * require update. + * + * One final note: pages can also be marked deleted if emptied and evicted. In + * that case, the WT_REF state will be set to WT_REF_DELETED but there will not + * be any associated WT_REF.page_del field. These pages are always skipped + * during cursor traversal (the page could not have been evicted if there were + * updates that weren't globally visible), and if read is forced to instantiate + * such a page, it simply creates an empty page from scratch. + */ + +/* + * __wt_delete_page -- + * If deleting a range, try to delete the page without instantiating it. + */ +int +__wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp) +{ + WT_DECL_RET; + WT_PAGE *parent; + + *skipp = 0; + + /* + * Atomically switch the page's state to lock it. If the page is not + * on-disk, other threads may be using it, no fast delete. + * + * Possible optimization: if the page is already deleted and the delete + * is visible to us (the delete has been committed), we could skip the + * page instead of instantiating it and figuring out there are no rows + * in the page. While that's a huge amount of work to no purpose, it's + * unclear optimizing for overlapping range deletes is worth the effort. + */ + if (ref->state != WT_REF_DISK || + !WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_LOCKED)) + return (0); + + /* + * We cannot fast-delete pages that have overflow key/value items as + * the overflow blocks have to be discarded. The way we figure that + * out is to check the on-page cell type for the page, cells for leaf + * pages that have no overflow items are special. + * + * In some cases, the reference address may not reference an on-page + * cell (for example, some combination of page splits), in which case + * we can't check the original cell value and we fail. + * + * To look at an on-page cell, we need to look at the parent page, and + * that's dangerous, our parent page could change without warning if + * the parent page were to split, deepening the tree. It's safe: the + * page's reference will always point to some valid page, and if we find + * any problems we simply fail the fast-delete optimization. + * + * !!! + * I doubt it's worth the effort, but we could copy the cell's type into + * the reference structure, and then we wouldn't need an on-page cell. + */ + parent = ref->home; + if (__wt_off_page(parent, ref->addr) || + __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO) + goto err; + + /* + * This action dirties the parent page: mark it dirty now, there's no + * future reconciliation of the child leaf page that will dirty it as + * we write the tree. + */ + WT_ERR(__wt_page_parent_modify_set(session, ref, 0)); + + /* + * Record the change in the transaction structure and set the change's + * transaction ID. + */ + WT_ERR(__wt_calloc_def(session, 1, &ref->page_del)); + ref->page_del->txnid = session->txn.id; + + WT_ERR(__wt_txn_modify_ref(session, ref)); + + *skipp = 1; + WT_PUBLISH(ref->state, WT_REF_DELETED); + return (0); + +err: __wt_free(session, ref->page_del); + + /* + * Restore the page to on-disk status, we'll have to instantiate it. + */ + WT_PUBLISH(ref->state, WT_REF_DISK); + return (ret); +} + +/* + * __wt_delete_page_rollback -- + * Abort pages that were deleted without being instantiated. + */ +void +__wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_UPDATE **upd; + + /* + * If the page is still "deleted", it's as we left it, reset the state + * to on-disk and we're done. Otherwise, we expect the page is either + * instantiated or being instantiated. Loop because it's possible for + * the page to return to the deleted state if instantiation fails. + */ + for (;; __wt_yield()) + switch (ref->state) { + case WT_REF_DISK: + case WT_REF_READING: + WT_ASSERT(session, 0); /* Impossible, assert */ + break; + case WT_REF_DELETED: + /* + * If the page is still "deleted", it's as we left it, + * reset the state. + */ + if (WT_ATOMIC_CAS4( + ref->state, WT_REF_DELETED, WT_REF_DISK)) + return; + break; + case WT_REF_LOCKED: + /* + * A possible state, the page is being instantiated. + */ + break; + case WT_REF_MEM: + case WT_REF_SPLIT: + /* + * We can't use the normal read path to get a copy of + * the page because the session may have closed the + * cursor, we no longer have the reference to the tree + * required for a hazard pointer. We're safe because + * with unresolved transactions, the page isn't going + * anywhere. + * + * The page is in an in-memory state, walk the list of + * update structures and abort them. + */ + for (upd = + ref->page_del->update_list; *upd != NULL; ++upd) + (*upd)->txnid = WT_TXN_ABORTED; + + /* + * Discard the memory, the transaction can't abort + * twice. + */ + __wt_free(session, ref->page_del->update_list); + __wt_free(session, ref->page_del); + return; + } +} + +/* + * __wt_delete_page_skip -- + * If iterating a cursor, skip deleted pages that are visible to us. + */ +int +__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) +{ + int skip; + + /* + * Deleted pages come from two sources: either it's a fast-delete as + * described above, or the page has been emptied by other operations + * and eviction deleted it. + * + * In both cases, the WT_REF state will be WT_REF_DELETED. In the case + * of a fast-delete page, there will be a WT_PAGE_DELETED structure with + * the transaction ID of the transaction that deleted the page, and the + * page is visible if that transaction ID is visible. In the case of an + * empty page, there will be no WT_PAGE_DELETED structure and the delete + * is by definition visible, eviction could not have deleted the page if + * there were changes on it that were not globally visible. + * + * We're here because we found a WT_REF state set to WT_REF_DELETED. It + * is possible the page is being read into memory right now, though, and + * the page could switch to an in-memory state at any time. Lock down + * the structure, just to be safe. + */ + if (!WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED)) + return (0); + + skip = ref->page_del == NULL || + __wt_txn_visible(session, ref->page_del->txnid) ? 1 : 0; + + WT_PUBLISH(ref->state, WT_REF_DELETED); + return (skip); +} + +/* + * __wt_delete_page_instantiate -- + * Instantiate an entirely deleted row-store leaf page. + */ +int +__wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_DELETED *page_del; + WT_UPDATE **upd_array, *upd; + uint32_t i; + + btree = S2BT(session); + page = ref->page; + page_del = ref->page_del; + + /* + * Give the page a modify structure. + * + * If the tree is already dirty and so will be written, mark the page + * dirty. (We'd like to free the deleted pages, but if the handle is + * read-only or if the application never modifies the tree, we're not + * able to do so.) + */ + if (btree->modified) { + WT_RET(__wt_page_modify_init(session, page)); + __wt_page_modify_set(session, page); + } + + /* + * An operation is accessing a "deleted" page, and we're building an + * in-memory version of the page (making it look like all entries in + * the page were individually updated by a remove operation). There + * are two cases where we end up here: + * + * First, a running transaction used a truncate call to delete the page + * without reading it, in which case the page reference includes a + * structure with a transaction ID; the page we're building might split + * in the future, so we update that structure to include references to + * all of the update structures we create, so the transaction can abort. + * + * Second, a truncate call deleted a page and the truncate committed, + * but an older transaction in the system forced us to keep the old + * version of the page around, then we crashed and recovered, and now + * we're being forced to read that page. + * + * In the first case, we have a page reference structure, in the second + * second, we don't. + * + * Allocate the per-reference update array; in the case of instantiating + * a page, deleted by a running transaction that might eventually abort, + * we need a list of the update structures so we can do that abort. The + * hard case is if a page splits: the update structures might be moved + * to different pages, and we still have to find them all for an abort. + */ + + if (page_del != NULL) + WT_RET(__wt_calloc_def( + session, page->pg_row_entries + 1, &page_del->update_list)); + + /* Allocate the per-page update array. */ + WT_ERR(__wt_calloc_def(session, page->pg_row_entries, &upd_array)); + page->pg_row_upd = upd_array; + + /* + * Fill in the per-reference update array with references to update + * structures, fill in the per-page update array with references to + * deleted items. + */ + for (i = 0; i < page->pg_row_entries; ++i) { + WT_ERR(__wt_calloc_def(session, 1, &upd)); + WT_UPDATE_DELETED_SET(upd); + + if (page_del == NULL) + upd->txnid = WT_TXN_NONE; /* Globally visible */ + else { + upd->txnid = page_del->txnid; + page_del->update_list[i] = upd; + } + + upd->next = upd_array[i]; + upd_array[i] = upd; + } + + __wt_cache_page_inmem_incr(session, page, + page->pg_row_entries * (sizeof(WT_UPDATE *) + sizeof(WT_UPDATE))); + + return (0); + +err: /* + * There's no need to free the page update structures on error, our + * caller will discard the page and do that work for us. We could + * similarly leave the per-reference update array alone because it + * won't ever be used by any page that's not in-memory, but cleaning + * it up makes sense, especially if we come back in to this function + * attempting to instantiate this page again. + */ + if (page_del != NULL) + __wt_free(session, page_del->update_list); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c new file mode 100644 index 00000000000..a162e2dc841 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -0,0 +1,422 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static void __free_page_modify(WT_SESSION_IMPL *, WT_PAGE *); +static void __free_page_col_var(WT_SESSION_IMPL *, WT_PAGE *); +static void __free_page_int(WT_SESSION_IMPL *, WT_PAGE *); +static void __free_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *); +static void __free_skip_array(WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t); +static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *); +static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t); +static void __free_update_list(WT_SESSION_IMPL *, WT_UPDATE *); + +/* + * __wt_ref_out -- + * Discard an in-memory page, freeing all memory associated with it. + */ +void +__wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) +{ + /* + * A version of the page-out function that allows us to make additional + * diagnostic checks. + */ + WT_ASSERT(session, S2BT(session)->evict_ref != ref); + + __wt_page_out(session, &ref->page); +} + +/* + * __wt_page_out -- + * Discard an in-memory page, freeing all memory associated with it. + */ +void +__wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) +{ + WT_PAGE *page; + WT_PAGE_HEADER *dsk; + WT_PAGE_MODIFY *mod; + + /* + * Kill our caller's reference, do our best to catch races. + */ + page = *pagep; + *pagep = NULL; + + /* + * We should never discard a dirty page, the file's current eviction + * point or a page queued for LRU eviction. + */ + WT_ASSERT(session, !__wt_page_is_modified(page)); + WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); + WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLITTING)); + +#ifdef HAVE_DIAGNOSTIC + { + WT_HAZARD *hp; + int i; + /* + * Make sure no other thread has a hazard pointer on the page we are + * about to discard. This is complicated by the fact that readers + * publish their hazard pointer before re-checking the page state, so + * our check can race with readers without indicating a real problem. + * Wait for up to a second for hazard pointers to be cleared. + */ + for (hp = NULL, i = 0; i < 100; i++) { + if ((hp = __wt_page_hazard_check(session, page)) == NULL) + break; + __wt_sleep(0, 10000); + } + if (hp != NULL) + __wt_errx(session, + "discarded page has hazard pointer: (%p: %s, line %d)", + hp->page, hp->file, hp->line); + WT_ASSERT(session, hp == NULL); + } +#endif + + /* + * If a root page split, there may be one or more pages linked from the + * page; walk the list, discarding pages. + */ + switch (page->type) { + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + mod = page->modify; + if (mod != NULL && mod->mod_root_split != NULL) + __wt_page_out(session, &mod->mod_root_split); + break; + } + + /* Update the cache's information. */ + __wt_cache_page_evict(session, page); + + /* + * If discarding the page as part of process exit, the application may + * configure to leak the memory rather than do the work. + */ + if (F_ISSET(S2C(session), WT_CONN_LEAK_MEMORY)) + return; + + /* Free the page modification information. */ + if (page->modify != NULL) + __free_page_modify(session, page); + + switch (page->type) { + case WT_PAGE_COL_FIX: + break; + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + __free_page_int(session, page); + break; + case WT_PAGE_COL_VAR: + __free_page_col_var(session, page); + break; + case WT_PAGE_ROW_LEAF: + __free_page_row_leaf(session, page); + break; + } + + /* Discard any disk image. */ + dsk = (WT_PAGE_HEADER *)page->dsk; + if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) + __wt_overwrite_and_free_len(session, dsk, dsk->mem_size); + if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED)) + (void)__wt_mmap_discard(session, dsk, dsk->mem_size); + + __wt_overwrite_and_free(session, page); +} + +/* + * __free_page_modify -- + * Discard the page's associated modification structures. + */ +static void +__free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_INSERT_HEAD *append; + WT_MULTI *multi; + WT_PAGE_MODIFY *mod; + uint32_t i; + + mod = page->modify; + + switch (F_ISSET(mod, WT_PM_REC_MASK)) { + case WT_PM_REC_MULTIBLOCK: + /* Free list of replacement blocks. */ + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) { + switch (page->type) { + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + __wt_free(session, multi->key.ikey); + break; + } + __wt_free(session, multi->skip); + __wt_free(session, multi->skip_dsk); + __wt_free(session, multi->addr.addr); + } + __wt_free(session, mod->mod_multi); + break; + case WT_PM_REC_REPLACE: + /* + * Discard any replacement address: this memory is usually moved + * into the parent's WT_REF, but at the root that can't happen. + */ + __wt_free(session, mod->mod_replace.addr); + break; + } + + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + /* Free the append array. */ + if ((append = WT_COL_APPEND(page)) != NULL) { + __free_skip_list(session, WT_SKIP_FIRST(append)); + __wt_free(session, append); + __wt_free(session, mod->mod_append); + } + + /* Free the insert/update array. */ + if (mod->mod_update != NULL) + __free_skip_array(session, mod->mod_update, + page->type == + WT_PAGE_COL_FIX ? 1 : page->pg_var_entries); + break; + } + + /* Free the overflow on-page, reuse and transaction-cache skiplists. */ + __wt_ovfl_reuse_free(session, page); + __wt_ovfl_txnc_free(session, page); + __wt_ovfl_discard_free(session, page); + + __wt_free(session, page->modify->ovfl_track); + + __wt_free(session, page->modify); +} + +/* + * __free_page_int -- + * Discard a WT_PAGE_COL_INT or WT_PAGE_ROW_INT page. + */ +static void +__free_page_int(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + __wt_free_ref_index(session, page, WT_INTL_INDEX_COPY(page), 0); +} + +/* + * __wt_free_ref -- + * Discard the contents of a WT_REF structure (optionally including the + * pages it references). + */ +void +__wt_free_ref( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, int free_pages) +{ + WT_IKEY *ikey; + + if (ref == NULL) + return; + + /* + * Optionally free the referenced pages. (The path to free referenced + * page is used for error cleanup, no instantiated and then discarded + * page should have WT_REF entries with real pages. The page may have + * been marked dirty as well; page discard checks for that, so we mark + * it clean explicitly.) + */ + if (free_pages && ref->page != NULL) { + if (ref->page->modify != NULL) { + ref->page->modify->write_gen = 0; + __wt_cache_dirty_decr(session, ref->page); + } + __wt_page_out(session, &ref->page); + } + + /* Free any key allocation. */ + switch (page->type) { + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + if ((ikey = __wt_ref_key_instantiated(ref)) != NULL) + __wt_free(session, ikey); + break; + } + + /* Free any address allocation. */ + if (ref->addr != NULL && __wt_off_page(page, ref->addr)) { + __wt_free(session, ((WT_ADDR *)ref->addr)->addr); + __wt_free(session, ref->addr); + } + + /* Free any page-deleted information. */ + if (ref->page_del != NULL) { + __wt_free(session, ref->page_del->update_list); + __wt_free(session, ref->page_del); + } + + __wt_overwrite_and_free(session, ref); +} + +/* + * __wt_free_ref_index -- + * Discard a page index and it's references. + */ +void +__wt_free_ref_index(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_PAGE_INDEX *pindex, int free_pages) +{ + uint32_t i; + + if (pindex == NULL) + return; + + for (i = 0; i < pindex->entries; ++i) + __wt_free_ref(session, page, pindex->index[i], free_pages); + __wt_free(session, pindex); +} + +/* + * __free_page_col_var -- + * Discard a WT_PAGE_COL_VAR page. + */ +static void +__free_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + /* Free the RLE lookup array. */ + __wt_free(session, page->pg_var_repeats); +} + +/* + * __free_page_row_leaf -- + * Discard a WT_PAGE_ROW_LEAF page. + */ +static void +__free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_IKEY *ikey; + WT_ROW *rip; + uint32_t i; + void *copy; + + /* + * Free the in-memory index array. + * + * For each entry, see if the key was an allocation (that is, if it + * points somewhere other than the original page), and if so, free + * the memory. + */ + WT_ROW_FOREACH(page, rip, i) { + copy = WT_ROW_KEY_COPY(rip); + (void)__wt_row_leaf_key_info( + page, copy, &ikey, NULL, NULL, NULL); + if (ikey != NULL) + __wt_free(session, ikey); + } + + /* + * Free the insert array. + * + * Row-store tables have one additional slot in the insert array (the + * insert array has an extra slot to hold keys that sort before keys + * found on the original page). + */ + if (page->pg_row_ins != NULL) + __free_skip_array( + session, page->pg_row_ins, page->pg_row_entries + 1); + + /* Free the update array. */ + if (page->pg_row_upd != NULL) + __free_update(session, page->pg_row_upd, page->pg_row_entries); +} + +/* + * __free_skip_array -- + * Discard an array of skip list headers. + */ +static void +__free_skip_array( + WT_SESSION_IMPL *session, WT_INSERT_HEAD **head_arg, uint32_t entries) +{ + WT_INSERT_HEAD **head; + + /* + * For each non-NULL slot in the page's array of inserts, free the + * linked list anchored in that slot. + */ + for (head = head_arg; entries > 0; --entries, ++head) + if (*head != NULL) { + __free_skip_list(session, WT_SKIP_FIRST(*head)); + __wt_free(session, *head); + } + + /* Free the header array. */ + __wt_free(session, head_arg); +} + +/* + * __free_skip_list -- + * Walk a WT_INSERT forward-linked list and free the per-thread combination + * of a WT_INSERT structure and its associated chain of WT_UPDATE structures. + */ +static void +__free_skip_list(WT_SESSION_IMPL *session, WT_INSERT *ins) +{ + WT_INSERT *next; + + for (; ins != NULL; ins = next) { + __free_update_list(session, ins->upd); + next = WT_SKIP_NEXT(ins); + __wt_free(session, ins); + } +} + +/* + * __free_update -- + * Discard the update array. + */ +static void +__free_update( + WT_SESSION_IMPL *session, WT_UPDATE **update_head, uint32_t entries) +{ + WT_UPDATE **updp; + + /* + * For each non-NULL slot in the page's array of updates, free the + * linked list anchored in that slot. + */ + for (updp = update_head; entries > 0; --entries, ++updp) + if (*updp != NULL) + __free_update_list(session, *updp); + + /* Free the update array. */ + __wt_free(session, update_head); +} + +/* + * __free_update_list -- + * Walk a WT_UPDATE forward-linked list and free the per-thread combination + * of a WT_UPDATE structure and its associated data. + */ +static void +__free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd) +{ + WT_UPDATE *next; + + for (; upd != NULL; upd = next) { + /* Everything we free should be visible to everyone. */ + WT_ASSERT(session, + F_ISSET(session, WT_SESSION_DISCARD_FORCE) || + upd->txnid == WT_TXN_ABORTED || + __wt_txn_visible_all(session, upd->txnid)); + + next = upd->next; + __wt_free(session, upd); + } +} diff --git a/src/third_party/wiredtiger/src/btree/bt_evict.c b/src/third_party/wiredtiger/src/btree/bt_evict.c new file mode 100644 index 00000000000..ff049553c7f --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_evict.c @@ -0,0 +1,1297 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __evict_clear_walks(WT_SESSION_IMPL *); +static int __evict_has_work(WT_SESSION_IMPL *, uint32_t *); +static int __evict_lru(WT_SESSION_IMPL *, uint32_t); +static int __evict_lru_cmp(const void *, const void *); +static int __evict_lru_pages(WT_SESSION_IMPL *, int); +static int __evict_pass(WT_SESSION_IMPL *); +static int __evict_walk(WT_SESSION_IMPL *, uint32_t *, uint32_t); +static int __evict_walk_file(WT_SESSION_IMPL *, u_int *, uint32_t); +static void *__evict_worker(void *); + +/* + * __evict_read_gen -- + * Get the adjusted read generation for an eviction entry. + */ +static inline uint64_t +__evict_read_gen(const WT_EVICT_ENTRY *entry) +{ + WT_PAGE *page; + uint64_t read_gen; + + /* Never prioritize empty slots. */ + if (entry->ref == NULL) + return (UINT64_MAX); + + page = entry->ref->page; + read_gen = page->read_gen + entry->btree->evict_priority; + + /* + * Skew the read generation for internal pages, we prefer to evict leaf + * pages. + */ + if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) + read_gen += WT_EVICT_INT_SKEW; + + return (read_gen); +} + +/* + * __evict_lru_cmp -- + * Qsort function: sort the eviction array. + */ +static int +__evict_lru_cmp(const void *a, const void *b) +{ + uint64_t a_lru, b_lru; + + a_lru = __evict_read_gen(a); + b_lru = __evict_read_gen(b); + + return ((a_lru < b_lru) ? -1 : (a_lru == b_lru) ? 0 : 1); +} + +/* + * __evict_list_clear -- + * Clear an entry in the LRU eviction list. + */ +static inline void +__evict_list_clear(WT_SESSION_IMPL *session, WT_EVICT_ENTRY *e) +{ + if (e->ref != NULL) { + WT_ASSERT(session, + F_ISSET_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU)); + F_CLR_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU); + } + e->ref = NULL; + e->btree = WT_DEBUG_POINT; +} + +/* + * __wt_evict_list_clear_page -- + * Make sure a page is not in the LRU eviction list. This called from the + * page eviction code to make sure there is no attempt to evict a child + * page multiple times. + */ +void +__wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_CACHE *cache; + WT_EVICT_ENTRY *evict; + uint32_t i, elem; + + WT_ASSERT(session, + __wt_ref_is_root(ref) || ref->state == WT_REF_LOCKED); + + /* Fast path: if the page isn't on the queue, don't bother searching. */ + if (!F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU)) + return; + + cache = S2C(session)->cache; + __wt_spin_lock(session, &cache->evict_lock); + + elem = cache->evict_max; + for (i = 0, evict = cache->evict; i < elem; i++, evict++) + if (evict->ref == ref) { + __evict_list_clear(session, evict); + break; + } + + WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU)); + + __wt_spin_unlock(session, &cache->evict_lock); +} + +/* + * __wt_evict_server_wake -- + * Wake the eviction server thread. + */ +int +__wt_evict_server_wake(WT_SESSION_IMPL *session) +{ + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + cache = conn->cache; + + if (WT_VERBOSE_ISSET(session, WT_VERB_EVICTSERVER)) { + uint64_t bytes_inuse, bytes_max; + + bytes_inuse = __wt_cache_bytes_inuse(cache); + bytes_max = conn->cache_size; + WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, + "waking, bytes inuse %s max (%" PRIu64 + "MB %s %" PRIu64 "MB)", + bytes_inuse <= bytes_max ? "<=" : ">", + bytes_inuse / WT_MEGABYTE, + bytes_inuse <= bytes_max ? "<=" : ">", + bytes_max / WT_MEGABYTE)); + } + + return (__wt_cond_signal(session, cache->evict_cond)); +} + +/* + * __evict_server -- + * Thread to evict pages from the cache. + */ +static void * +__evict_server(void *arg) +{ + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_EVICT_WORKER *worker; + WT_SESSION_IMPL *session; + + session = arg; + conn = S2C(session); + cache = conn->cache; + + while (F_ISSET(conn, WT_CONN_EVICTION_RUN)) { + /* Evict pages from the cache as needed. */ + WT_ERR(__evict_pass(session)); + + if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) + break; + + /* + * If we have caught up and there are more than the minimum + * number of eviction workers running, shut one down. + */ + if (conn->evict_workers > conn->evict_workers_min) { + WT_TRET(__wt_verbose(session, WT_VERB_EVICTSERVER, + "Stopping evict worker: %"PRIu32"\n", + conn->evict_workers)); + worker = &conn->evict_workctx[--conn->evict_workers]; + F_CLR(worker, WT_EVICT_WORKER_RUN); + WT_TRET(__wt_cond_signal( + session, cache->evict_waiter_cond)); + WT_TRET(__wt_thread_join(session, worker->tid)); + /* + * Flag errors here with a message, but don't shut down + * the eviction server - that's fatal. + */ + WT_ASSERT(session, ret == 0); + if (ret != 0) { + (void)__wt_msg(session, + "Error stopping eviction worker: %d", ret); + ret = 0; + } + } + F_CLR(cache, WT_EVICT_ACTIVE); + WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping")); + /* Don't rely on signals: check periodically. */ + WT_ERR(__wt_cond_wait(session, cache->evict_cond, 100000)); + WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "waking")); + } + + WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "exiting")); + +err: + if (ret != 0) { + WT_PANIC_MSG(session, ret, "eviction server error"); + return (NULL); + } + + if (cache->pages_inmem != cache->pages_evict) + __wt_errx(session, + "cache server: exiting with %" PRIu64 " pages in " + "memory and %" PRIu64 " pages evicted", + cache->pages_inmem, cache->pages_evict); + if (cache->bytes_inmem != cache->bytes_evict) + __wt_errx(session, + "cache server: exiting with %" PRIu64 " bytes in " + "memory and %" PRIu64 " bytes evicted", + cache->bytes_inmem, cache->bytes_evict); + if (cache->bytes_dirty != 0 || cache->pages_dirty != 0) + __wt_errx(session, + "cache server: exiting with %" PRIu64 + " bytes dirty and %" PRIu64 " pages dirty", + cache->bytes_dirty, cache->pages_dirty); + + return (NULL); +} + +/* + * __wt_evict_create -- + * Start the eviction server thread. + */ +int +__wt_evict_create(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_EVICT_WORKER *workers; + u_int i; + + conn = S2C(session); + + /* Set first, the thread might run before we finish up. */ + F_SET(conn, WT_CONN_EVICTION_RUN); + + /* We need a session handle because we're reading/writing pages. */ + WT_RET(__wt_open_internal_session( + conn, "eviction-server", 0, 0, &conn->evict_session)); + session = conn->evict_session; + + /* + * If there's only a single eviction thread, it may be called upon to + * perform slow operations for the block manager. (The flag is not + * reset if reconfigured later, but I doubt that's a problem.) + */ + if (conn->evict_workers_max == 0) + F_SET(session, WT_SESSION_CAN_WAIT); + + if (conn->evict_workers_max > 0) { + WT_RET(__wt_calloc_def( + session, conn->evict_workers_max, &workers)); + conn->evict_workctx = workers; + + for (i = 0; i < conn->evict_workers_max; i++) { + WT_RET(__wt_open_internal_session(conn, + "eviction-worker", 0, 0, &workers[i].session)); + workers[i].id = i; + F_SET(workers[i].session, WT_SESSION_CAN_WAIT); + + if (i < conn->evict_workers_min) { + ++conn->evict_workers; + F_SET(&workers[i], WT_EVICT_WORKER_RUN); + WT_RET(__wt_thread_create( + workers[i].session, &workers[i].tid, + __evict_worker, &workers[i])); + } + } + } + + /* + * Start the primary eviction server thread after the worker threads + * have started to avoid it starting additional worker threads before + * the worker's sessions are created. + */ + WT_RET(__wt_thread_create( + session, &conn->evict_tid, __evict_server, session)); + conn->evict_tid_set = 1; + + return (0); +} + +/* + * __wt_evict_destroy -- + * Destroy the eviction server thread. + */ +int +__wt_evict_destroy(WT_SESSION_IMPL *session) +{ + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_EVICT_WORKER *workers; + WT_SESSION *wt_session; + u_int i; + + conn = S2C(session); + cache = conn->cache; + workers = conn->evict_workctx; + + F_CLR(conn, WT_CONN_EVICTION_RUN); + + WT_TRET(__wt_verbose( + session, WT_VERB_EVICTSERVER, "waiting for helper threads")); + for (i = 0; i < conn->evict_workers; i++) { + WT_TRET(__wt_cond_signal(session, cache->evict_waiter_cond)); + WT_TRET(__wt_thread_join(session, workers[i].tid)); + } + /* Handle shutdown when cleaning up after a failed open */ + if (conn->evict_workctx != NULL) { + for (i = 0; i < conn->evict_workers_max; i++) { + wt_session = &conn->evict_workctx[i].session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + } + __wt_free(session, conn->evict_workctx); + } + + if (conn->evict_tid_set) { + WT_TRET(__wt_evict_server_wake(session)); + WT_TRET(__wt_thread_join(session, conn->evict_tid)); + conn->evict_tid_set = 0; + } + + if (conn->evict_session != NULL) { + wt_session = &conn->evict_session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + + conn->evict_session = NULL; + } + + return (ret); +} + +/* + * __evict_worker -- + * Thread to help evict pages from the cache. + */ +static void * +__evict_worker(void *arg) +{ + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_EVICT_WORKER *worker; + WT_SESSION_IMPL *session; + uint32_t flags; + + worker = arg; + session = worker->session; + conn = S2C(session); + cache = conn->cache; + + while (F_ISSET(conn, WT_CONN_EVICTION_RUN) && + F_ISSET(worker, WT_EVICT_WORKER_RUN)) { + /* Don't spin in a busy loop if there is no work to do */ + WT_ERR(__evict_has_work(session, &flags)); + if (flags == 0) + WT_ERR(__wt_cond_wait( + session, cache->evict_waiter_cond, 10000)); + else + WT_ERR(__evict_lru_pages(session, 1)); + } + + if (0) { +err: __wt_err(session, ret, "cache eviction helper error"); + } + + WT_TRET(__wt_verbose(session, WT_VERB_EVICTSERVER, "helper exiting")); + + return (NULL); +} + +/* + * __evict_has_work -- + * Find out if there is eviction work to be done. + */ +static int +__evict_has_work(WT_SESSION_IMPL *session, uint32_t *flagsp) +{ + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + uint32_t flags; + uint64_t bytes_inuse, bytes_max, dirty_inuse; + + conn = S2C(session); + cache = conn->cache; + flags = 0; + *flagsp = 0; + + if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) + return (0); + + /* + * Figure out whether the cache usage exceeds either the eviction + * target or the dirty target. + */ + bytes_inuse = __wt_cache_bytes_inuse(cache); + dirty_inuse = cache->bytes_dirty; + bytes_max = conn->cache_size; + + /* Check to see if the eviction server should run. */ + if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) + LF_SET(WT_EVICT_PASS_ALL); + else if (dirty_inuse > + (cache->eviction_dirty_target * bytes_max) / 100) + /* Ignore clean pages unless the cache is too large */ + LF_SET(WT_EVICT_PASS_DIRTY); + + if (F_ISSET(cache, WT_EVICT_STUCK)) + LF_SET(WT_EVICT_PASS_AGGRESSIVE); + + *flagsp = flags; + return (0); +} + +/* + * __evict_pass -- + * Evict pages from memory. + */ +static int +__evict_pass(WT_SESSION_IMPL *session) +{ + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + WT_EVICT_WORKER *worker; + int loop; + uint32_t flags; + uint64_t bytes_inuse; + + conn = S2C(session); + cache = conn->cache; + + /* Evict pages from the cache. */ + for (loop = 0;; loop++) { + /* + * If there is a request to clear eviction walks, do that now, + * before checking if the cache is full. + */ + if (F_ISSET(cache, WT_EVICT_CLEAR_WALKS)) { + F_CLR(cache, WT_EVICT_CLEAR_WALKS); + WT_RET(__evict_clear_walks(session)); + WT_RET(__wt_cond_signal( + session, cache->evict_waiter_cond)); + } + + WT_RET(__evict_has_work(session, &flags)); + if (flags == 0) + break; + + if (loop > 10) + LF_SET(WT_EVICT_PASS_AGGRESSIVE); + + bytes_inuse = __wt_cache_bytes_inuse(cache); + /* + * When the cache is full, track whether pages are being + * evicted. This will be cleared by the next thread to + * successfully evict a page. + */ + if (bytes_inuse > conn->cache_size) { + F_SET(cache, WT_EVICT_NO_PROGRESS); + } else + F_CLR(cache, WT_EVICT_NO_PROGRESS); + + /* Start a worker if we have capacity and the cache is full. */ + if (bytes_inuse > conn->cache_size && + conn->evict_workers < conn->evict_workers_max) { + WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, + "Starting evict worker: %"PRIu32"\n", + conn->evict_workers)); + worker = &conn->evict_workctx[conn->evict_workers++]; + F_SET(worker, WT_EVICT_WORKER_RUN); + WT_RET(__wt_thread_create(session, + &worker->tid, __evict_worker, worker)); + } + + F_SET(cache, WT_EVICT_ACTIVE); + WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, + "Eviction pass with: Max: %" PRIu64 + " In use: %" PRIu64 " Dirty: %" PRIu64, + conn->cache_size, bytes_inuse, cache->bytes_dirty)); + + WT_RET(__evict_lru(session, flags)); + + /* + * If we're making progress, keep going; if we're not making + * any progress at all, mark the cache "stuck" and go back to + * sleep, it's not something we can fix. + */ + if (F_ISSET(cache, WT_EVICT_NO_PROGRESS)) { + if (F_ISSET(cache, WT_EVICT_STUCK)) + break; + if (loop == 100) { + F_SET(cache, WT_EVICT_STUCK); + WT_STAT_FAST_CONN_INCR( + session, cache_eviction_slow); + WT_RET(__wt_verbose( + session, WT_VERB_EVICTSERVER, + "unable to reach eviction goal")); + break; + } + } else + loop = 0; + } + return (0); +} + +/* + * __evict_clear_walks -- + * Clear the eviction walk points for all files. + */ +static int +__evict_clear_walks(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + WT_REF *ref; + + conn = S2C(session); + cache = conn->cache; + cache->evict_file_next = NULL; + + /* + * Lock the dhandle list so sweeping cannot change the pointers out + * from under us. + * + * NOTE: we don't hold the schema lock, so we have to take care + * that the handles we see are open and valid. + */ + __wt_spin_lock(session, &conn->dhandle_lock); + + SLIST_FOREACH(dhandle, &conn->dhlh, l) { + /* Ignore non-file handles, or handles that aren't open. */ + if (!WT_PREFIX_MATCH(dhandle->name, "file:") || + !F_ISSET(dhandle, WT_DHANDLE_OPEN)) + continue; + + btree = dhandle->handle; + session->dhandle = dhandle; + if ((ref = btree->evict_ref) != NULL) { + /* + * Clear evict_ref first, in case releasing it forces + * eviction (we assert that we never try to evict the + * current eviction walk point). + */ + btree->evict_ref = NULL; + WT_TRET(__wt_page_release(session, ref, 0)); + } + session->dhandle = NULL; + } + + __wt_spin_unlock(session, &conn->dhandle_lock); + + return (ret); +} + +/* + * __evict_tree_walk_clear -- + * Clear the tree's current eviction point, acquiring the eviction lock. + */ +static int +__evict_tree_walk_clear(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CACHE *cache; + WT_DECL_RET; + + btree = S2BT(session); + cache = S2C(session)->cache; + + while (btree->evict_ref != NULL) { + F_SET(cache, WT_EVICT_CLEAR_WALKS); + WT_RET(__wt_cond_wait( + session, cache->evict_waiter_cond, 100000)); + } + + return (ret); +} + +/* + * __wt_evict_page -- + * Evict a given page. + */ +int +__wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_DECL_RET; + WT_TXN *txn; + WT_TXN_ISOLATION saved_iso; + + /* + * We have to take care when evicting pages not to write a change that: + * (a) is not yet committed; or + * (b) is committed more recently than an in-progress checkpoint. + * + * We handle both of these cases by setting up the transaction context + * before evicting, using a special "eviction" isolation level, where + * only globally visible updates can be evicted. + */ + __wt_txn_update_oldest(session); + txn = &session->txn; + saved_iso = txn->isolation; + txn->isolation = TXN_ISO_EVICTION; + + /* + * Sanity check: if a transaction has updates, its updates should not + * be visible to eviction. + */ + WT_ASSERT(session, + !F_ISSET(txn, TXN_HAS_ID) || !__wt_txn_visible(session, txn->id)); + + ret = __wt_rec_evict(session, ref, 0); + txn->isolation = saved_iso; + + return (ret); +} + +/* + * __wt_evict_file_exclusive_on -- + * Get exclusive eviction access to a file and discard any of the file's + * blocks queued for eviction. + */ +int +__wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CACHE *cache; + WT_EVICT_ENTRY *evict; + u_int i, elem; + + btree = S2BT(session); + cache = S2C(session)->cache; + + /* + * Hold the walk lock to set the "no eviction" flag: no new pages from + * the file will be queued for eviction after this point. + */ + __wt_spin_lock(session, &cache->evict_walk_lock); + F_SET(btree, WT_BTREE_NO_EVICTION); + __wt_spin_unlock(session, &cache->evict_walk_lock); + + /* Clear any existing LRU eviction walk for the file. */ + WT_RET(__evict_tree_walk_clear(session)); + + /* Hold the evict lock to remove any queued pages from this file. */ + __wt_spin_lock(session, &cache->evict_lock); + + /* + * The eviction candidate list might reference pages from the file, + * clear it. + */ + elem = cache->evict_max; + for (i = 0, evict = cache->evict; i < elem; i++, evict++) + if (evict->btree == btree) + __evict_list_clear(session, evict); + __wt_spin_unlock(session, &cache->evict_lock); + + /* + * We have disabled further eviction: wait for concurrent LRU eviction + * activity to drain. + */ + while (btree->evict_busy > 0) + __wt_yield(); + + return (0); +} + +/* + * __wt_evict_file_exclusive_off -- + * Release exclusive eviction access to a file. + */ +void +__wt_evict_file_exclusive_off(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + + btree = S2BT(session); + + WT_ASSERT(session, btree->evict_ref == NULL); + + F_CLR(btree, WT_BTREE_NO_EVICTION); +} + +/* + * __evict_lru_pages -- + * Get pages from the LRU queue to evict. + */ +static int +__evict_lru_pages(WT_SESSION_IMPL *session, int is_app) +{ + WT_DECL_RET; + + /* + * Reconcile and discard some pages: EBUSY is returned if a page fails + * eviction because it's unavailable, continue in that case. + */ + while ((ret = __wt_evict_lru_page(session, is_app)) == 0 || + ret == EBUSY) + ; + return (ret == WT_NOTFOUND ? 0 : ret); +} + +/* + * __evict_lru -- + * Evict pages from the cache based on their read generation. + */ +static int +__evict_lru(WT_SESSION_IMPL *session, uint32_t flags) +{ + WT_CACHE *cache; + WT_EVICT_ENTRY *evict; + uint64_t cutoff; + uint32_t candidates, entries, i; + + cache = S2C(session)->cache; + + /* Get some more pages to consider for eviction. */ + WT_RET(__evict_walk(session, &entries, flags)); + + /* Sort the list into LRU order and restart. */ + __wt_spin_lock(session, &cache->evict_lock); + + qsort(cache->evict, + entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp); + + while (entries > 0 && cache->evict[entries - 1].ref == NULL) + --entries; + + cache->evict_entries = entries; + + if (entries == 0) { + /* + * If there are no entries, there cannot be any candidates. + * Make sure application threads don't read past the end of the + * candidate list, or they may race with the next walk. + */ + cache->evict_candidates = 0; + cache->evict_current = NULL; + __wt_spin_unlock(session, &cache->evict_lock); + return (0); + } + + WT_ASSERT(session, cache->evict[0].ref != NULL); + + /* Find the bottom 25% of read generations. */ + cutoff = (3 * __evict_read_gen(&cache->evict[0]) + + __evict_read_gen(&cache->evict[entries - 1])) / 4; + + /* + * Don't take less than 10% or more than 50% of entries, regardless. + * That said, if there is only one entry, which is normal when + * populating an empty file, don't exclude it. + */ + for (candidates = 1 + entries / 10; + candidates < entries / 2; + candidates++) + if (__evict_read_gen(&cache->evict[candidates]) > cutoff) + break; + cache->evict_candidates = candidates; + + /* If we have more than the minimum number of entries, clear them. */ + if (cache->evict_entries > WT_EVICT_WALK_BASE) { + for (i = WT_EVICT_WALK_BASE, evict = cache->evict + i; + i < cache->evict_entries; + i++, evict++) + __evict_list_clear(session, evict); + cache->evict_entries = WT_EVICT_WALK_BASE; + } + + cache->evict_current = cache->evict; + __wt_spin_unlock(session, &cache->evict_lock); + + /* + * The eviction server thread doesn't do any actual eviction if there + * are multiple eviction workers running. + */ + WT_RET(__wt_cond_signal(session, cache->evict_waiter_cond)); + + if (S2C(session)->evict_workers > 1) { + WT_STAT_FAST_CONN_INCR( + session, cache_eviction_server_not_evicting); + /* + * If there are candidates queued, give other threads a chance + * to access them before gathering more. + */ + if (candidates > 10 && cache->evict_current != NULL) + __wt_yield(); + } else { + WT_STAT_FAST_CONN_INCR(session, cache_eviction_server_evicting); + WT_RET(__evict_lru_pages(session, 0)); + } + + return (0); +} + +/* + * __evict_walk -- + * Fill in the array by walking the next set of pages. + */ +static int +__evict_walk(WT_SESSION_IMPL *session, u_int *entriesp, uint32_t flags) +{ + WT_BTREE *btree; + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + u_int max_entries, old_slot, retries, slot; + + conn = S2C(session); + cache = S2C(session)->cache; + retries = 0; + + /* Increment the shared read generation. */ + __wt_cache_read_gen_incr(session); + + /* + * Update the oldest ID: we use it to decide whether pages are + * candidates for eviction. Without this, if all threads are blocked + * after a long-running transaction (such as a checkpoint) completes, + * we may never start evicting again. + */ + __wt_txn_update_oldest(session); + + /* + * Set the starting slot in the queue and the maximum pages added + * per walk. + */ + slot = cache->evict_entries; + max_entries = slot + WT_EVICT_WALK_INCR; + if (cache->evict_current == NULL) + WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_empty); + else + WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_not_empty); + + /* + * Lock the dhandle list so sweeping cannot change the pointers out + * from under us. + * + * NOTE: we don't hold the schema lock, so we have to take care + * that the handles we see are open and valid. + */ + __wt_spin_lock(session, &conn->dhandle_lock); + +retry: SLIST_FOREACH(dhandle, &conn->dhlh, l) { + /* Ignore non-file handles, or handles that aren't open. */ + if (!WT_PREFIX_MATCH(dhandle->name, "file:") || + !F_ISSET(dhandle, WT_DHANDLE_OPEN)) + continue; + + /* + * Each time we reenter this function, start at the next handle + * on the list. + */ + if (cache->evict_file_next != NULL && + cache->evict_file_next != dhandle) + continue; + cache->evict_file_next = NULL; + + /* Skip files that don't allow eviction. */ + btree = dhandle->handle; + if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) + continue; + + /* + * Also skip files that are configured to stick in cache until + * we get aggressive. + */ + if (btree->evict_priority != 0 && + !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE)) + continue; + + /* + * If we are filling the queue, skip files that haven't been + * useful in the past. + */ + if (btree->evict_walk_period != 0 && + cache->evict_entries >= WT_EVICT_WALK_INCR && + btree->evict_walk_skips++ < btree->evict_walk_period) + continue; + btree->evict_walk_skips = 0; + old_slot = slot; + + __wt_spin_lock(session, &cache->evict_walk_lock); + + /* + * Re-check the "no eviction" flag -- it is used to enforce + * exclusive access when a handle is being closed. + */ + if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) + WT_WITH_BTREE(session, btree, + ret = __evict_walk_file(session, &slot, flags)); + + __wt_spin_unlock(session, &cache->evict_walk_lock); + + /* + * If we didn't find enough candidates in the file, skip it + * next time. + */ + if (slot >= old_slot + WT_EVICT_WALK_PER_FILE || + slot >= max_entries) + btree->evict_walk_period = 0; + else + btree->evict_walk_period = WT_MIN( + WT_MAX(1, 2 * btree->evict_walk_period), 1000); + + if (ret != 0 || slot >= max_entries) + break; + } + + /* Walk the list of files a few times if we don't find enough pages. */ + if (ret == 0 && slot < max_entries && ++retries < 10) + goto retry; + + /* Remember the file we should visit first, next loop. */ + if (dhandle != NULL) + dhandle = SLIST_NEXT(dhandle, l); + cache->evict_file_next = dhandle; + + __wt_spin_unlock(session, &conn->dhandle_lock); + + *entriesp = slot; + return (ret); +} + +/* + * __evict_init_candidate -- + * Initialize a WT_EVICT_ENTRY structure with a given page. + */ +static void +__evict_init_candidate( + WT_SESSION_IMPL *session, WT_EVICT_ENTRY *evict, WT_REF *ref) +{ + WT_CACHE *cache; + u_int slot; + + cache = S2C(session)->cache; + + /* Keep track of the maximum slot we are using. */ + slot = (u_int)(evict - cache->evict); + if (slot >= cache->evict_max) + cache->evict_max = slot + 1; + + if (evict->ref != NULL) + __evict_list_clear(session, evict); + evict->ref = ref; + evict->btree = S2BT(session); + + /* Mark the page on the list */ + F_SET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU); +} + +/* + * __evict_walk_file -- + * Get a few page eviction candidates from a single underlying file. + */ +static int +__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) +{ + WT_BTREE *btree; + WT_CACHE *cache; + WT_DECL_RET; + WT_EVICT_ENTRY *end, *evict, *start; + WT_PAGE *page; + WT_PAGE_MODIFY *mod; + uint64_t pages_walked; + uint32_t walk_flags; + int internal_pages, modified, restarts; + + btree = S2BT(session); + cache = S2C(session)->cache; + start = cache->evict + *slotp; + end = WT_MIN(start + WT_EVICT_WALK_PER_FILE, + cache->evict + cache->evict_slots); + + walk_flags = + WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; + + /* + * Get some more eviction candidate pages. + */ + for (evict = start, pages_walked = 0, internal_pages = restarts = 0; + evict < end && (ret == 0 || ret == WT_NOTFOUND); + ret = __wt_tree_walk(session, &btree->evict_ref, walk_flags), + ++pages_walked) { + if (btree->evict_ref == NULL) { + /* + * Take care with terminating this loop. + * + * Don't make an extra call to __wt_tree_walk: that will + * leave a page pinned, which may prevent any work from + * being done. + */ + if (++restarts == 2) + break; + continue; + } + + /* Ignore root pages entirely. */ + if (__wt_ref_is_root(btree->evict_ref)) + continue; + page = btree->evict_ref->page; + + /* + * Use the EVICT_LRU flag to avoid putting pages onto the list + * multiple times. + */ + if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) + continue; + + /* Limit internal pages to 50% unless we get aggressive. */ + if ((page->type == WT_PAGE_COL_INT || + page->type == WT_PAGE_ROW_INT) && + ++internal_pages > WT_EVICT_WALK_PER_FILE / 2 && + !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE)) + break; + + /* + * If this page has never been considered for eviction, + * set its read generation to a little bit in the + * future and move on, give readers a chance to start + * updating the read generation. + */ + if (page->read_gen == WT_READGEN_NOTSET) { + page->read_gen = __wt_cache_read_gen_set(session); + continue; + } + + /* + * If the file is being checkpointed, there's a period of time + * where we can't discard dirty pages because of possible races + * with the checkpointing thread. + */ + modified = __wt_page_is_modified(page); + if (modified && btree->checkpointing) + continue; + + /* Optionally ignore clean pages. */ + if (!modified && LF_ISSET(WT_EVICT_PASS_DIRTY)) + continue; + + /* + * If the page is clean but has modifications that appear too + * new to evict, skip it. + */ + mod = page->modify; + if (!modified && mod != NULL && + !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE) && + !__wt_txn_visible_all(session, mod->rec_max_txn)) + continue; + + /* + * If the oldest transaction hasn't changed since the + * last time this page was written, it's unlikely that + * we can make progress. Similarly, if the most recent + * update on the page is not yet globally visible, + * eviction will fail. These heuristics attempt to + * avoid repeated attempts to evict the same page. + * + * That said, if eviction is stuck, or the file is + * being checkpointed, try anyway: maybe a transaction + * that was running last time we wrote the page has + * since rolled back, or we can help get the checkpoint + * completed sooner. + */ + if (modified && !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE) && + !btree->checkpointing && + (mod->disk_snap_min == S2C(session)->txn_global.oldest_id || + !__wt_txn_visible_all(session, mod->update_txn))) + continue; + + WT_ASSERT(session, evict->ref == NULL); + __evict_init_candidate(session, evict, btree->evict_ref); + ++evict; + + WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, + "select: %p, size %" PRIu64, page, page->memory_footprint)); + } + + /* If the walk was interrupted by a locked page, that's okay. */ + if (ret == WT_NOTFOUND) + ret = 0; + + *slotp += (u_int)(evict - start); + WT_STAT_FAST_CONN_INCRV(session, cache_eviction_walk, pages_walked); + return (ret); +} + +/* + * __evict_get_ref -- + * Get a page for eviction. + */ +static int +__evict_get_ref( + WT_SESSION_IMPL *session, int is_app, WT_BTREE **btreep, WT_REF **refp) +{ + WT_CACHE *cache; + WT_EVICT_ENTRY *evict; + uint32_t candidates; + WT_DECL_SPINLOCK_ID(id); /* Must appear last */ + + cache = S2C(session)->cache; + *btreep = NULL; + *refp = NULL; + + /* + * A pathological case: if we're the oldest transaction in the system + * and the eviction server is stuck trying to find space, abort the + * transaction to give up all hazard pointers before trying again. + */ + if (is_app && F_ISSET(cache, WT_EVICT_STUCK) && + __wt_txn_am_oldest(session)) { + F_CLR(cache, WT_EVICT_STUCK); + WT_STAT_FAST_CONN_INCR(session, txn_fail_cache); + return (WT_ROLLBACK); + } + + /* + * Avoid the LRU lock if no pages are available. If there are pages + * available, spin until we get the lock. If this function returns + * without getting a page to evict, application threads assume there + * are no more pages available and will attempt to wake the eviction + * server. + */ + for (;;) { + if (cache->evict_current == NULL) + return (WT_NOTFOUND); + if (__wt_spin_trylock(session, &cache->evict_lock, &id) == 0) + break; + __wt_yield(); + } + + /* + * The eviction server only tries to evict half of the pages before + * looking for more. + */ + candidates = cache->evict_candidates; + if (!is_app && candidates > 1) + candidates /= 2; + + /* Get the next page queued for eviction. */ + while ((evict = cache->evict_current) != NULL && + evict < cache->evict + candidates && evict->ref != NULL) { + WT_ASSERT(session, evict->btree != NULL); + + /* Move to the next item. */ + ++cache->evict_current; + + /* + * Lock the page while holding the eviction mutex to prevent + * multiple attempts to evict it. For pages that are already + * being evicted, this operation will fail and we will move on. + */ + if (!WT_ATOMIC_CAS4( + evict->ref->state, WT_REF_MEM, WT_REF_LOCKED)) { + __evict_list_clear(session, evict); + continue; + } + + /* + * Increment the busy count in the btree handle to prevent it + * from being closed under us. + */ + (void)WT_ATOMIC_ADD4(evict->btree->evict_busy, 1); + + *btreep = evict->btree; + *refp = evict->ref; + + /* + * Remove the entry so we never try to reconcile the same page + * on reconciliation error. + */ + __evict_list_clear(session, evict); + break; + } + + /* Clear the current pointer if there are no more candidates. */ + if (evict >= cache->evict + cache->evict_candidates) + cache->evict_current = NULL; + __wt_spin_unlock(session, &cache->evict_lock); + + return ((*refp == NULL) ? WT_NOTFOUND : 0); +} + +/* + * __wt_evict_lru_page -- + * Called by both eviction and application threads to evict a page. + */ +int +__wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app) +{ + WT_BTREE *btree; + WT_CACHE *cache; + WT_DECL_RET; + WT_PAGE *page; + WT_REF *ref; + + WT_RET(__evict_get_ref(session, is_app, &btree, &ref)); + WT_ASSERT(session, ref->state == WT_REF_LOCKED); + + /* + * In case something goes wrong, don't pick the same set of pages every + * time. + * + * We used to bump the page's read generation only if eviction failed, + * but that isn't safe: at that point, eviction has already unlocked + * the page and some other thread may have evicted it by the time we + * look at it. + */ + page = ref->page; + if (page->read_gen != WT_READGEN_OLDEST) + page->read_gen = __wt_cache_read_gen_set(session); + + WT_WITH_BTREE(session, btree, ret = __wt_evict_page(session, ref)); + + (void)WT_ATOMIC_SUB4(btree->evict_busy, 1); + + WT_RET(ret); + + cache = S2C(session)->cache; + if (F_ISSET(cache, WT_EVICT_NO_PROGRESS | WT_EVICT_STUCK)) + F_CLR(cache, WT_EVICT_NO_PROGRESS | WT_EVICT_STUCK); + + return (ret); +} + +#ifdef HAVE_DIAGNOSTIC +/* + * __wt_cache_dump -- + * Dump debugging information to stdout about the size of the files in the + * cache. + * + * NOTE: this function is not called anywhere, it is intended to be called + * from a debugger. + */ +void +__wt_cache_dump(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_REF *next_walk; + WT_PAGE *page; + uint64_t file_intl_pages, file_leaf_pages; + uint64_t file_bytes, file_dirty, total_bytes; + + conn = S2C(session); + total_bytes = 0; + + SLIST_FOREACH(dhandle, &conn->dhlh, l) { + if (!WT_PREFIX_MATCH(dhandle->name, "file:") || + !F_ISSET(dhandle, WT_DHANDLE_OPEN)) + continue; + + btree = dhandle->handle; + if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) + continue; + + file_bytes = file_dirty = file_intl_pages = file_leaf_pages = 0; + next_walk = NULL; + session->dhandle = dhandle; + while (__wt_tree_walk(session, + &next_walk, WT_READ_CACHE | WT_READ_NO_WAIT) == 0 && + next_walk != NULL) { + page = next_walk->page; + if (page->type == WT_PAGE_COL_INT || + page->type == WT_PAGE_ROW_INT) + ++file_intl_pages; + else + ++file_leaf_pages; + file_bytes += page->memory_footprint; + if (__wt_page_is_modified(page)) + file_dirty += page->memory_footprint; + } + session->dhandle = NULL; + + printf("cache dump: %s [%s]:" + " %" PRIu64 " intl pages, %" PRIu64 " leaf pages," + " %" PRIu64 "MB, %" PRIu64 "MB dirty\n", + dhandle->name, dhandle->checkpoint, + file_intl_pages, file_leaf_pages, + file_bytes >> 20, file_dirty >> 20); + + total_bytes += file_bytes; + } + printf("cache dump: total found = %" PRIu64 "MB" + " vs tracked inuse %" PRIu64 "MB\n", + total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20); + fflush(stdout); +} +#endif diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c new file mode 100644 index 00000000000..a21d6d277d3 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -0,0 +1,770 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt); +static int __btree_get_last_recno(WT_SESSION_IMPL *); +static int __btree_page_sizes(WT_SESSION_IMPL *); +static int __btree_preload(WT_SESSION_IMPL *); +static int __btree_tree_open_empty(WT_SESSION_IMPL *, int, int); + +static int pse1(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t); +static int pse2(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t, int); + +/* + * __wt_btree_open -- + * Open a Btree. + */ +int +__wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_CKPT ckpt; + WT_CONFIG_ITEM cval; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + size_t root_addr_size; + uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE]; + int creation, forced_salvage, readonly; + const char *filename; + + dhandle = session->dhandle; + btree = S2BT(session); + + /* Checkpoint files are readonly. */ + readonly = dhandle->checkpoint == NULL ? 0 : 1; + + /* Get the checkpoint information for this name/checkpoint pair. */ + WT_CLEAR(ckpt); + WT_RET(__wt_meta_checkpoint( + session, dhandle->name, dhandle->checkpoint, &ckpt)); + + /* + * Bulk-load is only permitted on newly created files, not any empty + * file -- see the checkpoint code for a discussion. + */ + creation = ckpt.raw.size == 0; + if (!creation && F_ISSET(btree, WT_BTREE_BULK)) + WT_ERR_MSG(session, EINVAL, + "bulk-load is only supported on newly created objects"); + + /* Handle salvage configuration. */ + forced_salvage = 0; + if (F_ISSET(btree, WT_BTREE_SALVAGE)) { + WT_ERR(__wt_config_gets(session, op_cfg, "force", &cval)); + forced_salvage = (cval.val != 0); + } + + /* Initialize and configure the WT_BTREE structure. */ + WT_ERR(__btree_conf(session, &ckpt)); + + /* Connect to the underlying block manager. */ + filename = dhandle->name; + if (!WT_PREFIX_SKIP(filename, "file:")) + WT_ERR_MSG(session, EINVAL, "expected a 'file:' URI"); + + WT_ERR(__wt_block_manager_open(session, filename, dhandle->cfg, + forced_salvage, readonly, btree->allocsize, &btree->bm)); + bm = btree->bm; + + /* + * !!! + * As part of block-manager configuration, we need to return the maximum + * sized address cookie that a block manager will ever return. There's + * a limit of WT_BTREE_MAX_ADDR_COOKIE, but at 255B, it's too large for + * a Btree with 512B internal pages. The default block manager packs + * a wt_off_t and 2 uint32_t's into its cookie, so there's no problem + * now, but when we create a block manager extension API, we need some + * way to consider the block manager's maximum cookie size versus the + * minimum Btree internal node size. + */ + btree->block_header = bm->block_header(bm); + + /* + * Open the specified checkpoint unless it's a special command (special + * commands are responsible for loading their own checkpoints, if any). + */ + if (!F_ISSET(btree, + WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) { + /* + * There are two reasons to load an empty tree rather than a + * checkpoint: either there is no checkpoint (the file is + * being created), or the load call returns no root page (the + * checkpoint is for an empty file). + */ + WT_ERR(bm->checkpoint_load(bm, session, + ckpt.raw.data, ckpt.raw.size, + root_addr, &root_addr_size, readonly)); + if (creation || root_addr_size == 0) + WT_ERR(__btree_tree_open_empty( + session, creation, readonly)); + else { + WT_ERR(__wt_btree_tree_open( + session, root_addr, root_addr_size)); + + /* Warm the cache, if possible. */ + WT_ERR(__btree_preload(session)); + + /* Get the last record number in a column-store file. */ + if (btree->type != BTREE_ROW) + WT_ERR(__btree_get_last_recno(session)); + } + } + + if (0) { +err: WT_TRET(__wt_btree_close(session)); + } + __wt_meta_checkpoint_free(session, &ckpt); + + return (ret); +} + +/* + * __wt_btree_close -- + * Close a Btree. + */ +int +__wt_btree_close(WT_SESSION_IMPL *session) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + + dhandle = session->dhandle; + btree = S2BT(session); + + if ((bm = btree->bm) != NULL) { + /* Unload the checkpoint, unless it's a special command. */ + if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && + !F_ISSET(btree, + WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) + WT_TRET(bm->checkpoint_unload(bm, session)); + + /* Close the underlying block manager reference. */ + WT_TRET(bm->close(bm, session)); + + btree->bm = NULL; + } + + /* Close the Huffman tree. */ + __wt_btree_huffman_close(session); + + /* Destroy locks. */ + WT_TRET(__wt_rwlock_destroy(session, &btree->ovfl_lock)); + __wt_spin_destroy(session, &btree->flush_lock); + + /* Free allocated memory. */ + __wt_free(session, btree->key_format); + __wt_free(session, btree->value_format); + + if (btree->collator_owned) { + if (btree->collator->terminate != NULL) + WT_TRET(btree->collator->terminate( + btree->collator, &session->iface)); + btree->collator_owned = 0; + } + btree->collator = NULL; + + btree->bulk_load_ok = 0; + + return (ret); +} + +/* + * __btree_conf -- + * Configure a WT_BTREE structure. + */ +static int +__btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) +{ + WT_BTREE *btree; + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + WT_NAMED_COMPRESSOR *ncomp; + int64_t maj_version, min_version; + uint32_t bitcnt; + int fixed; + const char **cfg; + + btree = S2BT(session); + conn = S2C(session); + cfg = btree->dhandle->cfg; + + /* Dump out format information. */ + if (WT_VERBOSE_ISSET(session, WT_VERB_VERSION)) { + WT_RET(__wt_config_gets(session, cfg, "version.major", &cval)); + maj_version = cval.val; + WT_RET(__wt_config_gets(session, cfg, "version.minor", &cval)); + min_version = cval.val; + WT_RET(__wt_verbose(session, WT_VERB_VERSION, + "%" PRIu64 ".%" PRIu64, maj_version, min_version)); + } + + /* Get the file ID. */ + WT_RET(__wt_config_gets(session, cfg, "id", &cval)); + btree->id = (uint32_t)cval.val; + + /* Validate file types and check the data format plan. */ + WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); + WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL)); + if (WT_STRING_MATCH("r", cval.str, cval.len)) + btree->type = BTREE_COL_VAR; + else + btree->type = BTREE_ROW; + WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->key_format)); + + WT_RET(__wt_config_gets(session, cfg, "value_format", &cval)); + WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL)); + WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->value_format)); + + /* Row-store key comparison and key gap for prefix compression. */ + if (btree->type == BTREE_ROW) { + WT_RET(__wt_collator_config( + session, cfg, &btree->collator, &btree->collator_owned)); + + WT_RET(__wt_config_gets(session, cfg, "key_gap", &cval)); + btree->key_gap = (uint32_t)cval.val; + } + + /* Column-store: check for fixed-size data. */ + if (btree->type == BTREE_COL_VAR) { + WT_RET(__wt_struct_check( + session, cval.str, cval.len, &fixed, &bitcnt)); + if (fixed) { + if (bitcnt == 0 || bitcnt > 8) + WT_RET_MSG(session, EINVAL, + "fixed-width field sizes must be greater " + "than 0 and less than or equal to 8"); + btree->bitcnt = (uint8_t)bitcnt; + btree->type = BTREE_COL_FIX; + } + } + + /* Page sizes */ + WT_RET(__btree_page_sizes(session)); + + /* Eviction; the metadata file is never evicted. */ + if (WT_IS_METADATA(btree->dhandle)) + F_SET(btree, WT_BTREE_NO_EVICTION | WT_BTREE_NO_HAZARD); + else { + WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval)); + if (cval.val) + F_SET(btree, WT_BTREE_NO_EVICTION | WT_BTREE_NO_HAZARD); + else + F_CLR(btree, WT_BTREE_NO_EVICTION); + } + + /* Checksums */ + WT_RET(__wt_config_gets(session, cfg, "checksum", &cval)); + if (WT_STRING_MATCH("on", cval.str, cval.len)) + btree->checksum = CKSUM_ON; + else if (WT_STRING_MATCH("off", cval.str, cval.len)) + btree->checksum = CKSUM_OFF; + else + btree->checksum = CKSUM_UNCOMPRESSED; + + /* Huffman encoding */ + WT_RET(__wt_btree_huffman_open(session)); + + /* + * Reconciliation configuration: + * Block compression (all) + * Dictionary compression (variable-length column-store, row-store) + * Page-split percentage + * Prefix compression (row-store) + * Suffix compression (row-store) + */ + switch (btree->type) { + case BTREE_COL_FIX: + break; + case BTREE_ROW: + WT_RET(__wt_config_gets( + session, cfg, "internal_key_truncate", &cval)); + btree->internal_key_truncate = cval.val == 0 ? 0 : 1; + + WT_RET(__wt_config_gets( + session, cfg, "prefix_compression", &cval)); + btree->prefix_compression = cval.val == 0 ? 0 : 1; + WT_RET(__wt_config_gets( + session, cfg, "prefix_compression_min", &cval)); + btree->prefix_compression_min = (u_int)cval.val; + /* FALLTHROUGH */ + case BTREE_COL_VAR: + WT_RET(__wt_config_gets(session, cfg, "dictionary", &cval)); + btree->dictionary = (u_int)cval.val; + break; + } + + WT_RET(__wt_config_gets(session, cfg, "block_compressor", &cval)); + if (cval.len > 0) { + TAILQ_FOREACH(ncomp, &conn->compqh, q) + if (WT_STRING_MATCH(ncomp->name, cval.str, cval.len)) { + btree->compressor = ncomp->compressor; + break; + } + if (btree->compressor == NULL) + WT_RET_MSG(session, EINVAL, + "unknown block compressor '%.*s'", + (int)cval.len, cval.str); + } + + /* Initialize locks. */ + WT_RET(__wt_rwlock_alloc( + session, &btree->ovfl_lock, "btree overflow lock")); + WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush lock")); + + __wt_stat_init_dsrc_stats(&btree->dhandle->stats); + + btree->write_gen = ckpt->write_gen; /* Write generation */ + btree->modified = 0; /* Clean */ + + return (0); +} + +/* + * __wt_root_ref_init -- + * Initialize a tree root reference, and link in the root page. + */ +void +__wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno) +{ + memset(root_ref, 0, sizeof(*root_ref)); + + root_ref->page = root; + root_ref->state = WT_REF_MEM; + + root_ref->key.recno = is_recno ? 1 : 0; + + root->pg_intl_parent_ref = root_ref; +} + +/* + * __wt_btree_tree_open -- + * Read in a tree from disk. + */ +int +__wt_btree_tree_open( + WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_ITEM dsk; + WT_PAGE *page; + + btree = S2BT(session); + + /* + * A buffer into which we read a root page; don't use a scratch buffer, + * the buffer's allocated memory becomes the persistent in-memory page. + */ + WT_CLEAR(dsk); + + /* Read the page, then build the in-memory version of the page. */ + WT_ERR(__wt_bt_read(session, &dsk, addr, addr_size)); + WT_ERR(__wt_page_inmem(session, NULL, dsk.data, + WT_DATA_IN_ITEM(&dsk) ? + WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED , &page)); + + /* Finish initializing the root, root reference links. */ + __wt_root_ref_init(&btree->root, page, btree->type != BTREE_ROW); + + if (0) { +err: __wt_buf_free(session, &dsk); + } + return (ret); +} + +/* + * __btree_tree_open_empty -- + * Create an empty in-memory tree. + */ +static int +__btree_tree_open_empty(WT_SESSION_IMPL *session, int creation, int readonly) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *root, *leaf; + WT_PAGE_INDEX *pindex; + WT_REF *ref; + + btree = S2BT(session); + root = leaf = NULL; + + /* + * Newly created objects can be used for cursor inserts or for bulk + * loads; set a flag that's cleared when a row is inserted into the + * tree. Objects being bulk-loaded cannot be evicted, we set it + * globally, there's no point in searching empty trees for eviction. + */ + if (creation) { + btree->bulk_load_ok = 1; + __wt_btree_evictable(session, 0); + } + + /* + * A note about empty trees: the initial tree is a root page and a leaf + * page. We need a pair of pages instead of just a single page because + * we can reconcile the leaf page while the root stays pinned in memory. + * If the pair is evicted without being modified, that's OK, nothing is + * ever written. + * + * Create the root and leaf pages. + * + * !!! + * Be cautious about changing the order of updates in this code: to call + * __wt_page_out on error, we require a correct page setup at each point + * where we might fail. + */ + switch (btree->type) { + case BTREE_COL_FIX: + case BTREE_COL_VAR: + WT_ERR( + __wt_page_alloc(session, WT_PAGE_COL_INT, 1, 1, 1, &root)); + root->pg_intl_parent_ref = &btree->root; + + pindex = WT_INTL_INDEX_COPY(root); + ref = pindex->index[0]; + ref->home = root; + WT_ERR(__wt_btree_new_leaf_page(session, &leaf)); + ref->page = leaf; + ref->addr = NULL; + ref->state = WT_REF_MEM; + ref->key.recno = 1; + break; + case BTREE_ROW: + WT_ERR( + __wt_page_alloc(session, WT_PAGE_ROW_INT, 0, 1, 1, &root)); + root->pg_intl_parent_ref = &btree->root; + + pindex = WT_INTL_INDEX_COPY(root); + ref = pindex->index[0]; + ref->home = root; + WT_ERR(__wt_btree_new_leaf_page(session, &leaf)); + ref->page = leaf; + ref->addr = NULL; + ref->state = WT_REF_MEM; + WT_ERR(__wt_row_ikey_incr( + session, root, 0, "", 1, &ref->key.ikey)); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* + * Mark the leaf page dirty: we didn't create an entirely valid root + * page (specifically, the root page's disk address isn't set, and it's + * the act of reconciling the leaf page that makes it work, we don't + * try and use the original disk address of modified pages). We could + * get around that by leaving the leaf page clean and building a better + * root page, but then we get into trouble because a checkpoint marks + * the root page dirty to force a write, and without reconciling the + * leaf page we won't realize there's no records to write, we'll write + * a root page, which isn't correct for an empty tree. + * + * Earlier versions of this code kept the leaf page clean, but with the + * "empty" flag set in the leaf page's modification structure; in that + * case, checkpoints works (forced reconciliation of a root with a + * single "empty" page wouldn't write any blocks). That version had + * memory leaks because the eviction code didn't correctly handle pages + * that were "clean" (and so never reconciled), yet "modified" with an + * "empty" flag. The goal of this code is to mimic a real tree that + * simply has no records, for whatever reason, and trust reconciliation + * to figure out it's empty and not write any blocks. + * + * We do not set the tree's modified flag because the checkpoint code + * skips unmodified files in closing checkpoints (checkpoints that + * don't require a write unless the file is actually dirty). There's + * no need to reconcile this file unless the application does a real + * checkpoint or it's actually modified. + * + * Only do this for a live tree, not for checkpoints. If we open an + * empty checkpoint, the leaf page cannot be dirty or eviction may try + * to write it, which will fail because checkpoints are read-only. + */ + if (!readonly) { + WT_ERR(__wt_page_modify_init(session, leaf)); + __wt_page_only_modify_set(session, leaf); + } + + /* Finish initializing the root, root reference links. */ + __wt_root_ref_init(&btree->root, root, btree->type != BTREE_ROW); + + return (0); + +err: if (leaf != NULL) + __wt_page_out(session, &leaf); + if (root != NULL) + __wt_page_out(session, &root); + return (ret); +} + +/* + * __wt_btree_new_leaf_page -- + * Create an empty leaf page and link it into a reference in its parent. + */ +int +__wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep) +{ + WT_BTREE *btree; + + btree = S2BT(session); + + switch (btree->type) { + case BTREE_COL_FIX: + WT_RET( + __wt_page_alloc(session, WT_PAGE_COL_FIX, 1, 0, 1, pagep)); + break; + case BTREE_COL_VAR: + WT_RET( + __wt_page_alloc(session, WT_PAGE_COL_VAR, 1, 0, 1, pagep)); + break; + case BTREE_ROW: + WT_RET( + __wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, 0, 1, pagep)); + break; + WT_ILLEGAL_VALUE(session); + } + return (0); +} + +/* + * __wt_btree_evictable -- + * Setup or release a cache-resident tree. + */ +void +__wt_btree_evictable(WT_SESSION_IMPL *session, int on) +{ + WT_BTREE *btree; + + btree = S2BT(session); + + /* The metadata file is never evicted. */ + if (on && !WT_IS_METADATA(btree->dhandle)) + F_CLR(btree, WT_BTREE_NO_EVICTION); + else + F_SET(btree, WT_BTREE_NO_EVICTION); +} + +/* + * __btree_preload -- + * Pre-load internal pages. + */ +static int +__btree_preload(WT_SESSION_IMPL *session) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_REF *ref; + size_t addr_size; + const uint8_t *addr; + + btree = S2BT(session); + bm = btree->bm; + + /* Pre-load the second-level internal pages. */ + WT_INTL_FOREACH_BEGIN(session, btree->root.page, ref) { + WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); + if (addr != NULL) + WT_RET(bm->preload(bm, session, addr, addr_size)); + } WT_INTL_FOREACH_END; + return (0); +} + +/* + * __btree_get_last_recno -- + * Set the last record number for a column-store. + */ +static int +__btree_get_last_recno(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_PAGE *page; + WT_REF *next_walk; + + btree = S2BT(session); + + next_walk = NULL; + WT_RET(__wt_tree_walk(session, &next_walk, WT_READ_PREV)); + if (next_walk == NULL) + return (WT_NOTFOUND); + + page = next_walk->page; + btree->last_recno = page->type == WT_PAGE_COL_VAR ? + __col_var_last_recno(page) : __col_fix_last_recno(page); + + return (__wt_page_release(session, next_walk, 0)); +} + +/* + * __btree_page_sizes -- + * Verify the page sizes. Some of these sizes are automatically checked + * using limits defined in the API, don't duplicate the logic here. + */ +static int +__btree_page_sizes(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CONFIG_ITEM cval; + uint64_t cache_size; + uint32_t intl_split_size, leaf_split_size; + const char **cfg; + + btree = S2BT(session); + cfg = btree->dhandle->cfg; + + WT_RET(__wt_direct_io_size_check( + session, cfg, "allocation_size", &btree->allocsize)); + WT_RET(__wt_direct_io_size_check( + session, cfg, "internal_page_max", &btree->maxintlpage)); + WT_RET(__wt_config_gets(session, cfg, "internal_item_max", &cval)); + btree->maxintlitem = (uint32_t)cval.val; + WT_RET(__wt_direct_io_size_check( + session, cfg, "leaf_page_max", &btree->maxleafpage)); + WT_RET(__wt_config_gets(session, cfg, "leaf_item_max", &cval)); + btree->maxleafitem = (uint32_t)cval.val; + + WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval)); + btree->split_pct = (int)cval.val; + + /* + * When a page is forced to split, we want at least 50 entries on its + * parent. + */ + WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval)); + btree->maxmempage = WT_MAX((uint64_t)cval.val, 50 * btree->maxleafpage); + + /* + * Don't let pages grow to more than half the cache size. Otherwise, + * with very small caches, we can end up in a situation where nothing + * can be evicted. Take care getting the cache size: with a shared + * cache, it may not have been set. + */ + cache_size = S2C(session)->cache_size; + if (cache_size > 0) + btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 2); + + /* Allocation sizes must be a power-of-two, nothing else makes sense. */ + if (!__wt_ispo2(btree->allocsize)) + WT_RET_MSG(session, + EINVAL, "the allocation size must be a power of two"); + + /* All page sizes must be in units of the allocation size. */ + if (btree->maxintlpage < btree->allocsize || + btree->maxintlpage % btree->allocsize != 0 || + btree->maxleafpage < btree->allocsize || + btree->maxleafpage % btree->allocsize != 0) + WT_RET_MSG(session, EINVAL, + "page sizes must be a multiple of the page allocation " + "size (%" PRIu32 "B)", btree->allocsize); + + /* + * Set the split percentage: reconciliation splits to a smaller-than- + * maximum page size so we don't split every time a new entry is added. + */ + intl_split_size = __wt_split_page_size(btree, btree->maxintlpage); + leaf_split_size = __wt_split_page_size(btree, btree->maxleafpage); + + /* + * Default values for internal and leaf page items: make sure at least + * 8 items fit on split pages. + */ + if (btree->maxintlitem == 0) + btree->maxintlitem = intl_split_size / 8; + if (btree->maxleafitem == 0) + btree->maxleafitem = leaf_split_size / 8; + + /* + * If raw compression is configured, the application owns page layout, + * it's not our problem. Hopefully the application chose well. + */ + if (btree->compressor != NULL && + btree->compressor->compress_raw != NULL) + return (0); + + /* Check we can fit at least 2 items on a page. */ + if (btree->maxintlitem > btree->maxintlpage / 2) + return (pse1(session, "internal", + btree->maxintlpage, btree->maxintlitem)); + if (btree->maxleafitem > btree->maxleafpage / 2) + return (pse1(session, "leaf", + btree->maxleafpage, btree->maxleafitem)); + + /* + * Take into account the size of a split page: + * + * Make it a separate error message so it's clear what went wrong. + */ + if (btree->maxintlitem > intl_split_size / 2) + return (pse2(session, "internal", + btree->maxintlpage, btree->maxintlitem, btree->split_pct)); + if (btree->maxleafitem > leaf_split_size / 2) + return (pse2(session, "leaf", + btree->maxleafpage, btree->maxleafitem, btree->split_pct)); + + return (0); +} + +/* + * __wt_split_page_size -- + * Split page size calculation: we don't want to repeatedly split every + * time a new entry is added, so we split to a smaller-than-maximum page size. + */ +uint32_t +__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) +{ + uintmax_t a; + uint32_t split_size; + + /* + * Ideally, the split page size is some percentage of the maximum page + * size rounded to an allocation unit (round to an allocation unit so + * we don't waste space when we write). + */ + a = maxpagesize; /* Don't overflow. */ + split_size = (uint32_t) + WT_ALIGN((a * (u_int)btree->split_pct) / 100, btree->allocsize); + + /* + * If the result of that calculation is the same as the allocation unit + * (that happens if the maximum size is the same size as an allocation + * unit, use a percentage of the maximum page size). + */ + if (split_size == btree->allocsize) + split_size = (uint32_t)((a * (u_int)btree->split_pct) / 100); + + return (split_size); +} + +/* + * pse1 -- + * Page size error message 1. + */ +static int +pse1(WT_SESSION_IMPL *session, const char *type, uint32_t max, uint32_t ovfl) +{ + WT_RET_MSG(session, EINVAL, + "%s page size (%" PRIu32 "B) too small for the maximum item size " + "(%" PRIu32 "B); the page must be able to hold at least 2 items", + type, max, ovfl); +} + +/* + * pse2 -- + * Page size error message 2. + */ +static int +pse2(WT_SESSION_IMPL *session, + const char *type, uint32_t max, uint32_t ovfl, int pct) +{ + WT_RET_MSG(session, EINVAL, + "%s page size (%" PRIu32 "B) too small for the maximum item size " + "(%" PRIu32 "B), because of the split percentage (%d %%); a split " + "page must be able to hold at least 2 items", + type, max, ovfl, pct); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_huffman.c b/src/third_party/wiredtiger/src/btree/bt_huffman.c new file mode 100644 index 00000000000..aa6e7c36451 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_huffman.c @@ -0,0 +1,340 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * 7-bit ASCII, with English language frequencies. + * + * Based on "Case-sensitive letter and bigram frequency counts from large-scale + * English corpora" + * Michael N. Jones and D.J.K. Mewhort + * Queen's University, Kingston, Ontario, Canada + * Behavior Research Methods, Instruments, & Computers 2004, 36 (3), 388-396 + * + * Additionally supports space and tab characters; space is the most common + * character in text where it occurs, and tab appears about as frequently as + * 'a' and 'n' in text where it occurs. + */ +struct __wt_huffman_table { + uint32_t symbol; + uint32_t frequency; +}; +static const struct __wt_huffman_table __wt_huffman_nytenglish[] = { + /* nul */ { 0x00, 0 }, /* For an escape character. */ + /* ht */ { 0x09, 5263779 }, + /* sp */ { 0x20, 8000000 }, + /* ! */ { 0x21, 2178 }, + /* " */ { 0x22, 284671 }, + /* # */ { 0x23, 10 }, + /* $ */ { 0x24, 51572 }, + /* % */ { 0x25, 1993 }, + /* & */ { 0x26, 6523 }, + /* ' */ { 0x27, 204497 }, + /* ( */ { 0x28, 53398 }, + /* ) */ { 0x29, 53735 }, + /* * */ { 0x2a, 20716 }, + /* + */ { 0x2b, 309 }, + /* , */ { 0x2c, 984969 }, + /* - */ { 0x2d, 252302 }, + /* . */ { 0x2e, 946136 }, + /* / */ { 0x2f, 8161 }, + /* 0 */ { 0x30, 546233 }, + /* 1 */ { 0x31, 460946 }, + /* 2 */ { 0x32, 333499 }, + /* 3 */ { 0x33, 187606 }, + /* 4 */ { 0x34, 192528 }, + /* 5 */ { 0x35, 374413 }, + /* 6 */ { 0x36, 153865 }, + /* 7 */ { 0x37, 120094 }, + /* 8 */ { 0x38, 182627 }, + /* 9 */ { 0x39, 282364 }, + /* : */ { 0x3a, 54036 }, + /* ; */ { 0x3b, 36727 }, + /* < */ { 0x3c, 82 }, + /* = */ { 0x3d, 22 }, + /* > */ { 0x3e, 83 }, + /* ? */ { 0x3f, 12357 }, + /* @ */ { 0x40, 1 }, + /* A */ { 0x41, 280937 }, + /* B */ { 0x42, 169474 }, + /* C */ { 0x43, 229363 }, + /* D */ { 0x44, 129632 }, + /* E */ { 0x45, 138443 }, + /* F */ { 0x46, 100751 }, + /* G */ { 0x47, 93212 }, + /* H */ { 0x48, 123632 }, + /* I */ { 0x49, 223312 }, + /* J */ { 0x4a, 78706 }, + /* K */ { 0x4b, 46580 }, + /* L */ { 0x4c, 106984 }, + /* M */ { 0x4d, 259474 }, + /* N */ { 0x4e, 205409 }, + /* O */ { 0x4f, 105700 }, + /* P */ { 0x50, 144239 }, + /* Q */ { 0x51, 11659 }, + /* R */ { 0x52, 146448 }, + /* S */ { 0x53, 304971 }, + /* T */ { 0x54, 325462 }, + /* U */ { 0x55, 57488 }, + /* V */ { 0x56, 31053 }, + /* W */ { 0x57, 107195 }, + /* X */ { 0x58, 7578 }, + /* Y */ { 0x59, 94297 }, + /* Z */ { 0x5a, 5610 }, + /* [ */ { 0x5b, 1 }, + /* \ */ { 0x5c, 1 }, + /* ] */ { 0x5d, 1 }, + /* ^ */ { 0x5e, 1 }, + /* _ */ { 0x5f, 1 }, + /* ` */ { 0x60, 1 }, + /* a */ { 0x61, 5263779 }, + /* b */ { 0x62, 866156 }, + /* c */ { 0x63, 1960412 }, + /* d */ { 0x64, 2369820 }, + /* e */ { 0x65, 7741842 }, + /* f */ { 0x66, 1296925 }, + /* g */ { 0x67, 1206747 }, + /* h */ { 0x68, 2955858 }, + /* i */ { 0x69, 4527332 }, + /* j */ { 0x6a, 65856 }, + /* k */ { 0x6b, 460788 }, + /* l */ { 0x6c, 2553152 }, + /* m */ { 0x6d, 1467376 }, + /* n */ { 0x6e, 4535545 }, + /* o */ { 0x6f, 4729266 }, + /* p */ { 0x70, 1255579 }, + /* q */ { 0x71, 54221 }, + /* r */ { 0x72, 4137949 }, + /* s */ { 0x73, 4186210 }, + /* t */ { 0x74, 5507692 }, + /* u */ { 0x75, 1613323 }, + /* v */ { 0x76, 653370 }, + /* w */ { 0x77, 1015656 }, + /* x */ { 0x78, 123577 }, + /* y */ { 0x79, 1062040 }, + /* z */ { 0x7a, 66423 }, + /* { */ { 0x7b, 1 }, + /* | */ { 0x7c, 1 }, + /* } */ { 0x7d, 1 }, + /* ~ */ { 0x7e, 1 } +}; + +static int __wt_huffman_read(WT_SESSION_IMPL *, + WT_CONFIG_ITEM *, struct __wt_huffman_table **, u_int *, u_int *); + +/* + * __wt_btree_huffman_open -- + * Configure Huffman encoding for the tree. + */ +int +__wt_btree_huffman_open(WT_SESSION_IMPL *session) +{ + struct __wt_huffman_table *table; + WT_BTREE *btree; + WT_CONFIG_ITEM key_conf, value_conf; + WT_DECL_RET; + const char **cfg; + u_int entries, numbytes; + + btree = S2BT(session); + cfg = btree->dhandle->cfg; + + WT_RET(__wt_config_gets(session, cfg, "huffman_key", &key_conf)); + WT_RET(__wt_config_gets(session, cfg, "huffman_value", &value_conf)); + if (key_conf.len == 0 && value_conf.len == 0) + return (0); + + switch (btree->type) { /* Check file type compatibility. */ + case BTREE_COL_FIX: + WT_RET_MSG(session, EINVAL, + "fixed-size column-store files may not be Huffman encoded"); + case BTREE_COL_VAR: + if (key_conf.len != 0) + WT_RET_MSG(session, EINVAL, + "the keys of variable-length column-store files " + "may not be Huffman encoded"); + break; + case BTREE_ROW: + break; + } + + if (strncasecmp(key_conf.str, "english", key_conf.len) == 0) { + struct __wt_huffman_table + copy[WT_ELEMENTS(__wt_huffman_nytenglish)]; + + memcpy(copy, + __wt_huffman_nytenglish, sizeof(__wt_huffman_nytenglish)); + WT_RET(__wt_huffman_open(session, copy, + WT_ELEMENTS(__wt_huffman_nytenglish), + 1, &btree->huffman_key)); + + /* Check for a shared key/value table. */ + if (strncasecmp( + value_conf.str, "english", value_conf.len) == 0) { + btree->huffman_value = btree->huffman_key; + return (0); + } + } else { + WT_RET(__wt_huffman_read( + session, &key_conf, &table, &entries, &numbytes)); + ret = __wt_huffman_open(session, table, + entries, numbytes, &btree->huffman_key); + __wt_free(session, table); + if (ret != 0) + return (ret); + + /* Check for a shared key/value table. */ + if (value_conf.len != 0 && key_conf.len == value_conf.len && + memcmp(key_conf.str, value_conf.str, key_conf.len) == 0) { + btree->huffman_value = btree->huffman_key; + return (0); + } + } + if (strncasecmp(value_conf.str, "english", value_conf.len) == 0) { + struct __wt_huffman_table + copy[WT_ELEMENTS(__wt_huffman_nytenglish)]; + + memcpy(copy, + __wt_huffman_nytenglish, sizeof(__wt_huffman_nytenglish)); + WT_RET(__wt_huffman_open(session, copy, + WT_ELEMENTS(__wt_huffman_nytenglish), + 1, &btree->huffman_value)); + } else { + WT_RET(__wt_huffman_read( + session, &value_conf, &table, &entries, &numbytes)); + ret = __wt_huffman_open(session, table, + entries, numbytes, &btree->huffman_value); + __wt_free(session, table); + if (ret != 0) + return (ret); + } + + return (0); +} + +/* + * __wt_huffman_read -- + * Read a Huffman table from a file. + */ +static int +__wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip, + struct __wt_huffman_table **tablep, u_int *entriesp, u_int *numbytesp) +{ + struct __wt_huffman_table *table, *tp; + FILE *fp; + WT_DECL_RET; + uint64_t symbol, frequency; + u_int entries, lineno; + char *file; + + *tablep = NULL; + *entriesp = *numbytesp = 0; + + fp = NULL; + file = NULL; + table = NULL; + + /* + * UTF-8 table is 256 bytes, with a range of 0-255. + * UTF-16 is 128KB (2 * 65536) bytes, with a range of 0-65535. + */ + if (strncasecmp(ip->str, "utf8", 4) == 0) { + entries = UINT8_MAX; + *numbytesp = 1; + WT_ERR(__wt_calloc_def(session, entries, &table)); + + if (ip->len == 4) + WT_ERR_MSG(session, EINVAL, + "no Huffman table file name specified"); + WT_ERR(__wt_calloc_def(session, ip->len, &file)); + memcpy(file, ip->str + 4, ip->len - 4); + } else if (strncasecmp(ip->str, "utf16", 5) == 0) { + entries = UINT16_MAX; + *numbytesp = 2; + WT_ERR(__wt_calloc_def(session, entries, &table)); + + if (ip->len == 5) + WT_ERR_MSG(session, EINVAL, + "no Huffman table file name specified"); + WT_ERR(__wt_calloc_def(session, ip->len, &file)); + memcpy(file, ip->str + 5, ip->len - 5); + } else { + WT_ERR_MSG(session, EINVAL, + "unknown Huffman configuration value %.*s", + (int)ip->len, ip->str); + } + + if ((fp = fopen(file, "r")) == NULL) + WT_ERR_MSG(session, __wt_errno(), + "unable to read Huffman table file %.*s", + (int)ip->len, ip->str); + + for (tp = table, lineno = 1; (ret = + fscanf(fp, "%" SCNu64 " %" SCNu64, &symbol, &frequency)) != EOF; + ++tp, ++lineno) { + if (lineno > entries) + WT_ERR_MSG(session, EINVAL, + "Huffman table file %.*s is corrupted, " + "more than %" PRIu32 " entries", + (int)ip->len, ip->str, entries); + if (ret != 2) + WT_ERR_MSG(session, EINVAL, + "line %u of Huffman table file %.*s is corrupted: " + "expected two unsigned integral values", + lineno, (int)ip->len, ip->str); + if (symbol > entries) + WT_ERR_MSG(session, EINVAL, + "line %u of Huffman table file %.*s is corrupted: " + "symbol larger than maximum value of %u", + lineno, (int)ip->len, ip->str, entries); + if (frequency > UINT32_MAX) + WT_ERR_MSG(session, EINVAL, + "line %u of Huffman table file %.*s is corrupted: " + "frequency larger than maximum value of %" PRIu32, + lineno, (int)ip->len, ip->str, UINT32_MAX); + + tp->symbol = (uint32_t)symbol; + tp->frequency = (uint32_t)frequency; + } + + *entriesp = lineno - 1; + *tablep = table; + + if (0) { +err: __wt_free(session, table); + } + if (fp != NULL) + (void)fclose(fp); + __wt_free(session, file); + return (ret); +} + +/* + * __wt_btree_huffman_close -- + * Close the Huffman tables. + */ +void +__wt_btree_huffman_close(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + + btree = S2BT(session); + + if (btree->huffman_key != NULL) { + /* Key and data may use the same table, only close it once. */ + if (btree->huffman_value == btree->huffman_key) + btree->huffman_value = NULL; + + __wt_huffman_close(session, btree->huffman_key); + btree->huffman_key = NULL; + } + if (btree->huffman_value != NULL) { + __wt_huffman_close(session, btree->huffman_value); + btree->huffman_value = NULL; + } +} diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c new file mode 100644 index 00000000000..ccc67c994dc --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_io.c @@ -0,0 +1,304 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_bt_read -- + * Read a cookie referenced block into a buffer. + */ +int +__wt_bt_read(WT_SESSION_IMPL *session, + WT_ITEM *buf, const uint8_t *addr, size_t addr_size) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + const WT_PAGE_HEADER *dsk; + size_t result_len; + + btree = S2BT(session); + bm = btree->bm; + + /* + * If anticipating a compressed block, read into a scratch buffer and + * decompress into the caller's buffer. Else, read directly into the + * caller's buffer. + */ + if (btree->compressor == NULL) { + WT_RET(bm->read(bm, session, buf, addr, addr_size)); + dsk = buf->data; + } else { + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(bm->read(bm, session, tmp, addr, addr_size)); + dsk = tmp->data; + } + + /* + * If the block is compressed, copy the skipped bytes of the original + * image into place, then decompress. + */ + if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) { + if (btree->compressor == NULL || + btree->compressor->decompress == NULL) + WT_ERR_MSG(session, WT_ERROR, + "read compressed block where no compression engine " + "configured"); + + /* + * We're allocating the exact number of bytes we're expecting + * from decompression. + */ + WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size)); + + /* + * Note the source length is NOT the number of compressed bytes, + * it's the length of the block we just read (minus the skipped + * bytes). We don't store the number of compressed bytes: some + * compression engines need that length stored externally, they + * don't have markers in the stream to signal the end of the + * compressed bytes. Those engines must store the compressed + * byte length somehow, see the snappy compression extension for + * an example. + */ + memcpy(buf->mem, tmp->data, WT_BLOCK_COMPRESS_SKIP); + ret = btree->compressor->decompress( + btree->compressor, &session->iface, + (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP, + tmp->size - WT_BLOCK_COMPRESS_SKIP, + (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, + dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len); + + /* + * If checksums were turned off because we're depending on the + * decompression to fail on any corrupted data, we'll end up + * here after corruption happens. If we're salvaging the file, + * it's OK, otherwise it's really, really bad. + */ + if (ret != 0 || + result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) + WT_ERR( + F_ISSET(btree, WT_BTREE_VERIFY) || + F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ? + WT_ERROR : + __wt_illegal_value(session, btree->dhandle->name)); + } else + if (btree->compressor == NULL) + buf->size = dsk->mem_size; + else + /* + * We guessed wrong: there was a compressor, but this + * block was not compressed, and now the page is in the + * wrong buffer and the buffer may be of the wrong size. + * This should be rare, but happens with small blocks + * that aren't worth compressing. + */ + WT_ERR(__wt_buf_set( + session, buf, tmp->data, dsk->mem_size)); + + /* If the handle is a verify handle, verify the physical page. */ + if (F_ISSET(btree, WT_BTREE_VERIFY)) { + if (tmp == NULL) + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size)); + WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf)); + } + + WT_STAT_FAST_CONN_INCR(session, cache_read); + WT_STAT_FAST_DATA_INCR(session, cache_read); + if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) + WT_STAT_FAST_DATA_INCR(session, compress_read); + WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size); + WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size); + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __wt_bt_write -- + * Write a buffer into a block, returning the block's addr/size and + * checksum. + */ +int +__wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, + uint8_t *addr, size_t *addr_sizep, int checkpoint, int compressed) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_ITEM *ip; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_PAGE_HEADER *dsk; + size_t len, src_len, dst_len, result_len, size; + int data_cksum, compression_failed; + uint8_t *src, *dst; + + btree = S2BT(session); + bm = btree->bm; + + /* Checkpoint calls are different than standard calls. */ + WT_ASSERT(session, + (checkpoint == 0 && addr != NULL && addr_sizep != NULL) || + (checkpoint == 1 && addr == NULL && addr_sizep == NULL)); + +#ifdef HAVE_DIAGNOSTIC + /* + * We're passed a table's disk image. Decompress if necessary and + * verify the image. Always check the in-memory length for accuracy. + */ + dsk = buf->mem; + WT_ASSERT(session, dsk->u.entries != 0); + if (compressed) { + WT_ERR(__wt_scr_alloc(session, dsk->mem_size, &tmp)); + + memcpy(tmp->mem, buf->data, WT_BLOCK_COMPRESS_SKIP); + WT_ERR(btree->compressor->decompress( + btree->compressor, &session->iface, + (uint8_t *)buf->data + WT_BLOCK_COMPRESS_SKIP, + buf->size - WT_BLOCK_COMPRESS_SKIP, + (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP, + tmp->memsize - WT_BLOCK_COMPRESS_SKIP, + &result_len)); + WT_ASSERT(session, + dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP); + tmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP; + ip = tmp; + } else { + WT_ASSERT(session, dsk->mem_size == buf->size); + ip = buf; + } + WT_ERR(__wt_verify_dsk(session, "[write-check]", ip)); + __wt_scr_free(&tmp); +#endif + + /* + * Optionally stream-compress the data, but don't compress blocks that + * are already as small as they're going to get. + */ + if (btree->compressor == NULL || + btree->compressor->compress == NULL || compressed) + ip = buf; + else if (buf->size <= btree->allocsize) { + ip = buf; + WT_STAT_FAST_DATA_INCR(session, compress_write_too_small); + } else { + /* Skip the header bytes of the source data. */ + src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP; + src_len = buf->size - WT_BLOCK_COMPRESS_SKIP; + + /* + * Compute the size needed for the destination buffer. We only + * allocate enough memory for a copy of the original by default, + * if any compressed version is bigger than the original, we + * won't use it. However, some compression engines (snappy is + * one example), may need more memory because they don't stop + * just because there's no more memory into which to compress. + */ + if (btree->compressor->pre_size == NULL) + len = src_len; + else + WT_ERR(btree->compressor->pre_size(btree->compressor, + &session->iface, src, src_len, &len)); + + size = len + WT_BLOCK_COMPRESS_SKIP; + WT_ERR(bm->write_size(bm, session, &size)); + WT_ERR(__wt_scr_alloc(session, size, &tmp)); + + /* Skip the header bytes of the destination data. */ + dst = (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP; + dst_len = len; + + compression_failed = 0; + WT_ERR(btree->compressor->compress(btree->compressor, + &session->iface, + src, src_len, + dst, dst_len, + &result_len, &compression_failed)); + result_len += WT_BLOCK_COMPRESS_SKIP; + + /* + * If compression fails, or doesn't gain us at least one unit of + * allocation, fallback to the original version. This isn't + * unexpected: if compression doesn't work for some chunk of + * data for some reason (noting likely additional format/header + * information which compressed output requires), it just means + * the uncompressed version is as good as it gets, and that's + * what we use. + */ + if (compression_failed || + buf->size / btree->allocsize == + result_len / btree->allocsize) { + ip = buf; + WT_STAT_FAST_DATA_INCR(session, compress_write_fail); + } else { + compressed = 1; + WT_STAT_FAST_DATA_INCR(session, compress_write); + + /* + * Copy in the skipped header bytes, set the final data + * size. + */ + memcpy(tmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP); + tmp->size = result_len; + ip = tmp; + } + } + dsk = ip->mem; + + /* If the buffer is compressed, set the flag. */ + if (compressed) + F_SET(dsk, WT_PAGE_COMPRESSED); + + /* + * We increment the block's write generation so it's easy to identify + * newer versions of blocks during salvage. (It's common in WiredTiger, + * at least for the default block manager, for multiple blocks to be + * internally consistent with identical first and last keys, so we need + * a way to know the most recent state of the block. We could check + * which leaf is referenced by a valid internal page, but that implies + * salvaging internal pages, which I don't want to do, and it's not + * as good anyway, because the internal page may not have been written + * after the leaf page was updated. So, write generations it is. + * + * Nothing is locked at this point but two versions of a page with the + * same generation is pretty unlikely, and if we did, they're going to + * be roughly identical for the purposes of salvage, anyway. + */ + dsk->write_gen = ++btree->write_gen; + + /* + * Checksum the data if the buffer isn't compressed or checksums are + * configured. + */ + switch (btree->checksum) { + case CKSUM_ON: + data_cksum = 1; + break; + case CKSUM_OFF: + data_cksum = 0; + break; + case CKSUM_UNCOMPRESSED: + default: + data_cksum = !compressed; + break; + } + + /* Call the block manager to write the block. */ + WT_ERR(checkpoint ? + bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) : + bm->write(bm, session, ip, addr, addr_sizep, data_cksum)); + + WT_STAT_FAST_CONN_INCR(session, cache_write); + WT_STAT_FAST_DATA_INCR(session, cache_write); + WT_STAT_FAST_CONN_INCRV(session, cache_bytes_write, ip->size); + WT_STAT_FAST_DATA_INCRV(session, cache_bytes_write, ip->size); + +err: __wt_scr_free(&tmp); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_misc.c b/src/third_party/wiredtiger/src/btree/bt_misc.c new file mode 100644 index 00000000000..cba1c0c61aa --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_misc.c @@ -0,0 +1,128 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_page_type_string -- + * Return a string representing the page type. + */ +const char * +__wt_page_type_string(u_int type) +{ + switch (type) { + case WT_PAGE_INVALID: + return ("invalid"); + case WT_PAGE_BLOCK_MANAGER: + return ("block manager"); + case WT_PAGE_COL_FIX: + return ("column-store fixed-length leaf"); + case WT_PAGE_COL_INT: + return ("column-store internal"); + case WT_PAGE_COL_VAR: + return ("column-store variable-length leaf"); + case WT_PAGE_OVFL: + return ("overflow"); + case WT_PAGE_ROW_INT: + return ("row-store internal"); + case WT_PAGE_ROW_LEAF: + return ("row-store leaf"); + default: + return ("unknown"); + } + /* NOTREACHED */ +} + +/* + * __wt_cell_type_string -- + * Return a string representing the cell type. + */ +const char * +__wt_cell_type_string(uint8_t type) +{ + switch (type) { + case WT_CELL_ADDR_DEL: + return ("addr/del"); + case WT_CELL_ADDR_INT: + return ("addr/int"); + case WT_CELL_ADDR_LEAF: + return ("addr/leaf"); + case WT_CELL_ADDR_LEAF_NO: + return ("addr/leaf-no"); + case WT_CELL_DEL: + return ("deleted"); + case WT_CELL_KEY: + return ("key"); + case WT_CELL_KEY_PFX: + return ("key/pfx"); + case WT_CELL_KEY_OVFL: + return ("key/ovfl"); + case WT_CELL_KEY_SHORT: + return ("key/short"); + case WT_CELL_KEY_SHORT_PFX: + return ("key/short,pfx"); + case WT_CELL_KEY_OVFL_RM: + return ("key/ovfl,rm"); + case WT_CELL_VALUE: + return ("value"); + case WT_CELL_VALUE_COPY: + return ("value/copy"); + case WT_CELL_VALUE_OVFL: + return ("value/ovfl"); + case WT_CELL_VALUE_OVFL_RM: + return ("value/ovfl,rm"); + case WT_CELL_VALUE_SHORT: + return ("value/short"); + default: + return ("unknown"); + } + /* NOTREACHED */ +} + +/* + * __wt_page_addr_string -- + * Figure out a page's "address" and load a buffer with a printable, + * nul-terminated representation of that address. + */ +const char * +__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf) +{ + size_t addr_size; + const uint8_t *addr; + + if (__wt_ref_is_root(ref)) { + buf->data = "[Root]"; + buf->size = strlen("[Root]"); + return (buf->data); + } + + (void)__wt_ref_info(session, ref, &addr, &addr_size, NULL); + return (__wt_addr_string(session, addr, addr_size, buf)); +} + +/* + * __wt_addr_string -- + * Load a buffer with a printable, nul-terminated representation of an + * address. + */ +const char * +__wt_addr_string(WT_SESSION_IMPL *session, + const uint8_t *addr, size_t addr_size, WT_ITEM *buf) +{ + WT_BM *bm; + + bm = S2BT(session)->bm; + + if (addr == NULL) { + buf->data = "[NoAddr]"; + buf->size = strlen("[NoAddr]"); + } else if (bm->addr_string(bm, session, buf, addr, addr_size) != 0) { + buf->data = "[Error]"; + buf->size = strlen("[Error]"); + } + return (buf->data); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c new file mode 100644 index 00000000000..4cd317f1e8f --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c @@ -0,0 +1,270 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __ovfl_read -- + * Read an overflow item from the disk. + */ +static int +__ovfl_read(WT_SESSION_IMPL *session, + const uint8_t *addr, size_t addr_size, WT_ITEM *store) +{ + WT_BTREE *btree; + const WT_PAGE_HEADER *dsk; + + btree = S2BT(session); + + /* + * Read the overflow item from the block manager, then reference the + * start of the data and set the data's length. + * + * Overflow reads are synchronous. That may bite me at some point, but + * WiredTiger supports large page sizes, overflow items should be rare. + */ + WT_RET(__wt_bt_read(session, store, addr, addr_size)); + dsk = store->data; + store->data = WT_PAGE_HEADER_BYTE(btree, dsk); + store->size = dsk->u.datalen; + + WT_STAT_FAST_DATA_INCR(session, cache_read_overflow); + + return (0); +} + +/* + * __wt_ovfl_read -- + * Bring an overflow item into memory. + */ +int +__wt_ovfl_read(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store) +{ + WT_DECL_RET; + + /* + * If no page specified, there's no need to lock and there's no cache + * to search, we don't care about WT_CELL_VALUE_OVFL_RM cells. + */ + if (page == NULL) + return ( + __ovfl_read(session, unpack->data, unpack->size, store)); + + /* + * WT_CELL_VALUE_OVFL_RM cells: If reconciliation deleted an overflow + * value, but there was still a reader in the system that might need it, + * the on-page cell type will have been reset to WT_CELL_VALUE_OVFL_RM + * and we will be passed a page so we can look-aside into the cache of + * such values. + * + * Acquire the overflow lock, and retest the on-page cell's value inside + * the lock. + */ + WT_RET(__wt_readlock(session, S2BT(session)->ovfl_lock)); + ret = __wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM ? + __wt_ovfl_txnc_search(page, unpack->data, unpack->size, store) : + __ovfl_read(session, unpack->data, unpack->size, store); + WT_TRET(__wt_readunlock(session, S2BT(session)->ovfl_lock)); + + return (ret); +} + +/* + * __ovfl_cache_col_visible -- + * column-store: check for a globally visible update. + */ +static int +__ovfl_cache_col_visible( + WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_CELL_UNPACK *unpack) +{ + /* + * Column-store is harder than row_store: we're here because there's a + * reader in the system that might read the original version of an + * overflow record, which might match a number of records. For example, + * the original overflow value was for records 100-200, we've replaced + * each of those records individually, but there exists a reader that + * might read any one of those records, and all of those records have + * different update entries with different transaction IDs. Since it's + * infeasible to determine if there's a globally visible update for each + * reader for each record, we test the simple case where a single record + * has a single, globally visible update. If that's not the case, cache + * the value. + */ + if (__wt_cell_rle(unpack) == 1 && + upd != NULL && /* Sanity: upd should always be set. */ + __wt_txn_visible_all(session, upd->txnid)) + return (1); + return (0); +} + +/* + * __ovfl_cache_row_visible -- + * row-store: check for a globally visible update. + */ +static int +__ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip) +{ + WT_UPDATE *upd; + + /* Check to see if there's a globally visible update. */ + for (upd = WT_ROW_UPDATE(page, rip); upd != NULL; upd = upd->next) + if (__wt_txn_visible_all(session, upd->txnid)) + return (1); + + return (0); +} + +/* + * __ovfl_cache -- + * Cache a deleted overflow value. + */ +static int +__ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + size_t addr_size; + const uint8_t *addr; + + addr = unpack->data; + addr_size = unpack->size; + + WT_RET(__wt_scr_alloc(session, 1024, &tmp)); + + /* Enter the value into the overflow cache. */ + WT_ERR(__ovfl_read(session, addr, addr_size, tmp)); + WT_ERR(__wt_ovfl_txnc_add( + session, page, addr, addr_size, tmp->data, tmp->size)); + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __wt_ovfl_cache -- + * Handle deletion of an overflow value. + */ +int +__wt_ovfl_cache(WT_SESSION_IMPL *session, + WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack) +{ + int visible; + + /* + * This function solves a problem in reconciliation. The scenario is: + * - reconciling a leaf page that references an overflow item + * - the item is updated and the update committed + * - a checkpoint runs, freeing the backing overflow blocks + * - a snapshot transaction wants the original version of the item + * + * In summary, we may need the original version of an overflow item for + * a snapshot transaction after the item was deleted from a page that's + * subsequently been checkpointed, where the checkpoint must know about + * the freed blocks. We don't have any way to delay a free of the + * underlying blocks until a particular set of transactions exit (and + * this shouldn't be a common scenario), so cache the overflow value in + * memory. + * + * This gets hard because the snapshot transaction reader might: + * - search the WT_UPDATE list and not find an useful entry + * - read the overflow value's address from the on-page cell + * - go to sleep + * - checkpoint runs, caches the overflow value, frees the blocks + * - another thread allocates and overwrites the blocks + * - the reader wakes up and reads the wrong value + * + * Use a read/write lock and the on-page cell to fix the problem: hold + * a write lock when changing the cell type from WT_CELL_VALUE_OVFL to + * WT_CELL_VALUE_OVFL_RM and hold a read lock when reading an overflow + * item. + * + * The read/write lock is per btree, but it could be per page or even + * per overflow item. We don't do any of that because overflow values + * are supposed to be rare and we shouldn't see contention for the lock. + * + * Check for a globally visible update. If there is a globally visible + * update, we don't need to cache the item because it's not possible for + * a running thread to have moved past it. + */ + switch (page->type) { + case WT_PAGE_COL_VAR: + visible = __ovfl_cache_col_visible(session, cookie, vpack); + break; + case WT_PAGE_ROW_LEAF: + visible = __ovfl_cache_row_visible(session, page, cookie); + break; + WT_ILLEGAL_VALUE(session); + } + + /* + * If there's no globally visible update, there's a reader in the system + * that might try and read the old value, cache it. + */ + if (!visible) { + WT_RET(__ovfl_cache(session, page, vpack)); + WT_STAT_FAST_DATA_INCR(session, cache_overflow_value); + } + + /* + * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the + * underlying overflow value's blocks to be freed when reconciliation + * completes. + */ + return (__wt_ovfl_discard_add(session, page, vpack->cell)); +} + +/* + * __wt_ovfl_discard -- + * Discard an on-page overflow value, and reset the page's cell. + */ +int +__wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_CELL_UNPACK *unpack, _unpack; + WT_DECL_RET; + + btree = S2BT(session); + bm = btree->bm; + unpack = &_unpack; + + __wt_cell_unpack(cell, unpack); + + /* + * Finally remove overflow key/value objects, called when reconciliation + * finishes after successfully writing a page. + * + * Keys must have already been instantiated and value objects must have + * already been cached (if they might potentially still be read by any + * running transaction). + * + * Acquire the overflow lock to avoid racing with a thread reading the + * backing overflow blocks. + */ + WT_RET(__wt_writelock(session, btree->ovfl_lock)); + + switch (unpack->raw) { + case WT_CELL_KEY_OVFL: + __wt_cell_type_reset(session, + unpack->cell, WT_CELL_KEY_OVFL, WT_CELL_KEY_OVFL_RM); + break; + case WT_CELL_VALUE_OVFL: + __wt_cell_type_reset(session, + unpack->cell, WT_CELL_VALUE_OVFL, WT_CELL_VALUE_OVFL_RM); + break; + WT_ILLEGAL_VALUE(session); + } + + WT_TRET(__wt_writeunlock(session, btree->ovfl_lock)); + + /* Free the backing disk blocks. */ + WT_TRET(bm->free(bm, session, unpack->data, unpack->size)); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c new file mode 100644 index 00000000000..c5f24c06286 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -0,0 +1,734 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static void __inmem_col_fix(WT_SESSION_IMPL *, WT_PAGE *); +static void __inmem_col_int(WT_SESSION_IMPL *, WT_PAGE *); +static int __inmem_col_var(WT_SESSION_IMPL *, WT_PAGE *, size_t *); +static int __inmem_row_int(WT_SESSION_IMPL *, WT_PAGE *, size_t *); +static int __inmem_row_leaf(WT_SESSION_IMPL *, WT_PAGE *); +static int __inmem_row_leaf_entries( + WT_SESSION_IMPL *, const WT_PAGE_HEADER *, uint32_t *); + +/* + * __evict_force_check -- + * Check if a page matches the criteria for forced eviction. + */ +static int +__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BTREE *btree; + + btree = S2BT(session); + + /* Pages are usually small enough, check that first. */ + if (page->memory_footprint < btree->maxmempage) + return (0); + + /* Leaf pages only. */ + if (page->type != WT_PAGE_COL_FIX && + page->type != WT_PAGE_COL_VAR && + page->type != WT_PAGE_ROW_LEAF) + return (0); + + /* Eviction may be turned off, although that's rare. */ + if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) + return (0); + + /* + * It's hard to imagine a page with a huge memory footprint that has + * never been modified, but check to be sure. + */ + if (page->modify == NULL) + return (0); + + /* Trigger eviction on the next page release. */ + page->read_gen = WT_READGEN_OLDEST; + + return (1); +} + +/* + * __wt_page_in_func -- + * Acquire a hazard pointer to a page; if the page is not in-memory, + * read it from the disk and build an in-memory version. + */ +int +__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags +#ifdef HAVE_DIAGNOSTIC + , const char *file, int line +#endif + ) +{ + WT_DECL_RET; + WT_PAGE *page; + int busy, force_attempts, oldgen; + + for (force_attempts = oldgen = 0;;) { + switch (ref->state) { + case WT_REF_DISK: + case WT_REF_DELETED: + if (LF_ISSET(WT_READ_CACHE)) + return (WT_NOTFOUND); + + /* + * The page isn't in memory, attempt to read it. + * Make sure there is space in the cache. + */ + WT_RET(__wt_cache_full_check(session)); + WT_RET(__wt_cache_read(session, ref)); + oldgen = LF_ISSET(WT_READ_WONT_NEED) || + F_ISSET(session, WT_SESSION_NO_CACHE); + continue; + case WT_REF_READING: + if (LF_ISSET(WT_READ_CACHE)) + return (WT_NOTFOUND); + /* FALLTHROUGH */ + case WT_REF_LOCKED: + if (LF_ISSET(WT_READ_NO_WAIT)) + return (WT_NOTFOUND); + /* The page is busy -- wait. */ + break; + case WT_REF_SPLIT: + return (WT_RESTART); + case WT_REF_MEM: + /* + * The page is in memory: get a hazard pointer, update + * the page's LRU and return. The expected reason we + * can't get a hazard pointer is because the page is + * being evicted; yield and try again. + */ +#ifdef HAVE_DIAGNOSTIC + WT_RET( + __wt_hazard_set(session, ref, &busy, file, line)); +#else + WT_RET(__wt_hazard_set(session, ref, &busy)); +#endif + if (busy) + break; + + page = ref->page; + WT_ASSERT(session, page != NULL); + + /* Forcibly evict pages that are too big. */ + if (!LF_ISSET(WT_READ_NO_EVICT) && + force_attempts < 10 && + __evict_force_check(session, page)) { + ++force_attempts; + WT_RET(__wt_page_release(session, ref, flags)); + break; + } + + /* Check if we need an autocommit transaction. */ + if ((ret = __wt_txn_autocommit_check(session)) != 0) { + WT_TRET(__wt_hazard_clear(session, page)); + return (ret); + } + + /* + * If we read the page and we are configured to not + * trash the cache, set the oldest read generation so + * the page is forcibly evicted as soon as possible. + * + * Otherwise, update the page's read generation. + */ + if (oldgen && page->read_gen == WT_READGEN_NOTSET) + page->read_gen = WT_READGEN_OLDEST; + else if (!LF_ISSET(WT_READ_NO_GEN) && + page->read_gen < __wt_cache_read_gen(session)) + page->read_gen = + __wt_cache_read_gen_set(session); + + return (0); + WT_ILLEGAL_VALUE(session); + } + + /* We failed to get the page -- yield before retrying. */ + __wt_yield(); + } +} + +/* + * __wt_page_alloc -- + * Create or read a page into the cache. + */ +int +__wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, + uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep) +{ + WT_CACHE *cache; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + size_t size; + uint32_t i; + void *p; + + *pagep = NULL; + + cache = S2C(session)->cache; + page = NULL; + + size = sizeof(WT_PAGE); + switch (type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + break; + case WT_PAGE_COL_VAR: + /* + * Variable-length column-store leaf page: allocate memory to + * describe the page's contents with the initial allocation. + */ + size += alloc_entries * sizeof(WT_COL); + break; + case WT_PAGE_ROW_LEAF: + /* + * Row-store leaf page: allocate memory to describe the page's + * contents with the initial allocation. + */ + size += alloc_entries * sizeof(WT_ROW); + break; + WT_ILLEGAL_VALUE(session); + } + + WT_RET(__wt_calloc(session, 1, size, &page)); + + page->type = type; + page->read_gen = WT_READGEN_NOTSET; + + switch (type) { + case WT_PAGE_COL_FIX: + page->pg_fix_recno = recno; + page->pg_fix_entries = alloc_entries; + break; + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + page->pg_intl_recno = recno; + + /* + * Internal pages have an array of references to objects so they + * can split. Allocate the array of references and optionally, + * the objects to which they point. + */ + WT_ERR(__wt_calloc(session, 1, + sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *), + &p)); + size += + sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *); + pindex = p; + pindex->index = (WT_REF **)((WT_PAGE_INDEX *)p + 1); + pindex->entries = alloc_entries; + WT_INTL_INDEX_SET(page, pindex); + if (alloc_refs) + for (i = 0; i < pindex->entries; ++i) { + WT_ERR(__wt_calloc_def( + session, 1, &pindex->index[i])); + size += sizeof(WT_REF); + } + if (0) { +err: if ((pindex = WT_INTL_INDEX_COPY(page)) != NULL) { + for (i = 0; i < pindex->entries; ++i) + __wt_free(session, pindex->index[i]); + __wt_free(session, pindex); + } + __wt_free(session, page); + return (ret); + } + break; + case WT_PAGE_COL_VAR: + page->pg_var_recno = recno; + page->pg_var_d = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE)); + page->pg_var_entries = alloc_entries; + break; + case WT_PAGE_ROW_LEAF: + page->pg_row_d = (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE)); + page->pg_row_entries = alloc_entries; + break; + WT_ILLEGAL_VALUE(session); + } + + /* Increment the cache statistics. */ + __wt_cache_page_inmem_incr(session, page, size); + (void)WT_ATOMIC_ADD8(cache->pages_inmem, 1); + + *pagep = page; + return (0); +} + +/* + * __wt_page_inmem -- + * Build in-memory page information. + */ +int +__wt_page_inmem(WT_SESSION_IMPL *session, + WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep) +{ + WT_DECL_RET; + WT_PAGE *page; + const WT_PAGE_HEADER *dsk; + uint32_t alloc_entries; + size_t size; + + *pagep = NULL; + + dsk = image; + alloc_entries = 0; + + /* + * Figure out how many underlying objects the page references so we can + * allocate them along with the page. + */ + switch (dsk->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_INT: + case WT_PAGE_COL_VAR: + /* + * Column-store leaf page entries map one-to-one to the number + * of physical entries on the page (each physical entry is a + * value item). + * + * Column-store internal page entries map one-to-one to the + * number of physical entries on the page (each entry is a + * location cookie). + */ + alloc_entries = dsk->u.entries; + break; + case WT_PAGE_ROW_INT: + /* + * Row-store internal page entries map one-to-two to the number + * of physical entries on the page (each entry is a key and + * location cookie pair). + */ + alloc_entries = dsk->u.entries / 2; + break; + case WT_PAGE_ROW_LEAF: + /* + * If the "no empty values" flag is set, row-store leaf page + * entries map one-to-one to the number of physical entries + * on the page (each physical entry is a key or value item). + * If that flag is not set, there are more keys than values, + * we have to walk the page to figure it out. + */ + if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL)) + alloc_entries = dsk->u.entries; + else if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE)) + alloc_entries = dsk->u.entries / 2; + else + WT_RET(__inmem_row_leaf_entries( + session, dsk, &alloc_entries)); + break; + WT_ILLEGAL_VALUE(session); + } + + /* Allocate and initialize a new WT_PAGE. */ + WT_RET(__wt_page_alloc( + session, dsk->type, dsk->recno, alloc_entries, 1, &page)); + page->dsk = dsk; + F_SET_ATOMIC(page, flags); + + /* + * Track the memory allocated to build this page so we can update the + * cache statistics in a single call. + */ + size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? dsk->mem_size : 0; + + switch (page->type) { + case WT_PAGE_COL_FIX: + __inmem_col_fix(session, page); + break; + case WT_PAGE_COL_INT: + __inmem_col_int(session, page); + break; + case WT_PAGE_COL_VAR: + WT_ERR(__inmem_col_var(session, page, &size)); + break; + case WT_PAGE_ROW_INT: + WT_ERR(__inmem_row_int(session, page, &size)); + break; + case WT_PAGE_ROW_LEAF: + WT_ERR(__inmem_row_leaf(session, page)); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* Update the page's in-memory size and the cache statistics. */ + __wt_cache_page_inmem_incr(session, page, size); + + /* Link the new internal page to the parent. */ + if (ref != NULL) { + switch (page->type) { + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + page->pg_intl_parent_ref = ref; + break; + } + ref->page = page; + } + + *pagep = page; + return (0); + +err: __wt_page_out(session, &page); + return (ret); +} + +/* + * __inmem_col_fix -- + * Build in-memory index for fixed-length column-store leaf pages. + */ +static void +__inmem_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BTREE *btree; + const WT_PAGE_HEADER *dsk; + + btree = S2BT(session); + dsk = page->dsk; + + page->pg_fix_bitf = WT_PAGE_HEADER_BYTE(btree, dsk); +} + +/* + * __inmem_col_int -- + * Build in-memory index for column-store internal pages. + */ +static void +__inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + const WT_PAGE_HEADER *dsk; + WT_PAGE_INDEX *pindex; + WT_REF **refp, *ref; + uint32_t i; + + btree = S2BT(session); + dsk = page->dsk; + unpack = &_unpack; + + /* + * Walk the page, building references: the page contains value items. + * The value items are on-page items (WT_CELL_VALUE). + */ + pindex = WT_INTL_INDEX_COPY(page); + refp = pindex->index; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { + ref = *refp++; + ref->home = page; + + __wt_cell_unpack(cell, unpack); + ref->addr = cell; + ref->key.recno = unpack->v; + } +} + +/* + * __inmem_col_var_repeats -- + * Count the number of repeat entries on the page. + */ +static int +__inmem_col_var_repeats(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t *np) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + const WT_PAGE_HEADER *dsk; + uint32_t i; + + btree = S2BT(session); + dsk = page->dsk; + unpack = &_unpack; + + /* Walk the page, counting entries for the repeats array. */ + *np = 0; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { + __wt_cell_unpack(cell, unpack); + if (__wt_cell_rle(unpack) > 1) + ++*np; + } + return (0); +} + +/* + * __inmem_col_var -- + * Build in-memory index for variable-length, data-only leaf pages in + * column-store trees. + */ +static int +__inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) +{ + WT_BTREE *btree; + WT_COL *cip; + WT_COL_RLE *repeats; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + const WT_PAGE_HEADER *dsk; + uint64_t recno, rle; + size_t bytes_allocated; + uint32_t i, indx, n, repeat_off; + + btree = S2BT(session); + dsk = page->dsk; + recno = page->pg_var_recno; + + repeats = NULL; + repeat_off = 0; + unpack = &_unpack; + bytes_allocated = 0; + + /* + * Walk the page, building references: the page contains unsorted value + * items. The value items are on-page (WT_CELL_VALUE), overflow items + * (WT_CELL_VALUE_OVFL) or deleted items (WT_CELL_DEL). + */ + indx = 0; + cip = page->pg_var_d; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { + __wt_cell_unpack(cell, unpack); + WT_COL_PTR_SET(cip, WT_PAGE_DISK_OFFSET(page, cell)); + cip++; + + /* + * Add records with repeat counts greater than 1 to an array we + * use for fast lookups. The first entry we find needing the + * repeats array triggers a re-walk from the start of the page + * to determine the size of the array. + */ + rle = __wt_cell_rle(unpack); + if (rle > 1) { + if (repeats == NULL) { + WT_RET( + __inmem_col_var_repeats(session, page, &n)); + WT_RET(__wt_realloc_def(session, + &bytes_allocated, n + 1, &repeats)); + + page->pg_var_repeats = repeats; + page->pg_var_nrepeats = n; + *sizep += bytes_allocated; + } + repeats[repeat_off].indx = indx; + repeats[repeat_off].recno = recno; + repeats[repeat_off++].rle = rle; + } + indx++; + recno += rle; + } + + return (0); +} + +/* + * __inmem_row_int -- + * Build in-memory index for row-store internal pages. + */ +static int +__inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + WT_DECL_ITEM(current); + WT_DECL_RET; + const WT_PAGE_HEADER *dsk; + WT_PAGE_INDEX *pindex; + WT_REF *ref, **refp; + uint32_t i; + + btree = S2BT(session); + unpack = &_unpack; + dsk = page->dsk; + + WT_RET(__wt_scr_alloc(session, 0, ¤t)); + + /* + * Walk the page, instantiating keys: the page contains sorted key and + * location cookie pairs. Keys are on-page/overflow items and location + * cookies are WT_CELL_ADDR_XXX items. + */ + pindex = WT_INTL_INDEX_COPY(page); + refp = pindex->index; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { + ref = *refp; + ref->home = page; + + __wt_cell_unpack(cell, unpack); + switch (unpack->type) { + case WT_CELL_KEY: + /* + * Note: we don't Huffman encode internal page keys, + * there's no decoding work to do. + */ + __wt_ref_key_onpage_set(page, ref, unpack); + break; + case WT_CELL_KEY_OVFL: + /* Instantiate any overflow records. */ + WT_ERR(__wt_dsk_cell_data_ref( + session, page->type, unpack, current)); + + WT_ERR(__wt_row_ikey_incr(session, page, + WT_PAGE_DISK_OFFSET(page, cell), + current->data, current->size, &ref->key.ikey)); + + *sizep += sizeof(WT_IKEY) + current->size; + break; + case WT_CELL_ADDR_DEL: + /* + * A cell may reference a deleted leaf page: if a leaf + * page was deleted without being read (fast truncate), + * and the deletion committed, but older transactions + * in the system required the previous version of the + * page to remain available, a special deleted-address + * type cell is written. The only reason we'd ever see + * that cell on a page we're reading is if we crashed + * and recovered (otherwise a version of the page w/o + * that cell would have eventually been written). If we + * crash and recover to a page with a deleted-address + * cell, we want to discard the page from the backing + * store (it was never discarded), and, of course, by + * definition no earlier transaction will ever need it. + * + * Re-create the state of a deleted page. + */ + ref->addr = cell; + ref->state = WT_REF_DELETED; + ++refp; + + /* + * If the tree is already dirty and so will be written, + * mark the page dirty. (We want to free the deleted + * pages, but if the handle is read-only or if the + * application never modifies the tree, we're not able + * to do so.) + */ + if (btree->modified) { + WT_ERR(__wt_page_modify_init(session, page)); + __wt_page_modify_set(session, page); + } + break; + case WT_CELL_ADDR_INT: + case WT_CELL_ADDR_LEAF: + case WT_CELL_ADDR_LEAF_NO: + ref->addr = cell; + ++refp; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + } + +err: __wt_scr_free(¤t); + return (ret); +} + +/* + * __inmem_row_leaf_entries -- + * Return the number of entries for row-store leaf pages. + */ +static int +__inmem_row_leaf_entries( + WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, uint32_t *nindxp) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + uint32_t i, nindx; + + btree = S2BT(session); + unpack = &_unpack; + + /* + * Leaf row-store page entries map to a maximum of one-to-one to the + * number of physical entries on the page (each physical entry might be + * a key without a subsequent data item). To avoid over-allocation in + * workloads without empty data items, first walk the page counting the + * number of keys, then allocate the indices. + * + * The page contains key/data pairs. Keys are on-page (WT_CELL_KEY) or + * overflow (WT_CELL_KEY_OVFL) items, data are either non-existent or a + * single on-page (WT_CELL_VALUE) or overflow (WT_CELL_VALUE_OVFL) item. + */ + nindx = 0; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { + __wt_cell_unpack(cell, unpack); + switch (unpack->type) { + case WT_CELL_KEY: + case WT_CELL_KEY_OVFL: + ++nindx; + break; + case WT_CELL_VALUE: + case WT_CELL_VALUE_OVFL: + break; + WT_ILLEGAL_VALUE(session); + } + } + + *nindxp = nindx; + return (0); +} + +/* + * __inmem_row_leaf -- + * Build in-memory index for row-store leaf pages. + */ +static int +__inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + const WT_PAGE_HEADER *dsk; + WT_ROW *rip; + uint32_t i; + + btree = S2BT(session); + dsk = page->dsk; + unpack = &_unpack; + + /* Walk the page, building indices. */ + rip = page->pg_row_d; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { + __wt_cell_unpack(cell, unpack); + switch (unpack->type) { + case WT_CELL_KEY_OVFL: + __wt_row_leaf_key_set_cell(page, rip, cell); + ++rip; + break; + case WT_CELL_KEY: + /* + * Simple keys without compression (not Huffman encoded + * or prefix compressed), can be directly referenced on + * the page to avoid repeatedly unpacking their cells. + */ + if (!btree->huffman_key && unpack->prefix == 0) + __wt_row_leaf_key_set(page, rip, unpack); + else + __wt_row_leaf_key_set_cell(page, rip, cell); + ++rip; + break; + case WT_CELL_VALUE: + /* + * Simple values without compression can be directly + * referenced on the page to avoid repeatedly unpacking + * their cells. + */ + if (!btree->huffman_value) + __wt_row_leaf_value_set(page, rip - 1, unpack); + break; + case WT_CELL_VALUE_OVFL: + break; + WT_ILLEGAL_VALUE(session); + } + } + + /* + * We do not currently instantiate keys on leaf pages when the page is + * loaded, they're instantiated on demand. + */ + return (0); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c new file mode 100644 index 00000000000..9cd6f8310af --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -0,0 +1,88 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_cache_read -- + * Read a page from the file. + */ +int +__wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_DECL_RET; + WT_ITEM tmp; + WT_PAGE *page; + WT_PAGE_STATE previous_state; + size_t addr_size; + const uint8_t *addr; + + page = NULL; + + /* + * Don't pass an allocated buffer to the underlying block read function, + * force allocation of new memory of the appropriate size. + */ + WT_CLEAR(tmp); + + /* + * Attempt to set the state to WT_REF_READING for normal reads, or + * WT_REF_LOCKED, for deleted pages. If successful, we've won the + * race, read the page. + */ + if (WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_READING)) + previous_state = WT_REF_DISK; + else if (WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED)) + previous_state = WT_REF_DELETED; + else + return (0); + + /* + * Get the address: if there is no address, the page was deleted, but a + * subsequent search or insert is forcing re-creation of the name space. + * Otherwise, there's an address, read the backing disk page and build + * an in-memory version of the page. + */ + WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); + if (addr == NULL) { + WT_ASSERT(session, previous_state == WT_REF_DELETED); + + WT_ERR(__wt_btree_new_leaf_page(session, &page)); + ref->page = page; + } else { + /* Read the backing disk page. */ + WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); + + /* Build the in-memory version of the page. */ + WT_ERR(__wt_page_inmem(session, ref, tmp.data, + WT_DATA_IN_ITEM(&tmp) ? + WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); + + /* If the page was deleted, instantiate that information. */ + if (previous_state == WT_REF_DELETED) + WT_ERR(__wt_delete_page_instantiate(session, ref)); + } + + WT_ERR(__wt_verbose(session, WT_VERB_READ, + "page %p: %s", page, __wt_page_type_string(page->type))); + + WT_PUBLISH(ref->state, WT_REF_MEM); + return (0); + +err: /* + * If the function building an in-memory version of the page failed, + * it discarded the page, but not the disk image. Discard the page + * and separately discard the disk image in all cases. + */ + if (ref->page != NULL) + __wt_ref_out(session, ref); + WT_PUBLISH(ref->state, previous_state); + + __wt_buf_free(session, &tmp); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c new file mode 100644 index 00000000000..25b4bfc3005 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_ret.c @@ -0,0 +1,116 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_kv_return -- + * Return a page referenced key/value pair to the application. + */ +int +__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK unpack; + WT_CURSOR *cursor; + WT_PAGE *page; + WT_ROW *rip; + uint8_t v; + + btree = S2BT(session); + + page = cbt->ref->page; + cursor = &cbt->iface; + + switch (page->type) { + case WT_PAGE_COL_FIX: + /* + * The interface cursor's record has usually been set, but that + * isn't universally true, specifically, cursor.search_near may + * call here without first setting the interface cursor. + */ + cursor->recno = cbt->recno; + + /* If the cursor references a WT_UPDATE item, return it. */ + if (upd != NULL) { + cursor->value.data = WT_UPDATE_DATA(upd); + cursor->value.size = upd->size; + return (0); + } + + /* Take the value from the original page. */ + v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); + return (__wt_buf_set(session, &cursor->value, &v, 1)); + case WT_PAGE_COL_VAR: + /* + * The interface cursor's record has usually been set, but that + * isn't universally true, specifically, cursor.search_near may + * call here without first setting the interface cursor. + */ + cursor->recno = cbt->recno; + + /* If the cursor references a WT_UPDATE item, return it. */ + if (upd != NULL) { + cursor->value.data = WT_UPDATE_DATA(upd); + cursor->value.size = upd->size; + return (0); + } + + /* Take the value from the original page cell. */ + cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]); + break; + case WT_PAGE_ROW_LEAF: + rip = &page->pg_row_d[cbt->slot]; + + /* + * If the cursor references a WT_INSERT item, take its key. + * Else, if we have an exact match, we copied the key in the + * search function, take it from there. + * If we don't have an exact match, take the key from the + * original page. + */ + if (cbt->ins != NULL) { + cursor->key.data = WT_INSERT_KEY(cbt->ins); + cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); + } else if (cbt->compare == 0) { + cursor->key.data = cbt->search_key.data; + cursor->key.size = cbt->search_key.size; + } else + WT_RET(__wt_row_leaf_key( + session, page, rip, &cursor->key, 0)); + + /* If the cursor references a WT_UPDATE item, return it. */ + if (upd != NULL) { + cursor->value.data = WT_UPDATE_DATA(upd); + cursor->value.size = upd->size; + return (0); + } + + /* Simple values have their location encoded in the WT_ROW. */ + if (__wt_row_leaf_value(page, rip, &cursor->value)) + return (0); + + /* + * Take the value from the original page cell (which may be + * empty). + */ + if ((cell = + __wt_row_leaf_value_cell(page, rip, NULL)) == NULL) { + cursor->value.size = 0; + return (0); + } + break; + WT_ILLEGAL_VALUE(session); + } + + /* The value is an on-page cell, unpack and expand it as necessary. */ + __wt_cell_unpack(cell, &unpack); + WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value)); + + return (0); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c new file mode 100644 index 00000000000..10366e91a0e --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -0,0 +1,2520 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +struct __wt_stuff; typedef struct __wt_stuff WT_STUFF; +struct __wt_track; typedef struct __wt_track WT_TRACK; +struct __wt_track_shared; typedef struct __wt_track_shared WT_TRACK_SHARED; + +/* + * There's a bunch of stuff we pass around during salvage, group it together + * to make the code prettier. + */ +struct __wt_stuff { + WT_SESSION_IMPL *session; /* Salvage session */ + + WT_TRACK **pages; /* Pages */ + uint32_t pages_next; /* Next empty slot */ + size_t pages_allocated; /* Bytes allocated */ + + WT_TRACK **ovfl; /* Overflow pages */ + uint32_t ovfl_next; /* Next empty slot */ + size_t ovfl_allocated; /* Bytes allocated */ + + WT_REF root_ref; /* Created root page */ + + uint8_t page_type; /* Page type */ + + /* If need to free blocks backing merged page ranges. */ + int merge_free; + + WT_ITEM *tmp1; /* Verbose print buffer */ + WT_ITEM *tmp2; /* Verbose print buffer */ + + uint64_t fcnt; /* Progress counter */ +}; + +/* + * WT_TRACK_SHARED -- + * Information shared between pages being merged. + */ +struct __wt_track_shared { + uint32_t ref; /* Reference count */ + + /* + * Physical information about the file block. + */ + WT_ADDR addr; /* Page address */ + uint32_t size; /* Page size */ + uint64_t gen; /* Page generation */ + + /* + * Pages that reference overflow pages contain a list of the overflow + * pages they reference. We start out with a list of addresses, and + * convert to overflow array slots during the reconciliation of page + * references to overflow records. + */ + WT_ADDR *ovfl_addr; /* Overflow pages by address */ + uint32_t *ovfl_slot; /* Overflow pages by slot */ + uint32_t ovfl_cnt; /* Overflow reference count */ +}; + +/* + * WT_TRACK -- + * Structure to track chunks, one per chunk; we start out with a chunk per + * page (either leaf or overflow), but when we find overlapping key ranges, we + * split the leaf page chunks up, one chunk for each unique key range. + */ +struct __wt_track { +#define trk_addr shared->addr.addr +#define trk_addr_size shared->addr.size +#define trk_gen shared->gen +#define trk_ovfl_addr shared->ovfl_addr +#define trk_ovfl_cnt shared->ovfl_cnt +#define trk_ovfl_slot shared->ovfl_slot +#define trk_size shared->size + WT_TRACK_SHARED *shared; /* Shared information */ + + WT_STUFF *ss; /* Enclosing stuff */ + + union { + struct { +#undef row_start +#define row_start u.row._row_start + WT_ITEM _row_start; /* Row-store start range */ +#undef row_stop +#define row_stop u.row._row_stop + WT_ITEM _row_stop; /* Row-store stop range */ + } row; + + struct { +#undef col_start +#define col_start u.col._col_start + uint64_t _col_start; /* Col-store start range */ +#undef col_stop +#define col_stop u.col._col_stop + uint64_t _col_stop; /* Col-store stop range */ +#undef col_missing +#define col_missing u.col._col_missing + uint64_t _col_missing; /* Col-store missing range */ + } col; + } u; + +#define WT_TRACK_CHECK_START 0x01 /* Row: initial key updated */ +#define WT_TRACK_CHECK_STOP 0x02 /* Row: last key updated */ +#define WT_TRACK_MERGE 0x04 /* Page requires merging */ +#define WT_TRACK_OVFL_REFD 0x08 /* Overflow page referenced */ + u_int flags; +}; + +static int __slvg_cleanup(WT_SESSION_IMPL *, WT_STUFF *); +static int __slvg_col_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *); +static int __slvg_col_build_leaf(WT_SESSION_IMPL *, WT_TRACK *, WT_REF *); +static int __slvg_col_ovfl( + WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint64_t, uint64_t); +static int __slvg_col_range(WT_SESSION_IMPL *, WT_STUFF *); +static int __slvg_col_range_missing(WT_SESSION_IMPL *, WT_STUFF *); +static int __slvg_col_range_overlap( + WT_SESSION_IMPL *, uint32_t, uint32_t, WT_STUFF *); +static void __slvg_col_trk_update_start(uint32_t, WT_STUFF *); +static int __slvg_merge_block_free(WT_SESSION_IMPL *, WT_STUFF *); +static int __slvg_ovfl_compare(const void *, const void *); +static int __slvg_ovfl_discard(WT_SESSION_IMPL *, WT_STUFF *); +static int __slvg_ovfl_reconcile(WT_SESSION_IMPL *, WT_STUFF *); +static int __slvg_ovfl_ref(WT_SESSION_IMPL *, WT_TRACK *, int); +static int __slvg_ovfl_ref_all(WT_SESSION_IMPL *, WT_TRACK *); +static int __slvg_read(WT_SESSION_IMPL *, WT_STUFF *); +static int __slvg_row_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *); +static int __slvg_row_build_leaf( + WT_SESSION_IMPL *, WT_TRACK *, WT_REF *, WT_STUFF *); +static int __slvg_row_ovfl( + WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint32_t, uint32_t); +static int __slvg_row_range(WT_SESSION_IMPL *, WT_STUFF *); +static int __slvg_row_range_overlap( + WT_SESSION_IMPL *, uint32_t, uint32_t, WT_STUFF *); +static int __slvg_row_trk_update_start( + WT_SESSION_IMPL *, WT_ITEM *, uint32_t, WT_STUFF *); +static int __slvg_trk_compare_addr(const void *, const void *); +static int __slvg_trk_compare_gen(const void *, const void *); +static int __slvg_trk_compare_key(const void *, const void *); +static int __slvg_trk_free(WT_SESSION_IMPL *, WT_TRACK **, int); +static void __slvg_trk_free_addr(WT_SESSION_IMPL *, WT_TRACK *); +static int __slvg_trk_init(WT_SESSION_IMPL *, uint8_t *, + size_t, uint32_t, uint64_t, WT_STUFF *, WT_TRACK **); +static int __slvg_trk_leaf(WT_SESSION_IMPL *, + const WT_PAGE_HEADER *, uint8_t *, size_t, WT_STUFF *); +static int __slvg_trk_leaf_ovfl( + WT_SESSION_IMPL *, const WT_PAGE_HEADER *, WT_TRACK *); +static int __slvg_trk_ovfl(WT_SESSION_IMPL *, + const WT_PAGE_HEADER *, uint8_t *, size_t, WT_STUFF *); +static int __slvg_trk_split(WT_SESSION_IMPL *, WT_TRACK *, WT_TRACK **); + +/* + * __wt_bt_salvage -- + * Salvage a Btree. + */ +int +__wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_DECL_RET; + WT_STUFF *ss, stuff; + uint32_t i, leaf_cnt; + + WT_UNUSED(cfg); + + btree = S2BT(session); + bm = btree->bm; + + WT_CLEAR(stuff); + ss = &stuff; + ss->session = session; + ss->page_type = WT_PAGE_INVALID; + + /* Allocate temporary buffers. */ + WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp1)); + WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp2)); + + /* + * Step 1: + * Inform the underlying block manager that we're salvaging the file. + */ + WT_ERR(bm->salvage_start(bm, session)); + + /* + * Step 2: + * Read the file and build in-memory structures that reference any leaf + * or overflow page. Any pages other than leaf or overflow pages are + * added to the free list. + * + * Turn off read checksum and verification error messages while we're + * reading the file, we expect to see corrupted blocks. + */ + F_SET(session, WT_SESSION_SALVAGE_CORRUPT_OK); + ret = __slvg_read(session, ss); + F_CLR(session, WT_SESSION_SALVAGE_CORRUPT_OK); + WT_ERR(ret); + + /* + * Step 3: + * Discard any page referencing a non-existent overflow page. We do + * this before checking overlapping key ranges on the grounds that a + * bad key range we can use is better than a terrific key range that + * references pages we don't have. On the other hand, we subsequently + * discard key ranges where there are better overlapping ranges, and + * it would be better if we let the availability of an overflow value + * inform our choices as to the key ranges we select, ideally on a + * per-key basis. + * + * A complicating problem is found in variable-length column-store + * objects, where we potentially split key ranges within RLE units. + * For example, if there's a page with rows 15-20 and we later find + * row 17 with a larger LSN, the range splits into 3 chunks, 15-16, + * 17, and 18-20. If rows 15-20 were originally a single value (an + * RLE of 6), and that record is an overflow record, we end up with + * two chunks, both of which want to reference the same overflow value. + * + * Instead of the approach just described, we're first discarding any + * pages referencing non-existent overflow pages, then we're reviewing + * our key ranges and discarding any that overlap. We're doing it that + * way for a few reasons: absent corruption, missing overflow items are + * strong arguments the page was replaced (on the other hand, some kind + * of file corruption is probably why we're here); it's a significant + * amount of additional complexity to simultaneously juggle overlapping + * ranges and missing overflow items; finally, real-world applications + * usually don't have a lot of overflow items, as WiredTiger supports + * very large page sizes, overflow items shouldn't be common. + * + * Step 4: + * Add unreferenced overflow page blocks to the free list so they are + * reused immediately. + */ + if (ss->ovfl_next != 0) { + WT_ERR(__slvg_ovfl_reconcile(session, ss)); + WT_ERR(__slvg_ovfl_discard(session, ss)); + } + + /* + * Step 5: + * Walk the list of pages looking for overlapping ranges to resolve. + * If we find a range that needs to be resolved, set a global flag + * and a per WT_TRACK flag on the pages requiring modification. + * + * This requires sorting the page list by key, and secondarily by LSN. + * + * !!! + * It's vanishingly unlikely and probably impossible for fixed-length + * column-store files to have overlapping key ranges. It's possible + * for an entire key range to go missing (if a page is corrupted and + * lost), but because pages can't split, it shouldn't be possible to + * find pages where the key ranges overlap. That said, we check for + * it and clean up after it in reconciliation because it doesn't cost + * much and future column-store formats or operations might allow for + * fixed-length format ranges to overlap during salvage, and I don't + * want to have to retrofit the code later. + */ + qsort(ss->pages, + (size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_key); + if (ss->page_type == WT_PAGE_ROW_LEAF) + WT_ERR(__slvg_row_range(session, ss)); + else + WT_ERR(__slvg_col_range(session, ss)); + + /* + * Step 6: + * We may have lost key ranges in column-store databases, that is, some + * part of the record number space is gone. Look for missing ranges. + */ + switch (ss->page_type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + WT_ERR(__slvg_col_range_missing(session, ss)); + break; + case WT_PAGE_ROW_LEAF: + break; + } + + /* + * Step 7: + * Build an internal page that references all of the leaf pages, + * and write it, as well as any merged pages, to the file. + * + * Count how many leaf pages we have (we could track this during the + * array shuffling/splitting, but that's a lot harder). + */ + for (leaf_cnt = i = 0; i < ss->pages_next; ++i) + if (ss->pages[i] != NULL) + ++leaf_cnt; + if (leaf_cnt != 0) + switch (ss->page_type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + WT_ERR( + __slvg_col_build_internal(session, leaf_cnt, ss)); + break; + case WT_PAGE_ROW_LEAF: + WT_ERR( + __slvg_row_build_internal(session, leaf_cnt, ss)); + break; + } + + /* + * Step 8: + * If we had to merge key ranges, we have to do a final pass through + * the leaf page array and discard file pages used during key merges. + * We can't do it earlier: if we free'd the leaf pages we're merging as + * we merged them, the write of subsequent leaf pages or the internal + * page might allocate those free'd file blocks, and if the salvage run + * subsequently fails, we'd have overwritten pages used to construct the + * final key range. In other words, if the salvage run fails, we don't + * want to overwrite data the next salvage run might need. + */ + if (ss->merge_free) + WT_ERR(__slvg_merge_block_free(session, ss)); + + /* + * Step 9: + * Evict the newly created root page, creating a checkpoint. + */ + if (ss->root_ref.page != NULL) { + btree->ckpt = ckptbase; + ret = __wt_rec_evict(session, &ss->root_ref, 1); + ss->root_ref.page = NULL; + btree->ckpt = NULL; + } + + /* + * Step 10: + * Inform the underlying block manager that we're done. + */ +err: WT_TRET(bm->salvage_end(bm, session)); + + /* Discard any root page we created. */ + if (ss->root_ref.page != NULL) + __wt_ref_out(session, &ss->root_ref); + + /* Discard the leaf and overflow page memory. */ + WT_TRET(__slvg_cleanup(session, ss)); + + /* Discard temporary buffers. */ + __wt_scr_free(&ss->tmp1); + __wt_scr_free(&ss->tmp2); + + /* Wrap up reporting. */ + WT_TRET(__wt_progress(session, NULL, ss->fcnt)); + + return (ret); +} + +/* + * __slvg_read -- + * Read the file and build a table of the pages we can use. + */ +static int +__slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss) +{ + WT_BM *bm; + WT_DECL_ITEM(as); + WT_DECL_ITEM(buf); + WT_DECL_RET; + const WT_PAGE_HEADER *dsk; + size_t addr_size; + uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE]; + int eof, valid; + + bm = S2BT(session)->bm; + WT_ERR(__wt_scr_alloc(session, 0, &as)); + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + + for (;;) { + /* Get the next block address from the block manager. */ + WT_ERR(bm->salvage_next(bm, session, addr, &addr_size, &eof)); + if (eof) + break; + + /* Report progress every 10 chunks. */ + if (++ss->fcnt % 10 == 0) + WT_ERR(__wt_progress(session, NULL, ss->fcnt)); + + /* + * Read (and potentially decompress) the block; the underlying + * block manager might return only good blocks if checksums are + * configured, or both good and bad blocks if we're relying on + * compression. + * + * Report the block's status to the block manager. + */ + if ((ret = __wt_bt_read(session, buf, addr, addr_size)) == 0) + valid = 1; + else { + valid = 0; + if (ret == WT_ERROR) + ret = 0; + WT_ERR(ret); + } + WT_ERR(bm->salvage_valid(bm, session, addr, addr_size, valid)); + if (!valid) + continue; + + /* Create a printable version of the address. */ + WT_ERR(bm->addr_string(bm, session, as, addr, addr_size)); + + /* + * Make sure it's an expected page type for the file. + * + * We only care about leaf and overflow pages from here on out; + * discard all of the others. We put them on the free list now, + * because we might as well overwrite them, we want the file to + * grow as little as possible, or shrink, and future salvage + * calls don't need them either. + */ + dsk = buf->data; + switch (dsk->type) { + case WT_PAGE_BLOCK_MANAGER: + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "%s page ignored %s", + __wt_page_type_string(dsk->type), + (const char *)as->data)); + WT_ERR(bm->free(bm, session, addr, addr_size)); + continue; + } + + /* + * Verify the page. It's unlikely a page could have a valid + * checksum and still be broken, but paranoia is healthy in + * salvage. Regardless, verify does return failure because + * it detects failures we'd expect to see in a corrupted file, + * like overflow references past the end of the file or + * overflow references to non-existent pages, might as well + * discard these pages now. + */ + if (__wt_verify_dsk(session, as->data, buf) != 0) { + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "%s page failed verify %s", + __wt_page_type_string(dsk->type), + (const char *)as->data)); + WT_ERR(bm->free(bm, session, addr, addr_size)); + continue; + } + + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "tracking %s page, generation %" PRIu64 " %s", + __wt_page_type_string(dsk->type), dsk->write_gen, + (const char *)as->data)); + + switch (dsk->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + case WT_PAGE_ROW_LEAF: + if (ss->page_type == WT_PAGE_INVALID) + ss->page_type = dsk->type; + if (ss->page_type != dsk->type) + WT_ERR_MSG(session, WT_ERROR, + "file contains multiple file formats (both " + "%s and %s), and cannot be salvaged", + __wt_page_type_string(ss->page_type), + __wt_page_type_string(dsk->type)); + + WT_ERR(__slvg_trk_leaf( + session, dsk, addr, addr_size, ss)); + break; + case WT_PAGE_OVFL: + WT_ERR(__slvg_trk_ovfl( + session, dsk, addr, addr_size, ss)); + break; + } + } + +err: __wt_scr_free(&as); + __wt_scr_free(&buf); + + return (ret); +} + +/* + * __slvg_trk_init -- + * Initialize tracking information for a page. + */ +static int +__slvg_trk_init(WT_SESSION_IMPL *session, + uint8_t *addr, size_t addr_size, + uint32_t size, uint64_t gen, WT_STUFF *ss, WT_TRACK **retp) +{ + WT_DECL_RET; + WT_TRACK *trk; + + WT_RET(__wt_calloc_def(session, 1, &trk)); + WT_ERR(__wt_calloc_def(session, 1, &trk->shared)); + trk->shared->ref = 1; + + trk->ss = ss; + WT_ERR(__wt_strndup(session, addr, addr_size, &trk->trk_addr)); + trk->trk_addr_size = (uint8_t)addr_size; + trk->trk_size = size; + trk->trk_gen = gen; + + *retp = trk; + return (0); + +err: __wt_free(session, trk->trk_addr); + __wt_free(session, trk->shared); + __wt_free(session, trk); + return (ret); +} + +/* + * __slvg_trk_split -- + * Split a tracked chunk. + */ +static int +__slvg_trk_split(WT_SESSION_IMPL *session, WT_TRACK *orig, WT_TRACK **newp) +{ + WT_TRACK *trk; + + WT_RET(__wt_calloc_def(session, 1, &trk)); + + trk->shared = orig->shared; + trk->ss = orig->ss; + + ++orig->shared->ref; + + *newp = trk; + return (0); +} + +/* + * __slvg_trk_leaf -- + * Track a leaf page. + */ +static int +__slvg_trk_leaf(WT_SESSION_IMPL *session, + const WT_PAGE_HEADER *dsk, uint8_t *addr, size_t addr_size, WT_STUFF *ss) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + WT_DECL_RET; + WT_PAGE *page; + WT_TRACK *trk; + uint64_t stop_recno; + uint32_t i; + + btree = S2BT(session); + unpack = &_unpack; + page = NULL; + trk = NULL; + + /* Re-allocate the array of pages, as necessary. */ + WT_RET(__wt_realloc_def( + session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages)); + + /* Allocate a WT_TRACK entry for this new page and fill it in. */ + WT_RET(__slvg_trk_init( + session, addr, addr_size, dsk->mem_size, dsk->write_gen, ss, &trk)); + + switch (dsk->type) { + case WT_PAGE_COL_FIX: + /* + * Column-store fixed-sized format: start and stop keys can be + * taken from the block's header, and doesn't contain overflow + * items. + */ + trk->col_start = dsk->recno; + trk->col_stop = dsk->recno + (dsk->u.entries - 1); + + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "%s records %" PRIu64 "-%" PRIu64, + __wt_addr_string( + session, trk->trk_addr, trk->trk_addr_size, ss->tmp1), + trk->col_start, trk->col_stop)); + break; + case WT_PAGE_COL_VAR: + /* + * Column-store variable-length format: the start key can be + * taken from the block's header, stop key requires walking + * the page. + */ + stop_recno = dsk->recno; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { + __wt_cell_unpack(cell, unpack); + stop_recno += __wt_cell_rle(unpack); + } + + trk->col_start = dsk->recno; + trk->col_stop = stop_recno - 1; + + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "%s records %" PRIu64 "-%" PRIu64, + __wt_addr_string( + session, trk->trk_addr, trk->trk_addr_size, ss->tmp1), + trk->col_start, trk->col_stop)); + + /* Column-store pages can contain overflow items. */ + WT_ERR(__slvg_trk_leaf_ovfl(session, dsk, trk)); + break; + case WT_PAGE_ROW_LEAF: + /* + * Row-store format: copy the first and last keys on the page. + * Keys are prefix-compressed, the simplest and slowest thing + * to do is instantiate the in-memory page, then instantiate + * and copy the full keys, then free the page. We do this + * on every leaf page, and if you need to speed up the salvage, + * it's probably a great place to start. + */ + WT_ERR(__wt_page_inmem(session, NULL, dsk, 0, &page)); + WT_ERR(__wt_row_leaf_key_copy(session, + page, &page->pg_row_d[0], &trk->row_start)); + WT_ERR(__wt_row_leaf_key_copy(session, page, + &page->pg_row_d[page->pg_row_entries - 1], &trk->row_stop)); + + if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) { + WT_ERR(__wt_buf_set_printable(session, ss->tmp1, + trk->row_start.data, trk->row_start.size)); + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "%s start key %.*s", + __wt_addr_string(session, + trk->trk_addr, trk->trk_addr_size, ss->tmp2), + (int)ss->tmp1->size, (char *)ss->tmp1->data)); + WT_ERR(__wt_buf_set_printable(session, ss->tmp1, + trk->row_stop.data, trk->row_stop.size)); + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "%s stop key %.*s", + __wt_addr_string(session, + trk->trk_addr, trk->trk_addr_size, ss->tmp2), + (int)ss->tmp1->size, (char *)ss->tmp1->data)); + } + + /* Row-store pages can contain overflow items. */ + WT_ERR(__slvg_trk_leaf_ovfl(session, dsk, trk)); + break; + } + ss->pages[ss->pages_next++] = trk; + + if (0) { +err: __wt_free(session, trk); + } + if (page != NULL) + __wt_page_out(session, &page); + return (ret); +} + +/* + * __slvg_trk_ovfl -- + * Track an overflow page. + */ +static int +__slvg_trk_ovfl(WT_SESSION_IMPL *session, + const WT_PAGE_HEADER *dsk, uint8_t *addr, size_t addr_size, WT_STUFF *ss) +{ + WT_TRACK *trk; + + /* + * Reallocate the overflow page array as necessary, then save the + * page's location information. + */ + WT_RET(__wt_realloc_def( + session, &ss->ovfl_allocated, ss->ovfl_next + 1, &ss->ovfl)); + + WT_RET(__slvg_trk_init( + session, addr, addr_size, dsk->mem_size, dsk->write_gen, ss, &trk)); + ss->ovfl[ss->ovfl_next++] = trk; + + return (0); +} + +/* + * __slvg_trk_leaf_ovfl -- + * Search a leaf page for overflow items. + */ +static int +__slvg_trk_leaf_ovfl( + WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_TRACK *trk) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + uint32_t i, ovfl_cnt; + + btree = S2BT(session); + unpack = &_unpack; + + /* + * Two passes: count the overflow items, then copy them into an + * allocated array. + */ + ovfl_cnt = 0; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { + __wt_cell_unpack(cell, unpack); + if (unpack->ovfl) + ++ovfl_cnt; + } + if (ovfl_cnt == 0) + return (0); + + /* Allocate room for the array of overflow addresses and fill it in. */ + WT_RET(__wt_calloc_def(session, ovfl_cnt, &trk->trk_ovfl_addr)); + trk->trk_ovfl_cnt = ovfl_cnt; + + ovfl_cnt = 0; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { + __wt_cell_unpack(cell, unpack); + if (unpack->ovfl) { + WT_RET(__wt_strndup(session, unpack->data, + unpack->size, &trk->trk_ovfl_addr[ovfl_cnt].addr)); + trk->trk_ovfl_addr[ovfl_cnt].size = + (uint8_t)unpack->size; + + WT_RET(__wt_verbose(session, WT_VERB_SALVAGE, + "%s overflow reference %s", + __wt_addr_string(session, + trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1), + __wt_addr_string(session, + unpack->data, unpack->size, trk->ss->tmp2))); + + if (++ovfl_cnt == trk->trk_ovfl_cnt) + break; + } + } + + return (0); +} + +/* + * __slvg_col_range -- + * Figure out the leaf pages we need and free the leaf pages we don't. + * + * When pages split, the key range is split across multiple pages. If not all + * of the old versions of the page are overwritten, or not all of the new pages + * are written, or some of the pages are corrupted, salvage will read different + * pages with overlapping key ranges, at different LSNs. + * + * We salvage all of the key ranges we find, at the latest LSN value: this means + * we may resurrect pages of deleted items, as page deletion doesn't write leaf + * pages and salvage will read and instantiate the contents of an old version of + * the deleted page. + * + * The leaf page array is sorted in key order, and secondarily on LSN: what this + * means is that for each new key range, the first page we find is the best page + * for that key. The process is to walk forward from each page until we reach + * a page with a starting key after the current page's stopping key. + * + * For each of page, check to see if they overlap the current page's key range. + * If they do, resolve the overlap. Because WiredTiger rarely splits pages, + * overlap resolution usually means discarding a page because the key ranges + * are the same, and one of the pages is simply an old version of the other. + * + * However, it's possible more complex resolution is necessary. For example, + * here's an improbably complex list of page ranges and LSNs: + * + * Page Range LSN + * 30 A-G 3 + * 31 C-D 4 + * 32 B-C 5 + * 33 C-F 6 + * 34 C-D 7 + * 35 F-M 8 + * 36 H-O 9 + * + * We walk forward from each page reviewing all other pages in the array that + * overlap the range. For each overlap, the current or the overlapping + * page is updated so the page with the most recent information for any range + * "owns" that range. Here's an example for page 30. + * + * Review page 31: because page 31 has the range C-D and a higher LSN than page + * 30, page 30 would "split" into two ranges, A-C and E-G, conceding the C-D + * range to page 31. The new track element would be inserted into array with + * the following result: + * + * Page Range LSN + * 30 A-C 3 << Changed WT_TRACK element + * 31 C-D 4 + * 32 B-C 5 + * 33 C-F 6 + * 34 C-D 7 + * 30 E-G 3 << New WT_TRACK element + * 35 F-M 8 + * 36 H-O 9 + * + * Continue the review of the first element, using its new values. + * + * Review page 32: because page 31 has the range B-C and a higher LSN than page + * 30, page 30's A-C range would be truncated, conceding the B-C range to page + * 32. + * 30 A-B 3 + * E-G 3 + * 31 C-D 4 + * 32 B-C 5 + * 33 C-F 6 + * 34 C-D 7 + * + * Review page 33: because page 33 has a starting key (C) past page 30's ending + * key (B), we stop evaluating page 30's A-B range, as there can be no further + * overlaps. + * + * This process is repeated for each page in the array. + * + * When page 33 is processed, we'd discover that page 33's C-F range overlaps + * page 30's E-G range, and page 30's E-G range would be updated, conceding the + * E-F range to page 33. + * + * This is not computationally expensive because we don't walk far forward in + * the leaf array because it's sorted by starting key, and because WiredTiger + * splits are rare, the chance of finding the kind of range overlap requiring + * re-sorting the array is small. + */ +static int +__slvg_col_range(WT_SESSION_IMPL *session, WT_STUFF *ss) +{ + WT_TRACK *jtrk; + uint32_t i, j; + + /* + * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR + * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE + * BEING HANDLED. + * + * Walk the page array looking for overlapping key ranges, adjusting + * the ranges based on the LSN until there are no overlaps. + * + * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE + * AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE + * PLUS OFFSET. + */ + for (i = 0; i < ss->pages_next; ++i) { + if (ss->pages[i] == NULL) + continue; + + /* Check for pages that overlap our page. */ + for (j = i + 1; j < ss->pages_next; ++j) { + if (ss->pages[j] == NULL) + continue; + /* + * We're done if this page starts after our stop, no + * subsequent pages can overlap our page. + */ + if (ss->pages[j]->col_start > + ss->pages[i]->col_stop) + break; + + /* There's an overlap, fix it up. */ + jtrk = ss->pages[j]; + WT_RET(__slvg_col_range_overlap(session, i, j, ss)); + + /* + * If the overlap resolution changed the entry's start + * key, the entry might have moved and the page array + * re-sorted, and pages[j] would reference a different + * page. We don't move forward if that happened, we + * re-process the slot again (by decrementing j before + * the loop's increment). + */ + if (ss->pages[j] != NULL && jtrk != ss->pages[j]) + --j; + } + } + return (0); +} + +/* + * __slvg_col_range_overlap -- + * Two column-store key ranges overlap, deal with it. + */ +static int +__slvg_col_range_overlap( + WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_slot, WT_STUFF *ss) +{ + WT_TRACK *a_trk, *b_trk, *new; + uint32_t i; + + /* + * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR + * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE + * BEING HANDLED. + */ + a_trk = ss->pages[a_slot]; + b_trk = ss->pages[b_slot]; + + WT_RET(__wt_verbose(session, WT_VERB_SALVAGE, + "%s and %s range overlap", + __wt_addr_string( + session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1), + __wt_addr_string( + session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2))); + + /* + * The key ranges of two WT_TRACK pages in the array overlap -- choose + * the ranges we're going to take from each. + * + * We can think of the overlap possibilities as 11 different cases: + * + * AAAAAAAAAAAAAAAAAA + * #1 BBBBBBBBBBBBBBBBBB pages are the same + * #2 BBBBBBBBBBBBB overlaps the beginning + * #3 BBBBBBBBBBBBBBBB overlaps the end + * #4 BBBBB B is a prefix of A + * #5 BBBBBB B is middle of A + * #6 BBBBBBBBBB B is a suffix of A + * + * and: + * + * BBBBBBBBBBBBBBBBBB + * #7 AAAAAAAAAAAAA same as #3 + * #8 AAAAAAAAAAAAAAAA same as #2 + * #9 AAAAA A is a prefix of B + * #10 AAAAAA A is middle of B + * #11 AAAAAAAAAA A is a suffix of B + * + * Note the leaf page array was sorted by key and a_trk appears earlier + * in the array than b_trk, so cases #2/8, #10 and #11 are impossible. + * + * Finally, there's one additional complicating factor -- final ranges + * are assigned based on the page's LSN. + */ + /* Case #2/8, #10, #11 */ + if (a_trk->col_start > b_trk->col_start) + WT_PANIC_RET( + session, EINVAL, "unexpected merge array sort order"); + + if (a_trk->col_start == b_trk->col_start) { /* Case #1, #4 and #9 */ + /* + * The secondary sort of the leaf page array was the page's LSN, + * in high-to-low order, which means a_trk has a higher LSN, and + * is more desirable, than b_trk. In cases #1 and #4 and #9, + * where the start of the range is the same for the two pages, + * this simplifies things, it guarantees a_trk has a higher LSN + * than b_trk. + */ + if (a_trk->col_stop >= b_trk->col_stop) + /* + * Case #1, #4: a_trk is a superset of b_trk, and a_trk + * is more desirable -- discard b_trk. + */ + goto delete_b; + + /* + * Case #9: b_trk is a superset of a_trk, but a_trk is more + * desirable: keep both but delete a_trk's key range from + * b_trk. + */ + b_trk->col_start = a_trk->col_stop + 1; + __slvg_col_trk_update_start(b_slot, ss); + F_SET(b_trk, WT_TRACK_MERGE); + goto merge; + } + + if (a_trk->col_stop == b_trk->col_stop) { /* Case #6 */ + if (a_trk->trk_gen > b_trk->trk_gen) + /* + * Case #6: a_trk is a superset of b_trk and a_trk is + * more desirable -- discard b_trk. + */ + goto delete_b; + + /* + * Case #6: a_trk is a superset of b_trk, but b_trk is more + * desirable: keep both but delete b_trk's key range from a_trk. + */ + a_trk->col_stop = b_trk->col_start - 1; + F_SET(a_trk, WT_TRACK_MERGE); + goto merge; + } + + if (a_trk->col_stop < b_trk->col_stop) { /* Case #3/7 */ + if (a_trk->trk_gen > b_trk->trk_gen) { + /* + * Case #3/7: a_trk is more desirable, delete a_trk's + * key range from b_trk; + */ + b_trk->col_start = a_trk->col_stop + 1; + __slvg_col_trk_update_start(b_slot, ss); + F_SET(b_trk, WT_TRACK_MERGE); + } else { + /* + * Case #3/7: b_trk is more desirable, delete b_trk's + * key range from a_trk; + */ + a_trk->col_stop = b_trk->col_start - 1; + F_SET(a_trk, WT_TRACK_MERGE); + } + goto merge; + } + + /* + * Case #5: a_trk is a superset of b_trk and a_trk is more desirable -- + * discard b_trk. + */ + if (a_trk->trk_gen > b_trk->trk_gen) { +delete_b: /* + * After page and overflow reconciliation, one (and only one) + * page can reference an overflow record. But, if we split a + * page into multiple chunks, any of the chunks might own any + * of the backing overflow records, so overflow records won't + * normally be discarded until after the merge phase completes. + * (The merge phase is where the final pages are written, and + * we figure out which overflow records are actually used.) + * If freeing a chunk and there are no other references to the + * underlying shared information, the overflow records must be + * useless, discard them to keep the final file size small. + */ + if (b_trk->shared->ref == 1) + for (i = 0; i < b_trk->trk_ovfl_cnt; ++i) + WT_RET(__slvg_trk_free(session, + &ss->ovfl[b_trk->trk_ovfl_slot[i]], 1)); + return (__slvg_trk_free(session, &ss->pages[b_slot], 1)); + } + + /* + * Case #5: b_trk is more desirable and is a middle chunk of a_trk. + * Split a_trk into two parts, the key range before b_trk and the + * key range after b_trk. + */ + WT_RET(__slvg_trk_split(session, a_trk, &new)); + + /* + * Second, reallocate the array of pages if necessary, and then insert + * the new element into the array after the existing element (that's + * probably wrong, but we'll fix it up in a second). + */ + WT_RET(__wt_realloc_def( + session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages)); + memmove(ss->pages + a_slot + 1, ss->pages + a_slot, + (ss->pages_next - a_slot) * sizeof(*ss->pages)); + ss->pages[a_slot + 1] = new; + ++ss->pages_next; + + /* + * Third, set its start key to be the first key after the stop key of + * the middle chunk (that's b_trk), and its stop key to be the stop key + * of the original chunk, and call __slvg_col_trk_update_start. That + * function will re-sort the WT_TRACK array as necessary to move our + * new entry into the right sorted location. + */ + new->col_start = b_trk->col_stop + 1; + new->col_stop = a_trk->col_stop; + __slvg_col_trk_update_start(a_slot + 1, ss); + + /* + * Fourth, set the original WT_TRACK information to reference only + * the initial key space in the page, that is, everything up to the + * starting key of the middle chunk (that's b_trk). + */ + a_trk->col_stop = b_trk->col_start - 1; + + F_SET(new, WT_TRACK_MERGE); + F_SET(a_trk, WT_TRACK_MERGE); + +merge: WT_RET(__wt_verbose(session, WT_VERB_SALVAGE, + "%s and %s require merge", + __wt_addr_string( + session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1), + __wt_addr_string( + session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2))); + return (0); +} + +/* + * __slvg_col_trk_update_start -- + * Update a column-store page's start key after an overlap. + */ +static void +__slvg_col_trk_update_start(uint32_t slot, WT_STUFF *ss) +{ + WT_TRACK *trk; + uint32_t i; + + trk = ss->pages[slot]; + + /* + * If we deleted an initial piece of the WT_TRACK name space, it may no + * longer be in the right location. + * + * For example, imagine page #1 has the key range 30-50, it split, and + * we wrote page #2 with key range 30-40, and page #3 key range with + * 40-50, where pages #2 and #3 have larger LSNs than page #1. When the + * key ranges were sorted, page #2 came first, then page #1 (because of + * their earlier start keys than page #3), and page #2 came before page + * #1 because of its LSN. When we resolve the overlap between page #2 + * and page #1, we truncate the initial key range of page #1, and it now + * sorts after page #3, because it has the same starting key of 40, and + * a lower LSN. + * + * We have already updated b_trk's start key; what we may have to do is + * re-sort some number of elements in the list. + */ + for (i = slot + 1; i < ss->pages_next; ++i) { + if (ss->pages[i] == NULL) + continue; + if (ss->pages[i]->col_start > trk->col_stop) + break; + } + i -= slot; + if (i > 1) + qsort(ss->pages + slot, (size_t)i, + sizeof(WT_TRACK *), __slvg_trk_compare_key); +} + +/* + * __slvg_col_range_missing -- + * Detect missing ranges from column-store files. + */ +static int +__slvg_col_range_missing(WT_SESSION_IMPL *session, WT_STUFF *ss) +{ + WT_TRACK *trk; + uint64_t r; + uint32_t i; + + for (i = 0, r = 0; i < ss->pages_next; ++i) { + if ((trk = ss->pages[i]) == NULL) + continue; + if (trk->col_start != r + 1) { + WT_RET(__wt_verbose(session, WT_VERB_SALVAGE, + "%s column-store missing range from %" + PRIu64 " to %" PRIu64 " inclusive", + __wt_addr_string(session, + trk->trk_addr, trk->trk_addr_size, ss->tmp1), + r + 1, trk->col_start - 1)); + + /* + * We need to instantiate deleted items for the missing + * record range. + */ + trk->col_missing = r + 1; + F_SET(trk, WT_TRACK_MERGE); + } + r = trk->col_stop; + } + return (0); +} + +/* + * __slvg_modify_init -- + * Initialize a salvage page's modification information. + */ +static int +__slvg_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_RET(__wt_page_modify_init(session, page)); + __wt_page_modify_set(session, page); + + return (0); +} + +/* + * __slvg_col_build_internal -- + * Build a column-store in-memory page that references all of the leaf + * pages we've found. + */ +static int +__slvg_col_build_internal( + WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss) +{ + WT_ADDR *addr; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + WT_REF *ref, **refp; + WT_TRACK *trk; + uint32_t i; + + addr = NULL; + + /* Allocate a column-store root (internal) page and fill it in. */ + WT_RET( + __wt_page_alloc(session, WT_PAGE_COL_INT, 1, leaf_cnt, 1, &page)); + WT_ERR(__slvg_modify_init(session, page)); + + pindex = WT_INTL_INDEX_COPY(page); + for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) { + if ((trk = ss->pages[i]) == NULL) + continue; + + ref = *refp++; + ref->home = page; + ref->page = NULL; + + WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr)); + WT_ERR(__wt_strndup( + session, trk->trk_addr, trk->trk_addr_size, &addr->addr)); + addr->size = trk->trk_addr_size; + addr->type = + trk->trk_ovfl_cnt == 0 ? WT_ADDR_LEAF_NO : WT_ADDR_LEAF; + ref->addr = addr; + addr = NULL; + + ref->key.recno = trk->col_start; + ref->state = WT_REF_DISK; + + /* + * If the page's key range is unmodified from when we read it + * (in other words, we didn't merge part of this page with + * another page), we can use the page without change, and the + * only thing we need to do is mark all overflow records the + * page references as in-use. + * + * If we did merge with another page, we have to build a page + * reflecting the updated key range. Note, that requires an + * additional pass to free the merge page's backing blocks. + */ + if (F_ISSET(trk, WT_TRACK_MERGE)) { + ss->merge_free = 1; + + WT_ERR(__slvg_col_build_leaf(session, trk, ref)); + } else + WT_ERR(__slvg_ovfl_ref_all(session, trk)); + ++ref; + } + + __wt_root_ref_init(&ss->root_ref, page, 1); + + if (0) { +err: if (addr != NULL) + __wt_free(session, addr); + __wt_page_out(session, &page); + } + return (ret); +} + +/* + * __slvg_col_build_leaf -- + * Build a column-store leaf page for a merged page. + */ +static int +__slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) +{ + WT_COL *save_col_var; + WT_DECL_RET; + WT_PAGE *page; + WT_SALVAGE_COOKIE *cookie, _cookie; + uint64_t skip, take; + uint32_t *entriesp, save_entries; + + cookie = &_cookie; + WT_CLEAR(*cookie); + + /* Get the original page, including the full in-memory setup. */ + WT_RET(__wt_page_in(session, ref, 0)); + page = ref->page; + + entriesp = page->type == WT_PAGE_COL_VAR ? + &page->pg_var_entries : &page->pg_fix_entries; + + save_col_var = page->pg_var_d; + save_entries = *entriesp; + + /* + * Calculate the number of K/V entries we are going to skip, and + * the total number of K/V entries we'll take from this page. + */ + cookie->skip = skip = trk->col_start - page->pg_var_recno; + cookie->take = take = (trk->col_stop - trk->col_start) + 1; + + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "%s merge discarding first %" PRIu64 " records, " + "then taking %" PRIu64 " records", + __wt_addr_string( + session, trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1), + skip, take)); + + /* Set the referenced flag on overflow pages we're using. */ + if (page->type == WT_PAGE_COL_VAR && trk->trk_ovfl_cnt != 0) + WT_ERR(__slvg_col_ovfl(session, trk, page, skip, take)); + + /* + * If we're missing some part of the range, the real start range is in + * trk->col_missing, else, it's in trk->col_start. Update the parent's + * reference as well as the page itself. + */ + if (trk->col_missing == 0) + page->pg_var_recno = trk->col_start; + else { + page->pg_var_recno = trk->col_missing; + cookie->missing = trk->col_start - trk->col_missing; + + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "%s merge inserting %" PRIu64 " missing records", + __wt_addr_string( + session, trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1), + cookie->missing)); + } + ref->key.recno = page->pg_var_recno; + + /* + * We can't discard the original blocks associated with this page now. + * (The problem is we don't want to overwrite any original information + * until the salvage run succeeds -- if we free the blocks now, the next + * merge page we write might allocate those blocks and overwrite them, + * and should the salvage run eventually fail, the original information + * would have been lost.) Clear the reference addr so eviction doesn't + * free the underlying blocks. + */ + __wt_free(session, ((WT_ADDR *)ref->addr)->addr); + __wt_free(session, ref->addr); + ref->addr = NULL; + + /* Write the new version of the leaf page to disk. */ + WT_ERR(__slvg_modify_init(session, page)); + WT_ERR(__wt_rec_write(session, ref, cookie, WT_SKIP_UPDATE_ERR)); + + /* Reset the page. */ + page->pg_var_d = save_col_var; + *entriesp = save_entries; + + ret = __wt_page_release(session, ref, 0); + if (ret == 0) + ret = __wt_rec_evict(session, ref, 1); + + if (0) { +err: WT_TRET(__wt_page_release(session, ref, 0)); + } + + return (ret); +} + +/* + * __slvg_col_ovfl_single -- + * Find a single overflow record in the merge page's list, and mark it as + * referenced. + */ +static int +__slvg_col_ovfl_single( + WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK *unpack) +{ + WT_TRACK *ovfl; + uint32_t i; + + /* + * Search the list of overflow records for this page -- we should find + * exactly one match, and we mark it as referenced. + */ + for (i = 0; i < trk->trk_ovfl_cnt; ++i) { + ovfl = trk->ss->ovfl[trk->trk_ovfl_slot[i]]; + if (unpack->size == ovfl->trk_addr_size && + memcmp(unpack->data, ovfl->trk_addr, unpack->size) == 0) + return (__slvg_ovfl_ref(session, ovfl, 0)); + } + + WT_PANIC_RET(session, + EINVAL, "overflow record at column-store page merge not found"); +} + +/* + * __slvg_col_ovfl -- + * Mark overflow items referenced by the merged page. + */ +static int +__slvg_col_ovfl(WT_SESSION_IMPL *session, + WT_TRACK *trk, WT_PAGE *page, uint64_t skip, uint64_t take) +{ + WT_CELL_UNPACK unpack; + WT_CELL *cell; + WT_COL *cip; + WT_DECL_RET; + uint64_t recno, start, stop; + uint32_t i; + + /* + * Merging a variable-length column-store page, and we took some number + * of records, figure out which (if any) overflow records we used. + */ + recno = page->pg_var_recno; + start = recno + skip; + stop = (recno + skip + take) - 1; + + WT_COL_FOREACH(page, cip, i) { + cell = WT_COL_PTR(page, cip); + __wt_cell_unpack(cell, &unpack); + recno += __wt_cell_rle(&unpack); + + /* + * I keep getting this calculation wrong, so here's the logic. + * Start is the first record we want, stop is the last record + * we want. The record number has already been incremented one + * past the maximum record number for this page entry, that is, + * it's set to the first record number for the next page entry. + * The test of start should be greater-than (not greater-than- + * or-equal), because of that increment, if the record number + * equals start, we want the next record, not this one. The + * test against stop is greater-than, not greater-than-or-equal + * because stop is the last record wanted, if the record number + * equals stop, we want the next record. + */ + if (recno > start && unpack.type == WT_CELL_VALUE_OVFL) { + ret = __slvg_col_ovfl_single(session, trk, &unpack); + + /* + * When handling overlapping ranges on variable-length + * column-store leaf pages, we split ranges without + * considering if we were splitting RLE units. (See + * note at the beginning of this file for explanation + * of the overall process.) If the RLE unit was on-page, + * we can simply write it again. If the RLE unit was an + * overflow value that's already been used by another + * row (from some other page created by a range split), + * there's not much to do, this row can't reference an + * overflow record we don't have: delete the row. + */ + if (ret == EBUSY) { + __wt_cell_type_reset(session, + cell, WT_CELL_VALUE_OVFL, WT_CELL_DEL); + ret = 0; + } + WT_RET(ret); + } + if (recno > stop) + break; + } + return (0); +} + +/* + * __slvg_row_range -- + * Figure out the leaf pages we need and discard everything else. At the + * same time, tag the overflow pages they reference. + */ +static int +__slvg_row_range(WT_SESSION_IMPL *session, WT_STUFF *ss) +{ + WT_TRACK *jtrk; + WT_BTREE *btree; + uint32_t i, j; + int cmp; + + btree = S2BT(session); + + /* + * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR + * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE + * BEING HANDLED. + * + * Walk the page array looking for overlapping key ranges, adjusting + * the ranges based on the LSN until there are no overlaps. + * + * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE + * AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE + * PLUS OFFSET. + */ + for (i = 0; i < ss->pages_next; ++i) { + if (ss->pages[i] == NULL) + continue; + + /* Check for pages that overlap our page. */ + for (j = i + 1; j < ss->pages_next; ++j) { + if (ss->pages[j] == NULL) + continue; + /* + * We're done if this page starts after our stop, no + * subsequent pages can overlap our page. + */ + WT_RET(__wt_compare(session, btree->collator, + &ss->pages[j]->row_start, &ss->pages[i]->row_stop, + &cmp)); + if (cmp > 0) + break; + + /* There's an overlap, fix it up. */ + jtrk = ss->pages[j]; + WT_RET(__slvg_row_range_overlap(session, i, j, ss)); + + /* + * If the overlap resolution changed the entry's start + * key, the entry might have moved and the page array + * re-sorted, and pages[j] would reference a different + * page. We don't move forward if that happened, we + * re-process the slot again (by decrementing j before + * the loop's increment). + */ + if (ss->pages[j] != NULL && jtrk != ss->pages[j]) + --j; + } + } + return (0); +} + +/* + * __slvg_row_range_overlap -- + * Two row-store key ranges overlap, deal with it. + */ +static int +__slvg_row_range_overlap( + WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_slot, WT_STUFF *ss) +{ + WT_BTREE *btree; + WT_TRACK *a_trk, *b_trk, *new; + uint32_t i; + int start_cmp, stop_cmp; + + /* + * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR + * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE + * BEING HANDLED. + */ + btree = S2BT(session); + + a_trk = ss->pages[a_slot]; + b_trk = ss->pages[b_slot]; + + WT_RET(__wt_verbose(session, WT_VERB_SALVAGE, + "%s and %s range overlap", + __wt_addr_string( + session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1), + __wt_addr_string( + session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2))); + + /* + * The key ranges of two WT_TRACK pages in the array overlap -- choose + * the ranges we're going to take from each. + * + * We can think of the overlap possibilities as 11 different cases: + * + * AAAAAAAAAAAAAAAAAA + * #1 BBBBBBBBBBBBBBBBBB pages are the same + * #2 BBBBBBBBBBBBB overlaps the beginning + * #3 BBBBBBBBBBBBBBBB overlaps the end + * #4 BBBBB B is a prefix of A + * #5 BBBBBB B is middle of A + * #6 BBBBBBBBBB B is a suffix of A + * + * and: + * + * BBBBBBBBBBBBBBBBBB + * #7 AAAAAAAAAAAAA same as #3 + * #8 AAAAAAAAAAAAAAAA same as #2 + * #9 AAAAA A is a prefix of B + * #10 AAAAAA A is middle of B + * #11 AAAAAAAAAA A is a suffix of B + * + * Note the leaf page array was sorted by key and a_trk appears earlier + * in the array than b_trk, so cases #2/8, #10 and #11 are impossible. + * + * Finally, there's one additional complicating factor -- final ranges + * are assigned based on the page's LSN. + */ +#define A_TRK_START (&a_trk->row_start) +#define A_TRK_STOP (&a_trk->row_stop) +#define B_TRK_START (&b_trk->row_start) +#define B_TRK_STOP (&b_trk->row_stop) +#define SLOT_START(i) (&ss->pages[i]->row_start) +#define __slvg_key_copy(session, dst, src) \ + __wt_buf_set(session, dst, (src)->data, (src)->size) + + WT_RET(__wt_compare( + session, btree->collator, A_TRK_START, B_TRK_START, &start_cmp)); + WT_RET(__wt_compare( + session, btree->collator, A_TRK_STOP, B_TRK_STOP, &stop_cmp)); + + if (start_cmp > 0) /* Case #2/8, #10, #11 */ + WT_PANIC_RET( + session, EINVAL, "unexpected merge array sort order"); + + if (start_cmp == 0) { /* Case #1, #4, #9 */ + /* + * The secondary sort of the leaf page array was the page's LSN, + * in high-to-low order, which means a_trk has a higher LSN, and + * is more desirable, than b_trk. In cases #1 and #4 and #9, + * where the start of the range is the same for the two pages, + * this simplifies things, it guarantees a_trk has a higher LSN + * than b_trk. + */ + if (stop_cmp >= 0) + /* + * Case #1, #4: a_trk is a superset of b_trk, and a_trk + * is more desirable -- discard b_trk. + */ + goto delete_b; + + /* + * Case #9: b_trk is a superset of a_trk, but a_trk is more + * desirable: keep both but delete a_trk's key range from + * b_trk. + */ + WT_RET(__slvg_row_trk_update_start( + session, A_TRK_STOP, b_slot, ss)); + F_SET(b_trk, WT_TRACK_CHECK_START | WT_TRACK_MERGE); + goto merge; + } + + if (stop_cmp == 0) { /* Case #6 */ + if (a_trk->trk_gen > b_trk->trk_gen) + /* + * Case #6: a_trk is a superset of b_trk and a_trk is + * more desirable -- discard b_trk. + */ + goto delete_b; + + /* + * Case #6: a_trk is a superset of b_trk, but b_trk is more + * desirable: keep both but delete b_trk's key range from a_trk. + */ + WT_RET(__slvg_key_copy(session, A_TRK_STOP, B_TRK_START)); + F_SET(a_trk, WT_TRACK_CHECK_STOP | WT_TRACK_MERGE); + goto merge; + } + + if (stop_cmp < 0) { /* Case #3/7 */ + if (a_trk->trk_gen > b_trk->trk_gen) { + /* + * Case #3/7: a_trk is more desirable, delete a_trk's + * key range from b_trk; + */ + WT_RET(__slvg_row_trk_update_start( + session, A_TRK_STOP, b_slot, ss)); + F_SET(b_trk, WT_TRACK_CHECK_START | WT_TRACK_MERGE); + } else { + /* + * Case #3/7: b_trk is more desirable, delete b_trk's + * key range from a_trk; + */ + WT_RET(__slvg_key_copy( + session, A_TRK_STOP, B_TRK_START)); + F_SET(a_trk, WT_TRACK_CHECK_STOP | WT_TRACK_MERGE); + } + goto merge; + } + + /* + * Case #5: a_trk is a superset of b_trk and a_trk is more desirable -- + * discard b_trk. + */ + if (a_trk->trk_gen > b_trk->trk_gen) { +delete_b: /* + * After page and overflow reconciliation, one (and only one) + * page can reference an overflow record. But, if we split a + * page into multiple chunks, any of the chunks might own any + * of the backing overflow records, so overflow records won't + * normally be discarded until after the merge phase completes. + * (The merge phase is where the final pages are written, and + * we figure out which overflow records are actually used.) + * If freeing a chunk and there are no other references to the + * underlying shared information, the overflow records must be + * useless, discard them to keep the final file size small. + */ + if (b_trk->shared->ref == 1) + for (i = 0; i < b_trk->trk_ovfl_cnt; ++i) + WT_RET(__slvg_trk_free(session, + &ss->ovfl[b_trk->trk_ovfl_slot[i]], 1)); + return (__slvg_trk_free(session, &ss->pages[b_slot], 1)); + } + + /* + * Case #5: b_trk is more desirable and is a middle chunk of a_trk. + * Split a_trk into two parts, the key range before b_trk and the + * key range after b_trk. + */ + WT_RET(__slvg_trk_split(session, a_trk, &new)); + + /* + * Second, reallocate the array of pages if necessary, and then insert + * the new element into the array after the existing element (that's + * probably wrong, but we'll fix it up in a second). + */ + WT_RET(__wt_realloc_def( + session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages)); + memmove(ss->pages + a_slot + 1, ss->pages + a_slot, + (ss->pages_next - a_slot) * sizeof(*ss->pages)); + ss->pages[a_slot + 1] = new; + ++ss->pages_next; + + /* + * Third, set its its stop key to be the stop key of the original chunk, + * and call __slvg_row_trk_update_start. That function will both set + * the start key to be the first key after the stop key of the middle + * chunk (that's b_trk), and re-sort the WT_TRACK array as necessary to + * move our new entry into the right sorted location. + */ + WT_RET(__slvg_key_copy(session, &new->row_stop, A_TRK_STOP)); + WT_RET( + __slvg_row_trk_update_start(session, B_TRK_STOP, a_slot + 1, ss)); + + /* + * Fourth, set the original WT_TRACK information to reference only + * the initial key space in the page, that is, everything up to the + * starting key of the middle chunk (that's b_trk). + */ + WT_RET(__slvg_key_copy(session, A_TRK_STOP, B_TRK_START)); + F_SET(new, WT_TRACK_CHECK_START); + F_SET(a_trk, WT_TRACK_CHECK_STOP); + + F_SET(new, WT_TRACK_MERGE); + F_SET(a_trk, WT_TRACK_MERGE); + +merge: WT_RET(__wt_verbose(session, WT_VERB_SALVAGE, + "%s and %s require merge", + __wt_addr_string( + session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1), + __wt_addr_string( + session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2))); + return (0); +} + +/* + * __slvg_row_trk_update_start -- + * Update a row-store page's start key after an overlap. + */ +static int +__slvg_row_trk_update_start( + WT_SESSION_IMPL *session, WT_ITEM *stop, uint32_t slot, WT_STUFF *ss) +{ + WT_BTREE *btree; + WT_DECL_ITEM(dsk); + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_PAGE *page; + WT_ROW *rip; + WT_TRACK *trk; + uint32_t i; + int cmp, found; + + btree = S2BT(session); + page = NULL; + found = 0; + + trk = ss->pages[slot]; + + /* + * If we deleted an initial piece of the WT_TRACK name space, it may no + * longer be in the right location. + * + * For example, imagine page #1 has the key range 30-50, it split, and + * we wrote page #2 with key range 30-40, and page #3 key range with + * 40-50, where pages #2 and #3 have larger LSNs than page #1. When the + * key ranges were sorted, page #2 came first, then page #1 (because of + * their earlier start keys than page #3), and page #2 came before page + * #1 because of its LSN. When we resolve the overlap between page #2 + * and page #1, we truncate the initial key range of page #1, and it now + * sorts after page #3, because it has the same starting key of 40, and + * a lower LSN. + * + * First, update the WT_TRACK start key based on the specified stop key. + * + * Read and instantiate the WT_TRACK page (we don't have to verify the + * page, nor do we have to be quiet on error, we've already read this + * page successfully). + */ + WT_RET(__wt_scr_alloc(session, trk->trk_size, &dsk)); + WT_ERR(__wt_bt_read(session, dsk, trk->trk_addr, trk->trk_addr_size)); + WT_ERR(__wt_page_inmem(session, NULL, dsk->mem, 0, &page)); + + /* + * Walk the page, looking for a key sorting greater than the specified + * stop key -- that's our new start key. + */ + WT_ERR(__wt_scr_alloc(session, 0, &key)); + WT_ROW_FOREACH(page, rip, i) { + WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0)); + WT_ERR(__wt_compare(session, btree->collator, key, stop, &cmp)); + if (cmp > 0) { + found = 1; + break; + } + } + + /* + * We know that at least one key on the page sorts after the specified + * stop key, otherwise the page would have entirely overlapped and we + * would have discarded it, we wouldn't be here. Therefore, this test + * is safe. (But, it never hurts to check.) + */ + WT_ERR_TEST(!found, WT_ERROR); + WT_ERR(__slvg_key_copy(session, &trk->row_start, key)); + + /* + * We may need to re-sort some number of elements in the list. Walk + * forward in the list until reaching an entry which cannot overlap + * the adjusted entry. If it's more than a single slot, re-sort the + * entries. + */ + for (i = slot + 1; i < ss->pages_next; ++i) { + if (ss->pages[i] == NULL) + continue; + WT_ERR(__wt_compare(session, + btree->collator, SLOT_START(i), &trk->row_stop, &cmp)); + if (cmp > 0) + break; + } + i -= slot; + if (i > 1) + qsort(ss->pages + slot, (size_t)i, + sizeof(WT_TRACK *), __slvg_trk_compare_key); + +err: if (page != NULL) + __wt_page_out(session, &page); + __wt_scr_free(&dsk); + __wt_scr_free(&key); + + return (ret); +} + +/* + * __slvg_row_build_internal -- + * Build a row-store in-memory page that references all of the leaf + * pages we've found. + */ +static int +__slvg_row_build_internal( + WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss) +{ + WT_ADDR *addr; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + WT_REF *ref, **refp; + WT_TRACK *trk; + uint32_t i; + + addr = NULL; + + /* Allocate a row-store root (internal) page and fill it in. */ + WT_RET( + __wt_page_alloc(session, WT_PAGE_ROW_INT, 0, leaf_cnt, 1, &page)); + WT_ERR(__slvg_modify_init(session, page)); + + pindex = WT_INTL_INDEX_COPY(page); + for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) { + if ((trk = ss->pages[i]) == NULL) + continue; + + ref = *refp++; + ref->home = page; + ref->page = NULL; + + WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr)); + WT_ERR(__wt_strndup( + session, trk->trk_addr, trk->trk_addr_size, &addr->addr)); + addr->size = trk->trk_addr_size; + addr->type = + trk->trk_ovfl_cnt == 0 ? WT_ADDR_LEAF_NO : WT_ADDR_LEAF; + ref->addr = addr; + addr = NULL; + + __wt_ref_key_clear(ref); + ref->state = WT_REF_DISK; + + /* + * If the page's key range is unmodified from when we read it + * (in other words, we didn't merge part of this page with + * another page), we can use the page without change, and the + * only thing we need to do is mark all overflow records the + * page references as in-use. + * + * If we did merge with another page, we have to build a page + * reflecting the updated key range. Note, that requires an + * additional pass to free the merge page's backing blocks. + */ + if (F_ISSET(trk, WT_TRACK_MERGE)) { + ss->merge_free = 1; + + WT_ERR(__slvg_row_build_leaf(session, trk, ref, ss)); + } else { + WT_ERR(__wt_row_ikey_incr(session, page, 0, + trk->row_start.data, trk->row_start.size, + &ref->key.ikey)); + + WT_ERR(__slvg_ovfl_ref_all(session, trk)); + } + ++ref; + } + + __wt_root_ref_init(&ss->root_ref, page, 0); + + if (0) { +err: if (addr != NULL) + __wt_free(session, addr); + __wt_page_out(session, &page); + } + return (ret); +} + +/* + * __slvg_row_build_leaf -- + * Build a row-store leaf page for a merged page. + */ +static int +__slvg_row_build_leaf( + WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref, WT_STUFF *ss) +{ + WT_BTREE *btree; + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_PAGE *page; + WT_ROW *rip; + WT_SALVAGE_COOKIE *cookie, _cookie; + uint32_t i, skip_start, skip_stop; + int cmp; + + btree = S2BT(session); + page = NULL; + + cookie = &_cookie; + WT_CLEAR(*cookie); + + /* Allocate temporary space in which to instantiate the keys. */ + WT_RET(__wt_scr_alloc(session, 0, &key)); + + /* Get the original page, including the full in-memory setup. */ + WT_ERR(__wt_page_in(session, ref, 0)); + page = ref->page; + + /* + * Figure out how many page keys we want to take and how many we want + * to skip. + * + * If checking the starting range key, the key we're searching for will + * be equal to the starting range key. This is because we figured out + * the true merged-page start key as part of discarding initial keys + * from the page (see the __slvg_row_range_overlap function, and its + * calls to __slvg_row_trk_update_start for more information). + * + * If checking the stopping range key, we want the keys on the page that + * are less-than the stopping range key. This is because we copied a + * key from another page to define this page's stop range: that page is + * the page that owns the "equal to" range space. + */ + skip_start = skip_stop = 0; + if (F_ISSET(trk, WT_TRACK_CHECK_START)) + WT_ROW_FOREACH(page, rip, i) { + WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0)); + + /* + * >= is correct: see the comment above. + */ + WT_ERR(__wt_compare(session, + btree->collator, key, &trk->row_start, &cmp)); + if (cmp >= 0) + break; + if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) { + WT_ERR(__wt_buf_set_printable(session, + ss->tmp1, key->data, key->size)); + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "%s merge discarding leading key %.*s", + __wt_addr_string(session, + trk->trk_addr, trk->trk_addr_size, + ss->tmp2), (int)ss->tmp1->size, + (char *)ss->tmp1->data)); + } + ++skip_start; + } + if (F_ISSET(trk, WT_TRACK_CHECK_STOP)) + WT_ROW_FOREACH_REVERSE(page, rip, i) { + WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0)); + + /* + * < is correct: see the comment above. + */ + WT_ERR(__wt_compare(session, + btree->collator, key, &trk->row_stop, &cmp)); + if (cmp < 0) + break; + if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) { + WT_ERR(__wt_buf_set_printable(session, + ss->tmp1, key->data, key->size)); + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "%s merge discarding trailing key %.*s", + __wt_addr_string(session, + trk->trk_addr, trk->trk_addr_size, + ss->tmp2), (int)ss->tmp1->size, + (char *)ss->tmp1->data)); + } + ++skip_stop; + } + + /* We should have selected some entries, but not the entire page. */ + WT_ASSERT(session, + skip_start + skip_stop > 0 && + skip_start + skip_stop < page->pg_row_entries); + + /* + * Take a copy of this page's first key to define the start of + * its range. The key may require processing, otherwise, it's + * a copy from the page. + */ + rip = page->pg_row_d + skip_start; + WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0)); + WT_ERR(__wt_row_ikey_incr(session, + ref->home, 0, key->data, key->size, &ref->key.ikey)); + + /* Set the referenced flag on overflow pages we're using. */ + if (trk->trk_ovfl_cnt != 0) + WT_ERR(__slvg_row_ovfl(session, + trk, page, skip_start, page->pg_row_entries - skip_stop)); + + /* + * Change the page to reflect the correct record count: there is no + * need to copy anything on the page itself, the entries value limits + * the number of page items. + */ + page->pg_row_entries -= skip_stop; + cookie->skip = skip_start; + + /* + * We can't discard the original blocks associated with this page now. + * (The problem is we don't want to overwrite any original information + * until the salvage run succeeds -- if we free the blocks now, the next + * merge page we write might allocate those blocks and overwrite them, + * and should the salvage run eventually fail, the original information + * would have been lost.) Clear the reference addr so eviction doesn't + * free the underlying blocks. + */ + __wt_free(session, ((WT_ADDR *)ref->addr)->addr); + __wt_free(session, ref->addr); + ref->addr = NULL; + + /* Write the new version of the leaf page to disk. */ + WT_ERR(__slvg_modify_init(session, page)); + WT_ERR(__wt_rec_write(session, ref, cookie, WT_SKIP_UPDATE_ERR)); + + /* Reset the page. */ + page->pg_row_entries += skip_stop; + + /* + * Discard our hazard pointer and evict the page, updating the + * parent's reference. + */ + ret = __wt_page_release(session, ref, 0); + if (ret == 0) + ret = __wt_rec_evict(session, ref, 1); + + if (0) { +err: WT_TRET(__wt_page_release(session, ref, 0)); + } + __wt_scr_free(&key); + + return (ret); +} + +/* + * __slvg_row_ovfl_single -- + * Find a single overflow record in the merge page's list, and mark it as + * referenced. + */ +static int +__slvg_row_ovfl_single(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL *cell) +{ + WT_CELL_UNPACK unpack; + WT_TRACK *ovfl; + uint32_t i; + + /* Unpack the cell, and check if it's an overflow record. */ + __wt_cell_unpack(cell, &unpack); + if (unpack.type != WT_CELL_KEY_OVFL && + unpack.type != WT_CELL_VALUE_OVFL) + return (0); + + /* + * Search the list of overflow records for this page -- we should find + * exactly one match, and we mark it as referenced. + */ + for (i = 0; i < trk->trk_ovfl_cnt; ++i) { + ovfl = trk->ss->ovfl[trk->trk_ovfl_slot[i]]; + if (unpack.size == ovfl->trk_addr_size && + memcmp(unpack.data, ovfl->trk_addr, unpack.size) == 0) + return (__slvg_ovfl_ref(session, ovfl, 1)); + } + + WT_PANIC_RET(session, + EINVAL, "overflow record at row-store page merge not found"); +} + +/* + * __slvg_row_ovfl -- + * Mark overflow items referenced by the merged page. + */ +static int +__slvg_row_ovfl(WT_SESSION_IMPL *session, + WT_TRACK *trk, WT_PAGE *page, uint32_t start, uint32_t stop) +{ + WT_CELL *cell; + WT_ROW *rip; + void *copy; + + /* + * We're merging a row-store page, and we took some number of records, + * figure out which (if any) overflow records we used. + */ + for (rip = page->pg_row_d + start; start < stop; ++start, ++rip) { + copy = WT_ROW_KEY_COPY(rip); + (void)__wt_row_leaf_key_info( + page, copy, NULL, &cell, NULL, NULL); + if (cell != NULL) + WT_RET(__slvg_row_ovfl_single(session, trk, cell)); + cell = __wt_row_leaf_value_cell(page, rip, NULL); + if (cell != NULL) + WT_RET(__slvg_row_ovfl_single(session, trk, cell)); + } + return (0); +} + +/* + * __slvg_trk_compare_addr -- + * Compare two WT_TRACK array entries by address cookie. + */ +static int +__slvg_trk_compare_addr(const void *a, const void *b) +{ + WT_DECL_RET; + WT_TRACK *a_trk, *b_trk; + size_t len; + + a_trk = *(WT_TRACK **)a; + b_trk = *(WT_TRACK **)b; + + /* + * We don't care about the order because these are opaque cookies -- + * we're just sorting them so we can binary search instead of linear + * search. + */ + len = WT_MIN(a_trk->trk_addr_size, b_trk->trk_addr_size); + ret = memcmp(a_trk->trk_addr, b_trk->trk_addr, len); + if (ret == 0) + ret = a_trk->trk_addr_size > b_trk->trk_addr_size ? -1 : 1; + return (ret); +} + +/* + * __slvg_ovfl_compare -- + * Bsearch comparison routine for the overflow array. + */ +static int +__slvg_ovfl_compare(const void *a, const void *b) +{ + WT_ADDR *addr; + WT_DECL_RET; + WT_TRACK *trk; + size_t len; + + addr = (WT_ADDR *)a; + trk = *(WT_TRACK **)b; + + len = WT_MIN(trk->trk_addr_size, addr->size); + ret = memcmp(addr->addr, trk->trk_addr, len); + if (ret == 0 && addr->size != trk->trk_addr_size) + ret = addr->size < trk->trk_addr_size ? -1 : 1; + return (ret); +} + +/* + * __slvg_ovfl_reconcile -- + * Review relationships between leaf pages and the overflow pages, delete + * leaf pages until there's a one-to-one relationship between leaf and overflow + * pages. + */ +static int +__slvg_ovfl_reconcile(WT_SESSION_IMPL *session, WT_STUFF *ss) +{ + WT_ADDR *addr; + WT_DECL_RET; + WT_TRACK **searchp, *trk; + uint32_t i, j, *slot; + + slot = NULL; + + /* + * If an overflow page is referenced more than once, discard leaf pages + * with the lowest LSNs until overflow pages are only referenced once. + * + * This requires sorting the page list by LSN, and the overflow array + * by address cookie. + */ + qsort(ss->pages, + (size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_gen); + qsort(ss->ovfl, + (size_t)ss->ovfl_next, sizeof(WT_TRACK *), __slvg_trk_compare_addr); + + /* + * Walk the list of pages and discard any pages referencing non-existent + * overflow pages or referencing overflow pages also referenced by pages + * with higher LSNs. Our caller sorted the page list by LSN, high to + * low, so we don't have to do explicit testing of the page LSNs, the + * first page to reference an overflow page is the best page to own it. + */ + for (i = 0; i < ss->pages_next; ++i) { + if ((trk = ss->pages[i]) == NULL || trk->trk_ovfl_cnt == 0) + continue; + + WT_ERR(__wt_calloc_def(session, trk->trk_ovfl_cnt, &slot)); + for (j = 0; j < trk->trk_ovfl_cnt; ++j) { + addr = &trk->trk_ovfl_addr[j]; + searchp = bsearch(addr, ss->ovfl, ss->ovfl_next, + sizeof(WT_TRACK *), __slvg_ovfl_compare); + + /* + * If the overflow page doesn't exist or if another page + * has already claimed it, this leaf page isn't usable. + */ + if (searchp != NULL && + !F_ISSET(*searchp, WT_TRACK_OVFL_REFD)) { + /* + * Convert each block address into a slot in the + * list of overflow pages as we go. + */ + slot[j] = (uint32_t)(searchp - ss->ovfl); + F_SET(*searchp, WT_TRACK_OVFL_REFD); + continue; + } + + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "%s references unavailable overflow page %s", + __wt_addr_string(session, + trk->trk_addr, trk->trk_addr_size, ss->tmp1), + __wt_addr_string(session, + addr->addr, addr->size, ss->tmp2))); + + /* + * Clear the "referenced" flag for any overflow pages + * already claimed by this leaf page some other page + * might claim them. + */ + while (j > 0) + F_CLR(ss->ovfl[slot[--j]], WT_TRACK_OVFL_REFD); + trk = NULL; + WT_ERR(__slvg_trk_free(session, &ss->pages[i], 1)); + break; + } + + /* + * We now have a reference to the overflow WT_TRACK, and so no + * longer need the page's address array, discard it. Note, we + * potentially freed the WT_TRACK in the loop above, check it's + * still valid. + */ + if (trk == NULL) + __wt_free(session, slot); + else { + __slvg_trk_free_addr(session, trk); + + trk->trk_ovfl_slot = slot; + slot = NULL; + } + } + return (0); + +err: __wt_free(session, slot); + return (ret); +} + +/* + * __slvg_trk_compare_key -- + * Compare two WT_TRACK array entries by key, and secondarily, by LSN. + */ +static int +__slvg_trk_compare_key(const void *a, const void *b) +{ + WT_SESSION_IMPL *session; + WT_TRACK *a_trk, *b_trk; + uint64_t a_gen, a_recno, b_gen, b_recno; + int cmp; + + a_trk = *(WT_TRACK **)a; + b_trk = *(WT_TRACK **)b; + + if (a_trk == NULL) + return (b_trk == NULL ? 0 : 1); + if (b_trk == NULL) + return (-1); + + switch (a_trk->ss->page_type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + a_recno = a_trk->col_start; + b_recno = b_trk->col_start; + if (a_recno == b_recno) + break; + if (a_recno > b_recno) + return (1); + if (a_recno < b_recno) + return (-1); + break; + case WT_PAGE_ROW_LEAF: + /* + * XXX + * __wt_compare can potentially fail, and we're ignoring that + * error because this routine is called as an underlying qsort + * routine. + */ + session = a_trk->ss->session; + (void)__wt_compare(session, S2BT(session)->collator, + &a_trk->row_start, &b_trk->row_start, &cmp); + if (cmp != 0) + return (cmp); + break; + } + + /* + * If the primary keys compare equally, differentiate based on LSN. + * Sort from highest LSN to lowest, that is, the earlier pages in + * the array are more desirable. + */ + a_gen = a_trk->trk_gen; + b_gen = b_trk->trk_gen; + return (a_gen > b_gen ? -1 : (a_gen < b_gen ? 1 : 0)); +} + +/* + * __slvg_trk_compare_gen -- + * Compare two WT_TRACK array entries by LSN. + */ +static int +__slvg_trk_compare_gen(const void *a, const void *b) +{ + WT_TRACK *a_trk, *b_trk; + uint64_t a_gen, b_gen; + + a_trk = *(WT_TRACK **)a; + b_trk = *(WT_TRACK **)b; + + /* + * Sort from highest LSN to lowest, that is, the earlier pages in the + * array are more desirable. + */ + a_gen = a_trk->trk_gen; + b_gen = b_trk->trk_gen; + return (a_gen > b_gen ? -1 : (a_gen < b_gen ? 1 : 0)); +} + +/* + * __slvg_merge_block_free -- + * Clean up backing file and overflow blocks after the merge phase. + */ +static int +__slvg_merge_block_free(WT_SESSION_IMPL *session, WT_STUFF *ss) +{ + WT_TRACK *trk; + uint32_t i; + + /* Free any underlying file blocks for merged pages. */ + for (i = 0; i < ss->pages_next; ++i) { + if ((trk = ss->pages[i]) == NULL) + continue; + if (F_ISSET(trk, WT_TRACK_MERGE)) + WT_RET(__slvg_trk_free(session, &ss->pages[i], 1)); + } + + /* Free any unused overflow records. */ + return (__slvg_ovfl_discard(session, ss)); +} + +/* + * __slvg_ovfl_ref -- + * Reference an overflow page, checking for multiple references. + */ +static int +__slvg_ovfl_ref(WT_SESSION_IMPL *session, WT_TRACK *trk, int multi_panic) +{ + if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) { + if (!multi_panic) + return (EBUSY); + WT_PANIC_RET(session, EINVAL, + "overflow record unexpectedly referenced multiple times " + "during leaf page merge"); + } + + F_SET(trk, WT_TRACK_OVFL_REFD); + return (0); +} + +/* + * __slvg_ovfl_ref_all -- + * Reference all of the page's overflow pages. + */ +static int +__slvg_ovfl_ref_all(WT_SESSION_IMPL *session, WT_TRACK *trk) +{ + uint32_t i; + + for (i = 0; i < trk->trk_ovfl_cnt; ++i) + WT_RET(__slvg_ovfl_ref( + session, trk->ss->ovfl[trk->trk_ovfl_slot[i]], 1)); + + return (0); +} + +/* + * __slvg_ovfl_discard -- + * Discard unused overflow pages. + */ +static int +__slvg_ovfl_discard(WT_SESSION_IMPL *session, WT_STUFF *ss) +{ + WT_TRACK *trk; + uint32_t i; + + /* + * Walk the overflow page array: if an overflow page isn't referenced, + * add its file blocks to the free list. + * + * Clear the reference flag (it's reused to figure out if the overflow + * record is referenced, but never used, by merged pages). + */ + for (i = 0; i < ss->ovfl_next; ++i) { + if ((trk = ss->ovfl[i]) == NULL) + continue; + + if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) { + F_CLR(trk, WT_TRACK_OVFL_REFD); + continue; + } + WT_RET(__wt_verbose(session, WT_VERB_SALVAGE, + "%s unused overflow page", + __wt_addr_string( + session, trk->trk_addr, trk->trk_addr_size, ss->tmp1))); + WT_RET(__slvg_trk_free(session, &ss->ovfl[i], 1)); + } + + return (0); +} + +/* + * __slvg_cleanup -- + * Discard memory allocated to the page and overflow arrays. + */ +static int +__slvg_cleanup(WT_SESSION_IMPL *session, WT_STUFF *ss) +{ + uint32_t i; + + /* Discard the leaf page array. */ + for (i = 0; i < ss->pages_next; ++i) + if (ss->pages[i] != NULL) + WT_RET(__slvg_trk_free(session, &ss->pages[i], 0)); + __wt_free(session, ss->pages); + + /* Discard the ovfl page array. */ + for (i = 0; i < ss->ovfl_next; ++i) + if (ss->ovfl[i] != NULL) + WT_RET(__slvg_trk_free(session, &ss->ovfl[i], 0)); + __wt_free(session, ss->ovfl); + + return (0); +} + +/* + * __slvg_trk_free_addr -- + * Discard address information. + */ +static void +__slvg_trk_free_addr(WT_SESSION_IMPL *session, WT_TRACK *trk) +{ + uint32_t i; + + if (trk->trk_ovfl_addr != NULL) { + for (i = 0; i < trk->trk_ovfl_cnt; ++i) + __wt_free(session, trk->trk_ovfl_addr[i].addr); + __wt_free(session, trk->trk_ovfl_addr); + } +} + +/* + * __slvg_trk_free_block -- + * Discard underlying blocks. + */ +static int +__slvg_trk_free_block(WT_SESSION_IMPL *session, WT_TRACK *trk) +{ + WT_BM *bm; + + bm = S2BT(session)->bm; + + /* + * If freeing underlying file blocks or overflow pages, this is a page + * we were tracking but eventually decided not to use. + */ + WT_RET(__wt_verbose(session, WT_VERB_SALVAGE, + "%s blocks discarded: discard freed file bytes %" PRIu32, + __wt_addr_string(session, + trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1), trk->trk_size)); + + return (bm->free(bm, session, trk->trk_addr, trk->trk_addr_size)); +} + +/* + * __slvg_trk_free -- + * Discard a WT_TRACK structure and (optionally) its underlying blocks. + */ +static int +__slvg_trk_free(WT_SESSION_IMPL *session, WT_TRACK **trkp, int free_on_last_ref) +{ + WT_TRACK *trk; + + trk = *trkp; + *trkp = NULL; + + /* + * If we're the last user of shared information, clean up. + */ + WT_ASSERT(session, trk->shared->ref > 0); + if (--trk->shared->ref == 0) { + /* + * If the free-on-last-ref flag is set, this chunk isn't going + * to use the backing physical blocks. As we're the last user + * of those blocks, nobody is going to use them and they can be + * discarded. + */ + if (free_on_last_ref) + WT_RET(__slvg_trk_free_block(session, trk)); + + __wt_free(session, trk->trk_addr); + + __slvg_trk_free_addr(session, trk); + + __wt_free(session, trk->trk_ovfl_slot); + + __wt_free(session, trk->shared); + } + + if (trk->ss->page_type == WT_PAGE_ROW_LEAF) { + __wt_buf_free(session, &trk->row_start); + __wt_buf_free(session, &trk->row_stop); + } + + __wt_free(session, trk); + + return (0); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c new file mode 100644 index 00000000000..3da0bcf346c --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_stat.c @@ -0,0 +1,190 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *); +static int __stat_page_col_var(WT_PAGE *, WT_DSRC_STATS *); +static int __stat_page_row_leaf(WT_PAGE *, WT_DSRC_STATS *); + +/* + * __wt_btree_stat_init -- + * Initialize the Btree statistics. + */ +int +__wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_DECL_RET; + WT_DSRC_STATS *stats; + WT_REF *next_walk; + + btree = S2BT(session); + bm = btree->bm; + stats = &btree->dhandle->stats; + + WT_RET(bm->stat(bm, session, stats)); + + WT_STAT_SET(stats, btree_fixed_len, btree->bitcnt); + WT_STAT_SET(stats, btree_maximum_depth, btree->maximum_depth); + WT_STAT_SET(stats, btree_maxintlitem, btree->maxintlitem); + WT_STAT_SET(stats, btree_maxintlpage, btree->maxintlpage); + WT_STAT_SET(stats, btree_maxleafitem, btree->maxleafitem); + WT_STAT_SET(stats, btree_maxleafpage, btree->maxleafpage); + + /* Everything else is really, really expensive. */ + if (!F_ISSET(cst, WT_CONN_STAT_ALL)) + return (0); + + next_walk = NULL; + while ((ret = + __wt_tree_walk(session, &next_walk, 0)) == 0 && next_walk != NULL) + WT_RET(__stat_page(session, next_walk->page, stats)); + return (ret == WT_NOTFOUND ? 0 : ret); +} + +/* + * __stat_page -- + * Stat any Btree page. + */ +static int +__stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats) +{ + WT_PAGE_INDEX *pindex; + + /* + * All internal pages and overflow pages are trivial, all we track is + * a count of the page type. + */ + switch (page->type) { + case WT_PAGE_COL_FIX: + WT_STAT_INCR(stats, btree_column_fix); + WT_STAT_INCRV(stats, btree_entries, page->pg_fix_entries); + break; + case WT_PAGE_COL_INT: + WT_STAT_INCR(stats, btree_column_internal); + pindex = WT_INTL_INDEX_COPY(page); + WT_STAT_INCRV(stats, btree_entries, pindex->entries); + break; + case WT_PAGE_COL_VAR: + WT_RET(__stat_page_col_var(page, stats)); + break; + case WT_PAGE_OVFL: + WT_STAT_INCR(stats, btree_overflow); + break; + case WT_PAGE_ROW_INT: + WT_STAT_INCR(stats, btree_row_internal); + pindex = WT_INTL_INDEX_COPY(page); + WT_STAT_INCRV(stats, btree_entries, pindex->entries); + break; + case WT_PAGE_ROW_LEAF: + WT_RET(__stat_page_row_leaf(page, stats)); + break; + WT_ILLEGAL_VALUE(session); + } + return (0); +} + +/* + * __stat_page_col_var -- + * Stat a WT_PAGE_COL_VAR page. + */ +static int +__stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats) +{ + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + WT_COL *cip; + WT_INSERT *ins; + WT_UPDATE *upd; + uint32_t i; + int orig_deleted; + + unpack = &_unpack; + + WT_STAT_INCR(stats, btree_column_variable); + + /* + * Walk the page, counting regular and overflow data items, and checking + * to be sure any updates weren't deletions. If the item was updated, + * assume it was updated by an item of the same size (it's expensive to + * figure out if it will require the same space or not, especially if + * there's Huffman encoding). + */ + WT_COL_FOREACH(page, cip, i) { + if ((cell = WT_COL_PTR(page, cip)) == NULL) { + orig_deleted = 1; + WT_STAT_INCR(stats, btree_column_deleted); + } else { + orig_deleted = 0; + __wt_cell_unpack(cell, unpack); + WT_STAT_INCRV( + stats, btree_entries, __wt_cell_rle(unpack)); + } + + /* + * Walk the insert list, checking for changes. For each insert + * we find, correct the original count based on its state. + */ + WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) { + upd = ins->upd; + if (WT_UPDATE_DELETED_ISSET(upd)) { + if (orig_deleted) + continue; + WT_STAT_INCR(stats, btree_column_deleted); + WT_STAT_DECR(stats, btree_entries); + } else { + if (!orig_deleted) + continue; + WT_STAT_DECR(stats, btree_column_deleted); + WT_STAT_INCR(stats, btree_entries); + } + } + } + return (0); +} + +/* + * __stat_page_row_leaf -- + * Stat a WT_PAGE_ROW_LEAF page. + */ +static int +__stat_page_row_leaf(WT_PAGE *page, WT_DSRC_STATS *stats) +{ + WT_INSERT *ins; + WT_ROW *rip; + WT_UPDATE *upd; + uint32_t cnt, i; + + WT_STAT_INCR(stats, btree_row_leaf); + + /* + * Stat any K/V pairs inserted into the page before the first from-disk + * key on the page. + */ + cnt = 0; + WT_SKIP_FOREACH(ins, WT_ROW_INSERT_SMALLEST(page)) + if (!WT_UPDATE_DELETED_ISSET(ins->upd)) + ++cnt; + + /* Stat the page's K/V pairs. */ + WT_ROW_FOREACH(page, rip, i) { + upd = WT_ROW_UPDATE(page, rip); + if (upd == NULL || !WT_UPDATE_DELETED_ISSET(upd)) + ++cnt; + + /* Stat inserted K/V pairs. */ + WT_SKIP_FOREACH(ins, WT_ROW_INSERT(page, rip)) + if (!WT_UPDATE_DELETED_ISSET(ins->upd)) + ++cnt; + } + + WT_STAT_INCRV(stats, btree_entries, cnt); + + return (0); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c new file mode 100644 index 00000000000..607e7919513 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -0,0 +1,373 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __sync_file -- + * Flush pages for a specific file. + */ +static int +__sync_file(WT_SESSION_IMPL *session, int syncop) +{ + struct timespec end, start; + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_MODIFY *mod; + WT_REF *walk; + WT_TXN *txn; + uint64_t internal_bytes, leaf_bytes; + uint64_t internal_pages, leaf_pages; + uint32_t flags; + + btree = S2BT(session); + + flags = WT_READ_CACHE | WT_READ_NO_GEN; + walk = NULL; + txn = &session->txn; + + internal_bytes = leaf_bytes = 0; + internal_pages = leaf_pages = 0; + if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) + WT_RET(__wt_epoch(session, &start)); + + switch (syncop) { + case WT_SYNC_WRITE_LEAVES: + /* + * Write all immediately available, dirty in-cache leaf pages. + * + * Writing the leaf pages is done without acquiring a high-level + * lock, serialize so multiple threads don't walk the tree at + * the same time. + */ + if (!btree->modified) + return (0); + __wt_spin_lock(session, &btree->flush_lock); + if (!btree->modified) { + __wt_spin_unlock(session, &btree->flush_lock); + return (0); + } + + flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; + for (walk = NULL;;) { + WT_ERR(__wt_tree_walk(session, &walk, flags)); + if (walk == NULL) + break; + + /* Write dirty pages if nobody beat us to it. */ + page = walk->page; + if (__wt_page_is_modified(page)) { + if (txn->isolation == TXN_ISO_READ_COMMITTED) + __wt_txn_refresh(session, 1); + leaf_bytes += page->memory_footprint; + ++leaf_pages; + WT_ERR(__wt_rec_write(session, walk, NULL, 0)); + } + } + break; + case WT_SYNC_CHECKPOINT: + /* + * We cannot check the tree modified flag in the case of a + * checkpoint, the checkpoint code has already cleared it. + * + * Writing the leaf pages is done without acquiring a high-level + * lock, serialize so multiple threads don't walk the tree at + * the same time. We're holding the schema lock, but need the + * lower-level lock as well. + */ + __wt_spin_lock(session, &btree->flush_lock); + + /* + * When internal pages are being reconciled by checkpoint their + * child pages cannot disappear from underneath them or be split + * into them, nor can underlying blocks be freed until the block + * lists for the checkpoint are stable. Set the checkpointing + * flag to block eviction of dirty pages until the checkpoint's + * internal page pass is complete, then wait for any existing + * eviction to complete. + */ + btree->checkpointing = 1; + + if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { + WT_ERR(__wt_evict_file_exclusive_on(session)); + __wt_evict_file_exclusive_off(session); + } + + /* Write all dirty in-cache pages. */ + flags |= WT_READ_NO_EVICT; + for (walk = NULL;;) { + WT_ERR(__wt_tree_walk(session, &walk, flags)); + if (walk == NULL) + break; + + /* + * Write dirty pages, unless we can be sure they only + * became dirty after the checkpoint started. + * + * We can skip dirty pages if: + * (1) they are leaf pages; + * (2) there is a snapshot transaction active (which + * is the case in ordinary application checkpoints + * but not all internal cases); and + * (3) the first dirty update on the page is + * sufficiently recent that the checkpoint + * transaction would skip them. + */ + page = walk->page; + mod = page->modify; + if (__wt_page_is_modified(page) && + (WT_PAGE_IS_INTERNAL(page) || + !F_ISSET(txn, TXN_HAS_SNAPSHOT) || + TXNID_LE(mod->first_dirty_txn, txn->snap_max))) { + if (WT_PAGE_IS_INTERNAL(page)) { + internal_bytes += + page->memory_footprint; + ++internal_pages; + } else { + leaf_bytes += page->memory_footprint; + ++leaf_pages; + } + WT_ERR(__wt_rec_write(session, walk, NULL, 0)); + } + } + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { + WT_ERR(__wt_epoch(session, &end)); + WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, + "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 + " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 + " bytes, %" PRIu64 " pages of internal\n\t" + "Took: %" PRIu64 "ms", + syncop == WT_SYNC_WRITE_LEAVES ? + "WRITE_LEAVES" : "CHECKPOINT", + leaf_bytes, leaf_pages, internal_bytes, internal_pages, + WT_TIMEDIFF(end, start) / WT_MILLION)); + } + +err: /* On error, clear any left-over tree walk. */ + if (walk != NULL) + WT_TRET(__wt_page_release(session, walk, flags)); + + if (txn->isolation == TXN_ISO_READ_COMMITTED && session->ncursors == 0) + __wt_txn_release_snapshot(session); + + if (btree->checkpointing) { + /* + * Clear the checkpoint flag and push the change; not required, + * but publishing the change means stalled eviction gets moving + * as soon as possible. + */ + btree->checkpointing = 0; + WT_FULL_BARRIER(); + + /* + * Wake the eviction server, in case application threads have + * stalled while the eviction server decided it couldn't make + * progress. Without this, application threads will be stalled + * until the eviction server next wakes. + */ + WT_TRET(__wt_evict_server_wake(session)); + } + + __wt_spin_unlock(session, &btree->flush_lock); + + /* + * Leaves are written before a checkpoint (or as part of a file close, + * before checkpointing the file). Start a flush to stable storage, + * but don't wait for it. + */ + if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES) + WT_RET(btree->bm->sync(btree->bm, session, 1)); + + return (ret); +} + +/* + * __evict_file -- + * Discard pages for a specific file. + */ +static int +__evict_file(WT_SESSION_IMPL *session, int syncop) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *page; + WT_REF *next_ref, *ref; + int eviction_enabled; + + btree = S2BT(session); + eviction_enabled = !F_ISSET(btree, WT_BTREE_NO_EVICTION); + + /* + * We need exclusive access to the file -- disable ordinary eviction + * and drain any blocks already queued. + */ + if (eviction_enabled) + WT_RET(__wt_evict_file_exclusive_on(session)); + + /* Make sure the oldest transaction ID is up-to-date. */ + __wt_txn_update_oldest(session); + + /* Walk the tree, discarding pages. */ + next_ref = NULL; + WT_ERR(__wt_tree_walk( + session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT)); + while ((ref = next_ref) != NULL) { + page = ref->page; + + /* + * Eviction can fail when a page in the evicted page's subtree + * switches state. For example, if we don't evict a page marked + * empty, because we expect it to be merged into its parent, it + * might no longer be empty after it's reconciled, in which case + * eviction of its parent would fail. We can either walk the + * tree multiple times (until it's finally empty), or reconcile + * each page to get it to its final state before considering if + * it's an eviction target or will be merged into its parent. + * + * Don't limit this test to any particular page type, that tends + * to introduce bugs when the reconciliation of other page types + * changes, and there's no advantage to doing so. + * + * Eviction can also fail because an update cannot be written. + * If sessions have disjoint sets of files open, updates in a + * no-longer-referenced file may not yet be globally visible, + * and the write will fail with EBUSY. Our caller handles that + * error, retrying later. + */ + if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page)) + WT_ERR(__wt_rec_write(session, ref, NULL, WT_EVICTING)); + + /* + * We can't evict the page just returned to us (it marks our + * place in the tree), so move the walk to one page ahead of + * the page being evicted. Note, we reconciled the returned + * page first: if reconciliation of that page were to change + * the shape of the tree, and we did the next walk call before + * the reconciliation, the next walk call could miss a page in + * the tree. + */ + WT_ERR(__wt_tree_walk( + session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT)); + + switch (syncop) { + case WT_SYNC_CLOSE: + /* + * Evict the page. + * Do not attempt to evict pages expected to be merged + * into their parents, with the exception that the root + * page can't be merged, it must be written. + */ + if (__wt_ref_is_root(ref) || + page->modify == NULL || + !F_ISSET(page->modify, WT_PM_REC_EMPTY)) + WT_ERR(__wt_rec_evict(session, ref, 1)); + break; + case WT_SYNC_DISCARD: + case WT_SYNC_DISCARD_FORCE: + /* + * Discard the page, whether clean or dirty. + * + * Clean the page, both to keep statistics correct, and + * to let the page-discard function assert no dirty page + * is ever discarded. + */ + if (__wt_page_is_modified(page)) { + page->modify->write_gen = 0; + __wt_cache_dirty_decr(session, page); + } + /* + * If the page contains an update that is too recent to + * evict, stop. This should never happen during + * connection close, and in other paths our caller + * should be prepared to deal with this case. + */ + if (syncop == WT_SYNC_DISCARD && + page->modify != NULL && + !__wt_txn_visible_all(session, + page->modify->rec_max_txn)) + return (EBUSY); + if (syncop == WT_SYNC_DISCARD_FORCE) + F_SET(session, WT_SESSION_DISCARD_FORCE); + __wt_ref_out(session, ref); + /* + * In case we don't discard the whole tree, make sure + * that future readers know that the page is no longer + * in cache. + */ + ref->state = WT_REF_DISK; + F_CLR(session, WT_SESSION_DISCARD_FORCE); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + } + + if (0) { +err: /* On error, clear any left-over tree walk. */ + if (next_ref != NULL) + WT_TRET(__wt_page_release( + session, next_ref, WT_READ_NO_EVICT)); + } + + if (eviction_enabled) + __wt_evict_file_exclusive_off(session); + + return (ret); +} + +/* + * __wt_cache_op -- + * Cache operations. + */ +int +__wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op) +{ + WT_DECL_RET; + WT_BTREE *btree; + + btree = S2BT(session); + + switch (op) { + case WT_SYNC_CHECKPOINT: + case WT_SYNC_CLOSE: + /* + * Set the checkpoint reference for reconciliation; it's ugly, + * but drilling a function parameter path from our callers to + * the reconciliation of the tree's root page is going to be + * worse. + */ + WT_ASSERT(session, btree->ckpt == NULL); + btree->ckpt = ckptbase; + break; + } + + switch (op) { + case WT_SYNC_CHECKPOINT: + case WT_SYNC_WRITE_LEAVES: + WT_ERR(__sync_file(session, op)); + break; + case WT_SYNC_CLOSE: + case WT_SYNC_DISCARD: + case WT_SYNC_DISCARD_FORCE: + WT_ERR(__evict_file(session, op)); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + +err: switch (op) { + case WT_SYNC_CHECKPOINT: + case WT_SYNC_CLOSE: + btree->ckpt = NULL; + break; + } + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_upgrade.c b/src/third_party/wiredtiger/src/btree/bt_upgrade.c new file mode 100644 index 00000000000..d65c8793fbb --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_upgrade.c @@ -0,0 +1,22 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_upgrade -- + * Upgrade a file. + */ +int +__wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_UNUSED(cfg); + + /* There's nothing to upgrade, yet. */ + WT_RET(__wt_progress(session, NULL, 1)); + return (0); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c new file mode 100644 index 00000000000..e7caf02fd2f --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -0,0 +1,666 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * There's a bunch of stuff we pass around during verification, group it + * together to make the code prettier. + */ +typedef struct { + uint64_t record_total; /* Total record count */ + + WT_ITEM *max_key; /* Largest key */ + WT_ITEM *max_addr; /* Largest key page */ + + uint64_t fcnt; /* Progress counter */ + + int dump_address; /* Debugging hooks */ + int dump_pages; + int dump_blocks; + + WT_ITEM *tmp1; /* Temporary buffer */ + WT_ITEM *tmp2; /* Temporary buffer */ +} WT_VSTUFF; + +static void __verify_checkpoint_reset(WT_VSTUFF *); +static int __verify_config(WT_SESSION_IMPL *, const char *[], WT_VSTUFF *); +static int __verify_config_offsets(WT_SESSION_IMPL *, const char *[], int *); +static int __verify_overflow( + WT_SESSION_IMPL *, const uint8_t *, size_t, WT_VSTUFF *); +static int __verify_overflow_cell( + WT_SESSION_IMPL *, WT_REF *, int *, WT_VSTUFF *); +static int __verify_row_int_key_order( + WT_SESSION_IMPL *, WT_PAGE *, WT_REF *, uint32_t, WT_VSTUFF *); +static int __verify_row_leaf_key_order( + WT_SESSION_IMPL *, WT_REF *, WT_VSTUFF *); +static int __verify_tree(WT_SESSION_IMPL *, WT_REF *, WT_VSTUFF *); + +/* + * __wt_verify -- + * Verify a file. + */ +int +__wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_CKPT *ckptbase, *ckpt; + WT_DECL_RET; + WT_VSTUFF *vs, _vstuff; + size_t root_addr_size; + uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE]; + int bm_start, quit; + + btree = S2BT(session); + bm = btree->bm; + ckptbase = NULL; + bm_start = 0; + + WT_CLEAR(_vstuff); + vs = &_vstuff; + WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key)); + WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr)); + WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1)); + WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2)); + + /* Check configuration strings. */ + WT_ERR(__verify_config(session, cfg, vs)); + + /* Optionally dump specific block offsets. */ + WT_ERR(__verify_config_offsets(session, cfg, &quit)); + if (quit) + goto done; + + /* Get a list of the checkpoints for this file. */ + WT_ERR( + __wt_meta_ckptlist_get(session, btree->dhandle->name, &ckptbase)); + + /* Inform the underlying block manager we're verifying. */ + WT_ERR(bm->verify_start(bm, session, ckptbase)); + bm_start = 1; + + /* Loop through the file's checkpoints, verifying each one. */ + WT_CKPT_FOREACH(ckptbase, ckpt) { + WT_ERR(__wt_verbose(session, WT_VERB_VERIFY, + "%s: checkpoint %s", btree->dhandle->name, ckpt->name)); + + /* Fake checkpoints require no work. */ + if (F_ISSET(ckpt, WT_CKPT_FAKE)) + continue; + + /* House-keeping between checkpoints. */ + __verify_checkpoint_reset(vs); + +#ifdef HAVE_DIAGNOSTIC + if (vs->dump_address || vs->dump_blocks || vs->dump_pages) + WT_ERR(__wt_msg(session, "%s: checkpoint %s", + btree->dhandle->name, ckpt->name)); +#endif + /* Load the checkpoint. */ + WT_ERR(bm->checkpoint_load(bm, session, + ckpt->raw.data, ckpt->raw.size, + root_addr, &root_addr_size, 1)); + + /* + * Ignore trees with no root page. + * Verify, then discard the checkpoint from the cache. + */ + if (root_addr_size != 0 && + (ret = __wt_btree_tree_open( + session, root_addr, root_addr_size)) == 0) { +#ifdef HAVE_DIAGNOSTIC + if (vs->dump_address || + vs->dump_blocks || vs->dump_pages) + WT_ERR(__wt_msg(session, "Root: %s %s", + __wt_addr_string(session, + root_addr, root_addr_size, vs->tmp1), + __wt_page_type_string( + btree->root.page->type))); +#endif + ret = __verify_tree(session, &btree->root, vs); + + WT_TRET(__wt_cache_op(session, NULL, WT_SYNC_DISCARD)); + } + + /* Unload the checkpoint. */ + WT_TRET(bm->checkpoint_unload(bm, session)); + WT_ERR(ret); + } + +done: +err: /* Inform the underlying block manager we're done. */ + if (bm_start) + WT_TRET(bm->verify_end(bm, session)); + + /* Discard the list of checkpoints. */ + if (ckptbase != NULL) + __wt_meta_ckptlist_free(session, ckptbase); + + /* Wrap up reporting. */ + WT_TRET(__wt_progress(session, NULL, vs->fcnt)); + + /* Free allocated memory. */ + __wt_scr_free(&vs->max_key); + __wt_scr_free(&vs->max_addr); + __wt_scr_free(&vs->tmp1); + __wt_scr_free(&vs->tmp2); + + return (ret); +} + +/* + * __verify_config -- + * Debugging: verification supports dumping pages in various formats. + */ +static int +__verify_config(WT_SESSION_IMPL *session, const char *cfg[], WT_VSTUFF *vs) +{ + WT_CONFIG_ITEM cval; + + WT_RET(__wt_config_gets(session, cfg, "dump_address", &cval)); + vs->dump_address = cval.val != 0; + + WT_RET(__wt_config_gets(session, cfg, "dump_blocks", &cval)); + vs->dump_blocks = cval.val != 0; + + WT_RET(__wt_config_gets(session, cfg, "dump_pages", &cval)); + vs->dump_pages = cval.val != 0; + +#if !defined(HAVE_DIAGNOSTIC) + if (vs->dump_address || vs->dump_blocks || vs->dump_pages) + WT_RET_MSG(session, ENOTSUP, + "the WiredTiger library was not built in diagnostic mode"); +#endif + return (0); +} + +/* + * __verify_config_offsets -- + * Debugging: optionally dump specific blocks from the file. + */ +static int +__verify_config_offsets(WT_SESSION_IMPL *session, const char *cfg[], int *quitp) +{ + WT_CONFIG list; + WT_CONFIG_ITEM cval, k, v; + WT_DECL_RET; + u_long offset; + + *quitp = 0; + + WT_RET(__wt_config_gets(session, cfg, "dump_offsets", &cval)); + WT_RET(__wt_config_subinit(session, &list, &cval)); + while ((ret = __wt_config_next(&list, &k, &v)) == 0) { + /* + * Quit after dumping the requested blocks. (That's hopefully + * what the user wanted, all of this stuff is just hooked into + * verify because that's where we "dump blocks" for debugging.) + */ + *quitp = 1; + if (v.len != 0 || sscanf(k.str, "%lu", &offset) != 1) + WT_RET_MSG(session, EINVAL, + "unexpected dump offset format"); +#if !defined(HAVE_DIAGNOSTIC) + WT_RET_MSG(session, ENOTSUP, + "the WiredTiger library was not built in diagnostic mode"); +#else + WT_TRET( + __wt_debug_offset_blind(session, (wt_off_t)offset, NULL)); +#endif + } + return (ret == WT_NOTFOUND ? 0 : ret); +} + +/* + * __verify_checkpoint_reset -- + * Reset anything needing to be reset for each new checkpoint verification. + */ +static void +__verify_checkpoint_reset(WT_VSTUFF *vs) +{ + /* + * Key order is per checkpoint, reset the data length that serves as a + * flag value. + */ + vs->max_addr->size = 0; + + /* Record total is per checkpoint, reset the record count. */ + vs->record_total = 0; +} + +/* + * __verify_tree -- + * Verify a tree, recursively descending through it in depth-first fashion. + * The page argument was physically verified (so we know it's correctly formed), + * and the in-memory version built. Our job is to check logical relationships + * in the page and in the tree. + */ +static int +__verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs) +{ + WT_BM *bm; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + WT_COL *cip; + WT_DECL_RET; + WT_PAGE *page; + WT_REF *child_ref; + uint64_t recno; + uint32_t entry, i; + int found; + + bm = S2BT(session)->bm; + page = ref->page; + + unpack = &_unpack; + WT_CLEAR(*unpack); /* -Wuninitialized */ + + WT_RET(__wt_verbose(session, WT_VERB_VERIFY, "%s %s", + __wt_page_addr_string(session, ref, vs->tmp1), + __wt_page_type_string(page->type))); +#ifdef HAVE_DIAGNOSTIC + if (vs->dump_address) + WT_RET(__wt_msg(session, "%s %s", + __wt_page_addr_string(session, ref, vs->tmp1), + __wt_page_type_string(page->type))); +#endif + + /* + * The page's physical structure was verified when it was read into + * memory by the read server thread, and then the in-memory version + * of the page was built. Now we make sure the page and tree are + * logically consistent. + * + * !!! + * The problem: (1) the read server has to build the in-memory version + * of the page because the read server is the thread that flags when + * any thread can access the page in the tree; (2) we can't build the + * in-memory version of the page until the physical structure is known + * to be OK, so the read server has to verify at least the physical + * structure of the page; (3) doing complete page verification requires + * reading additional pages (for example, overflow keys imply reading + * overflow pages in order to test the key's order in the page); (4) + * the read server cannot read additional pages because it will hang + * waiting on itself. For this reason, we split page verification + * into a physical verification, which allows the in-memory version + * of the page to be built, and then a subsequent logical verification + * which happens here. + * + * Report progress every 10 pages. + */ + if (++vs->fcnt % 10 == 0) + WT_RET(__wt_progress(session, NULL, vs->fcnt)); + +#ifdef HAVE_DIAGNOSTIC + /* Optionally dump the blocks or page in debugging mode. */ + if (vs->dump_blocks) + WT_RET(__wt_debug_disk(session, page->dsk, NULL)); + if (vs->dump_pages) + WT_RET(__wt_debug_page(session, page, NULL)); +#endif + + /* + * Column-store key order checks: check the page's record number and + * then update the total record count. + */ + switch (page->type) { + case WT_PAGE_COL_FIX: + recno = page->pg_fix_recno; + goto recno_chk; + case WT_PAGE_COL_INT: + recno = page->pg_intl_recno; + goto recno_chk; + case WT_PAGE_COL_VAR: + recno = page->pg_var_recno; +recno_chk: if (recno != vs->record_total + 1) + WT_RET_MSG(session, WT_ERROR, + "page at %s has a starting record of %" PRIu64 + " when the expected starting record is %" PRIu64, + __wt_page_addr_string(session, ref, vs->tmp1), + recno, vs->record_total + 1); + break; + } + switch (page->type) { + case WT_PAGE_COL_FIX: + vs->record_total += page->pg_fix_entries; + break; + case WT_PAGE_COL_VAR: + recno = 0; + WT_COL_FOREACH(page, cip, i) + if ((cell = WT_COL_PTR(page, cip)) == NULL) + ++recno; + else { + __wt_cell_unpack(cell, unpack); + recno += __wt_cell_rle(unpack); + } + vs->record_total += recno; + break; + } + + /* + * Row-store leaf page key order check: it's a depth-first traversal, + * the first key on this page should be larger than any key previously + * seen. + */ + switch (page->type) { + case WT_PAGE_ROW_LEAF: + WT_RET(__verify_row_leaf_key_order(session, ref, vs)); + break; + } + + /* If it's not the root page, unpack the parent cell. */ + if (!__wt_ref_is_root(ref)) { + __wt_cell_unpack(ref->addr, unpack); + + /* Compare the parent cell against the page type. */ + switch (page->type) { + case WT_PAGE_COL_FIX: + if (unpack->raw != WT_CELL_ADDR_LEAF_NO) + goto celltype_err; + break; + case WT_PAGE_COL_VAR: + case WT_PAGE_ROW_LEAF: + if (unpack->raw != WT_CELL_ADDR_LEAF && + unpack->raw != WT_CELL_ADDR_LEAF_NO) + goto celltype_err; + break; + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + if (unpack->raw != WT_CELL_ADDR_INT) +celltype_err: WT_RET_MSG(session, WT_ERROR, + "page at %s, of type %s, is referenced in " + "its parent by a cell of type %s", + __wt_page_addr_string( + session, ref, vs->tmp1), + __wt_page_type_string(page->type), + __wt_cell_type_string(unpack->raw)); + break; + } + } + + /* + * Check overflow pages. We check overflow cells separately from other + * tests that walk the page as it's simpler, and I don't care much how + * fast table verify runs. + */ + switch (page->type) { + case WT_PAGE_COL_VAR: + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + WT_RET(__verify_overflow_cell(session, ref, &found, vs)); + if (__wt_ref_is_root(ref) || page->type == WT_PAGE_ROW_INT) + break; + + /* + * Object if a leaf-no-overflow address cell references a page + * with overflow keys, but don't object if a leaf address cell + * references a page without overflow keys. Reconciliation + * doesn't guarantee every leaf page without overflow items will + * be a leaf-no-overflow type. + */ + if (found && unpack->raw == WT_CELL_ADDR_LEAF_NO) + WT_RET_MSG(session, WT_ERROR, + "page at %s, of type %s and referenced in its " + "parent by a cell of type %s, contains overflow " + "items", + __wt_page_addr_string(session, ref, vs->tmp1), + __wt_page_type_string(page->type), + __wt_cell_type_string(WT_CELL_ADDR_LEAF_NO)); + break; + } + + /* Check tree connections and recursively descend the tree. */ + switch (page->type) { + case WT_PAGE_COL_INT: + /* For each entry in an internal page, verify the subtree. */ + entry = 0; + WT_INTL_FOREACH_BEGIN(session, page, child_ref) { + /* + * It's a depth-first traversal: this entry's starting + * record number should be 1 more than the total records + * reviewed to this point. + */ + ++entry; + if (child_ref->key.recno != vs->record_total + 1) { + WT_RET_MSG(session, WT_ERROR, + "the starting record number in entry %" + PRIu32 " of the column internal page at " + "%s is %" PRIu64 " and the expected " + "starting record number is %" PRIu64, + entry, + __wt_page_addr_string( + session, child_ref, vs->tmp1), + child_ref->key.recno, + vs->record_total + 1); + } + + /* Verify the subtree. */ + WT_RET(__wt_page_in(session, child_ref, 0)); + ret = __verify_tree(session, child_ref, vs); + WT_TRET(__wt_page_release(session, child_ref, 0)); + WT_RET(ret); + + __wt_cell_unpack(child_ref->addr, unpack); + WT_RET(bm->verify_addr( + bm, session, unpack->data, unpack->size)); + } WT_INTL_FOREACH_END; + break; + case WT_PAGE_ROW_INT: + /* For each entry in an internal page, verify the subtree. */ + entry = 0; + WT_INTL_FOREACH_BEGIN(session, page, child_ref) { + /* + * It's a depth-first traversal: this entry's starting + * key should be larger than the largest key previously + * reviewed. + * + * The 0th key of any internal page is magic, and we + * can't test against it. + */ + ++entry; + if (entry != 1) + WT_RET(__verify_row_int_key_order( + session, page, child_ref, entry, vs)); + + /* Verify the subtree. */ + WT_RET(__wt_page_in(session, child_ref, 0)); + ret = __verify_tree(session, child_ref, vs); + WT_TRET(__wt_page_release(session, child_ref, 0)); + WT_RET(ret); + + __wt_cell_unpack(child_ref->addr, unpack); + WT_RET(bm->verify_addr( + bm, session, unpack->data, unpack->size)); + } WT_INTL_FOREACH_END; + break; + } + return (0); +} + +/* + * __verify_row_int_key_order -- + * Compare a key on an internal page to the largest key we've seen so + * far; update the largest key we've seen so far to that key. + */ +static int +__verify_row_int_key_order(WT_SESSION_IMPL *session, + WT_PAGE *parent, WT_REF *ref, uint32_t entry, WT_VSTUFF *vs) +{ + WT_BTREE *btree; + WT_ITEM item; + int cmp; + + btree = S2BT(session); + + /* The maximum key is set, we updated it from a leaf page first. */ + WT_ASSERT(session, vs->max_addr->size != 0); + + /* Get the parent page's internal key. */ + __wt_ref_key(parent, ref, &item.data, &item.size); + + /* Compare the key against the largest key we've seen so far. */ + WT_RET(__wt_compare( + session, btree->collator, &item, vs->max_key, &cmp)); + if (cmp <= 0) + WT_RET_MSG(session, WT_ERROR, + "the internal key in entry %" PRIu32 " on the page at %s " + "sorts before the last key appearing on page %s, earlier " + "in the tree", + entry, + __wt_page_addr_string(session, ref, vs->tmp1), + (char *)vs->max_addr->data); + + /* Update the largest key we've seen to the key just checked. */ + WT_RET(__wt_buf_set(session, vs->max_key, item.data, item.size)); + (void)__wt_page_addr_string(session, ref, vs->max_addr); + + return (0); +} + +/* + * __verify_row_leaf_key_order -- + * Compare the first key on a leaf page to the largest key we've seen so + * far; update the largest key we've seen so far to the last key on the page. + */ +static int +__verify_row_leaf_key_order( + WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs) +{ + WT_BTREE *btree; + WT_PAGE *page; + int cmp; + + btree = S2BT(session); + page = ref->page; + + /* + * If a tree is empty (just created), it won't have keys; if there + * are no keys, we're done. + */ + if (page->pg_row_entries == 0) + return (0); + + /* + * We visit our first leaf page before setting the maximum key (the 0th + * keys on the internal pages leading to the smallest leaf in the tree + * are all empty entries). + */ + if (vs->max_addr->size != 0) { + WT_RET(__wt_row_leaf_key_copy( + session, page, page->pg_row_d, vs->tmp1)); + + /* + * Compare the key against the largest key we've seen so far. + * + * If we're comparing against a key taken from an internal page, + * we can compare equal (which is an expected path, the internal + * page key is often a copy of the leaf page's first key). But, + * in the case of the 0th slot on an internal page, the last key + * we've seen was a key from a previous leaf page, and it's not + * OK to compare equally in that case. + */ + WT_RET(__wt_compare(session, + btree->collator, vs->tmp1, (WT_ITEM *)vs->max_key, &cmp)); + if (cmp < 0) + WT_RET_MSG(session, WT_ERROR, + "the first key on the page at %s sorts equal to or " + "less than a key appearing on the page at %s, " + "earlier in the tree", + __wt_page_addr_string(session, ref, vs->tmp1), + (char *)vs->max_addr->data); + } + + /* Update the largest key we've seen to the last key on this page. */ + WT_RET(__wt_row_leaf_key_copy(session, page, + page->pg_row_d + (page->pg_row_entries - 1), vs->max_key)); + (void)__wt_page_addr_string(session, ref, vs->max_addr); + + return (0); +} + +/* + * __verify_overflow_cell -- + * Verify any overflow cells on the page. + */ +static int +__verify_overflow_cell( + WT_SESSION_IMPL *session, WT_REF *ref, int *found, WT_VSTUFF *vs) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + WT_DECL_RET; + const WT_PAGE_HEADER *dsk; + uint32_t cell_num, i; + + btree = S2BT(session); + unpack = &_unpack; + *found = 0; + + /* + * If a tree is empty (just created), it won't have a disk image; + * if there is no disk image, we're done. + */ + if ((dsk = ref->page->dsk) == NULL) + return (0); + + /* Walk the disk page, verifying pages referenced by overflow cells. */ + cell_num = 0; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { + ++cell_num; + __wt_cell_unpack(cell, unpack); + switch (unpack->type) { + case WT_CELL_KEY_OVFL: + case WT_CELL_VALUE_OVFL: + *found = 1; + WT_ERR(__verify_overflow( + session, unpack->data, unpack->size, vs)); + break; + } + } + + return (0); + +err: WT_RET_MSG(session, ret, + "cell %" PRIu32 " on page at %s references an overflow item at %s " + "that failed verification", + cell_num - 1, + __wt_page_addr_string(session, ref, vs->tmp1), + __wt_addr_string(session, unpack->data, unpack->size, vs->tmp2)); +} + +/* + * __verify_overflow -- + * Read in an overflow page and check it. + */ +static int +__verify_overflow(WT_SESSION_IMPL *session, + const uint8_t *addr, size_t addr_size, WT_VSTUFF *vs) +{ + WT_BM *bm; + const WT_PAGE_HEADER *dsk; + + bm = S2BT(session)->bm; + + /* Read and verify the overflow item. */ + WT_RET(__wt_bt_read(session, vs->tmp1, addr, addr_size)); + + /* + * The physical page has already been verified, but we haven't confirmed + * it was an overflow page, only that it was a valid page. Confirm it's + * the type of page we expected. + */ + dsk = vs->tmp1->data; + if (dsk->type != WT_PAGE_OVFL) + WT_RET_MSG(session, WT_ERROR, + "overflow referenced page at %s is not an overflow page", + __wt_addr_string(session, addr, addr_size, vs->tmp1)); + + WT_RET(bm->verify_addr(bm, session, addr, addr_size)); + return (0); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c new file mode 100644 index 00000000000..a14f9f1078e --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c @@ -0,0 +1,739 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __err_cell_corrupted(WT_SESSION_IMPL *, uint32_t, const char *); +static int __err_cell_type( + WT_SESSION_IMPL *, uint32_t, const char *, uint8_t, uint8_t); +static int __err_eof(WT_SESSION_IMPL *, uint32_t, const char *); +static int __verify_dsk_chunk( + WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, uint32_t); +static int __verify_dsk_col_fix( + WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *); +static int __verify_dsk_col_int( + WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *); +static int __verify_dsk_col_var( + WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *); +static int __verify_dsk_memsize( + WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, WT_CELL *); +static int __verify_dsk_row( + WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *); + +#define WT_ERR_VRFY(session, ...) do { \ + if (!(F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))) \ + __wt_errx(session, __VA_ARGS__); \ + goto err; \ +} while (0) + +#define WT_RET_VRFY(session, ...) do { \ + if (!(F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))) \ + __wt_errx(session, __VA_ARGS__); \ + return (WT_ERROR); \ +} while (0) + +/* + * __wt_verify_dsk_image -- + * Verify a single block as read from disk. + */ +int +__wt_verify_dsk_image(WT_SESSION_IMPL *session, + const char *addr, const WT_PAGE_HEADER *dsk, size_t size) +{ + const uint8_t *p, *end; + u_int i; + uint8_t flags; + + /* Check the page type. */ + switch (dsk->type) { + case WT_PAGE_BLOCK_MANAGER: + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_INT: + case WT_PAGE_COL_VAR: + case WT_PAGE_OVFL: + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + break; + case WT_PAGE_INVALID: + default: + WT_RET_VRFY(session, + "page at %s has an invalid type of %" PRIu32, + addr, dsk->type); + } + + /* Check the page record number. */ + switch (dsk->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_INT: + case WT_PAGE_COL_VAR: + if (dsk->recno != 0) + break; + WT_RET_VRFY(session, + "%s page at %s has a record number of zero", + __wt_page_type_string(dsk->type), addr); + case WT_PAGE_BLOCK_MANAGER: + case WT_PAGE_OVFL: + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + if (dsk->recno == 0) + break; + WT_RET_VRFY(session, + "%s page at %s has a non-zero record number", + __wt_page_type_string(dsk->type), addr); + } + + /* Check the page flags. */ + flags = dsk->flags; + if (LF_ISSET(WT_PAGE_COMPRESSED)) + LF_CLR(WT_PAGE_COMPRESSED); + if (dsk->type == WT_PAGE_ROW_LEAF) { + if (LF_ISSET(WT_PAGE_EMPTY_V_ALL) && + LF_ISSET(WT_PAGE_EMPTY_V_NONE)) + WT_RET_VRFY(session, + "page at %s has invalid flags combination: 0x%" + PRIx8, + addr, dsk->flags); + if (LF_ISSET(WT_PAGE_EMPTY_V_ALL)) + LF_CLR(WT_PAGE_EMPTY_V_ALL); + if (LF_ISSET(WT_PAGE_EMPTY_V_NONE)) + LF_CLR(WT_PAGE_EMPTY_V_NONE); + } + if (flags != 0) + WT_RET_VRFY(session, + "page at %s has invalid flags set: 0x%" PRIx8, + addr, flags); + + /* Unused bytes */ + for (p = dsk->unused, i = sizeof(dsk->unused); i > 0; --i) + if (*p != '\0') + WT_RET_VRFY(session, + "page at %s has non-zero unused page header bytes", + addr); + + /* + * Any bytes after the data chunk should be nul bytes; ignore if the + * size is 0, that allows easy checking of disk images where we don't + * have the size. + */ + if (size != 0) { + p = (uint8_t *)dsk + dsk->mem_size; + end = (uint8_t *)dsk + size; + for (; p < end; ++p) + if (*p != '\0') + WT_RET_VRFY(session, + "%s page at %s has non-zero trailing bytes", + __wt_page_type_string(dsk->type), addr); + } + + /* Verify the items on the page. */ + switch (dsk->type) { + case WT_PAGE_COL_INT: + return (__verify_dsk_col_int(session, addr, dsk)); + case WT_PAGE_COL_FIX: + return (__verify_dsk_col_fix(session, addr, dsk)); + case WT_PAGE_COL_VAR: + return (__verify_dsk_col_var(session, addr, dsk)); + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + return (__verify_dsk_row(session, addr, dsk)); + case WT_PAGE_BLOCK_MANAGER: + case WT_PAGE_OVFL: + return (__verify_dsk_chunk(session, addr, dsk, dsk->u.datalen)); + WT_ILLEGAL_VALUE(session); + } + /* NOTREACHED */ +} + +/* + * __wt_verify_dsk -- + * Verify a single Btree page as read from disk. + */ +int +__wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf) +{ + return (__wt_verify_dsk_image(session, addr, buf->data, buf->size)); +} + +/* + * __verify_dsk_row -- + * Walk a WT_PAGE_ROW_INT or WT_PAGE_ROW_LEAF disk page and verify it. + */ +static int +__verify_dsk_row( + WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + WT_DECL_ITEM(current); + WT_DECL_ITEM(last_ovfl); + WT_DECL_ITEM(last_pfx); + WT_DECL_RET; + WT_ITEM *last; + enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type; + void *huffman; + uint32_t cell_num, cell_type, i, key_cnt, prefix; + uint8_t *end; + int cmp; + + btree = S2BT(session); + bm = btree->bm; + unpack = &_unpack; + huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key; + + WT_ERR(__wt_scr_alloc(session, 0, ¤t)); + WT_ERR(__wt_scr_alloc(session, 0, &last_pfx)); + WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl)); + last = last_ovfl; + + end = (uint8_t *)dsk + dsk->mem_size; + + last_cell_type = FIRST; + cell_num = 0; + key_cnt = 0; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { + ++cell_num; + + /* Carefully unpack the cell. */ + if (__wt_cell_unpack_safe(cell, unpack, end) != 0) { + ret = __err_cell_corrupted(session, cell_num, addr); + goto err; + } + + /* Check the raw and collapsed cell types. */ + WT_ERR(__err_cell_type( + session, cell_num, addr, unpack->raw, dsk->type)); + WT_ERR(__err_cell_type( + session, cell_num, addr, unpack->type, dsk->type)); + cell_type = unpack->type; + + /* + * Check ordering relationships between the WT_CELL entries. + * For row-store internal pages, check for: + * two values in a row, + * two keys in a row, + * a value as the first cell on a page. + * For row-store leaf pages, check for: + * two values in a row, + * a value as the first cell on a page. + */ + switch (cell_type) { + case WT_CELL_KEY: + case WT_CELL_KEY_OVFL: + ++key_cnt; + switch (last_cell_type) { + case FIRST: + case WAS_VALUE: + break; + case WAS_KEY: + if (dsk->type == WT_PAGE_ROW_LEAF) + break; + WT_ERR_VRFY(session, + "cell %" PRIu32 " on page at %s is the " + "first of two adjacent keys", + cell_num - 1, addr); + } + last_cell_type = WAS_KEY; + break; + case WT_CELL_ADDR_DEL: + case WT_CELL_ADDR_INT: + case WT_CELL_ADDR_LEAF: + case WT_CELL_ADDR_LEAF_NO: + case WT_CELL_VALUE: + case WT_CELL_VALUE_OVFL: + switch (last_cell_type) { + case FIRST: + WT_ERR_VRFY(session, + "page at %s begins with a value", addr); + case WAS_KEY: + break; + case WAS_VALUE: + WT_ERR_VRFY(session, + "cell %" PRIu32 " on page at %s is the " + "first of two adjacent values", + cell_num - 1, addr); + } + last_cell_type = WAS_VALUE; + break; + } + + /* Check if any referenced item has a valid address. */ + switch (cell_type) { + case WT_CELL_ADDR_DEL: + case WT_CELL_ADDR_INT: + case WT_CELL_ADDR_LEAF: + case WT_CELL_ADDR_LEAF_NO: + case WT_CELL_KEY_OVFL: + case WT_CELL_VALUE_OVFL: + if (!bm->addr_valid(bm, + session, unpack->data, unpack->size)) + goto eof; + break; + } + + /* + * Remaining checks are for key order and prefix compression. + * If this cell isn't a key, we're done, move to the next cell. + * If this cell is an overflow item, instantiate the key and + * compare it with the last key. Otherwise, we have to deal + * with prefix compression. + */ + switch (cell_type) { + case WT_CELL_KEY: + break; + case WT_CELL_KEY_OVFL: + WT_ERR(__wt_dsk_cell_data_ref( + session, dsk->type, unpack, current)); + goto key_compare; + default: + /* Not a key -- continue with the next cell. */ + continue; + } + + /* + * Prefix compression checks. + * + * Confirm the first non-overflow key on a page has a zero + * prefix compression count. + */ + prefix = unpack->prefix; + if (last_pfx->size == 0 && prefix != 0) + WT_ERR_VRFY(session, + "the %" PRIu32 " key on page at %s is the first " + "non-overflow key on the page and has a non-zero " + "prefix compression value", + cell_num, addr); + + /* Confirm the prefix compression count is possible. */ + if (cell_num > 1 && prefix > last->size) + WT_ERR_VRFY(session, + "key %" PRIu32 " on page at %s has a prefix " + "compression count of %" PRIu32 ", larger than " + "the length of the previous key, %" WT_SIZET_FMT, + cell_num, addr, prefix, last->size); + + /* + * If Huffman decoding required, unpack the cell to build the + * key, then resolve the prefix. Else, we can do it faster + * internally because we don't have to shuffle memory around as + * much. + */ + if (huffman != NULL) { + WT_ERR(__wt_dsk_cell_data_ref( + session, dsk->type, unpack, current)); + + /* + * If there's a prefix, make sure there's enough buffer + * space, then shift the decoded data past the prefix + * and copy the prefix into place. Take care with the + * pointers: current->data may be pointing inside the + * buffer. + */ + if (prefix != 0) { + WT_ERR(__wt_buf_grow( + session, current, prefix + current->size)); + memmove((uint8_t *)current->mem + prefix, + current->data, current->size); + memcpy(current->mem, last->data, prefix); + current->data = current->mem; + current->size += prefix; + } + } else { + /* + * Get the cell's data/length and make sure we have + * enough buffer space. + */ + WT_ERR(__wt_buf_init( + session, current, prefix + unpack->size)); + + /* Copy the prefix then the data into place. */ + if (prefix != 0) + memcpy(current->mem, last->data, prefix); + memcpy((uint8_t *)current->mem + prefix, unpack->data, + unpack->size); + current->size = prefix + unpack->size; + } + +key_compare: /* + * Compare the current key against the last key. + * + * Be careful about the 0th key on internal pages: we only store + * the first byte and custom collators may not be able to handle + * truncated keys. + */ + if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) || + (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) { + WT_ERR(__wt_compare( + session, btree->collator, last, current, &cmp)); + if (cmp >= 0) + WT_ERR_VRFY(session, + "the %" PRIu32 " and %" PRIu32 " keys on " + "page at %s are incorrectly sorted", + cell_num - 2, cell_num, addr); + } + + /* + * Swap the buffers: last always references the last key entry, + * last_pfx and last_ovfl reference the last prefix-compressed + * and last overflow key entries. Current gets pointed to the + * buffer we're not using this time around, which is where the + * next key goes. + */ + last = current; + if (cell_type == WT_CELL_KEY) { + current = last_pfx; + last_pfx = last; + } else { + current = last_ovfl; + last_ovfl = last; + } + WT_ASSERT(session, last != current); + } + WT_ERR(__verify_dsk_memsize(session, addr, dsk, cell)); + + /* + * On row-store internal pages, and on row-store leaf pages, where the + * "no empty values" flag is set, the key count should be equal to half + * the number of physical entries. On row-store leaf pages where the + * "all empty values" flag is set, the key count should be equal to the + * number of physical entries. + */ + if (dsk->type == WT_PAGE_ROW_INT && key_cnt * 2 != dsk->u.entries) + WT_ERR_VRFY(session, + "%s page at %s has a key count of %" PRIu32 " and a " + "physical entry count of %" PRIu32, + __wt_page_type_string(dsk->type), + addr, key_cnt, dsk->u.entries); + if (dsk->type == WT_PAGE_ROW_LEAF && + F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL) && + key_cnt != dsk->u.entries) + WT_ERR_VRFY(session, + "%s page at %s with the 'all empty values' flag set has a " + "key count of %" PRIu32 " and a physical entry count of %" + PRIu32, + __wt_page_type_string(dsk->type), + addr, key_cnt, dsk->u.entries); + if (dsk->type == WT_PAGE_ROW_LEAF && + F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE) && + key_cnt * 2 != dsk->u.entries) + WT_ERR_VRFY(session, + "%s page at %s with the 'no empty values' flag set has a " + "key count of %" PRIu32 " and a physical entry count of %" + PRIu32, + __wt_page_type_string(dsk->type), + addr, key_cnt, dsk->u.entries); + + if (0) { +eof: ret = __err_eof(session, cell_num, addr); + } + + if (0) { +err: if (ret == 0) + ret = WT_ERROR; + } + __wt_scr_free(¤t); + __wt_scr_free(&last_pfx); + __wt_scr_free(&last_ovfl); + return (ret); +} + +/* + * __verify_dsk_col_int -- + * Walk a WT_PAGE_COL_INT disk page and verify it. + */ +static int +__verify_dsk_col_int( + WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + uint32_t cell_num, i; + uint8_t *end; + + btree = S2BT(session); + bm = btree->bm; + unpack = &_unpack; + end = (uint8_t *)dsk + dsk->mem_size; + + cell_num = 0; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { + ++cell_num; + + /* Carefully unpack the cell. */ + if (__wt_cell_unpack_safe(cell, unpack, end) != 0) + return (__err_cell_corrupted(session, cell_num, addr)); + + /* Check the raw and collapsed cell types. */ + WT_RET(__err_cell_type( + session, cell_num, addr, unpack->raw, dsk->type)); + WT_RET(__err_cell_type( + session, cell_num, addr, unpack->type, dsk->type)); + + /* Check if any referenced item is entirely in the file. */ + if (!bm->addr_valid(bm, session, unpack->data, unpack->size)) + return (__err_eof(session, cell_num, addr)); + } + WT_RET(__verify_dsk_memsize(session, addr, dsk, cell)); + + return (0); +} + +/* + * __verify_dsk_col_fix -- + * Walk a WT_PAGE_COL_FIX disk page and verify it. + */ +static int +__verify_dsk_col_fix( + WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk) +{ + WT_BTREE *btree; + uint32_t datalen; + + btree = S2BT(session); + + datalen = __bitstr_size(btree->bitcnt * dsk->u.entries); + return (__verify_dsk_chunk(session, addr, dsk, datalen)); +} + +/* + * __verify_dsk_col_var -- + * Walk a WT_PAGE_COL_VAR disk page and verify it. + */ +static int +__verify_dsk_col_var( + WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + size_t last_size; + uint32_t cell_num, cell_type, i; + int last_deleted; + const uint8_t *last_data; + uint8_t *end; + + btree = S2BT(session); + bm = btree->bm; + unpack = &_unpack; + end = (uint8_t *)dsk + dsk->mem_size; + + last_data = NULL; + last_size = 0; + last_deleted = 0; + + cell_num = 0; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { + ++cell_num; + + /* Carefully unpack the cell. */ + if (__wt_cell_unpack_safe(cell, unpack, end) != 0) + return (__err_cell_corrupted(session, cell_num, addr)); + + /* Check the raw and collapsed cell types. */ + WT_RET(__err_cell_type( + session, cell_num, addr, unpack->raw, dsk->type)); + WT_RET(__err_cell_type( + session, cell_num, addr, unpack->type, dsk->type)); + cell_type = unpack->type; + + /* Check if any referenced item is entirely in the file. */ + if (cell_type == WT_CELL_VALUE_OVFL && + !bm->addr_valid(bm, session, unpack->data, unpack->size)) + return (__err_eof(session, cell_num, addr)); + + /* + * Compare the last two items and see if reconciliation missed + * a chance for RLE encoding. We don't have to care about data + * encoding or anything else, a byte comparison is enough. + */ + if (last_deleted == 1) { + if (cell_type == WT_CELL_DEL) + goto match_err; + } else + if (cell_type == WT_CELL_VALUE && + last_data != NULL && + last_size == unpack->size && + memcmp(last_data, unpack->data, last_size) == 0) +match_err: WT_RET_VRFY(session, + "data entries %" PRIu32 " and %" PRIu32 + " on page at %s are identical and should " + "have been run-length encoded", + cell_num - 1, cell_num, addr); + + switch (cell_type) { + case WT_CELL_DEL: + last_deleted = 1; + last_data = NULL; + break; + case WT_CELL_VALUE_OVFL: + last_deleted = 0; + last_data = NULL; + break; + case WT_CELL_VALUE: + last_deleted = 0; + last_data = unpack->data; + last_size = unpack->size; + break; + } + } + WT_RET(__verify_dsk_memsize(session, addr, dsk, cell)); + + return (0); +} + +/* + * __verify_dsk_memsize -- + * Verify the last cell on the page matches the page's memory size. + */ +static int +__verify_dsk_memsize(WT_SESSION_IMPL *session, + const char *addr, const WT_PAGE_HEADER *dsk, WT_CELL *cell) +{ + size_t len; + + /* + * We use the fact that cells exactly fill a page to detect the case of + * a row-store leaf page where the last cell is a key (that is, there's + * no subsequent value cell). Check for any page type containing cells. + */ + len = WT_PTRDIFF((uint8_t *)dsk + dsk->mem_size, cell); + if (len == 0) + return (0); + WT_RET_VRFY(session, + "%s page at %s has %" WT_SIZET_FMT " unexpected bytes of data " + "after the last cell", + __wt_page_type_string(dsk->type), addr, len); +} + +/* + * __verify_dsk_chunk -- + * Verify a Chunk O' Data on a Btree page. + */ +static int +__verify_dsk_chunk(WT_SESSION_IMPL *session, + const char *addr, const WT_PAGE_HEADER *dsk, uint32_t datalen) +{ + WT_BTREE *btree; + uint8_t *p, *end; + + btree = S2BT(session); + end = (uint8_t *)dsk + dsk->mem_size; + + /* + * Fixed-length column-store and overflow pages are simple chunks of + * data. + */ + if (datalen == 0) + WT_RET_VRFY(session, + "%s page at %s has no data", + __wt_page_type_string(dsk->type), addr); + + /* Verify the data doesn't overflow the end of the page. */ + p = WT_PAGE_HEADER_BYTE(btree, dsk); + if (p + datalen > end) + WT_RET_VRFY(session, + "data on page at %s extends past the end of the page", + addr); + + /* Any bytes after the data chunk should be nul bytes. */ + for (p += datalen; p < end; ++p) + if (*p != '\0') + WT_RET_VRFY(session, + "%s page at %s has non-zero trailing bytes", + __wt_page_type_string(dsk->type), addr); + + return (0); +} + +/* + * __err_cell_corrupted -- + * Generic corrupted cell, we couldn't read it. + */ +static int +__err_cell_corrupted( + WT_SESSION_IMPL *session, uint32_t entry_num, const char *addr) +{ + WT_RET_VRFY(session, + "item %" PRIu32 " on page at %s is a corrupted cell", + entry_num, addr); +} + +/* + * __err_cell_type -- + * Generic illegal cell type for a particular page type error. + */ +static int +__err_cell_type(WT_SESSION_IMPL *session, + uint32_t entry_num, const char *addr, uint8_t cell_type, uint8_t dsk_type) +{ + switch (cell_type) { + case WT_CELL_ADDR_DEL: + case WT_CELL_ADDR_INT: + case WT_CELL_ADDR_LEAF: + case WT_CELL_ADDR_LEAF_NO: + if (dsk_type == WT_PAGE_COL_INT || + dsk_type == WT_PAGE_ROW_INT) + return (0); + break; + case WT_CELL_DEL: + if (dsk_type == WT_PAGE_COL_VAR) + return (0); + break; + case WT_CELL_KEY: + case WT_CELL_KEY_OVFL: + case WT_CELL_KEY_SHORT: + if (dsk_type == WT_PAGE_ROW_INT || + dsk_type == WT_PAGE_ROW_LEAF) + return (0); + break; + case WT_CELL_KEY_PFX: + case WT_CELL_KEY_SHORT_PFX: + if (dsk_type == WT_PAGE_ROW_LEAF) + return (0); + break; + case WT_CELL_KEY_OVFL_RM: + case WT_CELL_VALUE_OVFL_RM: + /* + * Removed overflow cells are in-memory only, it's an error to + * ever see one on a disk page. + */ + break; + case WT_CELL_VALUE: + case WT_CELL_VALUE_COPY: + case WT_CELL_VALUE_OVFL: + case WT_CELL_VALUE_SHORT: + if (dsk_type == WT_PAGE_COL_VAR || + dsk_type == WT_PAGE_ROW_LEAF) + return (0); + break; + default: + break; + } + + WT_RET_VRFY(session, + "illegal cell and page type combination: cell %" PRIu32 + " on page at %s is a %s cell on a %s page", + entry_num, addr, + __wt_cell_type_string(cell_type), __wt_page_type_string(dsk_type)); +} + +/* + * __err_eof -- + * Generic item references non-existent file pages error. + */ +static int +__err_eof(WT_SESSION_IMPL *session, uint32_t entry_num, const char *addr) +{ + WT_RET_VRFY(session, + "off-page item %" PRIu32 + " on page at %s references non-existent file pages", + entry_num, addr); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c new file mode 100644 index 00000000000..ef35d215ec0 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -0,0 +1,285 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_tree_walk -- + * Move to the next/previous page in the tree. + */ +int +__wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + WT_REF *couple, *ref; + WT_TXN_STATE *txn_state; + int descending, prev, skip; + uint32_t slot; + + btree = S2BT(session); + descending = 0; + + /* + * Tree walks are special: they look inside page structures that splits + * may want to free. Publish that the tree is active during this + * window. + */ + WT_ENTER_PAGE_INDEX(session); + + /* + * !!! + * Fast-truncate currently only works on row-store trees. + */ + if (btree->type != BTREE_ROW) + LF_CLR(WT_READ_TRUNCATE); + + prev = LF_ISSET(WT_READ_PREV) ? 1 : 0; + + /* + * Pin a transaction ID, required to safely look at page index + * structures, if our caller has not already done so. + */ + txn_state = WT_SESSION_TXN_STATE(session); + if (txn_state->snap_min == WT_TXN_NONE) + txn_state->snap_min = S2C(session)->txn_global.last_running; + else + txn_state = NULL; + + /* + * There are multiple reasons and approaches to walking the in-memory + * tree: + * + * (1) finding pages to evict (the eviction server); + * (2) writing just dirty leaves or internal nodes (checkpoint); + * (3) discarding pages (close); + * (4) truncating pages in a range (fast truncate); + * (5) skipping pages based on outside information (compaction); + * (6) cursor scans (applications). + * + * Except for cursor scans and compaction, the walk is limited to the + * cache, no pages are read. In all cases, hazard pointers protect the + * walked pages from eviction. + * + * Walks use hazard-pointer coupling through the tree and that's OK + * (hazard pointers can't deadlock, so there's none of the usual + * problems found when logically locking up a btree). If the eviction + * thread tries to evict the active page, it fails because of our + * hazard pointer. If eviction tries to evict our parent, that fails + * because the parent has a child page that can't be discarded. We do + * play one game: don't couple up to our parent and then back down to a + * new leaf, couple to the next page to which we're descending, it + * saves a hazard-pointer swap for each cursor page movement. + * + * !!! + * NOTE: we depend on the fact it's OK to release a page we don't hold, + * that is, it's OK to release couple when couple is set to NULL. + * + * Take a copy of any held page and clear the return value. Remember + * the hazard pointer we're currently holding. + * + * We may be passed a pointer to btree->evict_page that we are clearing + * here. We check when discarding pages that we're not discarding that + * page, so this clear must be done before the page is released. + */ + couple = ref = *refp; + *refp = NULL; + + /* If no page is active, begin a walk from the start of the tree. */ + if (ref == NULL) { + ref = &btree->root; + if (ref->page == NULL) { + if (txn_state != NULL) + txn_state->snap_min = WT_TXN_NONE; + goto done; + } + goto descend; + } + +ascend: /* + * If the active page was the root, we've reached the walk's end. + * Release any hazard-pointer we're holding. + */ + if (__wt_ref_is_root(ref)) { + WT_ERR(__wt_page_release(session, couple, flags)); + goto done; + } + + /* Figure out the current slot in the WT_REF array. */ + __wt_page_refp(session, ref, &pindex, &slot); + + if (0) { +restart: /* + * The page we're moving to might have split, in which case find + * the last position we held. + * + * If we were starting a tree walk, begin again. + * + * If we were in the process of descending, repeat the descent. + * If we were moving within a single level of the tree, repeat + * the last move. + */ + ref = couple; + if (ref == &btree->root) { + ref = &btree->root; + if (ref->page == NULL) { + if (txn_state != NULL) + txn_state->snap_min = WT_TXN_NONE; + goto done; + } + goto descend; + } + __wt_page_refp(session, ref, &pindex, &slot); + if (descending) + goto descend; + } + + for (;;) { + /* + * If we're at the last/first slot on the page, return this page + * in post-order traversal. Otherwise we move to the next/prev + * slot and left/right-most element in its subtree. + */ + if ((prev && slot == 0) || + (!prev && slot == pindex->entries - 1)) { + ref = ref->home->pg_intl_parent_ref; + + /* Optionally skip internal pages. */ + if (LF_ISSET(WT_READ_SKIP_INTL)) + goto ascend; + + /* + * We've ascended the tree and are returning an internal + * page. If it's the root, discard our hazard pointer, + * otherwise, swap our hazard pointer for the page we'll + * return. + */ + if (__wt_ref_is_root(ref)) + WT_ERR(__wt_page_release( + session, couple, flags)); + else { + /* + * Locate the reference to our parent page then + * swap our child hazard pointer for the parent. + * We don't handle a restart return because it + * would require additional complexity in the + * restart code (ascent code somewhat like the + * descent code already there), and it's not a + * possible return: we're moving to the parent + * of the current child, not another child of + * the same parent, there's no way our parent + * split. + */ + __wt_page_refp(session, ref, &pindex, &slot); + if ((ret = __wt_page_swap( + session, couple, ref, flags)) != 0) { + WT_TRET(__wt_page_release( + session, couple, flags)); + WT_ERR(ret); + } + } + + *refp = ref; + goto done; + } + + if (prev) + --slot; + else + ++slot; + + for (descending = 0;;) { + ref = pindex->index[slot]; + + if (LF_ISSET(WT_READ_CACHE)) { + /* + * Only look at unlocked pages in memory: + * fast-path some common cases. + */ + if (LF_ISSET(WT_READ_NO_WAIT) && + ref->state != WT_REF_MEM) + break; + } else if (LF_ISSET(WT_READ_TRUNCATE)) { + /* + * If deleting a range, try to delete the page + * without instantiating it. + */ + WT_ERR(__wt_delete_page(session, ref, &skip)); + if (skip) + break; + } else if (LF_ISSET(WT_READ_COMPACT)) { + /* + * Skip deleted pages, rewriting them doesn't + * seem useful. + */ + if (ref->state == WT_REF_DELETED) + break; + + /* + * If the page is in-memory, we want to look at + * it (it may have been modified and written, + * and the current location is the interesting + * one in terms of compaction, not the original + * location). If the page isn't in-memory, test + * if the page will help with compaction, don't + * read it if we don't have to. + */ + if (ref->state == WT_REF_DISK) { + WT_ERR(__wt_compact_page_skip( + session, ref, &skip)); + if (skip) + break; + } + } else { + /* + * If iterating a cursor, try to skip deleted + * pages that are visible to us. + */ + if (ref->state == WT_REF_DELETED && + __wt_delete_page_skip(session, ref)) + break; + } + + ret = __wt_page_swap(session, couple, ref, flags); + if (ret == WT_NOTFOUND) { + ret = 0; + break; + } + if (ret == WT_RESTART) + goto restart; + WT_ERR(ret); + + /* + * Entering a new page: configure for traversal of any + * internal page's children, else return (or optionally + * skip), the leaf page. + */ +descend: couple = ref; + page = ref->page; + if (page->type == WT_PAGE_ROW_INT || + page->type == WT_PAGE_COL_INT) { + pindex = WT_INTL_INDEX_COPY(page); + slot = prev ? pindex->entries - 1 : 0; + descending = 1; + } else if (LF_ISSET(WT_READ_SKIP_LEAF)) + goto ascend; + else { + *refp = ref; + goto done; + } + } + } + +done: +err: if (txn_state != NULL) + txn_state->snap_min = WT_TXN_NONE; + + WT_LEAVE_PAGE_INDEX(session); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c new file mode 100644 index 00000000000..3a4a2a2987d --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/col_modify.c @@ -0,0 +1,223 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __col_insert_alloc( + WT_SESSION_IMPL *, uint64_t, u_int, WT_INSERT **, size_t *); + +/* + * __wt_col_modify -- + * Column-store delete, insert, and update. + */ +int +__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, + uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_INSERT *ins; + WT_INSERT_HEAD *ins_head, **ins_headp; + WT_ITEM _value; + WT_PAGE *page; + WT_UPDATE *old_upd; + size_t ins_size, upd_size; + u_int i, skipdepth; + int append, logged; + + btree = cbt->btree; + ins = NULL; + page = cbt->ref->page; + append = logged = 0; + + /* This code expects a remove to have a NULL value. */ + if (is_remove) { + if (btree->type == BTREE_COL_FIX) { + value = &_value; + value->data = ""; + value->size = 1; + } else + value = NULL; + } else { + /* + * There's some chance the application specified a record past + * the last record on the page. If that's the case, and we're + * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the + * append list, not the update list. In addition, a recno of + * 0 implies an append operation, we're allocating a new row. + */ + if (recno == 0 || + recno > (btree->type == BTREE_COL_VAR ? + __col_var_last_recno(page) : __col_fix_last_recno(page))) + append = 1; + } + + /* If we don't yet have a modify structure, we'll need one. */ + WT_RET(__wt_page_modify_init(session, page)); + + /* + * Delete, insert or update a column-store entry. + * + * If modifying a previously modified record, create a new WT_UPDATE + * entry and have a serialized function link it into an existing + * WT_INSERT entry's WT_UPDATE list. + * + * Else, allocate an insert array as necessary, build a WT_INSERT and + * WT_UPDATE structure pair, and call a serialized function to insert + * the WT_INSERT structure. + */ + if (cbt->compare == 0 && cbt->ins != NULL) { + /* + * If we are restoring updates that couldn't be evicted, the + * key must not exist on the new page. + */ + WT_ASSERT(session, upd == NULL); + + /* Make sure the update can proceed. */ + WT_ERR(__wt_txn_update_check( + session, old_upd = cbt->ins->upd)); + + /* Allocate a WT_UPDATE structure and transaction ID. */ + WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); + WT_ERR(__wt_txn_modify(session, upd)); + logged = 1; + + /* Avoid a data copy in WT_CURSOR.update. */ + cbt->modify_update = upd; + + /* + * Point the new WT_UPDATE item to the next element in the list. + * If we get it right, the serialization function lock acts as + * our memory barrier to flush this write. + */ + upd->next = old_upd; + + /* Serialize the update. */ + WT_ERR(__wt_update_serial( + session, page, &cbt->ins->upd, &upd, upd_size)); + } else { + /* Allocate the append/update list reference as necessary. */ + if (append) { + WT_PAGE_ALLOC_AND_SWAP(session, + page, page->modify->mod_append, ins_headp, 1); + ins_headp = &page->modify->mod_append[0]; + } else if (page->type == WT_PAGE_COL_FIX) { + WT_PAGE_ALLOC_AND_SWAP(session, + page, page->modify->mod_update, ins_headp, 1); + ins_headp = &page->modify->mod_update[0]; + } else { + WT_PAGE_ALLOC_AND_SWAP(session, + page, page->modify->mod_update, ins_headp, + page->pg_var_entries); + ins_headp = &page->modify->mod_update[cbt->slot]; + } + + /* Allocate the WT_INSERT_HEAD structure as necessary. */ + WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1); + ins_head = *ins_headp; + + /* Choose a skiplist depth for this insert. */ + skipdepth = __wt_skip_choose_depth(session); + + /* + * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and + * update the cursor to reference it (the WT_INSERT_HEAD might + * be allocated, the WT_INSERT was allocated). + */ + WT_ERR(__col_insert_alloc( + session, recno, skipdepth, &ins, &ins_size)); + cbt->ins_head = ins_head; + cbt->ins = ins; + + if (upd == NULL) { + WT_ERR( + __wt_update_alloc(session, value, &upd, &upd_size)); + WT_ERR(__wt_txn_modify(session, upd)); + logged = 1; + + /* Avoid a data copy in WT_CURSOR.update. */ + cbt->modify_update = upd; + } else + upd_size = sizeof(WT_UPDATE) + upd->size; + ins->upd = upd; + ins_size += upd_size; + + /* + * If there was no insert list during the search, or there was + * no search because the record number has not been allocated + * yet, the cursor's information cannot be correct, search + * couldn't have initialized it. + * + * Otherwise, point the new WT_INSERT item's skiplist to the + * next elements in the insert list (which we will check are + * still valid inside the serialization function). + * + * The serial mutex acts as our memory barrier to flush these + * writes before inserting them into the list. + */ + if (WT_SKIP_FIRST(ins_head) == NULL || recno == 0) + for (i = 0; i < skipdepth; i++) { + cbt->ins_stack[i] = &ins_head->head[i]; + ins->next[i] = cbt->next_stack[i] = NULL; + } + else + for (i = 0; i < skipdepth; i++) + ins->next[i] = cbt->next_stack[i]; + + /* Append or insert the WT_INSERT structure. */ + if (append) + WT_ERR(__wt_col_append_serial( + session, page, cbt->ins_head, cbt->ins_stack, + &ins, ins_size, &cbt->recno, skipdepth)); + else + WT_ERR(__wt_insert_serial( + session, page, cbt->ins_head, cbt->ins_stack, + &ins, ins_size, skipdepth)); + } + + /* If the update was successful, add it to the in-memory log. */ + if (logged) + WT_ERR(__wt_txn_log_op(session, cbt)); + + if (0) { +err: /* + * Remove the update from the current transaction, so we don't + * try to modify it on rollback. + */ + if (logged) + __wt_txn_unmodify(session); + __wt_free(session, ins); + __wt_free(session, upd); + } + + return (ret); +} + +/* + * __col_insert_alloc -- + * Column-store insert: allocate a WT_INSERT structure and fill it in. + */ +static int +__col_insert_alloc(WT_SESSION_IMPL *session, + uint64_t recno, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep) +{ + WT_INSERT *ins; + size_t ins_size; + + /* + * Allocate the WT_INSERT structure and skiplist pointers, then copy + * the record number into place. + */ + ins_size = sizeof(WT_INSERT) + skipdepth * sizeof(WT_INSERT *); + WT_RET(__wt_calloc(session, 1, ins_size, &ins)); + + WT_INSERT_RECNO(ins) = recno; + + *insp = ins; + *ins_sizep = ins_size; + return (0); +} diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c new file mode 100644 index 00000000000..e4083e2282f --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -0,0 +1,199 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_col_search -- + * Search a column-store tree for a specific record-based key. + */ +int +__wt_col_search(WT_SESSION_IMPL *session, + uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_COL *cip; + WT_DECL_RET; + WT_INSERT *ins; + WT_INSERT_HEAD *ins_head; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + WT_REF *current, *descent; + uint32_t base, indx, limit; + int depth; + + btree = S2BT(session); + + __cursor_pos_clear(cbt); + + /* + * In the service of eviction splits, we're only searching a single leaf + * page, not a full tree. + */ + if (leaf != NULL) { + current = leaf; + goto leaf_only; + } + + /* Search the internal pages of the tree. */ + current = &btree->root; + for (depth = 2;; ++depth) { +restart: page = current->page; + if (page->type != WT_PAGE_COL_INT) + break; + + WT_ASSERT(session, current->key.recno == page->pg_intl_recno); + + pindex = WT_INTL_INDEX_COPY(page); + base = pindex->entries; + descent = pindex->index[base - 1]; + + /* Fast path appends. */ + if (recno >= descent->key.recno) + goto descend; + + /* Binary search of internal pages. */ + for (base = 0, + limit = pindex->entries - 1; limit != 0; limit >>= 1) { + indx = base + (limit >> 1); + descent = pindex->index[indx]; + + if (recno == descent->key.recno) + break; + if (recno < descent->key.recno) + continue; + base = indx + 1; + --limit; + } +descend: /* + * Reference the slot used for next step down the tree. + * + * Base is the smallest index greater than recno and may be the + * (last + 1) index. The slot for descent is the one before + * base. + */ + if (recno != descent->key.recno) { + /* + * We don't have to correct for base == 0 because the + * only way for base to be 0 is if recno is the page's + * starting recno. + */ + WT_ASSERT(session, base > 0); + descent = pindex->index[base - 1]; + } + + /* + * Swap the current page for the child page. If the page splits + * while we're retrieving it, restart the search in the current + * page; otherwise return on error, the swap call ensures we're + * holding nothing on failure. + */ + switch (ret = __wt_page_swap(session, current, descent, 0)) { + case 0: + current = descent; + break; + case WT_RESTART: + goto restart; + default: + return (ret); + } + } + + /* Track how deep the tree gets. */ + if (depth > btree->maximum_depth) + btree->maximum_depth = depth; + +leaf_only: + page = current->page; + cbt->ref = current; + cbt->recno = recno; + cbt->compare = 0; + + /* + * Set the on-page slot to an impossible value larger than any possible + * slot (it's used to interpret the search function's return after the + * search returns an insert list for a page that has no entries). + */ + cbt->slot = UINT32_MAX; + + /* + * Search the leaf page. We do not check in the search path for a + * record greater than the maximum record in the tree; in that case, + * we arrive here with a record that's impossibly large for the page. + */ + if (page->type == WT_PAGE_COL_FIX) { + if (recno >= page->pg_fix_recno + page->pg_fix_entries) { + cbt->recno = page->pg_fix_recno + page->pg_fix_entries; + goto past_end; + } else + ins_head = WT_COL_UPDATE_SINGLE(page); + } else + if ((cip = __col_var_search(page, recno)) == NULL) { + cbt->recno = __col_var_last_recno(page); + goto past_end; + } else { + cbt->slot = WT_COL_SLOT(page, cip); + ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); + } + + /* + * We have a match on the page, check for an update. Check the page's + * update list (fixed-length), or slot's update list (variable-length) + * for a better match. The only better match we can find is an exact + * match, otherwise the existing match on the page is the one we want. + * For that reason, don't set the cursor's WT_INSERT_HEAD/WT_INSERT pair + * until we know we have a useful entry. + */ + if ((ins = __col_insert_search( + ins_head, cbt->ins_stack, cbt->next_stack, recno)) != NULL) + if (recno == WT_INSERT_RECNO(ins)) { + cbt->ins_head = ins_head; + cbt->ins = ins; + } + return (0); + +past_end: + /* + * A record past the end of the page's standard information. Check the + * append list; by definition, any record on the append list is closer + * than the last record on the page, so it's a better choice for return. + * This is a rarely used path: we normally find exact matches, because + * column-store files are dense, but in this case the caller searched + * past the end of the table. + * + * Don't bother searching if the caller is appending a new record where + * we'll allocate the record number; we're not going to find a match by + * definition, and we figure out the position when we do the work. + */ + cbt->ins_head = WT_COL_APPEND(page); + if (recno == UINT64_MAX) + cbt->ins = NULL; + else + cbt->ins = __col_insert_search( + cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno); + if (cbt->ins == NULL) + cbt->compare = -1; + else { + cbt->recno = WT_INSERT_RECNO(cbt->ins); + if (recno == cbt->recno) + cbt->compare = 0; + else if (recno < cbt->recno) + cbt->compare = 1; + else + cbt->compare = -1; + } + + /* + * Note if the record is past the maximum record in the tree, the cursor + * search functions need to know for fixed-length column-stores because + * appended records implicitly create any skipped records, and cursor + * search functions have to handle that case. + */ + if (cbt->compare == -1) + F_SET(cbt, WT_CBT_MAX_RECORD); + return (0); +} diff --git a/src/third_party/wiredtiger/src/btree/rec_evict.c b/src/third_party/wiredtiger/src/btree/rec_evict.c new file mode 100644 index 00000000000..4696e78059e --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/rec_evict.c @@ -0,0 +1,468 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __hazard_exclusive(WT_SESSION_IMPL *, WT_REF *, int); +static void __rec_discard_tree(WT_SESSION_IMPL *, WT_REF *, int, int); +static void __rec_excl_clear(WT_SESSION_IMPL *); +static void __rec_page_clean_update(WT_SESSION_IMPL *, WT_REF *); +static int __rec_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, int); +static int __rec_review(WT_SESSION_IMPL *, WT_REF *, int, int, int *); + +/* + * __wt_rec_evict -- + * Reconciliation plus eviction. + */ +int +__wt_rec_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) +{ + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_MODIFY *mod; + WT_TXN_STATE *txn_state; + int istree; + + page = ref->page; + istree = 0; + + WT_RET(__wt_verbose(session, WT_VERB_EVICT, + "page %p (%s)", page, __wt_page_type_string(page->type))); + + /* + * Pin the oldest transaction ID: eviction looks at page structures + * that are freed when no transaction in the system needs them. + */ + txn_state = WT_SESSION_TXN_STATE(session); + if (txn_state->snap_min == WT_TXN_NONE) + txn_state->snap_min = S2C(session)->txn_global.oldest_id; + else + txn_state = NULL; + + /* + * Get exclusive access to the page and review the page and its subtree + * for conditions that would block our eviction of the page. If the + * check fails (for example, we find a child page that can't be merged), + * we're done. We have to make this check for clean pages, too: while + * unlikely eviction would choose an internal page with children, it's + * not disallowed anywhere. + */ + WT_ERR(__rec_review(session, ref, exclusive, 1, &istree)); + + /* + * Update the page's modification reference, reconciliation might have + * changed it. + */ + mod = page->modify; + + /* Count evictions of internal pages during normal operation. */ + if (!exclusive && + (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) { + WT_STAT_FAST_CONN_INCR(session, cache_eviction_internal); + WT_STAT_FAST_DATA_INCR(session, cache_eviction_internal); + } + + /* Discard any subtree rooted in this page. */ + if (istree) + __rec_discard_tree(session, ref, exclusive, 1); + + /* Update the reference and discard the page. */ + if (mod == NULL || !F_ISSET(mod, WT_PM_REC_MASK)) { + WT_ASSERT(session, exclusive || ref->state == WT_REF_LOCKED); + + if (__wt_ref_is_root(ref)) + __wt_ref_out(session, ref); + else + __rec_page_clean_update(session, ref); + + WT_STAT_FAST_CONN_INCR(session, cache_eviction_clean); + WT_STAT_FAST_DATA_INCR(session, cache_eviction_clean); + } else { + if (__wt_ref_is_root(ref)) + __wt_ref_out(session, ref); + else + WT_ERR( + __rec_page_dirty_update(session, ref, exclusive)); + + WT_STAT_FAST_CONN_INCR(session, cache_eviction_dirty); + WT_STAT_FAST_DATA_INCR(session, cache_eviction_dirty); + } + + if (0) { +err: /* + * If unable to evict this page, release exclusive reference(s) + * we've acquired. + */ + if (!exclusive) + __rec_excl_clear(session); + + WT_STAT_FAST_CONN_INCR(session, cache_eviction_fail); + WT_STAT_FAST_DATA_INCR(session, cache_eviction_fail); + } + session->excl_next = 0; + + if (txn_state != NULL) + txn_state->snap_min = WT_TXN_NONE; + + return (ret); +} + +/* + * __rec_page_clean_update -- + * Update a clean page's reference on eviction. + */ +static void +__rec_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref) +{ + /* + * Discard the page and update the reference structure; if the page has + * an address, it's a disk page; if it has no address, it's a deleted + * page re-instantiated (for example, by searching) and never written. + */ + __wt_ref_out(session, ref); + WT_PUBLISH(ref->state, + ref->addr == NULL ? WT_REF_DELETED : WT_REF_DISK); +} + +/* + * __rec_page_dirty_update -- + * Update a dirty page's reference on eviction. + */ +static int +__rec_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) +{ + WT_ADDR *addr; + WT_PAGE *parent; + WT_PAGE_MODIFY *mod; + + parent = ref->home; + mod = ref->page->modify; + + switch (F_ISSET(mod, WT_PM_REC_MASK)) { + case WT_PM_REC_EMPTY: /* Page is empty */ + if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) { + __wt_free(session, ((WT_ADDR *)ref->addr)->addr); + __wt_free(session, ref->addr); + } + + /* + * Update the parent to reference a deleted page. The fact that + * reconciliation left the page "empty" means there's no older + * transaction in the system that might need to see an earlier + * version of the page. For that reason, we clear the address + * of the page, if we're forced to "read" into that namespace, + * we'll instantiate a new page instead of trying to read from + * the backing store. + * + * Publish: a barrier to ensure the structure fields are set + * before the state change makes the page available to readers. + */ + __wt_ref_out(session, ref); + ref->addr = NULL; + WT_PUBLISH(ref->state, WT_REF_DELETED); + break; + case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ + /* Split the page in memory. */ + WT_RET(__wt_split_evict(session, ref, exclusive)); + break; + case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ + if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) { + __wt_free(session, ((WT_ADDR *)ref->addr)->addr); + __wt_free(session, ref->addr); + } + + /* + * Update the parent to reference the replacement page. + * + * Publish: a barrier to ensure the structure fields are set + * before the state change makes the page available to readers. + */ + WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr)); + *addr = mod->mod_replace; + mod->mod_replace.addr = NULL; + mod->mod_replace.size = 0; + + __wt_ref_out(session, ref); + ref->addr = addr; + WT_PUBLISH(ref->state, WT_REF_DISK); + break; + WT_ILLEGAL_VALUE(session); + } + + return (0); +} + +/* + * __rec_discard_tree -- + * Discard the tree rooted a page (that is, any pages merged into it), + * then the page itself. + */ +static void +__rec_discard_tree( + WT_SESSION_IMPL *session, WT_REF *ref, int exclusive, int top) +{ + WT_REF *child; + + switch (ref->page->type) { + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + /* For each entry in the page... */ + WT_INTL_FOREACH_BEGIN(session, ref->page, child) { + if (child->state == WT_REF_DISK || + child->state == WT_REF_DELETED) + continue; + WT_ASSERT(session, + exclusive || child->state == WT_REF_LOCKED); + __rec_discard_tree(session, child, exclusive, 0); + } WT_INTL_FOREACH_END; + /* FALLTHROUGH */ + default: + if (!top) + __wt_ref_out(session, ref); + break; + } +} + +/* + * __rec_review -- + * Get exclusive access to the page and review the page and its subtree + * for conditions that would block its eviction. + */ +static int +__rec_review( + WT_SESSION_IMPL *session, WT_REF *ref, int exclusive, int top, int *istree) +{ + WT_BTREE *btree; + WT_PAGE *page; + WT_PAGE_MODIFY *mod; + WT_REF *child; + uint32_t flags; + + btree = S2BT(session); + page = ref->page; + + /* + * Get exclusive access to the page if our caller doesn't have the tree + * locked down. + */ + if (!exclusive) { + WT_RET(__hazard_exclusive(session, ref, top)); + + /* + * Now the page is locked, remove it from the LRU eviction + * queue. We have to do this before freeing the page memory or + * otherwise touching the reference because eviction paths + * assume a non-NULL reference on the queue is pointing at + * valid memory. + */ + __wt_evict_list_clear_page(session, ref); + } + + /* + * Recurse through the page's subtree: this happens first because we + * have to write pages in depth-first order, otherwise we'll dirty + * pages after we've written them. + */ + if (WT_PAGE_IS_INTERNAL(page)) + WT_INTL_FOREACH_BEGIN(session, page, child) { + switch (child->state) { + case WT_REF_DISK: /* On-disk */ + case WT_REF_DELETED: /* On-disk, deleted */ + break; + case WT_REF_MEM: /* In-memory */ + /* + * Tell our caller if there's a subtree so we + * know to do a full walk when discarding the + * page. + */ + *istree = 1; + WT_RET(__rec_review( + session, child, exclusive, 0, istree)); + break; + case WT_REF_LOCKED: /* Being evicted */ + case WT_REF_READING: /* Being read */ + case WT_REF_SPLIT: /* Being split */ + return (EBUSY); + WT_ILLEGAL_VALUE(session); + } + } WT_INTL_FOREACH_END; + + mod = page->modify; + + /* + * If the tree was deepened, there's a requirement that newly created + * internal pages not be evicted until all threads are known to have + * exited the original page index array, because evicting an internal + * page discards its WT_REF array, and a thread traversing the original + * page index array might see an freed WT_REF. During the split we set + * a transaction value, once that's globally visible, we know we can + * evict the created page. + */ + if (!exclusive && mod != NULL && WT_PAGE_IS_INTERNAL(page) && + !__wt_txn_visible_all(session, mod->mod_split_txn)) + return (EBUSY); + + /* + * If the file is being checkpointed, we can't evict dirty pages: + * if we write a page and free the previous version of the page, that + * previous version might be referenced by an internal page already + * been written in the checkpoint, leaving the checkpoint inconsistent. + * + * Don't rely on new updates being skipped by the transaction used + * for transaction reads: (1) there are paths that dirty pages for + * artificial reasons; (2) internal pages aren't transactional; and + * (3) if an update was skipped during the checkpoint (leaving the page + * dirty), then rolled back, we could still successfully overwrite a + * page and corrupt the checkpoint. + * + * Further, we can't race with the checkpoint's reconciliation of + * an internal page as we evict a clean child from the page's subtree. + * This works in the usual way: eviction locks the page and then checks + * for existing hazard pointers, the checkpoint thread reconciling an + * internal page acquires hazard pointers on child pages it reads, and + * is blocked by the exclusive lock. + */ + if (mod != NULL && btree->checkpointing && + (__wt_page_is_modified(page) || + F_ISSET(mod, WT_PM_REC_MULTIBLOCK))) { + WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint); + WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint); + return (EBUSY); + } + + /* + * Fail if any page in the top-level page's subtree won't be merged into + * its parent, the page that cannot be merged must be evicted first. + * The test is necessary but should not fire much: the eviction code is + * biased for leaf pages, an internal page shouldn't be selected for + * eviction until its children have been evicted. + * + * We have to write dirty pages to know their final state, a page marked + * empty may have had records added since reconciliation. Writing the + * page is expensive, do a cheap test first: if it doesn't seem likely a + * subtree page can be merged, quit. + */ + if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY))) + return (EBUSY); + + /* + * If the page is dirty and can possibly change state, write it so we + * know the final state. + * + * If we have an exclusive lock (we're discarding the tree), assert + * there are no updates we cannot read. + * + * Otherwise, if the top-level page we're evicting is a leaf page, set + * the update-restore flag, so reconciliation will write blocks it can + * write and create a list of skipped updates for blocks it cannot + * write. This is how forced eviction of huge pages works: we take a + * big page and reconcile it into blocks, some of which we write and + * discard, the rest of which we re-create as smaller in-memory pages, + * (restoring the updates that stopped us from writing the block), and + * inserting the whole mess into the page's parent. + * + * Don't set the update-restore flag for internal pages, they don't + * have updates that can be saved and restored. + * + * Don't set the update-restore flag for small pages. (If a small + * page were selected by eviction and then modified, and we configure it + * for update-restore, we'll end up splitting one or two pages into the + * parent, which is a waste of effort. If we don't set update-restore, + * eviction will return EBUSY, which makes more sense, the page was just + * modified.) + * + * Don't set the update-restore flag for any page other than the + * top one; only the reconciled top page goes through the split path + * (and child pages are pages we expect to merge into the top page, they + * they are not expected to split). + */ + if (__wt_page_is_modified(page)) { + flags = WT_EVICTING; + if (exclusive) + LF_SET(WT_SKIP_UPDATE_ERR); + else if (top && !WT_PAGE_IS_INTERNAL(page) && + page->memory_footprint > 10 * btree->maxleafpage) + LF_SET(WT_SKIP_UPDATE_RESTORE); + WT_RET(__wt_rec_write(session, ref, NULL, flags)); + WT_ASSERT(session, + !__wt_page_is_modified(page) || + LF_ISSET(WT_SKIP_UPDATE_RESTORE)); + } else { + /* + * If the page was ever modified, make sure all of the updates + * on the page are old enough they can be discarded from cache. + */ + if (!exclusive && mod != NULL && + !__wt_txn_visible_all(session, mod->rec_max_txn)) + return (EBUSY); + } + + /* + * Repeat the test: fail if any page in the top-level page's subtree + * won't be merged into its parent. + */ + if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY))) + return (EBUSY); + + return (0); +} + +/* + * __rec_excl_clear -- + * Discard exclusive access and return a page's subtree to availability. + */ +static void +__rec_excl_clear(WT_SESSION_IMPL *session) +{ + WT_REF *ref; + uint32_t i; + + for (i = 0; i < session->excl_next; ++i) { + if ((ref = session->excl[i]) == NULL) + break; + WT_ASSERT(session, + ref->state == WT_REF_LOCKED && ref->page != NULL); + ref->state = WT_REF_MEM; + } +} + +/* + * __hazard_exclusive -- + * Request exclusive access to a page. + */ +static int +__hazard_exclusive(WT_SESSION_IMPL *session, WT_REF *ref, int top) +{ + /* + * Make sure there is space to track exclusive access so we can unlock + * to clean up. + */ + WT_RET(__wt_realloc_def(session, &session->excl_allocated, + session->excl_next + 1, &session->excl)); + + /* + * Request exclusive access to the page. The top-level page should + * already be in the locked state, lock child pages in memory. + * If another thread already has this page, give up. + */ + if (!top && !WT_ATOMIC_CAS4(ref->state, WT_REF_MEM, WT_REF_LOCKED)) + return (EBUSY); /* We couldn't change the state. */ + WT_ASSERT(session, ref->state == WT_REF_LOCKED); + + session->excl[session->excl_next++] = ref; + + /* Check for a matching hazard pointer. */ + if (__wt_page_hazard_check(session, ref->page) == NULL) + return (0); + + WT_STAT_FAST_DATA_INCR(session, cache_eviction_hazard); + WT_STAT_FAST_CONN_INCR(session, cache_eviction_hazard); + + WT_RET(__wt_verbose(session, WT_VERB_EVICT, + "page %p hazard request failed", ref->page)); + return (EBUSY); +} diff --git a/src/third_party/wiredtiger/src/btree/rec_split.c b/src/third_party/wiredtiger/src/btree/rec_split.c new file mode 100644 index 00000000000..babec2cc295 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/rec_split.c @@ -0,0 +1,1121 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * Tuning; global variables to allow the binary to be patched, we don't yet have + * any real understanding of what might be useful to surface to applications. + */ +static u_int __split_deepen_max_internal_image = 100; +static u_int __split_deepen_min_child = 10; +static u_int __split_deepen_per_child = 100; +static u_int __split_deepen_split_child = 100; + +/* + * Track allocation increments, matching the cache calculations, which add an + * estimate of allocation overhead to every object. + */ +#define WT_MEMSIZE_ADD(total, len) do { \ + total += (len) + WT_ALLOC_OVERHEAD; \ +} while (0) +#define WT_MEMSIZE_TRANSFER(from_decr, to_incr, len) do { \ + WT_MEMSIZE_ADD(from_decr, len); \ + WT_MEMSIZE_ADD(to_incr, len); \ +} while (0) + +/* + * __split_oldest_gen -- + * Calculate the oldest active split generation. + */ +static uint64_t +__split_oldest_gen(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_SESSION_IMPL *s; + uint64_t gen, oldest; + u_int i, session_cnt; + + conn = S2C(session); + WT_ORDERED_READ(session_cnt, conn->session_cnt); + for (i = 0, s = conn->sessions, oldest = conn->split_gen + 1; + i < session_cnt; + i++, s++) + if (((gen = s->split_gen) != 0) && gen < oldest) + oldest = gen; + + return (oldest); +} + +/* + * __split_stash_add -- + * Add a new entry into the session's split stash list. + */ +static int +__split_stash_add(WT_SESSION_IMPL *session, void *p, size_t len) +{ + WT_SPLIT_STASH *stash; + + WT_ASSERT(session, p != NULL); + + /* Grow the list as necessary. */ + WT_RET(__wt_realloc_def(session, &session->split_stash_alloc, + session->split_stash_cnt + 1, &session->split_stash)); + + stash = session->split_stash + session->split_stash_cnt++; + stash->split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1); + stash->p = p; + stash->len = len; + + WT_STAT_FAST_CONN_ATOMIC_INCRV(session, rec_split_stashed_bytes, len); + WT_STAT_FAST_CONN_ATOMIC_INCR(session, rec_split_stashed_objects); + + /* See if we can free any previous entries. */ + if (session->split_stash_cnt > 1) + __wt_split_stash_discard(session); + + return (0); +} + +/* + * __wt_split_stash_discard -- + * Discard any memory from a session's split stash that we can. + */ +void +__wt_split_stash_discard(WT_SESSION_IMPL *session) +{ + WT_SPLIT_STASH *stash; + uint64_t oldest; + size_t i; + + /* Get the oldest split generation. */ + oldest = __split_oldest_gen(session); + + for (i = 0, stash = session->split_stash; + i < session->split_stash_cnt; + ++i, ++stash) { + if (stash->p == NULL) + continue; + else if (stash->split_gen >= oldest) + break; + /* + * It's a bad thing if another thread is in this memory after + * we free it, make sure nothing good happens to that thread. + */ + WT_STAT_FAST_CONN_ATOMIC_DECRV( + session, rec_split_stashed_bytes, stash->len); + WT_STAT_FAST_CONN_ATOMIC_DECR( + session, rec_split_stashed_objects); + __wt_overwrite_and_free_len(session, stash->p, stash->len); + } + + /* + * If there are enough free slots at the beginning of the list, shuffle + * everything down. + */ + if (i > 100 || i == session->split_stash_cnt) + if ((session->split_stash_cnt -= i) > 0) + memmove(session->split_stash, stash, + session->split_stash_cnt * sizeof(*stash)); +} + +/* + * __wt_split_stash_discard_all -- + * Discard all memory from a session's split stash. + */ +void +__wt_split_stash_discard_all( + WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session) +{ + WT_SPLIT_STASH *stash; + size_t i; + + /* + * This function is called during WT_CONNECTION.close to discard any + * memory that remains. For that reason, we take two WT_SESSION_IMPL + * arguments: session_safe is still linked to the WT_CONNECTION and + * can be safely used for calls to other WiredTiger functions, while + * session is the WT_SESSION_IMPL we're cleaning up. + */ + for (i = 0, stash = session->split_stash; + i < session->split_stash_cnt; + ++i, ++stash) + if (stash->p != NULL) + __wt_free(session_safe, stash->p); + + __wt_free(session_safe, session->split_stash); + session->split_stash_cnt = session->split_stash_alloc = 0; +} + +/* + * __split_safe_free -- + * Free a buffer if we can be sure no thread is accessing it, or schedule + * it to be freed otherwise. + */ +static int +__split_safe_free(WT_SESSION_IMPL *session, int exclusive, void *p, size_t s) +{ + /* + * We have swapped something in a page: if we don't have exclusive + * access, check whether there are other threads in the same tree. + */ + if (!exclusive && + __split_oldest_gen(session) == S2C(session)->split_gen + 1) + exclusive = 1; + + if (exclusive) { + __wt_free(session, p); + return (0); + } + + return (__split_stash_add(session, p, s)); +} + +/* + * __split_should_deepen -- + * Return if we should deepen the tree. + */ +static int +__split_should_deepen(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_PAGE_INDEX *pindex; + + /* + * Splits are based on either the number of child pages that will be + * created by the split (splitting an internal page that will be slow + * to search), or by the memory footprint of the parent page (avoiding + * an internal page that will eat up all of the cache and put eviction + * pressure on the system). + */ + pindex = WT_INTL_INDEX_COPY(page); + + /* + * Deepen the tree if the page's memory footprint is larger than the + * maximum size for a page in memory. We need an absolute minimum + * number of entries in order to split the page: if there is a single + * huge key, splitting won't help. + */ + if (page->memory_footprint > S2BT(session)->maxmempage && + pindex->entries >= __split_deepen_min_child) + return (1); + + /* + * Deepen the tree if the page's memory footprint is at least N + * times the maximum internal page size chunk in the backing file and + * the split will result in at least N children in the newly created + * intermediate layer. + */ + if (page->memory_footprint > + __split_deepen_max_internal_image * S2BT(session)->maxintlpage && + pindex->entries >= + (__split_deepen_per_child * __split_deepen_split_child)) + return (1); + + return (0); +} + +/* + * __split_ovfl_key_cleanup -- + * Handle cleanup for on-page row-store overflow keys. + */ +static int +__split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref) +{ + WT_CELL *cell; + WT_CELL_UNPACK kpack; + WT_IKEY *ikey; + uint32_t cell_offset; + + /* + * A key being discarded (page split) or moved to a different page (page + * deepening) may be an on-page overflow key. Clear any reference to an + * underlying disk image, and, if the key hasn't been deleted, delete it + * along with any backing blocks. + */ + if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) + return (0); + if ((cell_offset = ikey->cell_offset) == 0) + return (0); + + /* Leak blocks rather than try this twice. */ + ikey->cell_offset = 0; + + cell = WT_PAGE_REF_OFFSET(page, cell_offset); + __wt_cell_unpack(cell, &kpack); + if (kpack.ovfl && kpack.raw != WT_CELL_KEY_OVFL_RM) + WT_RET(__wt_ovfl_discard(session, cell)); + + return (0); +} + +/* + * __split_ref_instantiate -- + * Instantiate key/address pairs in memory in service of a split. + */ +static int +__split_ref_instantiate(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp) +{ + WT_ADDR *addr; + WT_CELL_UNPACK unpack; + WT_DECL_RET; + WT_IKEY *ikey; + size_t size; + void *key; + + /* + * Instantiate row-store keys, and column- and row-store addresses in + * the WT_REF structures referenced by a page that's being split (and + * deepening the tree). The WT_REF structures aren't moving, but the + * index references are moving from the page we're splitting to a set + * of child pages, and so we can no longer reference the block image + * that remains with the page being split. + * + * Track how much memory the parent is losing and the child gaining. + * + * No locking is required to update the WT_REF structure because we're + * the only thread splitting the parent page, and there's no way for + * readers to race with our updates of single pointers. The changes + * have to be written before the page goes away, of course, our caller + * owns that problem. + * + * Row-store keys, first. + */ + if (page->type == WT_PAGE_ROW_INT) { + if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) { + __wt_ref_key(page, ref, &key, &size); + WT_RET(__wt_row_ikey(session, 0, key, size, &ikey)); + ref->key.ikey = ikey; + } else { + WT_RET(__split_ovfl_key_cleanup(session, page, ref)); + WT_MEMSIZE_ADD(*parent_decrp, + sizeof(WT_IKEY) + ikey->size); + } + WT_MEMSIZE_ADD(*child_incrp, sizeof(WT_IKEY) + ikey->size); + } + + /* + * If there's no address (the page has never been written), or the + * address has been instantiated, there's no work to do. Otherwise, + * get the address from the on-page cell. + */ + if ((addr = ref->addr) == NULL) + return (0); + if (__wt_off_page(page, addr)) + WT_MEMSIZE_TRANSFER(*parent_decrp, *child_incrp, + sizeof(WT_ADDR) + addr->size); + else { + __wt_cell_unpack((WT_CELL *)ref->addr, &unpack); + WT_RET(__wt_calloc_def(session, 1, &addr)); + if ((ret = __wt_strndup( + session, unpack.data, unpack.size, &addr->addr)) != 0) { + __wt_free(session, addr); + return (ret); + } + addr->size = (uint8_t)unpack.size; + addr->type = + unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF; + ref->addr = addr; + WT_MEMSIZE_ADD(*child_incrp, sizeof(WT_ADDR) + addr->size); + } + return (0); +} + +#ifdef HAVE_DIAGNOSTIC +/* + * __split_verify_intl_key_order -- + * Verify the key order on an internal page after a split, diagnostic only. + */ +static void +__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BTREE *btree; + WT_ITEM *next, _next, *last, _last, *tmp; + WT_REF *ref; + uint64_t recno; + int cmp, first; + + btree = S2BT(session); + + switch (page->type) { + case WT_PAGE_COL_INT: + recno = 0; + WT_INTL_FOREACH_BEGIN(session, page, ref) { + WT_ASSERT(session, ref->key.recno > recno); + recno = ref->key.recno; + } WT_INTL_FOREACH_END; + break; + case WT_PAGE_ROW_INT: + next = &_next; + WT_CLEAR(_next); + last = &_last; + WT_CLEAR(_last); + + first = 1; + WT_INTL_FOREACH_BEGIN(session, page, ref) { + __wt_ref_key(page, ref, &next->data, &next->size); + if (last->size == 0) { + if (first) + first = 0; + else { + WT_ASSERT(session, __wt_compare( + session, btree->collator, last, + next, &cmp) == 0); + WT_ASSERT(session, cmp < 0); + } + } + tmp = last; + last = next; + next = tmp; + } WT_INTL_FOREACH_END; + break; + } +} +#endif + +/* + * __split_deepen -- + * Split an internal page in-memory, deepening the tree. + */ +static int +__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) +{ + WT_DECL_RET; + WT_PAGE *child; + WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex; + WT_REF **alloc_refp; + WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref; + size_t child_incr, parent_decr, parent_incr, size; + uint32_t children, chunk, i, j, remain, slots; + int panic; + void *p; + + alloc_index = NULL; + parent_incr = parent_decr = 0; + panic = 0; + + pindex = WT_INTL_INDEX_COPY(parent); + + /* + * Create N children, unless we are dealing with a large page without + * many entries, in which case split into the minimum number of pages. + */ + children = WT_MAX(pindex->entries / __split_deepen_per_child, + __split_deepen_min_child); + + WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); + WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, + "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children", + parent, pindex->entries, children)); + + /* + * If the workload is prepending/appending to the tree, we could deepen + * without bound. Don't let that happen, keep the first/last pages of + * the tree at their current level. + * + * XXX + * To improve this, we could track which pages were last merged into + * this page by eviction, and leave those pages alone, to prevent any + * sustained insert into the tree from deepening a single location. + */ +#undef SPLIT_CORRECT_1 +#define SPLIT_CORRECT_1 1 /* First page correction */ +#undef SPLIT_CORRECT_2 +#define SPLIT_CORRECT_2 2 /* First/last page correction */ + + /* + * Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize + * the first/last slots of the allocated WT_PAGE_INDEX to point to the + * first/last pages we're keeping at the current level, and the rest of + * the slots to point to new WT_REF objects. + */ + size = sizeof(WT_PAGE_INDEX) + + (children + SPLIT_CORRECT_2) * sizeof(WT_REF *); + WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); + WT_MEMSIZE_ADD(parent_incr, size); + alloc_index->index = (WT_REF **)(alloc_index + 1); + alloc_index->entries = children + SPLIT_CORRECT_2; + alloc_index->index[0] = pindex->index[0]; + alloc_index->index[alloc_index->entries - 1] = + pindex->index[pindex->entries - 1]; + for (alloc_refp = alloc_index->index + SPLIT_CORRECT_1, + i = 0; i < children; ++alloc_refp, ++i) { + WT_ERR(__wt_calloc_def(session, 1, alloc_refp)); + WT_MEMSIZE_ADD(parent_incr, sizeof(WT_REF)); + } + + /* Allocate child pages, and connect them into the new page index. */ + chunk = (pindex->entries - SPLIT_CORRECT_2) / children; + remain = (pindex->entries - SPLIT_CORRECT_2) - chunk * (children - 1); + for (parent_refp = pindex->index + SPLIT_CORRECT_1, + alloc_refp = alloc_index->index + SPLIT_CORRECT_1, + i = 0; i < children; ++i) { + slots = i == children - 1 ? remain : chunk; + WT_ERR(__wt_page_alloc( + session, parent->type, 0, slots, 0, &child)); + + /* + * Initialize the parent page's child reference; we need a copy + * of the page's key. + */ + ref = *alloc_refp++; + ref->home = parent; + ref->page = child; + ref->addr = NULL; + if (parent->type == WT_PAGE_ROW_INT) { + __wt_ref_key(parent, *parent_refp, &p, &size); + WT_ERR( + __wt_row_ikey(session, 0, p, size, &ref->key.ikey)); + WT_MEMSIZE_ADD(parent_incr, sizeof(WT_IKEY) + size); + } else + ref->key.recno = (*parent_refp)->key.recno; + ref->state = WT_REF_MEM; + + /* Initialize the child page. */ + if (parent->type == WT_PAGE_COL_INT) + child->pg_intl_recno = (*parent_refp)->key.recno; + child->pg_intl_parent_ref = ref; + + /* Mark it dirty. */ + WT_ERR(__wt_page_modify_init(session, child)); + __wt_page_only_modify_set(session, child); + + /* + * Once the split goes live, the newly created internal pages + * might be evicted and their WT_REF structures freed. If those + * pages are evicted before threads exit the previous page index + * array, a thread might see a freed WT_REF. Set the eviction + * transaction requirement for the newly created internal pages. + */ + child->modify->mod_split_txn = __wt_txn_new_id(session); + + /* + * The newly allocated child's page index references the same + * structures as the parent. (We cannot move WT_REF structures, + * threads may be underneath us right now changing the structure + * state.) However, if the WT_REF structures reference on-page + * information, we have to fix that, because the disk image for + * the page that has an page index entry for the WT_REF is about + * to change. + */ + child_incr = 0; + child_pindex = WT_INTL_INDEX_COPY(child); + for (child_refp = child_pindex->index, j = 0; j < slots; ++j) { + WT_ERR(__split_ref_instantiate(session, + parent, *parent_refp, &parent_decr, &child_incr)); + *child_refp++ = *parent_refp++; + + WT_MEMSIZE_TRANSFER( + parent_decr, child_incr, sizeof(WT_REF)); + } + __wt_cache_page_inmem_incr(session, child, child_incr); + } + WT_ASSERT(session, alloc_refp - + alloc_index->index == alloc_index->entries - SPLIT_CORRECT_1); + WT_ASSERT(session, + parent_refp - pindex->index == pindex->entries - SPLIT_CORRECT_1); + + /* + * Update the parent's index; this is the update which splits the page, + * making the change visible to threads descending the tree. From now + * on, we're committed to the split. If any subsequent work fails, we + * have to panic because we potentially have threads of control using + * the new page index we just swapped in. + * + * A note on error handling: until this point, there's no problem with + * unwinding on error. We allocated a new page index, a new set of + * WT_REFs and a new set of child pages -- if an error occurred, the + * parent remained unchanged, although it may have an incorrect memory + * footprint. From now on we've modified the parent page, attention + * needs to be paid. + */ + WT_INTL_INDEX_SET(parent, alloc_index); + panic = 1; + +#ifdef HAVE_DIAGNOSTIC + __split_verify_intl_key_order(session, parent); +#endif + + /* + * The moved reference structures now reference the wrong parent page, + * and we have to fix that up. The problem is revealed when a thread + * of control searches for a page's reference structure slot, and fails + * to find it because the page it's searching no longer references it. + * When that failure happens, the thread waits for the reference's home + * page to be updated, which we do here: walk the children and fix them + * up. + * + * We're not acquiring hazard pointers on these pages, they cannot be + * evicted because of the eviction transaction value set above. + */ + for (parent_refp = alloc_index->index, + i = alloc_index->entries; i > 0; ++parent_refp, --i) { + parent_ref = *parent_refp; + WT_ASSERT(session, parent_ref->home == parent); + if (parent_ref->state != WT_REF_MEM) + continue; + + /* + * We left the first/last children of the parent at the current + * level to avoid bad split patterns, they might be leaf pages; + * check the page type before we continue. + */ + child = parent_ref->page; + if (!WT_PAGE_IS_INTERNAL(child)) + continue; +#ifdef HAVE_DIAGNOSTIC + __split_verify_intl_key_order(session, child); +#endif + WT_INTL_FOREACH_BEGIN(session, child, child_ref) { + /* + * The page's parent reference may not be wrong, as we + * opened up access from the top of the tree already, + * pages may have been read in since then. Check and + * only update pages that reference the original page, + * they must be wrong. + */ + if (child_ref->home == parent) { + child_ref->home = child; + child_ref->ref_hint = 0; + } + } WT_INTL_FOREACH_END; + } + + /* + * Push out the changes: not required for correctness, but don't let + * threads spin on incorrect page references longer than necessary. + */ + WT_FULL_BARRIER(); + alloc_index = NULL; + + /* + * We can't free the previous parent's index, there may be threads using + * it. Add to the session's discard list, to be freed once we know no + * threads can still be using it. + * + * This change requires care with error handling: we have already + * updated the page with a new index. Even if stashing the old value + * fails, we don't roll back that change, because threads may already + * be using the new index. + */ + size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); + WT_MEMSIZE_ADD(parent_decr, size); + WT_ERR(__split_safe_free(session, 0, pindex, size)); + + /* + * Adjust the parent's memory footprint. This may look odd, but we + * have already taken the allocation overhead into account, and an + * increment followed by a decrement will cancel out the normal + * adjustment. + */ + __wt_cache_page_inmem_incr(session, parent, parent_incr); + __wt_cache_page_inmem_decr(session, parent, parent_decr); + + if (0) { +err: __wt_free_ref_index(session, parent, alloc_index, 1); + + /* + * If panic is set, we saw an error after opening up the tree + * to descent through the parent page's new index. There is + * nothing we can do, the tree is inconsistent and there are + * threads potentially active in both versions of the tree. + */ + if (panic) + ret = __wt_panic(session); + } + return (ret); +} + +/* + * __split_inmem_build -- + * Instantiate a page in a multi-block set, when an update couldn't be + * written. + */ +static int +__split_inmem_build( + WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref, WT_MULTI *multi) +{ + WT_CURSOR_BTREE cbt; + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_PAGE *page; + WT_UPDATE *upd; + WT_UPD_SKIPPED *skip; + uint64_t recno; + uint32_t i, slot; + + WT_CLEAR(cbt); + cbt.iface.session = &session->iface; + cbt.btree = S2BT(session); + + /* + * We can find unresolved updates when attempting to evict a page, which + * can't be written. This code re-creates the in-memory page and applies + * the unresolved updates to that page. + * + * Clear the disk image and link the page into the passed-in WT_REF to + * simplify error handling: our caller will not discard the disk image + * when discarding the original page, and our caller will discard the + * allocated page on error, when discarding the allocated WT_REF. + */ + WT_RET(__wt_page_inmem( + session, ref, multi->skip_dsk, WT_PAGE_DISK_ALLOC, &page)); + multi->skip_dsk = NULL; + + if (orig->type == WT_PAGE_ROW_LEAF) + WT_RET(__wt_scr_alloc(session, 0, &key)); + + /* Re-create each modification we couldn't write. */ + for (i = 0, skip = multi->skip; i < multi->skip_entries; ++i, ++skip) + switch (orig->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + /* Build a key. */ + upd = skip->ins->upd; + skip->ins->upd = NULL; + recno = WT_INSERT_RECNO(skip->ins); + + /* Search the page. */ + WT_ERR(__wt_col_search(session, recno, ref, &cbt)); + + /* Apply the modification. */ + WT_ERR(__wt_col_modify( + session, &cbt, recno, NULL, upd, 0)); + break; + case WT_PAGE_ROW_LEAF: + /* Build a key. */ + if (skip->ins == NULL) { + slot = WT_ROW_SLOT(orig, skip->rip); + upd = orig->pg_row_upd[slot]; + orig->pg_row_upd[slot] = NULL; + + WT_ERR(__wt_row_leaf_key( + session, orig, skip->rip, key, 0)); + } else { + upd = skip->ins->upd; + skip->ins->upd = NULL; + + key->data = WT_INSERT_KEY(skip->ins); + key->size = WT_INSERT_KEY_SIZE(skip->ins); + } + + /* Search the page. */ + WT_ERR(__wt_row_search(session, key, ref, &cbt, 1)); + + /* Apply the modification. */ + WT_ERR( + __wt_row_modify(session, &cbt, key, NULL, upd, 0)); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* + * We modified the page above, which will have set the first dirty + * transaction to the last transaction current running. However, the + * updates we installed may be older than that. Take the oldest active + * transaction ID to make sure these updates are not skipped by a + * checkpoint. + */ + page->modify->first_dirty_txn = S2C(session)->txn_global.oldest_id; + +err: __wt_scr_free(&key); + /* Free any resources that may have been cached in the cursor. */ + WT_TRET(__wt_btcur_close(&cbt)); + return (ret); +} + +/* + * __wt_multi_to_ref -- + * Move a multi-block list into an array of WT_REF structures. + */ +int +__wt_multi_to_ref(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp) +{ + WT_ADDR *addr; + WT_IKEY *ikey; + WT_REF *ref; + size_t incr; + + addr = NULL; + incr = 0; + + /* In some cases, the underlying WT_REF has not yet been allocated. */ + if (*refp == NULL) { + WT_RET(__wt_calloc_def(session, 1, refp)); + WT_MEMSIZE_ADD(incr, sizeof(WT_REF)); + } + ref = *refp; + + /* + * Any parent reference must be filled in by our caller; the primary + * use of this function is when splitting into a parent page, and we + * aren't holding any locks here that would allow us to know which + * parent we'll eventually split into, if the tree is simultaneously + * being deepened. + */ + ref->home = NULL; + + if (multi->skip == NULL) { + /* + * Copy the address: we could simply take the buffer, but that + * would complicate error handling, freeing the reference array + * would have to avoid freeing the memory, and it's not worth + * the confusion. + */ + WT_RET(__wt_calloc_def(session, 1, &addr)); + WT_MEMSIZE_ADD(incr, sizeof(WT_ADDR)); + ref->addr = addr; + addr->size = multi->addr.size; + addr->type = multi->addr.type; + WT_RET(__wt_strndup(session, + multi->addr.addr, addr->size, &addr->addr)); + /* Need a cast to avoid an implicit conversion warning. */ + WT_MEMSIZE_ADD(incr, addr->size); + } else + WT_RET(__split_inmem_build(session, page, ref, multi)); + + switch (page->type) { + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + ikey = multi->key.ikey; + WT_RET(__wt_row_ikey(session, 0, + WT_IKEY_DATA(ikey), ikey->size, &ref->key.ikey)); + WT_MEMSIZE_ADD(incr, sizeof(WT_IKEY) + ikey->size); + break; + default: + ref->key.recno = multi->key.recno; + break; + } + + ref->state = multi->skip == NULL ? WT_REF_DISK : WT_REF_MEM; + + /* + * If our caller wants to track the memory allocations, we have a return + * reference. + */ + if (incrp != NULL) + *incrp += incr; + return (0); +} + +/* + * __split_evict_multi -- + * Resolve a multi-page split, inserting new information into the parent. + */ +static int +__split_evict_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) +{ + WT_DECL_RET; + WT_IKEY *ikey; + WT_PAGE *parent, *child; + WT_PAGE_INDEX *alloc_index, *pindex; + WT_PAGE_MODIFY *mod; + WT_REF **alloc_refp, *parent_ref, ref_copy, **ref_tmp; + size_t parent_decr, parent_incr, size; + uint32_t i, j, parent_entries, result_entries, split_entries; + int complete, hazard, locked; + + parent = NULL; /* -Wconditional-uninitialized */ + alloc_index = NULL; + parent_ref = NULL; + ref_tmp = NULL; + parent_decr = parent_incr = 0; + complete = hazard = locked = 0; + + child = ref->page; + mod = child->modify; + + /* + * Convert the split page's multiblock reconciliation information into + * an array of page reference structures. + */ + split_entries = mod->mod_multi_entries; + WT_RET(__wt_calloc_def(session, split_entries, &ref_tmp)); + for (i = 0; i < split_entries; ++i) + WT_ERR(__wt_multi_to_ref(session, + child, &mod->mod_multi[i], &ref_tmp[i], &parent_incr)); + + /* + * Get a page-level lock on the parent to single-thread splits into the + * page because we need to single-thread sizing/growing the page index. + * It's OK to queue up multiple splits as the child pages split, but the + * actual split into the parent has to be serialized. Note we allocate + * memory inside of the lock and may want to invest effort in making the + * locked period shorter. + * + * We could race with another thread deepening our parent. To deal + * with that, read the parent pointer each time we try to lock it, and + * check that it's still correct after it is locked. + */ + for (;;) { + parent = ref->home; + F_CAS_ATOMIC(parent, WT_PAGE_SPLITTING, ret); + if (ret == 0) { + if (parent == ref->home) + break; + F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING); + continue; + } + __wt_yield(); + } + locked = 1; + + /* + * We have exclusive access to split the parent, and at this point, the + * child prevents the parent from being evicted. However, once we + * update the parent's index, it will no longer refer to the child, and + * could conceivably be evicted. Get a hazard pointer on the parent + * now, so that we can safely access it after updating the index. + */ + if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) { + WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT)); + hazard = 1; + } + + pindex = WT_INTL_INDEX_COPY(parent); + parent_entries = pindex->entries; + result_entries = (parent_entries - 1) + split_entries; + + /* + * Allocate and initialize a new page index array for the parent, then + * copy references from the original index array, plus references from + * the newly created split array, into place. + */ + size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *); + WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); + WT_MEMSIZE_ADD(parent_incr, size); + alloc_index->index = (WT_REF **)(alloc_index + 1); + alloc_index->entries = result_entries; + for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) + if (pindex->index[i] == ref) + for (j = 0; j < split_entries; ++j) { + ref_tmp[j]->home = parent; + *alloc_refp++ = ref_tmp[j]; + + /* + * Clear the split reference as it moves to the + * allocated page index, so it never appears on + * both after an error. + */ + ref_tmp[j] = NULL; + } + else + *alloc_refp++ = pindex->index[i]; + __wt_free(session, ref_tmp); + + /* + * Update the parent page's index: this update makes the split visible + * to threads descending the tree. + */ + WT_INTL_INDEX_SET(parent, alloc_index); + alloc_index = NULL; + +#ifdef HAVE_DIAGNOSTIC + __split_verify_intl_key_order(session, parent); +#endif + + /* + * Reset the page's original WT_REF field to split. Threads cursoring + * through the tree were blocked because that WT_REF state was set to + * locked. This update changes the locked state to split, unblocking + * those threads and causing them to re-calculate their position based + * on the updated parent page's index. + */ + WT_PUBLISH(ref->state, WT_REF_SPLIT); + + /* + * A note on error handling: failures before we swapped the new page + * index into the parent can be resolved by simply freeing allocated + * memory because the original page is unchanged, we can continue to + * use it and we have not yet modified the parent. (See below for an + * exception, we cannot discard pages referencing unresolved changes.) + * Failures after we swap the new page index into the parent are also + * relatively benign because the split is OK and complete and the page + * is reset so it will be discarded by eviction. For that reason, we + * mostly ignore further errors unless there's a panic. + */ + complete = 1; + + /* + * The previous parent page's key for this child page may have been an + * on-page overflow key. In that case, if the key hasn't been deleted, + * delete it now, including its backing blocks. We are exchanging the + * WT_REF that referenced it for the split page WT_REFs and their keys, + * and there's no longer any reference to it. Done after completing the + * split (if we failed, we'd leak the underlying blocks, but the parent + * page would be unaffected). + */ + if (parent->type == WT_PAGE_ROW_INT) + WT_TRET(__split_ovfl_key_cleanup(session, parent, ref)); + + /* + * We can't free the previous page index, or the page's original WT_REF + * structure and instantiated key, there may be threads using them. Add + * them to the session discard list, to be freed once we know it's safe. + */ + size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); + WT_TRET(__split_safe_free(session, exclusive, pindex, size)); + WT_MEMSIZE_ADD(parent_decr, size); + if (parent->type == WT_PAGE_ROW_INT && + (ikey = __wt_ref_key_instantiated(ref)) != NULL) { + size = sizeof(WT_IKEY) + ikey->size; + WT_TRET(__split_safe_free(session, exclusive, ikey, size)); + WT_MEMSIZE_ADD(parent_decr, size); + } + /* + * Take a copy of the ref in case we can free it immediately: we still + * need to discard the page. + */ + ref_copy = *ref; + WT_TRET(__split_safe_free(session, exclusive, ref, sizeof(WT_REF))); + WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF)); + + /* + * Adjust the parent's memory footprint. This may look odd, but we + * have already taken the allocation overhead into account, and an + * increment followed by a decrement will cancel out the normal + * adjustment. + */ + __wt_cache_page_inmem_incr(session, parent, parent_incr); + __wt_cache_page_inmem_decr(session, parent, parent_decr); + + WT_STAT_FAST_CONN_INCR(session, cache_eviction_split); + WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, + "%p: %s split into parent %p %" PRIu32 " -> %" PRIu32 + " (%" PRIu32 ")", + child, __wt_page_type_string(child->type), parent, parent_entries, + result_entries, result_entries - parent_entries)); + + /* + * Simple page splits trickle up the tree, that is, as leaf pages grow + * large enough and are evicted, they'll split into their parent. And, + * as that parent grows large enough and is evicted, it will split into + * its parent and so on. When the page split wave reaches the root, + * the tree will permanently deepen as multiple root pages are written. + * However, this only helps if first, the pages are evicted (and + * we resist evicting internal pages for obvious reasons), and second, + * if the tree is closed and re-opened from a disk image, which may be + * a rare event. + * To avoid the case of internal pages becoming too large when they + * aren't being evicted, check internal pages each time a leaf page is + * split into them. If it's big enough, deepen the tree at that point. + * Do the check here because we've just grown the parent page and + * are holding it locked. + */ + if (ret == 0 && !exclusive && __split_should_deepen(session, parent)) + ret = __split_deepen(session, parent); + +err: if (locked) + F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING); + + if (hazard) + WT_TRET(__wt_hazard_clear(session, parent)); + + /* + * Discard the child; test for split completion instead of errors, there + * might be a relatively innocuous error, and if we split the parent, we + * want to discard the child. + */ + if (complete) { + /* + * Pages with unresolved changes are not marked clean during + * reconciliation, do it now. + */ + if (__wt_page_is_modified(child)) { + mod->write_gen = 0; + __wt_cache_dirty_decr(session, child); + } + __wt_ref_out(session, &ref_copy); + } + + /* + * A note on error handling: in the case of evicting a page that has + * unresolved changes, we just instantiated some in-memory pages that + * reflect those unresolved changes. The problem is those pages + * reference the same WT_UPDATE chains as the page we're splitting, + * that is, we simply copied references into the new pages. If the + * split fails, the original page is fine, but discarding the created + * page would free those update chains, and that's wrong. There isn't + * an easy solution, there's a lot of small memory allocations in some + * common code paths, and unwinding those changes will be difficult. + * For now, leak the memory by not discarding the instantiated pages. + */ + __wt_free_ref_index(session, NULL, alloc_index, 0); + if (ref_tmp != NULL) { + for (i = 0; i < split_entries; ++i) + __wt_free_ref(session, child, ref_tmp[i], 0); + __wt_free(session, ref_tmp); + } + + /* + * A note on error handling: if we completed the split, return success, + * nothing really bad can have happened. + */ + return (ret == WT_PANIC || !complete ? ret : 0); +} + +/* + * __split_evict_single -- + * Resolve a single page split, replacing a page with a new version. + */ +static int +__split_evict_single(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_PAGE *page; + WT_PAGE_MODIFY *mod; + WT_REF new; + + page = ref->page; + mod = page->modify; + + /* Build the new page. */ + memset(&new, 0, sizeof(new)); + WT_RET(__split_inmem_build(session, page, &new, &mod->mod_multi[0])); + + /* + * Discard the original page. Pages with unresolved changes are not + * marked clean during reconciliation, do it now. + */ + mod->write_gen = 0; + __wt_cache_dirty_decr(session, page); + __wt_page_out(session, &page); + + /* Swap the new page into place. */ + ref->page = new.page; + WT_PUBLISH(ref->state, WT_REF_MEM); + + return (0); +} + +/* + * __wt_split_evict -- + * Resolve a page split. + */ +int +__wt_split_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) +{ + uint32_t split_entries; + + /* + * There are two cases entering this code. First, an in-memory page that + * got too large, we forcibly evicted it, and there wasn't anything to + * write. (Imagine two threads updating a small set keys on a leaf page. + * The page is too large so we try to evict it, but after reconciliation + * there's only a small amount of data (so it's a single page we can't + * split), and because there are two threads, there's some data we can't + * write (so we can't evict it). In that case, we take advantage of the + * fact we have exclusive access to the page and rewrite it in memory.) + * + * Second, a real split where we reconciled a page and it turned into a + * lot of pages. + */ + split_entries = ref->page->modify->mod_multi_entries; + return (split_entries == 1 ? + __split_evict_single(session, ref) : + __split_evict_multi(session, ref, exclusive)); +} diff --git a/src/third_party/wiredtiger/src/btree/rec_track.c b/src/third_party/wiredtiger/src/btree/rec_track.c new file mode 100644 index 00000000000..92282393a23 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/rec_track.c @@ -0,0 +1,904 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * Estimated memory cost for a structure on the overflow lists, the size of + * the structure plus two pointers (assume the average skip list depth is 2). + */ +#define WT_OVFL_SIZE(s) \ + (sizeof(s) + 2 * sizeof(void *)) + +/* + * __ovfl_track_init -- + * Initialize the overflow tracking structure. + */ +static int +__ovfl_track_init(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + return (__wt_calloc_def(session, 1, &page->modify->ovfl_track)); +} + +/* + * __ovfl_discard_verbose -- + * Dump information about a discard overflow record. + */ +static int +__ovfl_discard_verbose( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, const char *tag) +{ + WT_CELL_UNPACK *unpack, _unpack; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + + WT_RET(__wt_scr_alloc(session, 512, &tmp)); + + unpack = &_unpack; + __wt_cell_unpack(cell, unpack); + + WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW, + "discard: %s%s%p %s", + tag == NULL ? "" : tag, + tag == NULL ? "" : ": ", + page, + __wt_addr_string(session, unpack->data, unpack->size, tmp))); + +err: __wt_scr_free(&tmp); + return (ret); +} + +#if 0 +/* + * __ovfl_discard_dump -- + * Debugging information. + */ +static void +__ovfl_discard_dump(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_CELL **cellp; + WT_OVFL_TRACK *track; + size_t i; + + if (page->modify == NULL || page->modify->ovfl_track == NULL) + return; + + track = page->modify->ovfl_track; + for (i = 0, cellp = track->discard; + i < track->discard_entries; ++i, ++cellp) + (void)__ovfl_discard_verbose(session, page, *cellp, "dump"); +} +#endif + +/* + * __ovfl_discard_wrapup -- + * Resolve the page's overflow discard list after a page is written. + */ +static int +__ovfl_discard_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_CELL **cellp; + WT_DECL_RET; + WT_OVFL_TRACK *track; + uint32_t i; + + track = page->modify->ovfl_track; + for (i = 0, cellp = track->discard; + i < track->discard_entries; ++i, ++cellp) { + if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) + WT_RET(__ovfl_discard_verbose( + session, page, *cellp, "free")); + + /* Discard each cell's overflow item. */ + WT_RET(__wt_ovfl_discard(session, *cellp)); + } + + __wt_free(session, track->discard); + track->discard_entries = track->discard_allocated = 0; + + return (ret); +} + +/* + * __ovfl_discard_wrapup_err -- + * Resolve the page's overflow discard list after an error occurs. + */ +static int +__ovfl_discard_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_OVFL_TRACK *track; + + track = page->modify->ovfl_track; + + __wt_free(session, track->discard); + track->discard_entries = track->discard_allocated = 0; + + return (0); +} + +/* + * __wt_ovfl_discard_add -- + * Add a new entry to the page's list of overflow records that have been + * discarded. + */ +int +__wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell) +{ + WT_OVFL_TRACK *track; + + if (page->modify->ovfl_track == NULL) + WT_RET(__ovfl_track_init(session, page)); + + track = page->modify->ovfl_track; + WT_RET(__wt_realloc_def(session, &track->discard_allocated, + track->discard_entries + 1, &track->discard)); + track->discard[track->discard_entries++] = cell; + + if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) + WT_RET(__ovfl_discard_verbose(session, page, cell, "add")); + + return (0); +} + +/* + * __wt_ovfl_discard_free -- + * Free the page's list of discarded overflow record addresses. + */ +void +__wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_OVFL_TRACK *track; + + if (page->modify == NULL || page->modify->ovfl_track == NULL) + return; + + track = page->modify->ovfl_track; + + __wt_free(session, track->discard); + track->discard_entries = track->discard_allocated = 0; +} + +/* + * __ovfl_reuse_verbose -- + * Dump information about a reuse overflow record. + */ +static int +__ovfl_reuse_verbose(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_OVFL_REUSE *reuse, const char *tag) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + + WT_RET(__wt_scr_alloc(session, 64, &tmp)); + + WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW, + "reuse: %s%s%p %s (%s%s%s) {%.*s}", + tag == NULL ? "" : tag, + tag == NULL ? "" : ": ", + page, + __wt_addr_string( + session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size, tmp), + F_ISSET(reuse, WT_OVFL_REUSE_INUSE) ? "inuse" : "", + F_ISSET(reuse, WT_OVFL_REUSE_INUSE) && + F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED) ? ", " : "", + F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED) ? "just-added" : "", + WT_MIN(reuse->value_size, 40), (char *)WT_OVFL_REUSE_VALUE(reuse))); + +err: __wt_scr_free(&tmp); + return (ret); +} + +#if 0 +/* + * __ovfl_reuse_dump -- + * Debugging information. + */ +static void +__ovfl_reuse_dump(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_OVFL_REUSE **head, *reuse; + + if (page->modify == NULL || page->modify->ovfl_track == NULL) + return; + head = page->modify->ovfl_track->ovfl_reuse; + + for (reuse = head[0]; reuse != NULL; reuse = reuse->next[0]) + (void)__ovfl_reuse_verbose(session, page, reuse, "dump"); +} +#endif + +/* + * __ovfl_reuse_skip_search -- + * Return the first, not in-use, matching value in the overflow reuse list. + */ +static WT_OVFL_REUSE * +__ovfl_reuse_skip_search( + WT_OVFL_REUSE **head, const void *value, size_t value_size) +{ + WT_OVFL_REUSE **e, *next; + size_t len; + int cmp, i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) { + if (*e == NULL) { /* Empty levels */ + --i; + --e; + continue; + } + + /* + * Values are not unique, and it's possible to have long lists + * of identical overflow items. (We've seen it in benchmarks.) + * Move through a list of identical items at the current level + * as long as the next one is in-use, otherwise, drop down a + * level. When at the bottom level, return items if reusable, + * else NULL. + */ + len = WT_MIN((*e)->value_size, value_size); + cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len); + if (cmp == 0 && (*e)->value_size == value_size) { + if (i == 0) + return (F_ISSET(*e, + WT_OVFL_REUSE_INUSE) ? NULL : *e); + if ((next = (*e)->next[i]) == NULL || + !F_ISSET(next, WT_OVFL_REUSE_INUSE) || + next->value_size != len || memcmp( + WT_OVFL_REUSE_VALUE(next), value, len) != 0) { + --i; /* Drop down a level */ + --e; + } else /* Keep going at this level */ + e = &(*e)->next[i]; + continue; + } + + /* + * If the skiplist value is larger than the search value, or + * they compare equally and the skiplist value is longer than + * the search value, drop down a level, otherwise continue on + * this level. + */ + if (cmp > 0 || (cmp == 0 && (*e)->value_size > value_size)) { + --i; /* Drop down a level */ + --e; + } else /* Keep going at this level */ + e = &(*e)->next[i]; + } + return (NULL); +} + +/* + * __ovfl_reuse_skip_search_stack -- + * Search an overflow reuse skiplist, returning an insert/remove stack. + */ +static void +__ovfl_reuse_skip_search_stack(WT_OVFL_REUSE **head, + WT_OVFL_REUSE ***stack, const void *value, size_t value_size) +{ + WT_OVFL_REUSE **e; + size_t len; + int cmp, i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) { + if (*e == NULL) { /* Empty levels */ + stack[i--] = e--; + continue; + } + + /* + * If the skiplist value is larger than the search value, or + * they compare equally and the skiplist value is longer than + * the search value, drop down a level, otherwise continue on + * this level. + */ + len = WT_MIN((*e)->value_size, value_size); + cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len); + if (cmp > 0 || (cmp == 0 && (*e)->value_size > value_size)) + stack[i--] = e--; /* Drop down a level */ + else + e = &(*e)->next[i]; /* Keep going at this level */ + } +} + +/* + * __ovfl_reuse_wrapup -- + * Resolve the page's overflow reuse list after a page is written. + */ +static int +__ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BM *bm; + WT_OVFL_REUSE **e, **head, *reuse; + size_t incr, decr; + int i; + + bm = S2BT(session)->bm; + head = page->modify->ovfl_track->ovfl_reuse; + + /* + * Discard any overflow records that aren't in-use, freeing underlying + * blocks. + * + * First, walk the overflow reuse lists (except for the lowest one), + * fixing up skiplist links. + */ + for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) + for (e = &head[i]; *e != NULL;) { + if (F_ISSET(*e, WT_OVFL_REUSE_INUSE)) { + e = &(*e)->next[i]; + continue; + } + *e = (*e)->next[i]; + } + + /* + * Second, discard any overflow record without an in-use flag, clear + * the flags for the next run. + * + * As part of the pass through the lowest level, figure out how much + * space we added/subtracted from the page, and update its footprint. + * We don't get it exactly correct because we don't know the depth of + * the skiplist here, but it's close enough, and figuring out the + * memory footprint change in the reconciliation wrapup code means + * fewer atomic updates and less code overall. + */ + incr = decr = 0; + for (e = &head[0]; (reuse = *e) != NULL;) { + if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) { + if (F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) + incr += WT_OVFL_SIZE(WT_OVFL_REUSE) + + reuse->addr_size + reuse->value_size; + + F_CLR(reuse, + WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED); + e = &(*e)->next[0]; + continue; + } + *e = (*e)->next[0]; + + WT_ASSERT(session, !F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)); + decr += WT_OVFL_SIZE(WT_OVFL_REUSE) + + reuse->addr_size + reuse->value_size; + + if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) + WT_RET( + __ovfl_reuse_verbose(session, page, reuse, "free")); + WT_RET(bm->free( + bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size)); + __wt_free(session, reuse); + } + + if (incr > decr) + __wt_cache_page_inmem_incr(session, page, incr - decr); + if (decr > incr) + __wt_cache_page_inmem_decr(session, page, decr - incr); + return (0); +} + +/* + * __ovfl_reuse_wrapup_err -- + * Resolve the page's overflow reuse list after an error occurs. + */ +static int +__ovfl_reuse_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BM *bm; + WT_DECL_RET; + WT_OVFL_REUSE **e, **head, *reuse; + int i; + + bm = S2BT(session)->bm; + head = page->modify->ovfl_track->ovfl_reuse; + + /* + * Discard any overflow records that were just added, freeing underlying + * blocks. + * + * First, walk the overflow reuse lists (except for the lowest one), + * fixing up skiplist links. + */ + for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) + for (e = &head[i]; *e != NULL;) { + if (!F_ISSET(*e, WT_OVFL_REUSE_JUST_ADDED)) { + e = &(*e)->next[i]; + continue; + } + *e = (*e)->next[i]; + } + + /* + * Second, discard any overflow record with a just-added flag, clear the + * flags for the next run. + */ + for (e = &head[0]; (reuse = *e) != NULL;) { + if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) { + F_CLR(reuse, WT_OVFL_REUSE_INUSE); + e = &(*e)->next[0]; + continue; + } + *e = (*e)->next[0]; + + if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) + WT_RET( + __ovfl_reuse_verbose(session, page, reuse, "free")); + WT_TRET(bm->free( + bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size)); + __wt_free(session, reuse); + } + return (0); +} + +/* + * __wt_ovfl_reuse_search -- + * Search the page's list of overflow records for a match. + */ +int +__wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page, + uint8_t **addrp, size_t *addr_sizep, + const void *value, size_t value_size) +{ + WT_OVFL_REUSE **head, *reuse; + + *addrp = NULL; + *addr_sizep = 0; + + if (page->modify->ovfl_track == NULL) + return (0); + + head = page->modify->ovfl_track->ovfl_reuse; + + /* + * The search function returns the first matching record in the list + * which does not have the in-use flag set, or NULL. + */ + if ((reuse = __ovfl_reuse_skip_search(head, value, value_size)) == NULL) + return (0); + + *addrp = WT_OVFL_REUSE_ADDR(reuse); + *addr_sizep = reuse->addr_size; + F_SET(reuse, WT_OVFL_REUSE_INUSE); + + if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) + WT_RET(__ovfl_reuse_verbose(session, page, reuse, "reclaim")); + return (1); +} + +/* + * __wt_ovfl_reuse_add -- + * Add a new entry to the page's list of overflow records tracked for + * reuse. + */ +int +__wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page, + const uint8_t *addr, size_t addr_size, + const void *value, size_t value_size) +{ + WT_OVFL_REUSE **head, *reuse, **stack[WT_SKIP_MAXDEPTH]; + size_t size; + u_int i, skipdepth; + uint8_t *p; + + if (page->modify->ovfl_track == NULL) + WT_RET(__ovfl_track_init(session, page)); + + head = page->modify->ovfl_track->ovfl_reuse; + + /* Choose a skiplist depth for this insert. */ + skipdepth = __wt_skip_choose_depth(session); + + /* + * Allocate the WT_OVFL_REUSE structure, next pointers for the skip + * list, room for the address and value, then copy everything into + * place. + * + * To minimize the WT_OVFL_REUSE structure size, the address offset + * and size are single bytes: that's safe because the address follows + * the structure (which can't be more than about 100B), and address + * cookies are limited to 255B. + */ + size = sizeof(WT_OVFL_REUSE) + + skipdepth * sizeof(WT_OVFL_REUSE *) + addr_size + value_size; + WT_RET(__wt_calloc(session, 1, size, &reuse)); + p = (uint8_t *)reuse + + sizeof(WT_OVFL_REUSE) + skipdepth * sizeof(WT_OVFL_REUSE *); + reuse->addr_offset = (uint8_t)WT_PTRDIFF(p, reuse); + reuse->addr_size = (uint8_t)addr_size; + memcpy(p, addr, addr_size); + p += addr_size; + reuse->value_offset = WT_PTRDIFF32(p, reuse); + reuse->value_size = WT_STORE_SIZE(value_size); + memcpy(p, value, value_size); + F_SET(reuse, WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED); + + /* Insert the new entry into the skiplist. */ + __ovfl_reuse_skip_search_stack(head, stack, value, value_size); + for (i = 0; i < skipdepth; ++i) { + reuse->next[i] = *stack[i]; + *stack[i] = reuse; + } + + if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) + WT_RET(__ovfl_reuse_verbose(session, page, reuse, "add")); + + return (0); +} + +/* + * __wt_ovfl_reuse_free -- + * Free the page's list of overflow records tracked for reuse. + */ +void +__wt_ovfl_reuse_free(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_OVFL_REUSE *reuse; + WT_PAGE_MODIFY *mod; + void *next; + + mod = page->modify; + if (mod == NULL || mod->ovfl_track == NULL) + return; + + for (reuse = mod->ovfl_track->ovfl_reuse[0]; + reuse != NULL; reuse = next) { + next = reuse->next[0]; + __wt_free(session, reuse); + } +} + +/* + * __ovfl_txnc_verbose -- + * Dump information about a transaction-cached overflow record. + */ +static int +__ovfl_txnc_verbose(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_OVFL_TXNC *txnc, const char *tag) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + + WT_RET(__wt_scr_alloc(session, 64, &tmp)); + + WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW, + "txn-cache: %s%s%p %s %" PRIu64 " {%.*s}", + tag == NULL ? "" : tag, + tag == NULL ? "" : ": ", + page, + __wt_addr_string( + session, WT_OVFL_TXNC_ADDR(txnc), txnc->addr_size, tmp), + txnc->current, + WT_MIN(txnc->value_size, 40), (char *)WT_OVFL_TXNC_VALUE(txnc))); + +err: __wt_scr_free(&tmp); + return (ret); +} + +#if 0 +/* + * __ovfl_txnc_dump -- + * Debugging information. + */ +static void +__ovfl_txnc_dump(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_OVFL_TXNC **head, *txnc; + + if (page->modify == NULL || page->modify->ovfl_track == NULL) + return; + head = page->modify->ovfl_track->ovfl_txnc; + + for (txnc = head[0]; txnc != NULL; txnc = txnc->next[0]) + (void)__ovfl_txnc_verbose(session, page, txnc, "dump"); +} +#endif + +/* + * __ovfl_txnc_skip_search -- + * Return the first matching addr in the overflow transaction-cache list. + */ +static WT_OVFL_TXNC * +__ovfl_txnc_skip_search(WT_OVFL_TXNC **head, const void *addr, size_t addr_size) +{ + WT_OVFL_TXNC **e; + size_t len; + int cmp, i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) { + if (*e == NULL) { /* Empty levels */ + --i; + --e; + continue; + } + + /* + * Return any exact matches: we don't care in what search level + * we found a match. + */ + len = WT_MIN((*e)->addr_size, addr_size); + cmp = memcmp(WT_OVFL_TXNC_ADDR(*e), addr, len); + if (cmp == 0 && (*e)->addr_size == addr_size) + return (*e); + + /* + * If the skiplist address is larger than the search address, or + * they compare equally and the skiplist address is longer than + * the search address, drop down a level, otherwise continue on + * this level. + */ + if (cmp > 0 || (cmp == 0 && (*e)->addr_size > addr_size)) { + --i; /* Drop down a level */ + --e; + } else /* Keep going at this level */ + e = &(*e)->next[i]; + } + return (NULL); +} + +/* + * __ovfl_txnc_skip_search_stack -- + * Search an overflow transaction-cache skiplist, returning an + * insert/remove stack. + */ +static void +__ovfl_txnc_skip_search_stack(WT_OVFL_TXNC **head, + WT_OVFL_TXNC ***stack, const void *addr, size_t addr_size) +{ + WT_OVFL_TXNC **e; + size_t len; + int cmp, i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) { + if (*e == NULL) { /* Empty levels */ + stack[i--] = e--; + continue; + } + + /* + * If the skiplist addr is larger than the search addr, or + * they compare equally and the skiplist addr is longer than + * the search addr, drop down a level, otherwise continue on + * this level. + */ + len = WT_MIN((*e)->addr_size, addr_size); + cmp = memcmp(WT_OVFL_TXNC_ADDR(*e), addr, len); + if (cmp > 0 || (cmp == 0 && (*e)->addr_size > addr_size)) + stack[i--] = e--; /* Drop down a level */ + else + e = &(*e)->next[i]; /* Keep going at this level */ + } +} + +/* + * __ovfl_txnc_wrapup -- + * Resolve the page's transaction-cache list. + */ +static int +__ovfl_txnc_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_OVFL_TXNC **e, **head, *txnc; + size_t decr; + int i; + + head = page->modify->ovfl_track->ovfl_txnc; + + /* + * Discard any transaction-cache records with transaction IDs earlier + * than any in the system. + * + * First, walk the overflow transaction-cache skip lists (except for + * the lowest level), fixing up links. + */ + for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) + for (e = &head[i]; *e != NULL;) { + if (!__wt_txn_visible_all(session, (*e)->current)) { + e = &(*e)->next[i]; + continue; + } + *e = (*e)->next[i]; + } + + /* Second, discard any no longer needed transaction-cache records. */ + decr = 0; + for (e = &head[0]; (txnc = *e) != NULL;) { + if (!__wt_txn_visible_all(session, txnc->current)) { + e = &(*e)->next[0]; + continue; + } + *e = (*e)->next[0]; + + decr += WT_OVFL_SIZE(WT_OVFL_TXNC) + + txnc->addr_size + txnc->value_size; + + if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) + WT_RET( + __ovfl_txnc_verbose(session, page, txnc, "free")); + __wt_free(session, txnc); + } + + if (decr != 0) + __wt_cache_page_inmem_decr(session, page, decr); + return (0); +} + +/* + * __wt_ovfl_txnc_search -- + * Search the page's list of transaction-cache overflow records for a + * match. + */ +int +__wt_ovfl_txnc_search( + WT_PAGE *page, const uint8_t *addr, size_t addr_size, WT_ITEM *store) +{ + WT_OVFL_TXNC **head, *txnc; + + if (page->modify->ovfl_track == NULL) + return (WT_NOTFOUND); + + head = page->modify->ovfl_track->ovfl_txnc; + + if ((txnc = __ovfl_txnc_skip_search(head, addr, addr_size)) == NULL) + return (WT_NOTFOUND); + + store->data = WT_OVFL_TXNC_VALUE(txnc); + store->size = txnc->value_size; + return (0); +} + +/* + * __wt_ovfl_txnc_add -- + * Add a new entry to the page's list of transaction-cached overflow + * records. + */ +int +__wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page, + const uint8_t *addr, size_t addr_size, + const void *value, size_t value_size) +{ + WT_OVFL_TXNC **head, **stack[WT_SKIP_MAXDEPTH], *txnc; + size_t size; + u_int i, skipdepth; + uint8_t *p; + + if (page->modify->ovfl_track == NULL) + WT_RET(__ovfl_track_init(session, page)); + + head = page->modify->ovfl_track->ovfl_txnc; + + /* Choose a skiplist depth for this insert. */ + skipdepth = __wt_skip_choose_depth(session); + + /* + * Allocate the WT_OVFL_TXNC structure, next pointers for the skip + * list, room for the address and value, then copy everything into + * place. + * + * To minimize the WT_OVFL_TXNC structure size, the address offset + * and size are single bytes: that's safe because the address follows + * the structure (which can't be more than about 100B), and address + * cookies are limited to 255B. + */ + size = sizeof(WT_OVFL_TXNC) + + skipdepth * sizeof(WT_OVFL_TXNC *) + addr_size + value_size; + WT_RET(__wt_calloc(session, 1, size, &txnc)); + p = (uint8_t *)txnc + + sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *); + txnc->addr_offset = (uint8_t)WT_PTRDIFF(p, txnc); + txnc->addr_size = (uint8_t)addr_size; + memcpy(p, addr, addr_size); + p += addr_size; + txnc->value_offset = WT_PTRDIFF32(p, txnc); + txnc->value_size = WT_STORE_SIZE(value_size); + memcpy(p, value, value_size); + txnc->current = __wt_txn_new_id(session); + + __wt_cache_page_inmem_incr(session, page, + WT_OVFL_SIZE(WT_OVFL_TXNC) + addr_size + value_size); + + /* Insert the new entry into the skiplist. */ + __ovfl_txnc_skip_search_stack(head, stack, addr, addr_size); + for (i = 0; i < skipdepth; ++i) { + txnc->next[i] = *stack[i]; + *stack[i] = txnc; + } + + if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) + WT_RET(__ovfl_txnc_verbose(session, page, txnc, "add")); + + return (0); +} + +/* + * __wt_ovfl_txnc_free -- + * Free the page's list of transaction-cached overflow records. + */ +void +__wt_ovfl_txnc_free(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_OVFL_TXNC *txnc; + WT_PAGE_MODIFY *mod; + void *next; + + mod = page->modify; + if (mod == NULL || mod->ovfl_track == NULL) + return; + + for (txnc = mod->ovfl_track->ovfl_txnc[0]; + txnc != NULL; txnc = next) { + next = txnc->next[0]; + __wt_free(session, txnc); + } +} + +/* + * __wt_ovfl_track_wrapup -- + * Resolve the page's overflow tracking on reconciliation success. + */ +int +__wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_DECL_RET; + WT_OVFL_TRACK *track; + + if (page->modify == NULL || page->modify->ovfl_track == NULL) + return (0); + + track = page->modify->ovfl_track; + if (track->discard != NULL) + WT_RET(__ovfl_discard_wrapup(session, page)); + + if (track->ovfl_reuse[0] != NULL) + WT_RET(__ovfl_reuse_wrapup(session, page)); + + if (track->ovfl_txnc[0] != NULL) { + WT_RET(__wt_writelock(session, S2BT(session)->ovfl_lock)); + ret = __ovfl_txnc_wrapup(session, page); + WT_TRET(__wt_writeunlock(session, S2BT(session)->ovfl_lock)); + } + return (0); +} + +/* + * __wt_ovfl_track_wrapup_err -- + * Resolve the page's overflow tracking on reconciliation error. + */ +int +__wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_DECL_RET; + WT_OVFL_TRACK *track; + + if (page->modify == NULL || page->modify->ovfl_track == NULL) + return (0); + + track = page->modify->ovfl_track; + if (track->discard != NULL) + WT_RET(__ovfl_discard_wrapup_err(session, page)); + + if (track->ovfl_reuse[0] != NULL) + WT_RET(__ovfl_reuse_wrapup_err(session, page)); + + if (track->ovfl_txnc[0] != NULL) { + WT_RET(__wt_writelock(session, S2BT(session)->ovfl_lock)); + ret = __ovfl_txnc_wrapup(session, page); + WT_TRET(__wt_writeunlock(session, S2BT(session)->ovfl_lock)); + } + return (0); +} diff --git a/src/third_party/wiredtiger/src/btree/rec_write.c b/src/third_party/wiredtiger/src/btree/rec_write.c new file mode 100644 index 00000000000..1b3a9a0898f --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/rec_write.c @@ -0,0 +1,5521 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +struct __rec_boundary; typedef struct __rec_boundary WT_BOUNDARY; +struct __rec_dictionary; typedef struct __rec_dictionary WT_DICTIONARY; +struct __rec_kv; typedef struct __rec_kv WT_KV; + +/* + * Reconciliation is the process of taking an in-memory page, walking each entry + * in the page, building a backing disk image in a temporary buffer representing + * that information, and writing that buffer to disk. What could be simpler? + * + * WT_RECONCILE -- + * Information tracking a single page reconciliation. + */ +typedef struct { + WT_REF *ref; /* Page being reconciled */ + WT_PAGE *page; + uint32_t flags; /* Caller's configuration */ + + WT_ITEM dsk; /* Temporary disk-image buffer */ + + /* Track whether all changes to the page are written. */ + uint64_t max_txn; + uint64_t skipped_txn; + uint32_t orig_write_gen; + + /* + * If page updates are skipped because they are as yet unresolved, or + * the page has updates we cannot discard, the page is left "dirty": + * the page cannot be discarded and a subsequent reconciliation will + * be necessary to discard the page. + */ + int leave_dirty; + + /* + * Raw compression (don't get me started, as if normal reconciliation + * wasn't bad enough). If an application wants absolute control over + * what gets written to disk, we give it a list of byte strings and it + * gives us back an image that becomes a file block. Because we don't + * know the number of items we're storing in a block until we've done + * a lot of work, we turn off most compression: dictionary, copy-cell, + * prefix and row-store internal page suffix compression are all off. + */ + int raw_compression; + uint32_t raw_max_slots; /* Raw compression array sizes */ + uint32_t *raw_entries; /* Raw compression slot entries */ + uint32_t *raw_offsets; /* Raw compression slot offsets */ + uint64_t *raw_recnos; /* Raw compression recno count */ + WT_ITEM raw_destination; /* Raw compression destination buffer */ + + /* + * Track if reconciliation has seen any overflow items. If a leaf page + * with no overflow items is written, the parent page's address cell is + * set to the leaf-no-overflow type. This means we can delete the leaf + * page without reading it because we don't have to discard any overflow + * items it might reference. + * + * The test test is per-page reconciliation, that is, once we see an + * overflow item on the page, all subsequent leaf pages written for the + * page will not be leaf-no-overflow type, regardless of whether or not + * they contain overflow items. In other words, leaf-no-overflow is not + * guaranteed to be set on every page that doesn't contain an overflow + * item, only that if it is set, the page contains no overflow items. + * + * The reason is because of raw compression: there's no easy/fast way to + * figure out if the rows selected by raw compression included overflow + * items, and the optimization isn't worth another pass over the data. + */ + int ovfl_items; + + /* + * Track if reconciliation of a row-store leaf page has seen empty (zero + * length) values. We don't write out anything for empty values, so if + * there are empty values on a page, we have to make two passes over the + * page when it's read to figure out how many keys it has, expensive in + * the common case of no empty values and (entries / 2) keys. Likewise, + * a page with only empty values is another common data set, and keys on + * that page will be equal to the number of entries. In both cases, set + * a flag in the page's on-disk header. + * + * The test is per-page reconciliation as described above for the + * overflow-item test. + */ + int all_empty_value, any_empty_value; + + /* + * Reconciliation gets tricky if we have to split a page, which happens + * when the disk image we create exceeds the page type's maximum disk + * image size. + * + * First, the sizes of the page we're building. If WiredTiger is doing + * page layout, page_size is the same as page_size_max. We accumulate + * the maximum page size of raw data and when we reach that size, we + * split the page into multiple chunks, eventually compressing those + * chunks. When the application is doing page layout (raw compression + * is configured), page_size can continue to grow past page_size_max, + * and we keep accumulating raw data until the raw compression callback + * accepts it. + */ + uint32_t page_size; /* Current page size */ + uint32_t page_size_max; /* Maximum on-disk page size */ + + /* + * Second, the split size: if we're doing the page layout, split to a + * smaller-than-maximum page size when a split is required so we don't + * repeatedly split a packed page. + */ + uint32_t split_size; /* Split page size */ + + /* + * The problem with splits is we've done a lot of work by the time we + * realize we're going to have to split, we don't want to start over. + * + * To keep from having to start over when we hit the maximum page size, + * we track the page information when we approach a split boundary. + * If we eventually have to split, we walk this structure and pretend + * we were splitting all along. After that, we continue to append to + * this structure, and eventually walk it to create a new internal page + * that references all of our split pages. + */ + struct __rec_boundary { + /* + * The start field records location in the initial split buffer, + * that is, the first byte of the split chunk recorded before we + * decide to split a page; the offset between the first byte of + * chunk[0] and the first byte of chunk[1] is chunk[0]'s length. + * + * Once we split a page, we stop filling in the start field, as + * we're writing the split chunks as we find them. + */ + uint8_t *start; /* Split's first byte */ + + /* + * The recno and entries fields are the starting record number + * of the split chunk (for column-store splits), and the number + * of entries in the split chunk. These fields are used both + * to write the split chunk, and to create a new internal page + * to reference the split pages. + */ + uint64_t recno; /* Split's starting record */ + uint32_t entries; /* Split's entries */ + + WT_ADDR addr; /* Split's written location */ + uint32_t size; /* Split's size */ + uint32_t cksum; /* Split's checksum */ + void *dsk; /* Split's disk image */ + + /* + * When busy pages get large, we need to be able to evict them + * even when they contain unresolved updates, or updates which + * cannot be evicted because of running transactions. In such + * cases, break the page into multiple blocks, write the blocks + * that can be evicted, saving lists of updates for blocks that + * cannot be evicted, then re-instantiate the blocks that cannot + * be evicted as new, in-memory pages, restoring the updates on + * those pages. + */ + WT_UPD_SKIPPED *skip; /* Skipped updates */ + uint32_t skip_next; + size_t skip_allocated; + + /* + * The key for a row-store page; no column-store key is needed + * because the page's recno, stored in the recno field, is the + * column-store key. + */ + WT_ITEM key; /* Promoted row-store key */ + + /* + * During wrapup, after reconciling the root page, we write a + * final block as part of a checkpoint. If raw compression + * was configured, that block may have already been compressed. + */ + int already_compressed; + } *bnd; /* Saved boundaries */ + uint32_t bnd_next; /* Next boundary slot */ + uint32_t bnd_next_max; /* Maximum boundary slots used */ + size_t bnd_entries; /* Total boundary slots */ + size_t bnd_allocated; /* Bytes allocated */ + + /* + * We track the total number of page entries copied into split chunks + * so we can easily figure out how many entries in the current split + * chunk. + */ + uint32_t total_entries; /* Total entries in splits */ + + /* + * And there's state information as to where in this process we are: + * (1) tracking split boundaries because we can still fit more split + * chunks into the maximum page size, (2) tracking the maximum page + * size boundary because we can't fit any more split chunks into the + * maximum page size, (3) not performing boundary checks because it's + * either not useful with the current page size configuration, or + * because we've already been forced to split. + */ + enum { SPLIT_BOUNDARY=0, /* Next: a split page boundary */ + SPLIT_MAX=1, /* Next: the maximum page boundary */ + SPLIT_TRACKING_OFF=2, /* No boundary checks */ + SPLIT_TRACKING_RAW=3 } /* Underlying compression decides */ + bnd_state; + + /* + * We track current information about the current record number, the + * number of entries copied into the temporary buffer, where we are + * in the temporary buffer, and how much memory remains. Those items + * are packaged here rather than passing pointers to stack locations + * around the code. + */ + uint64_t recno; /* Current record number */ + uint32_t entries; /* Current number of entries */ + uint8_t *first_free; /* Current first free byte */ + size_t space_avail; /* Remaining space in this chunk */ + + /* + * While reviewing updates for each page, we store skipped updates here, + * and then move them to per-block areas as the blocks are defined. + */ + WT_UPD_SKIPPED *skip; /* Skipped updates */ + uint32_t skip_next; + size_t skip_allocated; + + /* + * We don't need to keep the 0th key around on internal pages, the + * search code ignores them as nothing can sort less by definition. + * There's some trickiness here, see the code for comments on how + * these fields work. + */ + int cell_zero; /* Row-store internal page 0th key */ + + /* + * WT_DICTIONARY -- + * We optionally build a dictionary of row-store values for leaf + * pages. Where two value cells are identical, only write the value + * once, the second and subsequent copies point to the original cell. + * The dictionary is fixed size, but organized in a skip-list to make + * searches faster. + */ + struct __rec_dictionary { + uint64_t hash; /* Hash value */ + void *cell; /* Matching cell */ + + u_int depth; /* Skiplist */ + WT_DICTIONARY *next[0]; + } **dictionary; /* Dictionary */ + u_int dictionary_next, dictionary_slots; /* Next, max entries */ + /* Skiplist head. */ + WT_DICTIONARY *dictionary_head[WT_SKIP_MAXDEPTH]; + + /* + * WT_KV-- + * An on-page key/value item we're building. + */ + struct __rec_kv { + WT_ITEM buf; /* Data */ + WT_CELL cell; /* Cell and cell's length */ + size_t cell_len; + size_t len; /* Total length of cell + data */ + } k, v; /* Key/Value being built */ + + WT_ITEM *cur, _cur; /* Key/Value being built */ + WT_ITEM *last, _last; /* Last key/value built */ + + int key_pfx_compress; /* If can prefix-compress next key */ + int key_pfx_compress_conf; /* If prefix compression configured */ + int key_sfx_compress; /* If can suffix-compress next key */ + int key_sfx_compress_conf; /* If suffix compression configured */ + + int is_bulk_load; /* If it's a bulk load */ + + WT_SALVAGE_COOKIE *salvage; /* If it's a salvage operation */ + + int tested_ref_state; /* Debugging information */ +} WT_RECONCILE; + +static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, int); +static void __rec_cell_build_addr( + WT_RECONCILE *, const void *, size_t, u_int, uint64_t); +static int __rec_cell_build_int_key(WT_SESSION_IMPL *, + WT_RECONCILE *, const void *, size_t, int *); +static int __rec_cell_build_leaf_key(WT_SESSION_IMPL *, + WT_RECONCILE *, const void *, size_t, int *); +static int __rec_cell_build_ovfl(WT_SESSION_IMPL *, + WT_RECONCILE *, WT_KV *, uint8_t, uint64_t); +static int __rec_cell_build_val(WT_SESSION_IMPL *, + WT_RECONCILE *, const void *, size_t, uint64_t); +static int __rec_child_deleted( + WT_SESSION_IMPL *, WT_RECONCILE *, WT_REF *, int *); +static int __rec_col_fix(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __rec_col_fix_slvg(WT_SESSION_IMPL *, + WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *); +static int __rec_col_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __rec_col_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __rec_col_var(WT_SESSION_IMPL *, + WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *); +static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *, + WT_SALVAGE_COOKIE *, WT_ITEM *, int, uint8_t, uint64_t); +static int __rec_destroy_session(WT_SESSION_IMPL *); +static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t); +static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __rec_row_leaf(WT_SESSION_IMPL *, + WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *); +static int __rec_row_leaf_insert( + WT_SESSION_IMPL *, WT_RECONCILE *, WT_INSERT *); +static int __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __rec_split_col(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *); +static int __rec_split_fixup(WT_SESSION_IMPL *, WT_RECONCILE *); +static int __rec_split_row(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __rec_split_row_promote( + WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t); +static int __rec_split_write(WT_SESSION_IMPL *, + WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, int); +static int __rec_write_init(WT_SESSION_IMPL *, + WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *); +static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __rec_write_wrapup_err( + WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); + +static void __rec_dictionary_free(WT_SESSION_IMPL *, WT_RECONCILE *); +static int __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *, u_int); +static int __rec_dictionary_lookup( + WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *, WT_DICTIONARY **); +static void __rec_dictionary_reset(WT_RECONCILE *); + +/* + * __wt_rec_write -- + * Reconcile an in-memory page into its on-disk format, and write it. + */ +int +__wt_rec_write(WT_SESSION_IMPL *session, + WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_MODIFY *mod; + WT_RECONCILE *r; + int locked; + + conn = S2C(session); + page = ref->page; + mod = page->modify; + + /* We're shouldn't get called with a clean page, that's an error. */ + if (!__wt_page_is_modified(page)) + WT_RET_MSG(session, WT_ERROR, + "Attempt to reconcile a clean page."); + + WT_RET(__wt_verbose(session, + WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type))); + WT_STAT_FAST_CONN_INCR(session, rec_pages); + WT_STAT_FAST_DATA_INCR(session, rec_pages); + if (LF_ISSET(WT_EVICTING)) { + WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction); + WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction); + } + + /* Record the most recent transaction ID we will *not* write. */ + mod->disk_snap_min = session->txn.snap_min; + + /* Initialize the reconciliation structure for each new run. */ + WT_RET(__rec_write_init( + session, ref, flags, salvage, &session->reconcile)); + r = session->reconcile; + + /* + * The compaction process looks at the page's modification information; + * if compaction is running, lock the page down. + * + * Otherwise, flip on the scanning flag: obsolete updates cannot be + * freed while reconciliation is in progress. + */ + locked = 0; + if (conn->compact_in_memory_pass) { + locked = 1; + WT_PAGE_LOCK(session, page); + } else + for (;;) { + F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret); + if (ret == 0) + break; + __wt_yield(); + } + + /* Reconcile the page. */ + switch (page->type) { + case WT_PAGE_COL_FIX: + if (salvage != NULL) + ret = __rec_col_fix_slvg(session, r, page, salvage); + else + ret = __rec_col_fix(session, r, page); + break; + case WT_PAGE_COL_INT: + ret = __rec_col_int(session, r, page); + break; + case WT_PAGE_COL_VAR: + ret = __rec_col_var(session, r, page, salvage); + break; + case WT_PAGE_ROW_INT: + ret = __rec_row_int(session, r, page); + break; + case WT_PAGE_ROW_LEAF: + ret = __rec_row_leaf(session, r, page, salvage); + break; + WT_ILLEGAL_VALUE_SET(session); + } + + /* Wrap up the page reconciliation. */ + if (ret == 0) + ret = __rec_write_wrapup(session, r, page); + else + WT_TRET(__rec_write_wrapup_err(session, r, page)); + + /* Release the page lock if we're holding one. */ + if (locked) + WT_PAGE_UNLOCK(session, page); + else + F_CLR_ATOMIC(page, WT_PAGE_SCANNING); + + /* + * Clean up the boundary structures: some workloads result in millions + * of these structures, and if associated with some random session that + * got roped into doing forced eviction, they won't be discarded for the + * life of the session. + */ + __rec_bnd_cleanup(session, r, 0); + + WT_RET(ret); + + /* + * Root pages are special, splits have to be done, we can't put it off + * as the parent's problem any more. + */ + if (__wt_ref_is_root(ref)) + return (__rec_root_write(session, page, flags)); + + /* + * Otherwise, mark the page's parent dirty. + * Don't mark the tree dirty: if this reconciliation is in service of a + * checkpoint, it's cleared the tree's dirty flag, and we don't want to + * set it again as part of that walk. + */ + return (__wt_page_parent_modify_set(session, ref, 1)); +} + +/* + * __rec_root_write -- + * Handle the write of a root page. + */ +static int +__rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) +{ + WT_DECL_RET; + WT_PAGE *next; + WT_PAGE_INDEX *pindex; + WT_PAGE_MODIFY *mod; + WT_REF fake_ref; + uint32_t i; + + mod = page->modify; + + /* + * If a single root page was written (either an empty page or there was + * a 1-for-1 page swap), we've written root and checkpoint, we're done. + * If the root page split, write the resulting WT_REF array. We already + * have an infrastructure for writing pages, create a fake root page and + * write it instead of adding code to write blocks based on the list of + * blocks resulting from a multiblock reconciliation. + */ + switch (F_ISSET(mod, WT_PM_REC_MASK)) { + case WT_PM_REC_EMPTY: /* Page is empty */ + case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ + return (0); + case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ + break; + WT_ILLEGAL_VALUE(session); + } + + WT_RET(__wt_verbose(session, WT_VERB_SPLIT, + "root page split -> %" PRIu32 " pages", mod->mod_multi_entries)); + + /* + * Create a new root page, initialize the array of child references, + * mark it dirty, then write it. + */ + switch (page->type) { + case WT_PAGE_COL_INT: + WT_RET(__wt_page_alloc(session, + WT_PAGE_COL_INT, 1, mod->mod_multi_entries, 1, &next)); + break; + case WT_PAGE_ROW_INT: + WT_RET(__wt_page_alloc(session, + WT_PAGE_ROW_INT, 0, mod->mod_multi_entries, 1, &next)); + break; + WT_ILLEGAL_VALUE(session); + } + + pindex = WT_INTL_INDEX_COPY(next); + for (i = 0; i < mod->mod_multi_entries; ++i) { + WT_ERR(__wt_multi_to_ref(session, + next, &mod->mod_multi[i], &pindex->index[i], NULL)); + pindex->index[i]->home = next; + } + + /* + * We maintain a list of pages written for the root in order to free the + * backing blocks the next time the root is written. + */ + mod->mod_root_split = next; + + WT_ERR(__wt_page_modify_init(session, next)); + __wt_page_only_modify_set(session, next); + + /* + * Fake up a reference structure, and write the next root page. + */ + __wt_root_ref_init(&fake_ref, next, page->type == WT_PAGE_COL_INT); + return (__wt_rec_write(session, &fake_ref, NULL, flags)); + +err: __wt_page_out(session, &next); + return (ret); +} + +/* + * __rec_raw_compression_config -- + * Configure raw compression. + */ +static inline int +__rec_raw_compression_config( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) +{ + WT_BTREE *btree; + + btree = S2BT(session); + + /* Check if raw compression configured. */ + if (btree->compressor == NULL || + btree->compressor->compress_raw == NULL) + return (0); + + /* Only for row-store and variable-length column-store objects. */ + if (page->type == WT_PAGE_COL_FIX) + return (0); + + /* + * Raw compression cannot support dictionary compression. (Technically, + * we could still use the raw callback on column-store variable length + * internal pages with dictionary compression configured, because + * dictionary compression only applies to column-store leaf pages, but + * that seems an unlikely use case.) + */ + if (btree->dictionary != 0) + return (0); + + /* Raw compression cannot support prefix compression. */ + if (btree->prefix_compression != 0) + return (0); + + /* + * Raw compression is also turned off during salvage: we can't allow + * pages to split during salvage, raw compression has no point if it + * can't manipulate the page size. + */ + if (salvage != NULL) + return (0); + + return (1); +} + +/* + * __rec_write_init -- + * Initialize the reconciliation structure. + */ +static int +__rec_write_init(WT_SESSION_IMPL *session, + WT_REF *ref, uint32_t flags, WT_SALVAGE_COOKIE *salvage, void *reconcilep) +{ + WT_BTREE *btree; + WT_PAGE *page; + WT_RECONCILE *r; + + btree = S2BT(session); + page = ref->page; + + if ((r = *(WT_RECONCILE **)reconcilep) == NULL) { + WT_RET(__wt_calloc_def(session, 1, &r)); + + *(WT_RECONCILE **)reconcilep = r; + session->reconcile_cleanup = __rec_destroy_session; + + /* Connect pointers/buffers. */ + r->cur = &r->_cur; + r->last = &r->_last; + + /* Disk buffers need to be aligned for writing. */ + F_SET(&r->dsk, WT_ITEM_ALIGNED); + } + + /* Remember the configuration. */ + r->ref = ref; + r->page = page; + r->flags = flags; + + /* Track if the page can be marked clean. */ + r->leave_dirty = 0; + + /* Raw compression. */ + r->raw_compression = + __rec_raw_compression_config(session, page, salvage); + r->raw_destination.flags = WT_ITEM_ALIGNED; + + /* Track overflow items. */ + r->ovfl_items = 0; + + /* Track empty values. */ + r->all_empty_value = 1; + r->any_empty_value = 0; + + /* The list of cached, skipped updates. */ + r->skip_next = 0; + + /* + * Dictionary compression only writes repeated values once. We grow + * the dictionary as necessary, always using the largest size we've + * seen. + * + * Reset the dictionary. + * + * Sanity check the size: 100 slots is the smallest dictionary we use. + */ + if (btree->dictionary != 0 && btree->dictionary > r->dictionary_slots) + WT_RET(__rec_dictionary_init(session, + r, btree->dictionary < 100 ? 100 : btree->dictionary)); + __rec_dictionary_reset(r); + + /* + * Suffix compression shortens internal page keys by discarding trailing + * bytes that aren't necessary for tree navigation. We don't do suffix + * compression if there is a custom collator because we don't know what + * bytes a custom collator might use. Some custom collators (for + * example, a collator implementing reverse ordering of strings), won't + * have any problem with suffix compression: if there's ever a reason to + * implement suffix compression for custom collators, we can add a + * setting to the collator, configured when the collator is added, that + * turns on suffix compression. + * + * The raw compression routines don't even consider suffix compression, + * but it doesn't hurt to confirm that. + */ + r->key_sfx_compress_conf = 0; + if (btree->collator == NULL && + btree->internal_key_truncate && !r->raw_compression) + r->key_sfx_compress_conf = 1; + + /* + * Prefix compression discards repeated prefix bytes from row-store leaf + * page keys. + */ + r->key_pfx_compress_conf = 0; + if (btree->prefix_compression && page->type == WT_PAGE_ROW_LEAF) + r->key_pfx_compress_conf = 1; + + r->salvage = salvage; + + /* Save the page's write generation before reading the page. */ + WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen); + + /* + * Running transactions may update the page after we write it, so + * this is the highest ID we can be confident we will see. + */ + r->skipped_txn = S2C(session)->txn_global.last_running; + + return (0); +} + +/* + * __rec_destroy -- + * Clean up the reconciliation structure. + */ +static void +__rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) +{ + WT_RECONCILE *r; + + if ((r = *(WT_RECONCILE **)reconcilep) == NULL) + return; + *(WT_RECONCILE **)reconcilep = NULL; + + __wt_buf_free(session, &r->dsk); + + __wt_free(session, r->raw_entries); + __wt_free(session, r->raw_offsets); + __wt_free(session, r->raw_recnos); + __wt_buf_free(session, &r->raw_destination); + + __rec_bnd_cleanup(session, r, 1); + + __wt_free(session, r->skip); + + __wt_buf_free(session, &r->k.buf); + __wt_buf_free(session, &r->v.buf); + __wt_buf_free(session, &r->_cur); + __wt_buf_free(session, &r->_last); + + __rec_dictionary_free(session, r); + + __wt_free(session, r); +} + +/* + * __rec_destroy_session -- + * Clean up the reconciliation structure, session version. + */ +static int +__rec_destroy_session(WT_SESSION_IMPL *session) +{ + __rec_destroy(session, &session->reconcile); + return (0); +} + +/* + * __rec_bnd_cleanup -- + * Cleanup the boundary structure information. + */ +static void +__rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy) +{ + WT_BOUNDARY *bnd; + uint32_t i, last_used; + + if (r->bnd == NULL) + return; + + /* + * Free the boundary structures' memory. In the case of normal cleanup, + * discard any memory we won't reuse in the next reconciliation; in the + * case of destruction, discard everything. + * + * During some big-page evictions we have seen boundary arrays that have + * millions of elements. That should not be a normal event, but if the + * memory is associated with a random session, it won't be discarded + * until the session is closed. If there are more than 10,000 boundary + * structure elements, destroy the boundary array and we'll start over. + */ + if (destroy || r->bnd_entries > 10 * 1000) { + for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) { + __wt_free(session, bnd->addr.addr); + __wt_free(session, bnd->dsk); + __wt_free(session, bnd->skip); + __wt_buf_free(session, &bnd->key); + } + __wt_free(session, r->bnd); + r->bnd_next = 0; + r->bnd_entries = r->bnd_allocated = 0; + } else { + /* + * The boundary-next field points to the next boundary structure + * we were going to use, but there's no requirement that value + * be incremented before reconciliation updates the structure it + * points to, that is, there's no guarantee elements of the next + * boundary structure are still unchanged. Be defensive, clean + * up the "next" structure as well as the ones we know we used. + */ + last_used = r->bnd_next; + if (last_used < r->bnd_entries) + ++last_used; + for (bnd = r->bnd, i = 0; i < last_used; ++bnd, ++i) { + __wt_free(session, bnd->addr.addr); + __wt_free(session, bnd->dsk); + __wt_free(session, bnd->skip); + } + } +} + +/* + * __rec_skip_update_save -- + * Save a skipped WT_UPDATE list for later restoration. + */ +static int +__rec_skip_update_save( + WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip) +{ + WT_RET(__wt_realloc_def( + session, &r->skip_allocated, r->skip_next + 1, &r->skip)); + r->skip[r->skip_next].ins = ins; + r->skip[r->skip_next].rip = rip; + ++r->skip_next; + return (0); +} + +/* + * __rec_skip_update_move -- + * Move a skipped WT_UPDATE list from the per-page cache to a specific + * block's list. + */ +static int +__rec_skip_update_move( + WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_UPD_SKIPPED *skip) +{ + WT_RET(__wt_realloc_def( + session, &bnd->skip_allocated, bnd->skip_next + 1, &bnd->skip)); + bnd->skip[bnd->skip_next] = *skip; + ++bnd->skip_next; + + skip->ins = NULL; + skip->rip = NULL; + return (0); +} + +/* + * __rec_txn_read -- + * Return the first visible update in a list (or NULL if none are visible), + * set a flag if any updates were skipped, track the maximum transaction ID on + * the page. + */ +static inline int +__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, + WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) +{ + WT_ITEM ovfl; + WT_PAGE *page; + WT_UPDATE *upd, *upd_list, *upd_ovfl; + size_t notused; + uint64_t max_txn, min_txn, txnid; + int skipped; + + *updp = NULL; + + page = r->page; + + /* + * If we're called with an WT_INSERT reference, use its WT_UPDATE + * list, else is an on-page row-store WT_UPDATE list. + */ + upd_list = ins == NULL ? WT_ROW_UPDATE(page, rip) : ins->upd; + skipped = 0; + + for (max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, upd = upd_list; + upd != NULL; upd = upd->next) { + if ((txnid = upd->txnid) == WT_TXN_ABORTED) + continue; + + /* Track the largest/smallest transaction IDs on the list. */ + if (TXNID_LT(max_txn, txnid)) + max_txn = txnid; + if (TXNID_LT(txnid, min_txn)) + min_txn = txnid; + if (TXNID_LT(txnid, r->skipped_txn) && + !__wt_txn_visible_all(session, txnid)) + r->skipped_txn = txnid; + + /* + * Record whether any updates were skipped on the way to finding + * the first visible update. + * + * If updates were skipped before the one being written, future + * reads without intervening modifications to the page could + * see a different value; if no updates were skipped, the page + * can safely be marked clean and does not need to be + * reconciled until modified again. + */ + if (*updp == NULL) { + if (__wt_txn_visible(session, txnid)) + *updp = upd; + else + skipped = 1; + } + } + + /* + * Track the maximum transaction ID in the page. We store this in the + * page at the end of reconciliation if no updates are skipped, it's + * used to avoid evicting clean pages from memory with changes required + * to satisfy a snapshot read. + */ + if (TXNID_LT(r->max_txn, max_txn)) + r->max_txn = max_txn; + + /* + * If all updates are globally visible and no updates were skipped, the + * page can be marked clean and we're done, regardless of whether we're + * evicting or checkpointing. + * + * The oldest transaction ID may have moved while we were scanning the + * page, so it is possible to skip an update but then find that by the + * end of the scan, all updates are stable. + */ + if (__wt_txn_visible_all(session, max_txn) && !skipped) + return (0); + + /* + * If some updates are not globally visible, or were skipped, the page + * cannot be marked clean. + */ + r->leave_dirty = 1; + + /* If we're not evicting, we're done, we know what we'll write. */ + if (!F_ISSET(r, WT_EVICTING)) + return (0); + + /* In some cases, there had better not be any updates we can't write. */ + if (F_ISSET(r, WT_SKIP_UPDATE_ERR)) + WT_PANIC_RET(session, EINVAL, + "reconciliation illegally skipped an update"); + + /* + * If evicting and we aren't able to save/restore the not-yet-visible + * updates, the page can't be evicted. + */ + if (!F_ISSET(r, WT_SKIP_UPDATE_RESTORE)) + return (EBUSY); + + /* + * Evicting a page with not-yet-visible updates: save and restore the + * list of updates on a newly instantiated page. + * + * The order of the updates on the list matters so we can't move only + * the unresolved updates, we have to move the entire update list. + * + * Clear the returned update so our caller ignores the key/value pair + * in the case of an insert/append entry (everything we need is in the + * update list), and otherwise writes the original on-page key/value + * pair to which the update list applies. + */ + *updp = NULL; + + /* + * Handle the case were we don't want to write an original on-page value + * item to disk because it's been updated or removed. + * + * Here's the deal: an overflow value was updated or removed and its + * backing blocks freed. If any transaction in the system might still + * read the value, a copy was cached in page reconciliation tracking + * memory, and the page cell set to WT_CELL_VALUE_OVFL_RM. Eviction + * then chose the page and we're splitting it up in order to push parts + * of it out of memory. + * + * We could write the original on-page value item to disk... if we had + * a copy. The cache may not have a copy (a globally visible update + * would have kept a value from ever being cached), or an update that + * subsequent became globally visible could cause a cached value to be + * discarded. Either way, once there's a globally visible update, we + * may not have the value. + * + * Fortunately, if there's a globally visible update we don't care about + * the original version, so we simply ignore it, no transaction can ever + * try and read it. If there isn't a globally visible update, there had + * better be a cached value. + * + * In the latter case, we could write the value out to disk, but (1) we + * are planning on re-instantiating this page in memory, it isn't going + * to disk, and (2) the value item is eventually going to be discarded, + * that seems like a waste of a write. Instead, find the cached value + * and append it to the update list we're saving for later restoration. + */ + if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM && + !__wt_txn_visible_all(session, min_txn)) { + WT_RET(__wt_ovfl_txnc_search( + page, vpack->data, vpack->size, &ovfl)); + /* + * Create an update structure with an impossibly low transaction + * ID and append it to the update list we're about to save. + * Restoring that update list when this page is re-instantiated + * creates an update for the key/value pair visible to every + * running transaction in the system, ensuring the on-page value + * will be ignored. + */ + WT_RET(__wt_update_alloc(session, &ovfl, &upd_ovfl, ¬used)); + upd_ovfl->txnid = WT_TXN_NONE; + for (upd = upd_list; upd->next != NULL; upd = upd->next) + ; + upd->next = upd_ovfl; + } + + return (__rec_skip_update_save(session, r, ins, rip)); +} + +/* + * CHILD_RELEASE -- + * Macros to clean up during internal-page reconciliation, releasing the + * hazard pointer we're holding on child pages. + */ +#undef CHILD_RELEASE +#define CHILD_RELEASE(session, hazard, ref) do { \ + if (hazard) { \ + hazard = 0; \ + WT_TRET( \ + __wt_page_release(session, ref, WT_READ_NO_EVICT)); \ + } \ +} while (0) +#undef CHILD_RELEASE_ERR +#define CHILD_RELEASE_ERR(session, hazard, ref) do { \ + CHILD_RELEASE(session, hazard, ref); \ + WT_ERR(ret); \ +} while (0) + +/* + * __rec_child_modify -- + * Return if the internal page's child references any modifications. + */ +static int +__rec_child_modify(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_REF *ref, int *hazardp, int *statep) +{ + WT_DECL_RET; + WT_PAGE_MODIFY *mod; + + /* We may acquire a hazard pointer our caller must release. */ + *hazardp = 0; + +#define WT_CHILD_IGNORE 1 /* Deleted child: ignore */ +#define WT_CHILD_MODIFIED 2 /* Modified child */ +#define WT_CHILD_PROXY 3 /* Deleted child: proxy */ + *statep = 0; + + /* + * This function is called when walking an internal page to decide how + * to handle child pages referenced by the internal page, specifically + * if the child page is to be merged into its parent. + * + * Internal pages are reconciled for two reasons: first, when evicting + * an internal page, second by the checkpoint code when writing internal + * pages. During eviction, the subtree is locked down so all pages + * should be in the WT_REF_DISK or WT_REF_LOCKED state. During + * checkpoint, any eviction that might affect our review of an internal + * page is prohibited, however, as the subtree is not reserved for our + * exclusive use, there are other page states that must be considered. + */ + for (;; __wt_yield()) + switch (r->tested_ref_state = ref->state) { + case WT_REF_DISK: + /* On disk, not modified by definition. */ + goto done; + + case WT_REF_DELETED: + /* + * The child is in a deleted state. + * + * It's possible the state could change underneath us as + * the page is read in, and we can race between checking + * for a deleted state and looking at the transaction ID + * to see if the delete is visible to us. Lock down the + * structure. + */ + if (!WT_ATOMIC_CAS4( + ref->state, WT_REF_DELETED, WT_REF_LOCKED)) + break; + ret = __rec_child_deleted(session, r, ref, statep); + WT_PUBLISH(ref->state, WT_REF_DELETED); + goto done; + + case WT_REF_LOCKED: + /* + * Locked. + * + * If evicting, the evicted page's subtree, including + * this child, was selected for eviction by us and the + * state is stable until we reset it, it's an in-memory + * state. This is the expected state for a child being + * merged into a page (where the page was selected by + * the eviction server for eviction). + */ + if (F_ISSET(r, WT_EVICTING)) + goto in_memory; + + /* + * If called during checkpoint, the child is being + * considered by the eviction server or the child is a + * fast-delete page being read. The eviction may have + * started before the checkpoint and so we must wait + * for the eviction to be resolved. I suspect we could + * handle fast-delete reads, but we can't distinguish + * between the two and fast-delete reads aren't expected + * to be common. + */ + break; + + case WT_REF_MEM: + /* + * In memory. + * + * If evicting, the evicted page's subtree, including + * this child, was selected for eviction by us and the + * state is stable until we reset it, it's an in-memory + * state. This is the expected state for a child being + * merged into a page (where the page belongs to a file + * being discarded from the cache during close). + */ + if (F_ISSET(r, WT_EVICTING)) + goto in_memory; + + /* + * If called during checkpoint, acquire a hazard pointer + * so the child isn't evicted, it's an in-memory case. + * + * This call cannot return split/restart, dirty page + * eviction is shutout during checkpoint, all splits in + * process will have completed before we walk any pages + * for checkpoint. + */ + if ((ret = __wt_page_in(session, ref, + WT_READ_CACHE | WT_READ_NO_EVICT | + WT_READ_NO_GEN | WT_READ_NO_WAIT)) == WT_NOTFOUND) { + ret = 0; + break; + } + *hazardp = 1; + goto in_memory; + + case WT_REF_READING: + /* + * Being read, not modified by definition. + * + * We should never be here during eviction, a child page + * in this state within an evicted page's subtree would + * have caused normally eviction to fail, and exclusive + * eviction shouldn't ever see pages being read. + */ + WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + goto done; + + case WT_REF_SPLIT: + /* + * The page was split out from under us. + * + * We should never be here during eviction, a child page + * in this state within an evicted page's subtree would + * have caused eviction to fail. + * + * We should never be here during checkpoint, dirty page + * eviction is shutout during checkpoint, all splits in + * process will have completed before we walk any pages + * for checkpoint. + */ + WT_ASSERT(session, ref->state != WT_REF_SPLIT); + /* FALLTHROUGH */ + + WT_ILLEGAL_VALUE(session); + } + +in_memory: + /* + * In-memory states: the child is potentially modified if the page's + * modify structure has been instantiated. If the modify structure + * exists and the page has actually been modified, set that state. + * If that's not the case, we would normally use the original cell's + * disk address as our reference, but, if we're forced to instantiate + * a deleted child page and it's never modified, we end up here with + * a page that has a modify structure, no modifications, and no disk + * address. Ignore those pages, they're not modified and there is no + * reason to write the cell. + */ + mod = ref->page->modify; + if (mod != NULL && mod->flags != 0) + *statep = WT_CHILD_MODIFIED; + else if (ref->addr == NULL) { + *statep = WT_CHILD_IGNORE; + CHILD_RELEASE(session, *hazardp, ref); + } + +done: WT_HAVE_DIAGNOSTIC_YIELD; + return (ret); +} + +/* + * __rec_child_deleted -- + * Handle pages with leaf pages in the WT_REF_DELETED state. + */ +static int +__rec_child_deleted( + WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, int *statep) +{ + WT_BM *bm; + WT_PAGE_DELETED *page_del; + size_t addr_size; + const uint8_t *addr; + + bm = S2BT(session)->bm; + page_del = ref->page_del; + + /* + * Internal pages with child leaf pages in the WT_REF_DELETED state are + * a special case during reconciliation. First, if the deletion was a + * result of a session truncate call, the deletion may not be visible to + * us. In that case, we proceed as with any change that's not visible + * during reconciliation by setting the skipped flag and ignoring the + * change for the purposes of writing the internal page. + * + * In this case, there must be an associated page-deleted structure, and + * it holds the transaction ID we care about. + */ + if (page_del != NULL && !__wt_txn_visible(session, page_del->txnid)) { + /* + * In some cases, there had better not be any updates we can't + * write. + */ + if (F_ISSET(r, WT_SKIP_UPDATE_ERR)) + WT_PANIC_RET(session, EINVAL, + "reconciliation illegally skipped an update"); + + /* If this page cannot be evicted, quit now. */ + if (F_ISSET(r, WT_EVICTING)) + return (EBUSY); + } + + /* + * The deletion is visible to us, deal with any underlying disk blocks. + * + * First, check to see if there is an address associated with this leaf: + * if there isn't, we're done, the underlying page is already gone. If + * the page still exists, check for any transactions in the system that + * might want to see the page's state before it's deleted. + * + * If any such transactions exist, we cannot discard the underlying leaf + * page to the block manager because the transaction may eventually read + * it. However, this write might be part of a checkpoint, and should we + * recover to that checkpoint, we'll need to delete the leaf page, else + * we'd leak it. The solution is to write a proxy cell on the internal + * page ensuring the leaf page is eventually discarded. + * + * If no such transactions exist, we can discard the leaf page to the + * block manager and no cell needs to be written at all. We do this + * outside of the underlying tracking routines because this action is + * permanent and irrevocable. (Clearing the address means we've lost + * track of the disk address in a permanent way. This is safe because + * there's no path to reading the leaf page again: if there's ever a + * read into this part of the name space again, the cache read function + * instantiates an entirely new page.) + */ + if (ref->addr != NULL && + (page_del == NULL || + __wt_txn_visible_all(session, page_del->txnid))) { + WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); + WT_RET(bm->free(bm, session, addr, addr_size)); + + if (__wt_off_page(ref->home, ref->addr)) { + __wt_free(session, ((WT_ADDR *)ref->addr)->addr); + __wt_free(session, ref->addr); + } + ref->addr = NULL; + } + + /* + * Minor memory cleanup: if a truncate call deleted this page and we + * were ever forced to instantiate the page in memory, we would have + * built a list of updates in the page reference in order to be able + * to abort the truncate. It's a cheap test to make that memory go + * away, we do it here because there's really nowhere else we do the + * checks. In short, if we have such a list, and the backing address + * blocks are gone, there can't be any transaction that can abort. + */ + if (ref->addr == NULL && page_del != NULL) { + __wt_free(session, ref->page_del->update_list); + __wt_free(session, ref->page_del); + } + + /* + * If there's still a disk address, then we have to write a proxy + * record, otherwise, we can safely ignore this child page. + */ + *statep = ref->addr == NULL ? WT_CHILD_IGNORE : WT_CHILD_PROXY; + return (0); +} + +/* + * __rec_incr -- + * Update the memory tracking structure for a set of new entries. + */ +static inline void +__rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size) +{ + /* + * The buffer code is fragile and prone to off-by-one errors -- check + * for overflow in diagnostic mode. + */ + WT_ASSERT(session, r->space_avail >= size); + WT_ASSERT(session, + WT_BLOCK_FITS(r->first_free, size, r->dsk.mem, r->page_size)); + + r->entries += v; + r->space_avail -= size; + r->first_free += size; +} + +/* + * __rec_copy_incr -- + * Copy a key/value cell and buffer pair into the new image. + */ +static inline void +__rec_copy_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *kv) +{ + size_t len; + uint8_t *p, *t; + + /* + * If there's only one chunk of data to copy (because the cell and data + * are being copied from the original disk page), the cell length won't + * be set, the WT_ITEM data/length will reference the data to be copied. + * + * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do + * the copy in-line. + */ + for (p = (uint8_t *)r->first_free, + t = (uint8_t *)&kv->cell, len = kv->cell_len; len > 0; --len) + *p++ = *t++; + + /* The data can be quite large -- call memcpy. */ + if (kv->buf.size != 0) + memcpy(p, kv->buf.data, kv->buf.size); + + WT_ASSERT(session, kv->len == kv->cell_len + kv->buf.size); + __rec_incr(session, r, 1, kv->len); +} + +/* + * __rec_dict_replace -- + * Check for a dictionary match. + */ +static int +__rec_dict_replace( + WT_SESSION_IMPL *session, WT_RECONCILE *r, uint64_t rle, WT_KV *val) +{ + WT_DICTIONARY *dp; + uint64_t offset; + + /* + * We optionally create a dictionary of values and only write a unique + * value once per page, using a special "copy" cell for all subsequent + * copies of the value. We have to do the cell build and resolution at + * this low level because we need physical cell offsets for the page. + * + * Sanity check: short-data cells can be smaller than dictionary-copy + * cells. If the data is already small, don't bother doing the work. + * This isn't just work avoidance: on-page cells can't grow as a result + * of writing a dictionary-copy cell, the reconciliation functions do a + * split-boundary test based on the size required by the value's cell; + * if we grow the cell after that test we'll potentially write off the + * end of the buffer's memory. + */ + if (val->buf.size <= WT_INTPACK32_MAXSIZE) + return (0); + WT_RET(__rec_dictionary_lookup(session, r, val, &dp)); + if (dp == NULL) + return (0); + + /* + * If the dictionary cell reference is not set, we're creating a new + * entry in the dictionary, update its location. + * + * If the dictionary cell reference is set, we have a matching value. + * Create a copy cell instead. + */ + if (dp->cell == NULL) + dp->cell = r->first_free; + else { + offset = WT_PTRDIFF(r->first_free, dp->cell); + val->len = val->cell_len = + __wt_cell_pack_copy(&val->cell, rle, offset); + val->buf.data = NULL; + val->buf.size = 0; + } + return (0); +} + +/* + * __rec_key_state_update -- + * Update prefix and suffix compression based on the last key. + */ +static inline void +__rec_key_state_update(WT_RECONCILE *r, int ovfl_key) +{ + WT_ITEM *a; + + /* + * If writing an overflow key onto the page, don't update the "last key" + * value, and leave the state of prefix compression alone. (If we are + * currently doing prefix compression, we have a key state which will + * continue to work, we're just skipping the key just created because + * it's an overflow key and doesn't participate in prefix compression. + * If we are not currently doing prefix compression, we can't start, an + * overflow key doesn't give us any state.) + * + * Additionally, if we wrote an overflow key onto the page, turn off the + * suffix compression of row-store internal node keys. (When we split, + * "last key" is the largest key on the previous page, and "cur key" is + * the first key on the next page, which is being promoted. In some + * cases we can discard bytes from the "cur key" that are not needed to + * distinguish between the "last key" and "cur key", compressing the + * size of keys on internal nodes. If we just built an overflow key, + * we're not going to update the "last key", making suffix compression + * impossible for the next key. Alternatively, we could remember where + * the last key was on the page, detect it's an overflow key, read it + * from disk and do suffix compression, but that's too much work for an + * unlikely event.) + * + * If we're not writing an overflow key on the page, update the last-key + * value and turn on both prefix and suffix compression. + */ + if (ovfl_key) + r->key_sfx_compress = 0; + else { + a = r->cur; + r->cur = r->last; + r->last = a; + + r->key_pfx_compress = r->key_pfx_compress_conf; + r->key_sfx_compress = r->key_sfx_compress_conf; + } +} + +/* + * Macros from fixed-length entries to/from bytes. + */ +#define WT_FIX_BYTES_TO_ENTRIES(btree, bytes) \ + ((uint32_t)((((bytes) * 8) / (btree)->bitcnt))) +#define WT_FIX_ENTRIES_TO_BYTES(btree, entries) \ + ((uint32_t)WT_ALIGN((entries) * (btree)->bitcnt, 8)) + +/* + * __rec_leaf_page_max -- + * Figure out the maximum leaf page size for the reconciliation. + */ +static inline uint32_t +__rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + WT_BTREE *btree; + WT_PAGE *page; + uint32_t page_size; + + btree = S2BT(session); + page = r->page; + + page_size = 0; + switch (page->type) { + case WT_PAGE_COL_FIX: + /* + * Column-store pages can grow if there are missing records + * (that is, we lost a chunk of the range, and have to write + * deleted records). Fixed-length objects are a problem, if + * there's a big missing range, we could theoretically have to + * write large numbers of missing objects. + */ + page_size = (uint32_t)WT_ALIGN(WT_FIX_ENTRIES_TO_BYTES(btree, + r->salvage->take + r->salvage->missing), btree->allocsize); + break; + case WT_PAGE_COL_VAR: + /* + * Column-store pages can grow if there are missing records + * (that is, we lost a chunk of the range, and have to write + * deleted records). Variable-length objects aren't usually a + * problem because we can write any number of deleted records + * in a single page entry because of the RLE, we just need to + * ensure that additional entry fits. + */ + break; + case WT_PAGE_ROW_LEAF: + default: + /* + * Row-store pages can't grow, salvage never does anything + * other than reduce the size of a page read from disk. + */ + break; + } + + /* + * Default size for variable-length column-store and row-store pages + * during salvage is the maximum leaf page size. + */ + if (page_size < btree->maxleafpage) + page_size = btree->maxleafpage; + + /* + * The page we read from the disk should be smaller than the page size + * we just calculated, check out of paranoia. + */ + if (page_size < page->dsk->mem_size) + page_size = page->dsk->mem_size; + + /* + * Salvage is the backup plan: don't let this fail. + */ + return (page_size * 2); +} + +/* + * __rec_split_bnd_init -- + * Initialize a single boundary structure. + */ +static void +__rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) +{ + bnd->start = NULL; + + bnd->recno = 0; + bnd->entries = 0; + + __wt_free(session, bnd->addr.addr); + WT_CLEAR(bnd->addr); + bnd->size = 0; + bnd->cksum = 0; + __wt_free(session, bnd->dsk); + + __wt_free(session, bnd->skip); + bnd->skip_next = 0; + bnd->skip_allocated = 0; + + /* Ignore the key, we re-use that memory in each new reconciliation. */ + + bnd->already_compressed = 0; +} + +/* + * __rec_split_bnd_grow -- + * Grow the boundary array as necessary. + */ +static int +__rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + /* + * Make sure there's enough room for another boundary. The calculation + * is +2, because when filling in the current boundary's information, + * we save the start point of the next boundary (for example, a record + * number or key), in the (current + 1) slot. + * + * For the same reason, we're always initializing one ahead. + */ + WT_RET(__wt_realloc_def( + session, &r->bnd_allocated, r->bnd_next + 2, &r->bnd)); + r->bnd_entries = r->bnd_allocated / sizeof(r->bnd[0]); + + __rec_split_bnd_init(session, &r->bnd[r->bnd_next + 1]); + + return (0); +} + +/* + * __rec_split_init -- + * Initialization for the reconciliation split functions. + */ +static int +__rec_split_init(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_PAGE *page, uint64_t recno, uint32_t max) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_PAGE_HEADER *dsk; + size_t corrected_page_size; + + btree = S2BT(session); + bm = btree->bm; + + /* + * The maximum leaf page size governs when an in-memory leaf page splits + * into multiple on-disk pages; however, salvage can't be allowed to + * split, there's no parent page yet. If we're doing salvage, override + * the caller's selection of a maximum page size, choosing a page size + * that ensures we won't split. + */ + if (r->salvage != NULL) + max = __rec_leaf_page_max(session, r); + + /* + * Set the page sizes. If we're doing the page layout, the maximum page + * size is the same as the page size. If the application is doing page + * layout (raw compression is configured), we accumulate some amount of + * additional data because we don't know how well it will compress, and + * we don't want to increment our way up to the amount of data needed by + * the application to successfully compress to the target page size. + */ + r->page_size = r->page_size_max = max; + if (r->raw_compression) + r->page_size *= 10; + + /* + * Ensure the disk image buffer is large enough for the max object, as + * corrected by the underlying block manager. + */ + corrected_page_size = r->page_size; + WT_RET(bm->write_size(bm, session, &corrected_page_size)); + WT_RET(__wt_buf_init(session, &r->dsk, corrected_page_size)); + + /* + * Clear the disk page's header and block-manager space, set the page + * type (the type doesn't change, and setting it later would require + * additional code in a few different places). + */ + dsk = r->dsk.mem; + memset(dsk, 0, WT_PAGE_HEADER_BYTE_SIZE(btree)); + dsk->type = page->type; + + /* + * If we have to split, we want to choose a smaller page size for the + * split pages, because otherwise we could end up splitting one large + * packed page over and over. We don't want to pick the minimum size + * either, because that penalizes an application that did a bulk load + * and subsequently inserted a few items into packed pages. Currently + * defaulted to 75%, but I have no empirical evidence that's "correct". + * + * The maximum page size may be a multiple of the split page size (for + * example, there's a maximum page size of 128KB, but because the table + * is active and we don't want to split a lot, the split size is 20KB). + * The maximum page size may NOT be an exact multiple of the split page + * size. + * + * It's lots of work to build these pages and don't want to start over + * when we reach the maximum page size (it's painful to restart after + * creating overflow items and compacted data, for example, as those + * items have already been written to disk). So, the loop calls the + * helper functions when approaching a split boundary, and we save the + * information at that point. That allows us to go back and split the + * page at the boundary points if we eventually overflow the maximum + * page size. + * + * Finally, all this doesn't matter for fixed-size column-store pages, + * raw compression, and salvage. Fixed-size column store pages can + * split under (very) rare circumstances, but they're allocated at a + * fixed page size, never anything smaller. In raw compression, the + * underlying compression routine decides when we split, so it's not + * our problem. In salvage, as noted above, we can't split at all. + */ + if (r->raw_compression || r->salvage != NULL) { + r->split_size = 0; + r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + } + else if (page->type == WT_PAGE_COL_FIX) { + r->split_size = r->page_size_max; + r->space_avail = + r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + } else { + r->split_size = __wt_split_page_size(btree, r->page_size_max); + r->space_avail = + r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + } + r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); + + /* Initialize the first boundary. */ + r->bnd_next = 0; + WT_RET(__rec_split_bnd_grow(session, r)); + __rec_split_bnd_init(session, &r->bnd[0]); + r->bnd[0].recno = recno; + r->bnd[0].start = WT_PAGE_HEADER_BYTE(btree, dsk); + + /* + * If the maximum page size is the same as the split page size, either + * because of the object type or application configuration, there isn't + * any need to maintain split boundaries within a larger page. + * + * No configuration for salvage here, because salvage can't split. + */ + if (r->raw_compression) + r->bnd_state = SPLIT_TRACKING_RAW; + else if (max == r->split_size) + r->bnd_state = SPLIT_TRACKING_OFF; + else + r->bnd_state = SPLIT_BOUNDARY; + + /* Initialize the entry counters. */ + r->entries = r->total_entries = 0; + + /* Initialize the starting record number. */ + r->recno = recno; + + /* New page, compression off. */ + r->key_pfx_compress = r->key_sfx_compress = 0; + + return (0); +} + +/* + * __rec_is_checkpoint -- + * Return if we're writing a checkpoint. + */ +static int +__rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd) +{ + /* + * Check to see if we're going to create a checkpoint. + * + * This function exists as a place to hang this comment. + * + * Any time we write the root page of the tree without splitting we are + * creating a checkpoint (and have to tell the underlying block manager + * so it creates and writes the additional information checkpoints + * require). However, checkpoints are completely consistent, and so we + * have to resolve information about the blocks we're expecting to free + * as part of the checkpoint, before writing the checkpoint. In short, + * we don't do checkpoint writes here; clear the boundary information as + * a reminder and create the checkpoint during wrapup. + */ + if (bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) { + bnd->addr.addr = NULL; + bnd->addr.size = 0; + bnd->addr.type = 0; + return (1); + } + return (0); +} + +/* + * __rec_split_row_promote_cell -- + * Get a key from a cell for the purposes of promotion. + */ +static int +__rec_split_row_promote_cell( + WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk, WT_ITEM *key) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *kpack, _kpack; + + btree = S2BT(session); + kpack = &_kpack; + + /* + * The cell had better have a zero-length prefix and not be a copy cell; + * the first cell on a page cannot refer an earlier cell on the page. + */ + cell = WT_PAGE_HEADER_BYTE(btree, dsk); + __wt_cell_unpack(cell, kpack); + WT_ASSERT(session, + kpack->prefix == 0 && kpack->raw != WT_CELL_VALUE_COPY); + + WT_RET(__wt_cell_data_copy(session, dsk->type, kpack, key)); + return (0); +} + +/* + * __rec_split_row_promote -- + * Key promotion for a row-store. + */ +static int +__rec_split_row_promote( + WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ITEM *key, uint8_t type) +{ + WT_BTREE *btree; + WT_DECL_ITEM(update); + WT_DECL_RET; + WT_ITEM *max; + WT_UPD_SKIPPED *skip; + size_t cnt, len, size; + uint32_t i; + const uint8_t *pa, *pb; + int cmp; + + /* + * For a column-store, the promoted key is the recno and we already have + * a copy. For a row-store, it's the first key on the page, a variable- + * length byte string, get a copy. + * + * This function is called from the split code at each split boundary, + * but that means we're not called before the first boundary, and we + * will eventually have to get the first key explicitly when splitting + * a page. + * + * For the current slot, take the last key we built, after doing suffix + * compression. The "last key we built" describes some process: before + * calling the split code, we must place the last key on the page before + * the boundary into the "last" key structure, and the first key on the + * page after the boundary into the "current" key structure, we're going + * to compare them for suffix compression. + * + * Suffix compression is a hack to shorten keys on internal pages. We + * only need enough bytes in the promoted key to ensure searches go to + * the correct page: the promoted key has to be larger than the last key + * on the leaf page preceding it, but we don't need any more bytes than + * that. In other words, we can discard any suffix bytes not required + * to distinguish between the key being promoted and the last key on the + * leaf page preceding it. This can only be done for the first level of + * internal pages, you cannot repeat suffix truncation as you split up + * the tree, it loses too much information. + * + * Note #1: if the last key on the previous page was an overflow key, + * we don't have the in-memory key against which to compare, and don't + * try to do suffix compression. The code for that case turns suffix + * compression off for the next key, we don't have to deal with it here. + */ + if (type != WT_PAGE_ROW_LEAF || !r->key_sfx_compress) + return (__wt_buf_set(session, key, r->cur->data, r->cur->size)); + + btree = S2BT(session); + WT_RET(__wt_scr_alloc(session, 0, &update)); + + /* + * Note #2: if we skipped updates, an update key may be larger than the + * last key stored in the previous block (probable for append-centric + * workloads). If there are skipped updates, check for one larger than + * the last key and smaller than the current key. + */ + max = r->last; + for (i = r->skip_next; i > 0; --i) { + skip = &r->skip[i - 1]; + if (skip->ins == NULL) + WT_ERR(__wt_row_leaf_key( + session, r->page, skip->rip, update, 0)); + else { + update->data = WT_INSERT_KEY(skip->ins); + update->size = WT_INSERT_KEY_SIZE(skip->ins); + } + + /* Compare against the current key, it must be less. */ + WT_ERR(__wt_compare( + session, btree->collator, update, r->cur, &cmp)); + if (cmp >= 0) + continue; + + /* Compare against the last key, it must be greater. */ + WT_ERR(__wt_compare( + session, btree->collator, update, r->last, &cmp)); + if (cmp >= 0) + max = update; + + /* + * The skipped updates are in key-sort order so the entry we're + * looking for is either the last one or the next-to-last one + * in the list. Once we've compared an entry against the last + * key on the page, we're done. + */ + break; + } + + /* + * The largest key on the last block must sort before the current key, + * so we'll either find a larger byte value in the current key, or the + * current key will be a longer key, and the interesting byte is one + * past the length of the shorter key. + */ + pa = max->data; + pb = r->cur->data; + len = WT_MIN(max->size, r->cur->size); + size = len + 1; + for (cnt = 1; len > 0; ++cnt, --len, ++pa, ++pb) + if (*pa != *pb) { + if (size != cnt) { + WT_STAT_FAST_DATA_INCRV(session, + rec_suffix_compression, size - cnt); + size = cnt; + } + break; + } + ret = __wt_buf_set(session, key, r->cur->data, size); + +err: __wt_scr_free(&update); + return (ret); +} + +/* + * __rec_split -- + * Handle the page reconciliation bookkeeping. (Did you know "bookkeeper" + * has 3 doubled letters in a row? Sweet-tooth does, too.) + */ +static int +__rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + WT_BTREE *btree; + WT_BOUNDARY *last, *next; + WT_PAGE_HEADER *dsk; + uint32_t len; + + /* + * We should never split during salvage, and we're about to drop core + * because there's no parent page. + */ + if (r->salvage != NULL) + WT_PANIC_RET(session, WT_PANIC, + "%s page too large, attempted split during salvage", + __wt_page_type_string(r->page->type)); + + /* + * Handle page-buffer size tracking; we have to do this work in every + * reconciliation loop, and I don't want to repeat the code that many + * times. + */ + btree = S2BT(session); + dsk = r->dsk.mem; + + /* Hitting a page boundary resets the dictionary, in all cases. */ + __rec_dictionary_reset(r); + + /* + * There are 3 cases we have to handle. + * + * #1 + * About to cross a split boundary: save current boundary information + * and return. + * + * #2 + * About to cross the maximum boundary: use saved boundary information + * to write all of the split pages. + * + * #3 + * About to cross a split boundary, but we've either already done the + * split thing when we approached the maximum boundary, in which + * case we write the page and keep going, or we were never tracking + * split boundaries at all. + * + * Cases #1 and #2 are the hard ones: we're called when we're about to + * cross each split boundary, and we save information away so we can + * split if we have to. We're also called when we're about to cross + * the maximum page boundary: in that case, we do the actual split and + * clean up all the previous boundaries, then keep going. + */ + switch (r->bnd_state) { + case SPLIT_BOUNDARY: /* Case #1 */ + /* + * Save the information about where we are when the split would + * have happened. + */ + WT_RET(__rec_split_bnd_grow(session, r)); + last = &r->bnd[r->bnd_next++]; + next = last + 1; + + /* Set the number of entries for the just finished chunk. */ + last->entries = r->entries - r->total_entries; + r->total_entries = r->entries; + + /* Set the key for the next chunk. */ + next->recno = r->recno; + if (dsk->type == WT_PAGE_ROW_INT || + dsk->type == WT_PAGE_ROW_LEAF) + WT_RET(__rec_split_row_promote( + session, r, &next->key, dsk->type)); + + /* + * Set the starting buffer address and clear the entries (the + * latter not required, but cleaner). + */ + next->start = r->first_free; + next->entries = 0; + + /* + * Set the space available to another split-size chunk, if we + * have one. If we don't have room for another split chunk, + * add whatever space remains in the maximum page size, and + * hope it's enough. + */ + len = WT_PTRDIFF32(r->first_free, dsk); + if (len + r->split_size <= r->page_size) + r->space_avail = + r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + else { + r->bnd_state = SPLIT_MAX; + r->space_avail = r->page_size - + (WT_PAGE_HEADER_BYTE_SIZE(btree) + len); + } + break; + case SPLIT_MAX: /* Case #2 */ + /* + * It didn't all fit into a single page. + * + * Cycle through the saved split-point information, writing the + * split chunks we have tracked. + */ + WT_RET(__rec_split_fixup(session, r)); + + /* We're done saving split chunks. */ + r->bnd_state = SPLIT_TRACKING_OFF; + break; + case SPLIT_TRACKING_OFF: /* Case #3 */ + /* + * It didn't all fit, but either we've already noticed it and + * are now processing the rest of the page at the split-size + * boundaries, or the split size was the same as the page size, + * so we never bothered with saving split-point information. + */ + WT_RET(__rec_split_bnd_grow(session, r)); + last = &r->bnd[r->bnd_next++]; + next = last + 1; + + /* + * Set the key for the next chunk (before writing the block, a + * key range is needed in that code). + */ + next->recno = r->recno; + if (dsk->type == WT_PAGE_ROW_INT || + dsk->type == WT_PAGE_ROW_LEAF) + WT_RET(__rec_split_row_promote( + session, r, &next->key, dsk->type)); + + /* Clear the entries (not required, but cleaner). */ + next->entries = 0; + + /* Finalize the header information and write the page. */ + dsk->recno = last->recno; + dsk->u.entries = r->entries; + dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk); + WT_RET(__rec_split_write(session, r, last, &r->dsk, 0)); + + /* + * Set the caller's entry count and buffer information for the + * next chunk. We only get here if we're not splitting or have + * already split, so it's split-size chunks from here on out. + */ + r->entries = 0; + r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); + r->space_avail = + r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + break; + case SPLIT_TRACKING_RAW: + WT_ILLEGAL_VALUE(session); + } + return (0); +} + +/* + * __rec_split_raw_worker -- + * Handle the raw compression page reconciliation bookkeeping. + */ +static int +__rec_split_raw_worker( + WT_SESSION_IMPL *session, WT_RECONCILE *r, int no_more_rows) +{ + WT_BM *bm; + WT_BOUNDARY *last, *next; + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + WT_COMPRESSOR *compressor; + WT_DECL_RET; + WT_ITEM *dst, *write_ref; + WT_PAGE_HEADER *dsk, *dsk_dst; + WT_SESSION *wt_session; + size_t corrected_page_size, len, result_len; + uint64_t recno; + uint32_t entry, i, result_slots, slots; + int last_block; + uint8_t *dsk_start; + + wt_session = (WT_SESSION *)session; + btree = S2BT(session); + bm = btree->bm; + + unpack = &_unpack; + compressor = btree->compressor; + dst = &r->raw_destination; + dsk = r->dsk.mem; + + WT_RET(__rec_split_bnd_grow(session, r)); + last = &r->bnd[r->bnd_next]; + next = last + 1; + + /* + * Build arrays of offsets and cumulative counts of cells and rows in + * the page: the offset is the byte offset to the possible split-point + * (adjusted for an initial chunk that cannot be compressed), entries + * is the cumulative page entries covered by the byte offset, recnos is + * the cumulative rows covered by the byte offset. + */ + if (r->entries >= r->raw_max_slots) { + __wt_free(session, r->raw_entries); + __wt_free(session, r->raw_offsets); + __wt_free(session, r->raw_recnos); + r->raw_max_slots = 0; + + i = r->entries + 100; + WT_RET(__wt_calloc_def(session, i, &r->raw_entries)); + WT_RET(__wt_calloc_def(session, i, &r->raw_offsets)); + if (dsk->type == WT_PAGE_COL_INT || + dsk->type == WT_PAGE_COL_VAR) + WT_RET(__wt_calloc_def(session, i, &r->raw_recnos)); + r->raw_max_slots = i; + } + + /* + * We're going to walk the disk image, which requires setting the + * number of entries. + */ + dsk->u.entries = r->entries; + + /* + * We track the record number at each column-store split point, set an + * initial value. + */ + recno = 0; + if (dsk->type == WT_PAGE_COL_VAR) + recno = last->recno; + + entry = slots = 0; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { + ++entry; + + /* + * Row-store pages can split at keys, but not at values, + * column-store pages can split at values. + */ + __wt_cell_unpack(cell, unpack); + switch (unpack->type) { + case WT_CELL_KEY: + case WT_CELL_KEY_OVFL: + case WT_CELL_KEY_SHORT: + break; + case WT_CELL_ADDR_DEL: + case WT_CELL_ADDR_INT: + case WT_CELL_ADDR_LEAF: + case WT_CELL_ADDR_LEAF_NO: + case WT_CELL_DEL: + case WT_CELL_VALUE: + case WT_CELL_VALUE_OVFL: + case WT_CELL_VALUE_SHORT: + if (dsk->type == WT_PAGE_COL_INT) { + recno = unpack->v; + break; + } + if (dsk->type == WT_PAGE_COL_VAR) { + recno += __wt_cell_rle(unpack); + break; + } + r->raw_entries[slots] = entry; + continue; + WT_ILLEGAL_VALUE(session); + } + + /* + * We can't compress the first 64B of the block (it must be + * written without compression), and a possible split point + * may appear in that 64B; keep it simple, ignore the first + * allocation size of data, anybody splitting smaller than + * that (as calculated before compression), is doing it wrong. + */ + if ((len = WT_PTRDIFF(cell, dsk)) > btree->allocsize) + r->raw_offsets[++slots] = + WT_STORE_SIZE(len - WT_BLOCK_COMPRESS_SKIP); + + if (dsk->type == WT_PAGE_COL_INT || + dsk->type == WT_PAGE_COL_VAR) + r->raw_recnos[slots] = recno; + r->raw_entries[slots] = entry; + } + + /* + * If we haven't managed to find at least one split point, we're done, + * don't bother calling the underlying compression function. + */ + if (slots == 0) { + result_len = 0; + result_slots = 0; + goto no_slots; + } + + /* The slot at array's end is the total length of the data. */ + r->raw_offsets[++slots] = + WT_STORE_SIZE(WT_PTRDIFF(cell, dsk) - WT_BLOCK_COMPRESS_SKIP); + + /* + * Allocate a destination buffer. If there's a pre-size function, use + * it to determine the destination buffer's minimum size, otherwise the + * destination buffer is documented to be at least the maximum object + * size. + * + * The destination buffer really only needs to be large enough for the + * target block size, corrected for the requirements of the underlying + * block manager. If the target block size is 8KB, that's a multiple + * of 512B and so the underlying block manager is fine with it. But... + * we don't control what the pre_size method returns us as a required + * size, and we don't want to document the compress_raw method has to + * skip bytes in the buffer because that's confusing, so do something + * more complicated. First, find out how much space the compress_raw + * function might need, either the value returned from pre_size, or the + * maximum object size. Add the compress-skip bytes, and then correct + * that value for the underlying block manager. As a result, we have + * a destination buffer that's the right "object" size when calling the + * compress_raw method, and there are bytes in the header just for us. + */ + if (compressor->pre_size == NULL) + result_len = r->page_size_max; + else + WT_RET(compressor->pre_size(compressor, wt_session, + (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, + (size_t)r->raw_offsets[slots], &result_len)); + corrected_page_size = result_len + WT_BLOCK_COMPRESS_SKIP; + WT_RET(bm->write_size(bm, session, &corrected_page_size)); + WT_RET(__wt_buf_init(session, dst, corrected_page_size)); + + /* + * Copy the header bytes into the destination buffer, then call the + * compression function. + */ + memcpy(dst->mem, dsk, WT_BLOCK_COMPRESS_SKIP); + ret = compressor->compress_raw(compressor, wt_session, + r->page_size_max, btree->split_pct, + WT_BLOCK_COMPRESS_SKIP, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, + r->raw_offsets, slots, + (uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP, + result_len, no_more_rows, &result_len, &result_slots); + switch (ret) { + case EAGAIN: + /* + * The compression function wants more rows; accumulate and + * retry. + * + * Reset the resulting slots count, just in case the compression + * function modified it before giving up. + */ + result_slots = 0; + break; + case 0: + /* + * If the compression function returned zero result slots, it's + * giving up and we write the original data. (This is a pretty + * bad result: we've not done compression on a block much larger + * than the maximum page size, but once compression gives up, + * there's not much else we can do.) + * + * If the compression function returned non-zero result slots, + * we were successful and have a block to write. + */ + if (result_slots == 0) { + WT_STAT_FAST_DATA_INCR(session, compress_raw_fail); + + /* + * If there are no more rows, we can write the original + * data from the original buffer. + */ + if (no_more_rows) + break; + + /* + * Copy the original data to the destination buffer, as + * if the compression function simply copied it. Take + * all but the last row of the original data (the last + * row has to be set as the key for the next block). + */ + result_slots = slots - 1; + result_len = r->raw_offsets[result_slots]; + WT_RET(__wt_buf_grow( + session, dst, result_len + WT_BLOCK_COMPRESS_SKIP)); + memcpy((uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP, + (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, + result_len); + + /* + * Mark it as uncompressed so the standard compression + * function is called before the buffer is written. + */ + last->already_compressed = 0; + } else { + WT_STAT_FAST_DATA_INCR(session, compress_raw_ok); + + /* + * If there are more rows and the compression function + * consumed all of the current data, there are problems: + * First, with row-store objects, we're potentially + * skipping updates, we must have a key for the next + * block so we know with what block a skipped update is + * associated. Second, if the compression function + * compressed all of the data, we're not pushing it + * hard enough (unless we got lucky and gave it exactly + * the right amount to work with, which is unlikely). + * Handle both problems by accumulating more data any + * time we're not writing the last block and compression + * ate all of the rows. + */ + if (result_slots == slots && !no_more_rows) + result_slots = 0; + else + last->already_compressed = 1; + } + break; + default: + return (ret); + } + +no_slots: + /* + * Check for the last block we're going to write: if no more rows and + * we failed to compress anything, or we compressed everything, it's + * the last block. + */ + last_block = no_more_rows && + (result_slots == 0 || result_slots == slots); + + if (result_slots != 0) { + /* + * We have a block, finalize the header information. + */ + dst->size = result_len + WT_BLOCK_COMPRESS_SKIP; + dsk_dst = dst->mem; + dsk_dst->recno = last->recno; + dsk_dst->mem_size = + r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP; + dsk_dst->u.entries = r->raw_entries[result_slots - 1]; + + /* + * There is likely a remnant in the working buffer that didn't + * get compressed; copy it down to the start of the buffer and + * update the starting record number, free space and so on. + * !!! + * Note use of memmove, the source and destination buffers can + * overlap. + */ + len = WT_PTRDIFF(r->first_free, (uint8_t *)dsk + + r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP); + dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); + (void)memmove(dsk_start, (uint8_t *)r->first_free - len, len); + + r->entries -= r->raw_entries[result_slots - 1]; + r->first_free = dsk_start + len; + r->space_avail = + r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len); + + /* + * Set the key for the next block (before writing the block, a + * key range is needed in that code). + */ + switch (dsk->type) { + case WT_PAGE_COL_INT: + next->recno = r->raw_recnos[result_slots]; + break; + case WT_PAGE_COL_VAR: + next->recno = r->raw_recnos[result_slots - 1]; + break; + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + next->recno = 0; + if (!last_block) { + /* + * Confirm there was uncompressed data remaining + * in the buffer, we're about to read it for the + * next chunk's initial key. + */ + WT_ASSERT(session, len > 0); + WT_RET(__rec_split_row_promote_cell( + session, dsk, &next->key)); + } + break; + } + write_ref = dst; + } else if (no_more_rows) { + /* + * Compression failed and there are no more rows to accumulate, + * write the original buffer instead. + */ + WT_STAT_FAST_DATA_INCR(session, compress_raw_fail); + + dsk->recno = last->recno; + dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk); + dsk->u.entries = r->entries; + + r->entries = 0; + r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); + r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + + write_ref = &r->dsk; + last->already_compressed = 0; + } else { + /* + * Compression failed, there are more rows to accumulate and the + * compression function wants to try again; increase the size of + * the "page" and try again after we accumulate some more rows. + */ + WT_STAT_FAST_DATA_INCR(session, compress_raw_fail_temporary); + + len = WT_PTRDIFF(r->first_free, r->dsk.mem); + corrected_page_size = r->page_size * 2; + WT_RET(bm->write_size(bm, session, &corrected_page_size)); + WT_RET(__wt_buf_grow(session, &r->dsk, corrected_page_size)); + r->page_size *= 2; + r->first_free = (uint8_t *)r->dsk.mem + len; + r->space_avail = + r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len); + return (0); + } + + /* We have a block, update the boundary counter. */ + ++r->bnd_next; + + /* + * If we are writing the whole page in our first/only attempt, it might + * be a checkpoint (checkpoints are only a single page, by definition). + * Further, checkpoints aren't written here, the wrapup functions do the + * write, and they do the write from the original buffer location. If + * it's a checkpoint and the block isn't in the right buffer, copy it. + * + * If it's not a checkpoint, write the block. + */ + if (r->bnd_next == 1 && last_block && __rec_is_checkpoint(r, last)) { + if (write_ref == dst) + WT_RET(__wt_buf_set( + session, &r->dsk, dst->mem, dst->size)); + } else + WT_RET( + __rec_split_write(session, r, last, write_ref, last_block)); + return (0); +} + +/* + * __rec_raw_decompress -- + * Decompress a raw-compressed image. + */ +static int +__rec_raw_decompress( + WT_SESSION_IMPL *session, const void *image, size_t size, void *retp) +{ + WT_BTREE *btree; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_PAGE_HEADER const *dsk; + size_t result_len; + + btree = S2BT(session); + dsk = image; + + /* + * We skipped an update and we can't write a block, but unfortunately, + * the block has already been compressed. Decompress the block so we + * can subsequently re-instantiate it in memory. + */ + WT_RET(__wt_scr_alloc(session, dsk->mem_size, &tmp)); + memcpy(tmp->mem, image, WT_BLOCK_COMPRESS_SKIP); + WT_ERR(btree->compressor->decompress(btree->compressor, + &session->iface, + (uint8_t *)image + WT_BLOCK_COMPRESS_SKIP, + size - WT_BLOCK_COMPRESS_SKIP, + (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP, + dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, + &result_len)); + if (result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) + WT_ERR(__wt_illegal_value(session, btree->dhandle->name)); + + WT_ERR(__wt_strndup(session, tmp->data, dsk->mem_size, retp)); + WT_ASSERT(session, __wt_verify_dsk_image( + session, "[raw evict split]", tmp->data, dsk->mem_size) == 0); + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __rec_split_raw -- + * Raw compression split routine. + */ +static inline int +__rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + return (__rec_split_raw_worker(session, r, 0)); +} + +/* + * __rec_split_finish_std -- + * Finish processing a page, standard version. + */ +static int +__rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + WT_BOUNDARY *bnd; + WT_PAGE_HEADER *dsk; + + /* Adjust the boundary information based on our split status. */ + switch (r->bnd_state) { + case SPLIT_BOUNDARY: + case SPLIT_MAX: + /* + * We never split, the reconciled page fit into a maximum page + * size. Change the first boundary slot to represent the full + * page (the first boundary slot is largely correct, just update + * the number of entries). + */ + r->bnd_next = 0; + break; + case SPLIT_TRACKING_OFF: + /* + * If we have already split, or aren't tracking boundaries, put + * the remaining data in the next boundary slot. + */ + WT_RET(__rec_split_bnd_grow(session, r)); + break; + case SPLIT_TRACKING_RAW: + /* + * We were configured for raw compression, but never actually + * wrote anything. + */ + break; + WT_ILLEGAL_VALUE(session); + } + + /* + * We only arrive here with no entries to write if the page was entirely + * empty, and if the page is empty, we merge it into its parent during + * the parent's reconciliation. A page with skipped updates isn't truly + * empty, continue on. + */ + if (r->entries == 0 && r->skip_next == 0) + return (0); + + /* Set the boundary reference and increment the count. */ + bnd = &r->bnd[r->bnd_next++]; + bnd->entries = r->entries; + + /* Finalize the header information. */ + dsk = r->dsk.mem; + dsk->recno = bnd->recno; + dsk->u.entries = r->entries; + dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk); + + /* If this is a checkpoint, we're done, otherwise write the page. */ + return ( + __rec_is_checkpoint(r, bnd) ? 0 : + __rec_split_write(session, r, bnd, &r->dsk, 1)); +} + +/* + * __rec_split_finish -- + * Finish processing a page. + */ +static int +__rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + /* We're done reconciling - write the final page */ + if (r->raw_compression && r->entries != 0) { + while (r->entries != 0) + WT_RET(__rec_split_raw_worker(session, r, 1)); + } else + WT_RET(__rec_split_finish_std(session, r)); + + return (0); +} + +/* + * __rec_split_fixup -- + * Fix up after crossing the maximum page boundary. + */ +static int +__rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + WT_BOUNDARY *bnd; + WT_BTREE *btree; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_PAGE_HEADER *dsk; + uint32_t i, len; + uint8_t *dsk_start; + + /* + * When we overflow physical limits of the page, we walk the list of + * split chunks we've created and write those pages out, then update + * the caller's information. + */ + btree = S2BT(session); + + /* + * The data isn't laid out on a page boundary or nul padded; copy it to + * a clean, aligned, padded buffer before writing it. + * + * Allocate a scratch buffer to hold the new disk image. Copy the + * WT_PAGE_HEADER header onto the scratch buffer, most of the header + * information remains unchanged between the pages. + */ + WT_RET(__wt_scr_alloc(session, r->page_size_max, &tmp)); + dsk = tmp->mem; + memcpy(dsk, r->dsk.mem, WT_PAGE_HEADER_SIZE); + + /* + * For each split chunk we've created, update the disk image and copy + * it into place. + */ + dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); + for (i = 0, bnd = r->bnd; i < r->bnd_next; ++i, ++bnd) { + /* Copy the page contents to the temporary buffer. */ + len = WT_PTRDIFF32((bnd + 1)->start, bnd->start); + memcpy(dsk_start, bnd->start, len); + + /* Finalize the header information and write the page. */ + dsk->recno = bnd->recno; + dsk->u.entries = bnd->entries; + dsk->mem_size = + tmp->size = WT_PAGE_HEADER_BYTE_SIZE(btree) + len; + WT_ERR(__rec_split_write(session, r, bnd, tmp, 0)); + } + + /* + * There is probably a remnant in the working buffer that didn't get + * written; copy it down to the beginning of the working buffer, and + * update the starting record number. + * + * Confirm the remnant is no larger than the available split buffer. + * + * Fix up our caller's information. + */ + len = WT_PTRDIFF32(r->first_free, bnd->start); + if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) + WT_PANIC_ERR(session, EINVAL, + "Reconciliation remnant too large for the split buffer"); + + dsk = r->dsk.mem; + dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); + (void)memmove(dsk_start, bnd->start, len); + + r->entries -= r->total_entries; + r->first_free = dsk_start + len; + r->space_avail = + (r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) - len; + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __rec_split_write -- + * Write a disk block out for the split helper functions. + */ +static int +__rec_split_write(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_BOUNDARY *bnd, WT_ITEM *buf, int last_block) +{ + WT_BTREE *btree; + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_MULTI *multi; + WT_PAGE *page; + WT_PAGE_HEADER *dsk; + WT_PAGE_MODIFY *mod; + WT_UPD_SKIPPED *skip; + size_t addr_size; + uint32_t bnd_slot, i, j; + int cmp; + uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE]; + + btree = S2BT(session); + dsk = buf->mem; + page = r->page; + mod = page->modify; + + WT_RET(__wt_scr_alloc(session, 0, &key)); + + /* Set the zero-length value flag in the page header. */ + if (dsk->type == WT_PAGE_ROW_LEAF) { + F_CLR(dsk, WT_PAGE_EMPTY_V_ALL | WT_PAGE_EMPTY_V_NONE); + + if (r->entries != 0 && r->all_empty_value) + F_SET(dsk, WT_PAGE_EMPTY_V_ALL); + if (r->entries != 0 && !r->any_empty_value) + F_SET(dsk, WT_PAGE_EMPTY_V_NONE); + } + + /* Initialize the address (set the page type for the parent). */ + switch (dsk->type) { + case WT_PAGE_COL_FIX: + bnd->addr.type = WT_ADDR_LEAF_NO; + break; + case WT_PAGE_COL_VAR: + case WT_PAGE_ROW_LEAF: + bnd->addr.type = r->ovfl_items ? WT_ADDR_LEAF : WT_ADDR_LEAF_NO; + break; + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + bnd->addr.type = WT_ADDR_INT; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + bnd->size = (uint32_t)buf->size; + bnd->cksum = 0; + + /* + * Check if we've skipped updates that belong to this block, and move + * any to the per-block structure. Quit as soon as we find a skipped + * update that doesn't belong to the block, they're in sorted order. + * + * This code requires a key be filled in for the next block (or the + * last block flag be set, if there's no next block). + */ + for (i = 0, skip = r->skip; i < r->skip_next; ++i, ++skip) { + /* The last block gets all remaining skipped updates. */ + if (last_block) { + WT_ERR(__rec_skip_update_move(session, bnd, skip)); + continue; + } + + /* + * Get the skipped update's key and compare it with this block's + * key range. If the skipped update list belongs with the block + * we're about to write, move it to the per-block memory. Check + * only to the first update that doesn't go with the block, they + * must be in sorted order. + */ + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + if (WT_INSERT_RECNO(skip->ins) >= (bnd + 1)->recno) + goto skip_check_complete; + break; + case WT_PAGE_ROW_LEAF: + if (skip->ins == NULL) + WT_ERR(__wt_row_leaf_key( + session, page, skip->rip, key, 0)); + else { + key->data = WT_INSERT_KEY(skip->ins); + key->size = WT_INSERT_KEY_SIZE(skip->ins); + } + WT_ERR(__wt_compare(session, + btree->collator, key, &(bnd + 1)->key, &cmp)); + if (cmp >= 0) + goto skip_check_complete; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + WT_ERR(__rec_skip_update_move(session, bnd, skip)); + } + +skip_check_complete: + /* + * If there are updates that weren't moved to the block, shuffle them to + * the beginning of the cached list (we maintain the skipped updates in + * sorted order, new skipped updates must be appended to the list). + */ + for (j = 0; i < r->skip_next; ++j, ++i) + r->skip[j] = r->skip[i]; + r->skip_next = j; + + /* + * If we had to skip updates in order to build this disk image, we can't + * actually write it. Instead, we will re-instantiate the page using the + * disk image and the list of updates we skipped. + * + * If the buffer is compressed (raw compression was configured), we have + * to decompress it so we can instantiate it later. + */ + if (bnd->skip != NULL) { + if (bnd->already_compressed) + WT_ERR(__rec_raw_decompress( + session, buf->data, buf->size, &bnd->dsk)); + else { + WT_ERR(__wt_strndup( + session, buf->data, buf->size, &bnd->dsk)); + WT_ASSERT(session, __wt_verify_dsk_image(session, + "[evict split]", buf->data, buf->size) == 0); + } + goto done; + } + + /* + * If we wrote this block before, re-use it. Pages get written in the + * same block order every time, only check the appropriate slot. The + * expensive part of this test is the checksum, only do that work when + * there has been or will be a reconciliation of this page involving + * split pages. This test isn't perfect: we're doing a checksum if a + * previous reconciliation of the page split or if we will split this + * time, but that test won't calculate a checksum on the first block + * the first time the page splits. + */ + bnd_slot = (uint32_t)(bnd - r->bnd); + if (bnd_slot > 1 || + (F_ISSET(mod, WT_PM_REC_MULTIBLOCK) && mod->mod_multi != NULL)) { + /* + * There are page header fields which need to be cleared to get + * consistent checksums: specifically, the write generation and + * the memory owned by the block manager. We are reusing the + * same buffer space each time, clear it before calculating the + * checksum. + */ + dsk->write_gen = 0; + memset(WT_BLOCK_HEADER_REF(dsk), 0, btree->block_header); + bnd->cksum = __wt_cksum(buf->data, buf->size); + + if (F_ISSET(mod, WT_PM_REC_MULTIBLOCK) && + mod->mod_multi_entries > bnd_slot) { + multi = &mod->mod_multi[bnd_slot]; + if (multi->size == bnd->size && + multi->cksum == bnd->cksum) { + multi->addr.reuse = 1; + bnd->addr = multi->addr; + + WT_STAT_FAST_DATA_INCR(session, rec_page_match); + goto done; + } + } + } + + WT_ERR(__wt_bt_write(session, + buf, addr, &addr_size, 0, bnd->already_compressed)); + WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr)); + bnd->addr.size = (uint8_t)addr_size; + +done: +err: __wt_scr_free(&key); + return (ret); +} + +/* + * __wt_bulk_init -- + * Bulk insert initialization. + */ +int +__wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_PAGE_INDEX *pindex; + WT_RECONCILE *r; + uint64_t recno; + + btree = S2BT(session); + /* + * Bulk-load is only permitted on newly created files, not any empty + * file -- see the checkpoint code for a discussion. + */ + if (!btree->bulk_load_ok) + WT_RET_MSG(session, EINVAL, + "bulk-load is only possible for newly created trees"); + + /* Set a reference to the empty leaf page. */ + pindex = WT_INTL_INDEX_COPY(btree->root.page); + cbulk->ref = pindex->index[0]; + cbulk->leaf = cbulk->ref->page; + + WT_RET( + __rec_write_init(session, cbulk->ref, 0, NULL, &cbulk->reconcile)); + r = cbulk->reconcile; + r->is_bulk_load = 1; + + switch (btree->type) { + case BTREE_COL_FIX: + case BTREE_COL_VAR: + recno = 1; + break; + case BTREE_ROW: + recno = 0; + break; + WT_ILLEGAL_VALUE(session); + } + + return (__rec_split_init( + session, r, cbulk->leaf, recno, btree->maxleafpage)); +} + +/* + * __wt_bulk_wrapup -- + * Bulk insert cleanup. + */ +int +__wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_PAGE *parent; + WT_RECONCILE *r; + + r = cbulk->reconcile; + btree = S2BT(session); + + switch (btree->type) { + case BTREE_COL_FIX: + if (cbulk->entry != 0) + __rec_incr(session, r, cbulk->entry, + __bitstr_size( + (size_t)cbulk->entry * btree->bitcnt)); + break; + case BTREE_COL_VAR: + if (cbulk->rle != 0) + WT_RET(__wt_bulk_insert_var(session, cbulk)); + break; + case BTREE_ROW: + break; + WT_ILLEGAL_VALUE(session); + } + + WT_RET(__rec_split_finish(session, r)); + WT_RET(__rec_write_wrapup(session, r, r->page)); + + /* Mark the page's parent dirty. */ + parent = r->ref->home; + WT_RET(__wt_page_modify_init(session, parent)); + __wt_page_modify_set(session, parent); + + __rec_destroy(session, &cbulk->reconcile); + + return (0); +} + +/* + * __wt_bulk_insert_row -- + * Row-store bulk insert. + */ +int +__wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_KV *key, *val; + WT_RECONCILE *r; + int ovfl_key; + + r = cbulk->reconcile; + btree = S2BT(session); + cursor = &cbulk->cbt.iface; + + key = &r->k; + val = &r->v; + WT_RET(__rec_cell_build_leaf_key(session, r, /* Build key cell */ + cursor->key.data, cursor->key.size, &ovfl_key)); + WT_RET(__rec_cell_build_val(session, r, /* Build value cell */ + cursor->value.data, cursor->value.size, (uint64_t)0)); + + /* Boundary: split or write the page. */ + while (key->len + val->len > r->space_avail) + if (r->raw_compression) + WT_RET(__rec_split_raw(session, r)); + else { + WT_RET(__rec_split(session, r)); + + /* + * Turn off prefix compression until a full key written + * to the new page, and (unless we're already working + * with an overflow key), rebuild the key without prefix + * compression. + */ + if (r->key_pfx_compress_conf) { + r->key_pfx_compress = 0; + if (!ovfl_key) + WT_RET(__rec_cell_build_leaf_key( + session, r, NULL, 0, &ovfl_key)); + } + } + + /* Copy the key/value pair onto the page. */ + __rec_copy_incr(session, r, key); + if (val->len == 0) + r->any_empty_value = 1; + else { + r->all_empty_value = 0; + if (btree->dictionary) + WT_RET(__rec_dict_replace(session, r, 0, val)); + __rec_copy_incr(session, r, val); + } + + /* Update compression state. */ + __rec_key_state_update(r, ovfl_key); + + return (0); +} + +/* + * __rec_col_fix_bulk_insert_split_check -- + * Check if a bulk-loaded fixed-length column store page needs to split. + */ +static inline int +__rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_RECONCILE *r; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session; + r = cbulk->reconcile; + btree = S2BT(session); + + if (cbulk->entry == cbulk->nrecs) { + if (cbulk->entry != 0) { + /* + * If everything didn't fit, update the counters and + * split. + * + * Boundary: split or write the page. + */ + __rec_incr(session, r, cbulk->entry, + __bitstr_size( + (size_t)cbulk->entry * btree->bitcnt)); + WT_RET(__rec_split(session, r)); + } + cbulk->entry = 0; + cbulk->nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail); + } + return (0); +} + +/* + * __wt_bulk_insert_fix -- + * Fixed-length column-store bulk insert. + */ +int +__wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_RECONCILE *r; + uint32_t entries, offset, page_entries, page_size; + const uint8_t *data; + + r = cbulk->reconcile; + btree = S2BT(session); + cursor = &cbulk->cbt.iface; + + if (cbulk->bitmap) { + if (((r->recno - 1) * btree->bitcnt) & 0x7) + WT_RET_MSG(session, EINVAL, + "Bulk bitmap load not aligned on a byte boundary"); + for (data = cursor->value.data, + entries = (uint32_t)cursor->value.size; + entries > 0; + entries -= page_entries, data += page_size) { + WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk)); + + page_entries = + WT_MIN(entries, cbulk->nrecs - cbulk->entry); + page_size = __bitstr_size(page_entries * btree->bitcnt); + offset = __bitstr_size(cbulk->entry * btree->bitcnt); + memcpy(r->first_free + offset, data, page_size); + cbulk->entry += page_entries; + r->recno += page_entries; + } + return (0); + } + + WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk)); + + __bit_setv(r->first_free, + cbulk->entry, btree->bitcnt, ((uint8_t *)cursor->value.data)[0]); + ++cbulk->entry; + ++r->recno; + + return (0); +} + +/* + * __wt_bulk_insert_var -- + * Variable-length column-store bulk insert. + */ +int +__wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_KV *val; + WT_RECONCILE *r; + + r = cbulk->reconcile; + btree = S2BT(session); + + /* + * Store the bulk cursor's last buffer, not the current value, we're + * creating a duplicate count, which means we want the previous value + * seen, not the current value. + */ + val = &r->v; + WT_RET(__rec_cell_build_val( + session, r, cbulk->last.data, cbulk->last.size, cbulk->rle)); + + /* Boundary: split or write the page. */ + while (val->len > r->space_avail) + if (r->raw_compression) + WT_RET(__rec_split_raw(session, r)); + else + WT_RET(__rec_split(session, r)); + + /* Copy the value onto the page. */ + if (btree->dictionary) + WT_RET(__rec_dict_replace(session, r, cbulk->rle, val)); + __rec_copy_incr(session, r, val); + + /* Update the starting record number in case we split. */ + r->recno += cbulk->rle; + + return (0); +} + +/* + * __rec_vtype -- + * Return a value cell's address type. + */ +static inline u_int +__rec_vtype(WT_ADDR *addr) +{ + if (addr->type == WT_ADDR_INT) + return (WT_CELL_ADDR_INT); + if (addr->type == WT_ADDR_LEAF) + return (WT_CELL_ADDR_LEAF); + return (WT_CELL_ADDR_LEAF_NO); +} + +/* + * __rec_col_int -- + * Reconcile a column-store internal page. + */ +static int +__rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_ADDR *addr; + WT_BTREE *btree; + WT_CELL_UNPACK *vpack, _vpack; + WT_DECL_RET; + WT_KV *val; + WT_PAGE *child; + WT_REF *ref; + int hazard, state; + + btree = S2BT(session); + child = NULL; + hazard = 0; + + val = &r->v; + vpack = &_vpack; + + WT_RET(__rec_split_init( + session, r, page, page->pg_intl_recno, btree->maxintlpage)); + + /* For each entry in the in-memory page... */ + WT_INTL_FOREACH_BEGIN(session, page, ref) { + /* Update the starting record number in case we split. */ + r->recno = ref->key.recno; + + /* + * Modified child. + * The page may be emptied or internally created during a split. + * Deleted/split pages are merged into the parent and discarded. + */ + WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state)); + addr = NULL; + child = ref->page; + if (state != 0) { + /* + * Currently the only non-zero returned stated possible + * for a column-store page is child-modified (all other + * states are part of the fast-truncate support, which + * is row-store only). + */ + WT_ASSERT(session, state == WT_CHILD_MODIFIED); + + switch (F_ISSET(child->modify, WT_PM_REC_MASK)) { + case WT_PM_REC_EMPTY: + /* + * Column-store pages are almost never empty, as + * discarding a page would remove a chunk of the + * name space. The exceptions are pages created + * when the tree is created, and never filled. + */ + CHILD_RELEASE_ERR(session, hazard, ref); + continue; + case WT_PM_REC_MULTIBLOCK: + WT_ERR(__rec_col_merge(session, r, child)); + CHILD_RELEASE_ERR(session, hazard, ref); + continue; + case WT_PM_REC_REPLACE: + addr = &child->modify->mod_replace; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + } + + /* + * Build the value cell. The child page address is in one of 3 + * places: if the page was replaced, the page's modify structure + * references it and we built the value cell just above in the + * switch statement. Else, the WT_REF->addr reference points to + * an on-page cell or an off-page WT_ADDR structure: if it's an + * on-page cell and we copy it from the page, else build a new + * cell. + */ + if (addr == NULL && __wt_off_page(page, ref->addr)) + addr = ref->addr; + if (addr == NULL) { + __wt_cell_unpack(ref->addr, vpack); + val->buf.data = ref->addr; + val->buf.size = __wt_cell_total_len(vpack); + val->cell_len = 0; + val->len = val->buf.size; + } else + __rec_cell_build_addr(r, addr->addr, addr->size, + __rec_vtype(addr), ref->key.recno); + CHILD_RELEASE_ERR(session, hazard, ref); + + /* Boundary: split or write the page. */ + while (val->len > r->space_avail) + if (r->raw_compression) + WT_ERR(__rec_split_raw(session, r)); + else + WT_ERR(__rec_split(session, r)); + + /* Copy the value onto the page. */ + __rec_copy_incr(session, r, val); + } WT_INTL_FOREACH_END; + + /* Write the remnant page. */ + return (__rec_split_finish(session, r)); + +err: CHILD_RELEASE(session, hazard, ref); + return (ret); +} + +/* + * __rec_col_merge -- + * Merge in a split page. + */ +static int +__rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_ADDR *addr; + WT_KV *val; + WT_MULTI *multi; + WT_PAGE_MODIFY *mod; + uint32_t i; + + mod = page->modify; + + val = &r->v; + + /* For each entry in the split array... */ + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) { + /* Update the starting record number in case we split. */ + r->recno = multi->key.recno; + + /* Build the value cell. */ + addr = &multi->addr; + __rec_cell_build_addr(r, + addr->addr, addr->size, __rec_vtype(addr), r->recno); + + /* Boundary: split or write the page. */ + while (val->len > r->space_avail) + if (r->raw_compression) + WT_RET(__rec_split_raw(session, r)); + else + WT_RET(__rec_split(session, r)); + + /* Copy the value onto the page. */ + __rec_copy_incr(session, r, val); + } + return (0); +} + +/* + * __rec_col_fix -- + * Reconcile a fixed-width, column-store leaf page. + */ +static int +__rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_BTREE *btree; + WT_INSERT *ins; + WT_UPDATE *upd; + uint64_t recno; + uint32_t entry, nrecs; + + btree = S2BT(session); + + WT_RET(__rec_split_init( + session, r, page, page->pg_fix_recno, btree->maxleafpage)); + + /* Update any changes to the original on-page data items. */ + WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) { + WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); + if (upd != NULL) + __bit_setv_recno(page, WT_INSERT_RECNO(ins), + btree->bitcnt, ((uint8_t *)WT_UPDATE_DATA(upd))[0]); + } + + /* Copy the updated, disk-image bytes into place. */ + memcpy(r->first_free, page->pg_fix_bitf, + __bitstr_size((size_t)page->pg_fix_entries * btree->bitcnt)); + + /* Calculate the number of entries per page remainder. */ + entry = page->pg_fix_entries; + nrecs = WT_FIX_BYTES_TO_ENTRIES( + btree, r->space_avail) - page->pg_fix_entries; + r->recno += entry; + + /* Walk any append list. */ + WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) { + WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); + if (upd == NULL) + continue; + for (;;) { + /* + * The application may have inserted records which left + * gaps in the name space. + */ + for (recno = WT_INSERT_RECNO(ins); + nrecs > 0 && r->recno < recno; + --nrecs, ++entry, ++r->recno) + __bit_setv( + r->first_free, entry, btree->bitcnt, 0); + + if (nrecs > 0) { + __bit_setv(r->first_free, entry, btree->bitcnt, + ((uint8_t *)WT_UPDATE_DATA(upd))[0]); + --nrecs; + ++entry; + ++r->recno; + break; + } + + /* + * If everything didn't fit, update the counters and + * split. + * + * Boundary: split or write the page. + */ + __rec_incr(session, r, entry, + __bitstr_size((size_t)entry * btree->bitcnt)); + WT_RET(__rec_split(session, r)); + + /* Calculate the number of entries per page. */ + entry = 0; + nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail); + } + } + + /* Update the counters. */ + __rec_incr( + session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt)); + + /* Write the remnant page. */ + return (__rec_split_finish(session, r)); +} + +/* + * __rec_col_fix_slvg -- + * Reconcile a fixed-width, column-store leaf page created during salvage. + */ +static int +__rec_col_fix_slvg(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) +{ + WT_BTREE *btree; + uint64_t page_start, page_take; + uint32_t entry, nrecs; + + btree = S2BT(session); + + /* + * !!! + * It's vanishingly unlikely and probably impossible for fixed-length + * column-store files to have overlapping key ranges. It's possible + * for an entire key range to go missing (if a page is corrupted and + * lost), but because pages can't split, it shouldn't be possible to + * find pages where the key ranges overlap. That said, we check for + * it during salvage and clean up after it here because it doesn't + * cost much and future column-store formats or operations might allow + * for fixed-length format ranges to overlap during salvage, and I + * don't want to have to retrofit the code later. + */ + WT_RET(__rec_split_init( + session, r, page, page->pg_fix_recno, btree->maxleafpage)); + + /* We may not be taking all of the entries on the original page. */ + page_take = salvage->take == 0 ? page->pg_fix_entries : salvage->take; + page_start = salvage->skip == 0 ? 0 : salvage->skip; + + /* Calculate the number of entries per page. */ + entry = 0; + nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail); + + for (; nrecs > 0 && salvage->missing > 0; + --nrecs, --salvage->missing, ++entry) + __bit_setv(r->first_free, entry, btree->bitcnt, 0); + + for (; nrecs > 0 && page_take > 0; + --nrecs, --page_take, ++page_start, ++entry) + __bit_setv(r->first_free, entry, btree->bitcnt, + __bit_getv(page->pg_fix_bitf, + (uint32_t)page_start, btree->bitcnt)); + + r->recno += entry; + __rec_incr(session, r, entry, + __bitstr_size((size_t)entry * btree->bitcnt)); + + /* + * We can't split during salvage -- if everything didn't fit, it's + * all gone wrong. + */ + if (salvage->missing != 0 || page_take != 0) + WT_PANIC_RET(session, WT_PANIC, + "%s page too large, attempted split during salvage", + __wt_page_type_string(page->type)); + + /* Write the page. */ + return (__rec_split_finish(session, r)); +} + +/* + * __rec_col_var_helper -- + * Create a column-store variable length record cell and write it onto a + * page. + */ +static int +__rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, + WT_SALVAGE_COOKIE *salvage, + WT_ITEM *value, int deleted, uint8_t overflow_type, uint64_t rle) +{ + WT_BTREE *btree; + WT_KV *val; + + btree = S2BT(session); + + val = &r->v; + + /* + * Occasionally, salvage needs to discard records from the beginning or + * end of the page, and because the items may be part of a RLE cell, do + * the adjustments here. It's not a mistake we don't bother telling + * our caller we've handled all the records from the page we care about, + * and can quit processing the page: salvage is a rare operation and I + * don't want to complicate our caller's loop. + */ + if (salvage != NULL) { + if (salvage->done) + return (0); + if (salvage->skip != 0) { + if (rle <= salvage->skip) { + salvage->skip -= rle; + return (0); + } + rle -= salvage->skip; + salvage->skip = 0; + } + if (salvage->take != 0) { + if (rle <= salvage->take) + salvage->take -= rle; + else { + rle = salvage->take; + salvage->take = 0; + } + if (salvage->take == 0) + salvage->done = 1; + } + } + + if (deleted) { + val->cell_len = __wt_cell_pack_del(&val->cell, rle); + val->buf.data = NULL; + val->buf.size = 0; + val->len = val->cell_len; + } else if (overflow_type) { + val->cell_len = __wt_cell_pack_ovfl( + &val->cell, overflow_type, rle, value->size); + val->buf.data = value->data; + val->buf.size = value->size; + val->len = val->cell_len + value->size; + } else + WT_RET(__rec_cell_build_val( + session, r, value->data, value->size, rle)); + + /* Boundary: split or write the page. */ + while (val->len > r->space_avail) + if (r->raw_compression) + WT_RET(__rec_split_raw(session, r)); + else + WT_RET(__rec_split(session, r)); + + /* Copy the value onto the page. */ + if (!deleted && !overflow_type && btree->dictionary) + WT_RET(__rec_dict_replace(session, r, rle, val)); + __rec_copy_incr(session, r, val); + + /* Update the starting record number in case we split. */ + r->recno += rle; + + return (0); +} + +/* + * __rec_col_var -- + * Reconcile a variable-width column-store leaf page. + */ +static int +__rec_col_var(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) +{ + enum { OVFL_IGNORE, OVFL_UNUSED, OVFL_USED } ovfl_state; + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *vpack, _vpack; + WT_COL *cip; + WT_DECL_ITEM(orig); + WT_DECL_RET; + WT_INSERT *ins; + WT_ITEM *last; + WT_UPDATE *upd; + uint64_t n, nrepeat, repeat_count, rle, src_recno; + uint32_t i, size; + int deleted, last_deleted, orig_deleted, update_no_copy; + const void *data; + + btree = S2BT(session); + last = r->last; + vpack = &_vpack; + + WT_RET(__wt_scr_alloc(session, 0, &orig)); + data = NULL; + size = 0; + upd = NULL; + + WT_RET(__rec_split_init( + session, r, page, page->pg_var_recno, btree->maxleafpage)); + + /* + * The salvage code may be calling us to reconcile a page where there + * were missing records in the column-store name space. If taking the + * first record from on the page, it might be a deleted record, so we + * have to give the RLE code a chance to figure that out. Else, if + * not taking the first record from the page, write a single element + * representing the missing records onto a new page. (Don't pass the + * salvage cookie to our helper function in this case, we're handling + * one of the salvage cookie fields on our own, and we don't need the + * helper function's assistance.) + */ + rle = 0; + last_deleted = 0; + if (salvage != NULL && salvage->missing != 0) { + if (salvage->skip == 0) { + rle = salvage->missing; + last_deleted = 1; + + /* + * Correct the number of records we're going to "take", + * pretending the missing records were on the page. + */ + salvage->take += salvage->missing; + } else + WT_ERR(__rec_col_var_helper( + session, r, NULL, NULL, 1, 0, salvage->missing)); + } + + /* + * We track two data items through this loop: the previous (last) item + * and the current item: if the last item is the same as the current + * item, we increment the RLE count for the last item; if the last item + * is different from the current item, we write the last item onto the + * page, and replace it with the current item. The r->recno counter + * tracks records written to the page, and is incremented by the helper + * function immediately after writing records to the page. The record + * number of our source record, that is, the current item, is maintained + * in src_recno. + */ + src_recno = r->recno + rle; + + /* For each entry in the in-memory page... */ + WT_COL_FOREACH(page, cip, i) { + ovfl_state = OVFL_IGNORE; + if ((cell = WT_COL_PTR(page, cip)) == NULL) { + nrepeat = 1; + ins = NULL; + orig_deleted = 1; + } else { + __wt_cell_unpack(cell, vpack); + nrepeat = __wt_cell_rle(vpack); + ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip)); + + /* + * If the original value is "deleted", there's no value + * to compare, we're done. + */ + orig_deleted = vpack->type == WT_CELL_DEL ? 1 : 0; + if (orig_deleted) + goto record_loop; + + /* + * Overflow items are tricky: we don't know until we're + * finished processing the set of values if we need the + * overflow value or not. If we don't use the overflow + * item at all, we have to discard it from the backing + * file, otherwise we'll leak blocks on the checkpoint. + * That's safe because if the backing overflow value is + * still needed by any running transaction, we'll cache + * a copy in the reconciliation tracking structures. + * + * Regardless, we avoid copying in overflow records: if + * there's a WT_INSERT entry that modifies a reference + * counted overflow record, we may have to write copies + * of the overflow record, and in that case we'll do the + * comparisons, but we don't read overflow items just to + * see if they match records on either side. + */ + if (vpack->ovfl) { + ovfl_state = OVFL_UNUSED; + goto record_loop; + } + + /* + * If data is Huffman encoded, we have to decode it in + * order to compare it with the last item we saw, which + * may have been an update string. This guarantees we + * find every single pair of objects we can RLE encode, + * including applications updating an existing record + * where the new value happens (?) to match a Huffman- + * encoded value in a previous or next record. + */ + WT_ERR(__wt_dsk_cell_data_ref( + session, WT_PAGE_COL_VAR, vpack, orig)); + } + +record_loop: /* + * Generate on-page entries: loop repeat records, looking for + * WT_INSERT entries matching the record number. The WT_INSERT + * lists are in sorted order, so only need check the next one. + */ + for (n = 0; + n < nrepeat; n += repeat_count, src_recno += repeat_count) { + upd = NULL; + if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) { + WT_ERR(__rec_txn_read( + session, r, ins, NULL, vpack, &upd)); + ins = WT_SKIP_NEXT(ins); + } + if (upd != NULL) { + update_no_copy = 1; /* No data copy */ + repeat_count = 1; /* Single record */ + + deleted = WT_UPDATE_DELETED_ISSET(upd); + if (!deleted) { + data = WT_UPDATE_DATA(upd); + size = upd->size; + } + } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) { + update_no_copy = 1; /* No data copy */ + repeat_count = 1; /* Single record */ + + deleted = 0; + + /* + * If doing update save and restore, there's an + * update that's not globally visible, and the + * underlying value is a removed overflow value, + * we end up here. + * + * When the update save/restore code noticed the + * removed overflow value, it appended a copy of + * the cached, original overflow value to the + * update list being saved (ensuring the on-page + * item will never be accessed after the page is + * re-instantiated), then returned a NULL update + * to us. + * + * Assert the case: if we remove an underlying + * overflow object, checkpoint reconciliation + * should never see it again, there should be a + * visible update in the way. + * + * Write a placeholder. + */ + WT_ASSERT(session, + F_ISSET(r, WT_SKIP_UPDATE_RESTORE)); + + data = "@"; + size = 1; + } else { + update_no_copy = 0; /* Maybe data copy */ + + /* + * The repeat count is the number of records up + * to the next WT_INSERT record, or up to the + * end of the entry if we have no more WT_INSERT + * records. + */ + if (ins == NULL) + repeat_count = nrepeat - n; + else + repeat_count = + WT_INSERT_RECNO(ins) - src_recno; + + deleted = orig_deleted; + if (deleted) + goto compare; + + /* + * If we are handling overflow items, use the + * overflow item itself exactly once, after + * which we have to copy it into a buffer and + * from then on use a complete copy because we + * are re-creating a new overflow record each + * time. + */ + switch (ovfl_state) { + case OVFL_UNUSED: + /* + * An as-yet-unused overflow item. + * + * We're going to copy the on-page cell, + * write out any record we're tracking. + */ + if (rle != 0) { + WT_ERR(__rec_col_var_helper( + session, r, salvage, last, + last_deleted, 0, rle)); + rle = 0; + } + + last->data = vpack->data; + last->size = vpack->size; + WT_ERR(__rec_col_var_helper( + session, r, salvage, last, 0, + WT_CELL_VALUE_OVFL, repeat_count)); + + /* Track if page has overflow items. */ + r->ovfl_items = 1; + + ovfl_state = OVFL_USED; + continue; + case OVFL_USED: + /* + * Original is an overflow item; we used + * it for a key and now we need another + * copy; read it into memory. + */ + WT_ERR(__wt_dsk_cell_data_ref(session, + WT_PAGE_COL_VAR, vpack, orig)); + + ovfl_state = OVFL_IGNORE; + /* FALLTHROUGH */ + case OVFL_IGNORE: + /* + * Original is an overflow item and we + * were forced to copy it into memory, + * or the original wasn't an overflow + * item; use the data copied into orig. + */ + data = orig->data; + size = (uint32_t)orig->size; + break; + } + } + +compare: /* + * If we have a record against which to compare, and + * the records compare equal, increment the rle counter + * and continue. If the records don't compare equal, + * output the last record and swap the last and current + * buffers: do NOT update the starting record number, + * we've been doing that all along. + */ + if (rle != 0) { + if ((deleted && last_deleted) || + (!last_deleted && !deleted && + last->size == size && + memcmp(last->data, data, size) == 0)) { + rle += repeat_count; + continue; + } + WT_ERR(__rec_col_var_helper(session, r, + salvage, last, last_deleted, 0, rle)); + } + + /* + * Swap the current/last state. + * + * Reset RLE counter and turn on comparisons. + */ + if (!deleted) { + /* + * We can't simply assign the data values into + * the last buffer because they may have come + * from a copy built from an encoded/overflow + * cell and creating the next record is going + * to overwrite that memory. Check, because + * encoded/overflow cells aren't that common + * and we'd like to avoid the copy. If data + * was taken from the current unpack structure + * (which points into the page), or was taken + * from an update structure, we can just use + * the pointers, they're not moving. + */ + if (data == vpack->data || update_no_copy) { + last->data = data; + last->size = size; + } else + WT_ERR(__wt_buf_set( + session, last, data, size)); + } + last_deleted = deleted; + rle = repeat_count; + } + + /* + * If we had a reference to an overflow record we never used, + * discard the underlying blocks, they're no longer useful. + * + * One complication: we must cache a copy before discarding the + * on-disk version if there's a transaction in the system that + * might read the original value. + */ + if (ovfl_state == OVFL_UNUSED && + vpack->raw != WT_CELL_VALUE_OVFL_RM) + WT_ERR(__wt_ovfl_cache(session, page, upd, vpack)); + } + + /* Walk any append list. */ + WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) { + WT_ERR(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); + if (upd == NULL) + continue; + for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) { + /* + * The application may have inserted records which left + * gaps in the name space. + */ + if (src_recno < n) + deleted = 1; + else { + deleted = WT_UPDATE_DELETED_ISSET(upd); + if (!deleted) { + data = WT_UPDATE_DATA(upd); + size = upd->size; + } + } + + /* + * Handle RLE accounting and comparisons -- see comment + * above, this code fragment does the same thing. + */ + if (rle != 0) { + if ((deleted && last_deleted) || + (!last_deleted && !deleted && + last->size == size && + memcmp(last->data, data, size) == 0)) { + ++rle; + continue; + } + WT_ERR(__rec_col_var_helper(session, r, + salvage, last, last_deleted, 0, rle)); + } + + /* + * Swap the current/last state. We always assign the + * data values to the buffer because they can only be + * the data from a WT_UPDATE structure. + * + * Reset RLE counter and turn on comparisons. + */ + if (!deleted) { + last->data = data; + last->size = size; + } + last_deleted = deleted; + rle = 1; + } + } + + /* If we were tracking a record, write it. */ + if (rle != 0) + WT_ERR(__rec_col_var_helper( + session, r, salvage, last, last_deleted, 0, rle)); + + /* Write the remnant page. */ + ret = __rec_split_finish(session, r); + +err: __wt_scr_free(&orig); + return (ret); +} + +/* + * __rec_row_int -- + * Reconcile a row-store internal page. + */ +static int +__rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_ADDR *addr; + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack; + WT_DECL_RET; + WT_IKEY *ikey; + WT_KV *key, *val; + WT_PAGE *child; + WT_REF *ref; + size_t size; + u_int vtype; + int hazard, key_onpage_ovfl, ovfl_key, state; + const void *p; + + btree = S2BT(session); + child = NULL; + hazard = 0; + + key = &r->k; + kpack = &_kpack; + WT_CLEAR(*kpack); /* -Wuninitialized */ + val = &r->v; + vpack = &_vpack; + WT_CLEAR(*vpack); /* -Wuninitialized */ + + WT_RET(__rec_split_init(session, r, page, 0ULL, btree->maxintlpage)); + + /* + * Ideally, we'd never store the 0th key on row-store internal pages + * because it's never used during tree search and there's no reason + * to waste the space. The problem is how we do splits: when we split, + * we've potentially picked out several "split points" in the buffer + * which is overflowing the maximum page size, and when the overflow + * happens, we go back and physically split the buffer, at those split + * points, into new pages. It would be both difficult and expensive + * to re-process the 0th key at each split point to be an empty key, + * so we don't do that. However, we are reconciling an internal page + * for whatever reason, and the 0th key is known to be useless. We + * truncate the key to a single byte, instead of removing it entirely, + * it simplifies various things in other parts of the code (we don't + * have to special case transforming the page from its disk image to + * its in-memory version, for example). + */ + r->cell_zero = 1; + + /* For each entry in the in-memory page... */ + WT_INTL_FOREACH_BEGIN(session, page, ref) { + /* + * There are different paths if the key is an overflow item vs. + * a straight-forward on-page value. If an overflow item, we + * would have instantiated it, and we can use that fact to set + * things up. + * + * Note the cell reference and unpacked key cell are available + * only in the case of an instantiated, off-page key. + */ + ikey = __wt_ref_key_instantiated(ref); + if (ikey == NULL || ikey->cell_offset == 0) { + cell = NULL; + key_onpage_ovfl = 0; + } else { + cell = WT_PAGE_REF_OFFSET(page, ikey->cell_offset); + __wt_cell_unpack(cell, kpack); + key_onpage_ovfl = + kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM; + } + + WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state)); + addr = ref->addr; + child = ref->page; + vtype = 0; + + /* Deleted child we don't have to write. */ + if (state == WT_CHILD_IGNORE) { + /* + * Overflow keys referencing discarded pages are no + * longer useful, schedule them for discard. Don't + * worry about instantiation, internal page keys are + * always instantiated. Don't worry about reuse, + * reusing this key in this reconciliation is unlikely. + */ + if (key_onpage_ovfl) + WT_ERR(__wt_ovfl_discard_add( + session, page, kpack->cell)); + CHILD_RELEASE_ERR(session, hazard, ref); + continue; + } + + /* Deleted child requiring a proxy cell. */ + if (state == WT_CHILD_PROXY) + vtype = WT_CELL_ADDR_DEL; + + /* + * Modified child. Empty pages are merged into the parent and + * discarded. + */ + if (state == WT_CHILD_MODIFIED) + switch (F_ISSET(child->modify, WT_PM_REC_MASK)) { + case WT_PM_REC_EMPTY: + /* + * Overflow keys referencing empty pages are no + * longer useful, schedule them for discard. + * Don't worry about instantiation, internal + * page keys are always instantiated. Don't + * worry about reuse, reusing this key in this + * reconciliation is unlikely. + */ + if (key_onpage_ovfl) + WT_ERR(__wt_ovfl_discard_add( + session, page, kpack->cell)); + CHILD_RELEASE_ERR(session, hazard, ref); + continue; + case WT_PM_REC_MULTIBLOCK: + /* + * Overflow keys referencing split pages are no + * longer useful (the split page's key is the + * interesting key); schedule them for discard. + * Don't worry about instantiation, internal + * page keys are always instantiated. Don't + * worry about reuse, reusing this key in this + * reconciliation is unlikely. + */ + if (key_onpage_ovfl) + WT_ERR(__wt_ovfl_discard_add( + session, page, kpack->cell)); + + WT_ERR(__rec_row_merge(session, r, child)); + CHILD_RELEASE_ERR(session, hazard, ref); + continue; + case WT_PM_REC_REPLACE: + /* + * If the page is replaced, the page's modify + * structure has the page's address. + */ + addr = &child->modify->mod_replace; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* + * Build the value cell, the child page's address. Addr points + * to an on-page cell or an off-page WT_ADDR structure. The + * cell type has been set in the case of page deletion requiring + * a proxy cell, otherwise use the information from the addr or + * original cell. + */ + if (__wt_off_page(page, addr)) { + p = addr->addr; + size = addr->size; + if (vtype == 0) + vtype = __rec_vtype(addr); + } else { + __wt_cell_unpack(ref->addr, vpack); + p = vpack->data; + size = vpack->size; + if (vtype == 0) + vtype = vpack->raw; + } + __rec_cell_build_addr(r, p, size, vtype, 0); + CHILD_RELEASE_ERR(session, hazard, ref); + + /* + * Build key cell. + * Truncate any 0th key, internal pages don't need 0th keys. + */ + if (key_onpage_ovfl) { + key->buf.data = cell; + key->buf.size = __wt_cell_total_len(kpack); + key->cell_len = 0; + key->len = key->buf.size; + ovfl_key = 1; + } else { + __wt_ref_key(page, ref, &p, &size); + WT_ERR(__rec_cell_build_int_key( + session, r, p, r->cell_zero ? 1 : size, &ovfl_key)); + } + r->cell_zero = 0; + + /* Boundary: split or write the page. */ + while (key->len + val->len > r->space_avail) { + if (r->raw_compression) { + WT_ERR(__rec_split_raw(session, r)); + continue; + } + + /* + * In one path above, we copied address blocks from the + * page rather than building the actual key. In that + * case, we have to build the actual key now because we + * are about to promote it. + */ + if (key_onpage_ovfl) { + WT_ERR(__wt_buf_set(session, + r->cur, WT_IKEY_DATA(ikey), ikey->size)); + key_onpage_ovfl = 0; + } + WT_ERR(__rec_split(session, r)); + } + + /* Copy the key and value onto the page. */ + __rec_copy_incr(session, r, key); + __rec_copy_incr(session, r, val); + + /* Update compression state. */ + __rec_key_state_update(r, ovfl_key); + } WT_INTL_FOREACH_END; + + /* Write the remnant page. */ + return (__rec_split_finish(session, r)); + +err: CHILD_RELEASE(session, hazard, ref); + return (ret); +} + +/* + * __rec_row_merge -- + * Merge in a split page. + */ +static int +__rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_ADDR *addr; + WT_KV *key, *val; + WT_MULTI *multi; + WT_PAGE_MODIFY *mod; + uint32_t i; + int ovfl_key; + + mod = page->modify; + + key = &r->k; + val = &r->v; + + /* For each entry in the split array... */ + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) { + /* Build the key and value cells. */ + WT_RET(__rec_cell_build_int_key(session, r, + WT_IKEY_DATA(multi->key.ikey), + r->cell_zero ? 1 : multi->key.ikey->size, &ovfl_key)); + r->cell_zero = 0; + + addr = &multi->addr; + __rec_cell_build_addr( + r, addr->addr, addr->size, __rec_vtype(addr), 0); + + /* Boundary: split or write the page. */ + while (key->len + val->len > r->space_avail) + if (r->raw_compression) + WT_RET(__rec_split_raw(session, r)); + else + WT_RET(__rec_split(session, r)); + + /* Copy the key and value onto the page. */ + __rec_copy_incr(session, r, key); + __rec_copy_incr(session, r, val); + + /* Update compression state. */ + __rec_key_state_update(r, ovfl_key); + } + return (0); +} + +/* + * __rec_row_leaf -- + * Reconcile a row-store leaf page. + */ +static int +__rec_row_leaf(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) +{ + WT_BTREE *btree; + WT_CELL *cell, *val_cell; + WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack; + WT_DECL_ITEM(tmpkey); + WT_DECL_ITEM(tmpval); + WT_DECL_RET; + WT_IKEY *ikey; + WT_INSERT *ins; + WT_KV *key, *val; + WT_ROW *rip; + WT_UPDATE *upd; + size_t size; + uint64_t slvg_skip; + uint32_t i; + int dictionary, onpage_ovfl, ovfl_key; + const void *p; + void *copy; + + btree = S2BT(session); + slvg_skip = salvage == NULL ? 0 : salvage->skip; + + key = &r->k; + val = &r->v; + + WT_RET(__rec_split_init(session, r, page, 0ULL, btree->maxleafpage)); + + /* + * Write any K/V pairs inserted into the page before the first from-disk + * key on the page. + */ + if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) + WT_RET(__rec_row_leaf_insert(session, r, ins)); + + /* + * Temporary buffers in which to instantiate any uninstantiated keys + * or value items we need. + */ + WT_RET(__wt_scr_alloc(session, 0, &tmpkey)); + WT_RET(__wt_scr_alloc(session, 0, &tmpval)); + + /* For each entry in the page... */ + WT_ROW_FOREACH(page, rip, i) { + /* + * The salvage code, on some rare occasions, wants to reconcile + * a page but skip some leading records on the page. Because + * the row-store leaf reconciliation function copies keys from + * the original disk page, this is non-trivial -- just changing + * the in-memory pointers isn't sufficient, we have to change + * the WT_CELL structures on the disk page, too. It's ugly, but + * we pass in a value that tells us how many records to skip in + * this case. + */ + if (slvg_skip != 0) { + --slvg_skip; + continue; + } + + /* + * Figure out the key: set any cell reference (and unpack it), + * set any instantiated key reference. + */ + copy = WT_ROW_KEY_COPY(rip); + (void)__wt_row_leaf_key_info( + page, copy, &ikey, &cell, NULL, NULL); + if (cell == NULL) + kpack = NULL; + else { + kpack = &_kpack; + __wt_cell_unpack(cell, kpack); + } + + /* Unpack the on-page value cell, and look for an update. */ + if ((val_cell = + __wt_row_leaf_value_cell(page, rip, NULL)) == NULL) + vpack = NULL; + else { + vpack = &_vpack; + __wt_cell_unpack(val_cell, vpack); + } + WT_ERR(__rec_txn_read(session, r, NULL, rip, vpack, &upd)); + + /* Build value cell. */ + dictionary = 0; + if (upd == NULL) { + /* + * When the page was read into memory, there may not + * have been a value item. + * + * If there was a value item, check if it's a dictionary + * cell (a copy of another item on the page). If it's a + * copy, we have to create a new value item as the old + * item might have been discarded from the page. + */ + if (vpack == NULL) { + val->buf.data = NULL; + val->cell_len = val->len = val->buf.size = 0; + } else if (vpack->raw == WT_CELL_VALUE_COPY) { + /* If the item is Huffman encoded, decode it. */ + if (btree->huffman_value == NULL) { + p = vpack->data; + size = vpack->size; + } else { + WT_ERR(__wt_huffman_decode(session, + btree->huffman_value, + vpack->data, vpack->size, + tmpval)); + p = tmpval->data; + size = tmpval->size; + } + WT_ERR(__rec_cell_build_val( + session, r, p, size, (uint64_t)0)); + dictionary = 1; + } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) { + /* + * If doing update save and restore in service + * of eviction, there's an update that's not + * globally visible, and the underlying value + * is a removed overflow value, we end up here. + * + * When the update save/restore code noticed the + * removed overflow value, it appended a copy of + * the cached, original overflow value to the + * update list being saved (ensuring any on-page + * item will never be accessed after the page is + * re-instantiated), then returned a NULL update + * to us. + * + * Assert the case. + */ + WT_ASSERT(session, + F_ISSET(r, WT_SKIP_UPDATE_RESTORE)); + + /* + * If the key is also a removed overflow item, + * don't write anything at all. + * + * We don't have to write anything because the + * code re-instantiating the page gets the key + * to match the saved list of updates from the + * original page. By not putting the key on + * the page, we'll move the key/value set from + * a row-store leaf page slot to an insert list, + * but that shouldn't matter. + * + * The reason we bother with the test is because + * overflows are expensive to write. It's hard + * to imagine a real workload where this test is + * worth the effort, but it's a simple test. + */ + if (kpack != NULL && + kpack->raw == WT_CELL_KEY_OVFL_RM) + goto leaf_insert; + + /* + * The on-page value will never be accessed, + * write a placeholder record. + */ + WT_ERR(__rec_cell_build_val( + session, r, "@", 1, (uint64_t)0)); + } else { + val->buf.data = val_cell; + val->buf.size = __wt_cell_total_len(vpack); + val->cell_len = 0; + val->len = val->buf.size; + + /* Track if page has overflow items. */ + if (vpack->ovfl) + r->ovfl_items = 1; + } + } else { + /* + * If the original value was an overflow and we've not + * already done so, discard it. One complication: we + * must cache a copy before discarding the on-disk + * version if there's a transaction in the system that + * might read the original value. + */ + if (vpack != NULL && + vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM) + WT_ERR( + __wt_ovfl_cache(session, page, rip, vpack)); + + /* If this key/value pair was deleted, we're done. */ + if (WT_UPDATE_DELETED_ISSET(upd)) { + /* + * Overflow keys referencing discarded values + * are no longer useful, discard the backing + * blocks. Don't worry about reuse, reusing + * keys from a row-store page reconciliation + * seems unlikely enough to ignore. + */ + if (kpack != NULL && kpack->ovfl && + kpack->raw != WT_CELL_KEY_OVFL_RM) { + /* + * Keys are part of the name-space, we + * can't remove them from the in-memory + * tree; if an overflow key was deleted + * without being instantiated (for + * example, cursor-based truncation, do + * it now. + */ + if (ikey == NULL) + WT_ERR(__wt_row_leaf_key( + session, + page, rip, tmpkey, 1)); + + WT_ERR(__wt_ovfl_discard_add( + session, page, kpack->cell)); + } + + /* + * We aren't actually creating the key so we + * can't use bytes from this key to provide + * prefix information for a subsequent key. + */ + tmpkey->size = 0; + + /* Proceed with appended key/value pairs. */ + goto leaf_insert; + } + + /* + * If no value, nothing needs to be copied. Otherwise, + * build the value's WT_CELL chunk from the most recent + * update value. + */ + if (upd->size == 0) { + val->buf.data = NULL; + val->cell_len = val->len = val->buf.size = 0; + } else { + WT_ERR(__rec_cell_build_val(session, r, + WT_UPDATE_DATA(upd), upd->size, + (uint64_t)0)); + dictionary = 1; + } + } + + /* + * Build key cell. + * + * If the key is an overflow key that hasn't been removed, use + * the original backing blocks. + */ + onpage_ovfl = kpack != NULL && + kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM; + if (onpage_ovfl) { + key->buf.data = cell; + key->buf.size = __wt_cell_total_len(kpack); + key->cell_len = 0; + key->len = key->buf.size; + ovfl_key = 1; + + /* + * We aren't creating a key so we can't use this key as + * a prefix for a subsequent key. + */ + tmpkey->size = 0; + + /* Track if page has overflow items. */ + r->ovfl_items = 1; + } else { + /* + * Get the key from the page or an instantiated key, or + * inline building the key from a previous key (it's a + * fast path for simple, prefix-compressed keys), or by + * by building the key from scratch. + */ + if (__wt_row_leaf_key_info(page, copy, + NULL, &cell, &tmpkey->data, &tmpkey->size)) + goto build; + + kpack = &_kpack; + __wt_cell_unpack(cell, kpack); + if (btree->huffman_key == NULL && + kpack->type == WT_CELL_KEY && + tmpkey->size >= kpack->prefix) { + /* + * The previous clause checked for a prefix of + * zero, which means the temporary buffer must + * have a non-zero size, and it references a + * valid key. + */ + WT_ASSERT(session, tmpkey->size != 0); + + /* + * Grow the buffer as necessary, ensuring data + * data has been copied into local buffer space, + * then append the suffix to the prefix already + * in the buffer. + * + * Don't grow the buffer unnecessarily or copy + * data we don't need, truncate the item's data + * length to the prefix bytes. + */ + tmpkey->size = kpack->prefix; + WT_ERR(__wt_buf_grow(session, + tmpkey, tmpkey->size + kpack->size)); + memcpy((uint8_t *)tmpkey->mem + tmpkey->size, + kpack->data, kpack->size); + tmpkey->size += kpack->size; + } else + WT_ERR(__wt_row_leaf_key_copy( + session, page, rip, tmpkey)); +build: + WT_ERR(__rec_cell_build_leaf_key(session, r, + tmpkey->data, tmpkey->size, &ovfl_key)); + } + + /* Boundary: split or write the page. */ + while (key->len + val->len > r->space_avail) { + if (r->raw_compression) { + WT_ERR(__rec_split_raw(session, r)); + continue; + } + + /* + * In one path above, we copied address blocks from the + * page rather than building the actual key. In that + * case, we have to build the actual key now because we + * are about to promote it. + */ + if (onpage_ovfl) { + WT_ERR(__wt_dsk_cell_data_ref( + session, WT_PAGE_ROW_LEAF, kpack, r->cur)); + onpage_ovfl = 0; + } + WT_ERR(__rec_split(session, r)); + + /* + * Turn off prefix compression until a full key written + * to the new page, and (unless we're already working + * with an overflow key), rebuild the key without prefix + * compression. + */ + if (r->key_pfx_compress_conf) { + r->key_pfx_compress = 0; + if (!ovfl_key) + WT_ERR(__rec_cell_build_leaf_key( + session, r, NULL, 0, &ovfl_key)); + } + } + + /* Copy the key/value pair onto the page. */ + __rec_copy_incr(session, r, key); + if (val->len == 0) + r->any_empty_value = 1; + else { + r->all_empty_value = 0; + if (dictionary && btree->dictionary) + WT_ERR(__rec_dict_replace(session, r, 0, val)); + __rec_copy_incr(session, r, val); + } + + /* Update compression state. */ + __rec_key_state_update(r, ovfl_key); + +leaf_insert: /* Write any K/V pairs inserted into the page after this key. */ + if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT(page, rip))) != NULL) + WT_ERR(__rec_row_leaf_insert(session, r, ins)); + } + + /* Write the remnant page. */ + ret = __rec_split_finish(session, r); + +err: __wt_scr_free(&tmpkey); + __wt_scr_free(&tmpval); + return (ret); +} + +/* + * __rec_row_leaf_insert -- + * Walk an insert chain, writing K/V pairs. + */ +static int +__rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) +{ + WT_BTREE *btree; + WT_KV *key, *val; + WT_UPDATE *upd; + int ovfl_key; + + btree = S2BT(session); + + key = &r->k; + val = &r->v; + + for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) { + /* Look for an update. */ + WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); + if (upd == NULL || WT_UPDATE_DELETED_ISSET(upd)) + continue; + + if (upd->size == 0) /* Build value cell. */ + val->len = 0; + else + WT_RET(__rec_cell_build_val(session, r, + WT_UPDATE_DATA(upd), upd->size, (uint64_t)0)); + + /* Build key cell. */ + WT_RET(__rec_cell_build_leaf_key(session, r, + WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key)); + + /* Boundary: split or write the page. */ + while (key->len + val->len > r->space_avail) { + if (r->raw_compression) { + WT_RET(__rec_split_raw(session, r)); + continue; + } + WT_RET(__rec_split(session, r)); + + /* + * Turn off prefix compression until a full key written + * to the new page, and (unless we're already working + * with an overflow key), rebuild the key without prefix + * compression. + */ + if (r->key_pfx_compress_conf) { + r->key_pfx_compress = 0; + if (!ovfl_key) + WT_RET(__rec_cell_build_leaf_key( + session, r, NULL, 0, &ovfl_key)); + } + } + + /* Copy the key/value pair onto the page. */ + __rec_copy_incr(session, r, key); + if (val->len == 0) + r->any_empty_value = 1; + else { + r->all_empty_value = 0; + if (btree->dictionary) + WT_RET(__rec_dict_replace(session, r, 0, val)); + __rec_copy_incr(session, r, val); + } + + /* Update compression state. */ + __rec_key_state_update(r, ovfl_key); + } + + return (0); +} + +/* + * __rec_split_discard -- + * Discard the pages resulting from a previous split. + */ +static int +__rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BM *bm; + WT_DECL_RET; + WT_PAGE_MODIFY *mod; + WT_MULTI *multi; + uint32_t i; + + bm = S2BT(session)->bm; + mod = page->modify; + + /* + * A page that split is being reconciled for the second, or subsequent + * time; discard underlying block space used in the last reconciliation + * that is not being reused for this reconciliation. + */ + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) { + switch (page->type) { + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + __wt_free(session, multi->key.ikey); + break; + } + if (multi->skip == NULL) { + if (multi->addr.reuse) + multi->addr.addr = NULL; + else { + WT_RET(bm->free(bm, session, + multi->addr.addr, multi->addr.size)); + __wt_free(session, multi->addr.addr); + } + } else { + __wt_free(session, multi->skip); + __wt_free(session, multi->skip_dsk); + } + } + __wt_free(session, mod->mod_multi); + mod->mod_multi_entries = 0; + + /* + * This routine would be trivial, and only walk a single page freeing + * any blocks written to support the split, except for root splits. + * In the case of root splits, we have to cope with multiple pages in + * a linked list, and we also have to discard overflow items written + * for the page. + */ + switch (page->type) { + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + if (mod->mod_root_split == NULL) + break; + WT_RET(__rec_split_discard(session, mod->mod_root_split)); + WT_RET(__wt_ovfl_track_wrapup(session, mod->mod_root_split)); + __wt_page_out(session, &mod->mod_root_split); + break; + } + + return (ret); +} + +/* + * __rec_write_wrapup -- + * Finish the reconciliation. + */ +static int +__rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_BM *bm; + WT_BOUNDARY *bnd; + WT_BTREE *btree; + WT_MULTI *multi; + WT_PAGE_MODIFY *mod; + WT_REF *ref; + size_t addr_size; + const uint8_t *addr; + + btree = S2BT(session); + bm = btree->bm; + mod = page->modify; + ref = r->ref; + + /* + * This page may have previously been reconciled, and that information + * is now about to be replaced. Make sure it's discarded at some point, + * and clear the underlying modification information, we're creating a + * new reality. + */ + switch (F_ISSET(mod, WT_PM_REC_MASK)) { + case 0: /* + * The page has never been reconciled before, free the original + * address blocks (if any). The "if any" is for empty trees + * created when a new tree is opened or previously deleted pages + * instantiated in memory. + * + * The exception is root pages are never tracked or free'd, they + * are checkpoints, and must be explicitly dropped. + */ + if (__wt_ref_is_root(ref)) + break; + if (ref->addr != NULL) { + /* + * Free the page and clear the address (so we don't free + * it twice). + */ + WT_RET(__wt_ref_info( + session, ref, &addr, &addr_size, NULL)); + WT_RET(bm->free(bm, session, addr, addr_size)); + if (__wt_off_page(ref->home, ref->addr)) { + __wt_free( + session, ((WT_ADDR *)ref->addr)->addr); + __wt_free(session, ref->addr); + } + ref->addr = NULL; + } + break; + case WT_PM_REC_EMPTY: /* Page deleted */ + break; + case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ + /* + * Discard the multiple replacement blocks. + */ + WT_RET(__rec_split_discard(session, page)); + break; + case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ + /* + * Discard the replacement leaf page's blocks. + * + * The exception is root pages are never tracked or free'd, they + * are checkpoints, and must be explicitly dropped. + */ + if (!__wt_ref_is_root(ref)) + WT_RET(bm->free(bm, session, + mod->mod_replace.addr, mod->mod_replace.size)); + + /* Discard the replacement page's address. */ + __wt_free(session, mod->mod_replace.addr); + mod->mod_replace.size = 0; + break; + WT_ILLEGAL_VALUE(session); + } + F_CLR(mod, WT_PM_REC_MASK); + + /* + * Wrap up overflow tracking. If we are about to create a checkpoint, + * the system must be entirely consistent at that point (the underlying + * block manager is presumably going to do some action to resolve the + * list of allocated/free/whatever blocks that are associated with the + * checkpoint). + */ + WT_RET(__wt_ovfl_track_wrapup(session, page)); + + switch (r->bnd_next) { + case 0: /* Page delete */ + WT_RET(__wt_verbose( + session, WT_VERB_RECONCILE, "page %p empty", page)); + WT_STAT_FAST_DATA_INCR(session, rec_page_delete); + + /* If this is the root page, we need to create a sync point. */ + ref = r->ref; + if (__wt_ref_is_root(ref)) + WT_RET( + bm->checkpoint(bm, session, NULL, btree->ckpt, 0)); + + /* + * If the page was empty, we want to discard it from the tree + * by discarding the parent's key when evicting the parent. + * Mark the page as deleted, then return success, leaving the + * page in memory. If the page is subsequently modified, that + * is OK, we'll just reconcile it again. + */ + F_SET(mod, WT_PM_REC_EMPTY); + break; + case 1: /* 1-for-1 page swap */ + /* + * Because WiredTiger's pages grow without splitting, we're + * replacing a single page with another single page most of + * the time. + */ + bnd = &r->bnd[0]; + + /* + * If we're saving/restoring changes for this page, there's + * nothing to write. Allocate, then initialize the array of + * replacement blocks. + */ + if (bnd->skip != NULL) { + WT_RET(__wt_calloc_def( + session, r->bnd_next, &mod->mod_multi)); + multi = mod->mod_multi; + multi->skip = bnd->skip; + multi->skip_entries = bnd->skip_next; + bnd->skip = NULL; + multi->skip_dsk = bnd->dsk; + bnd->dsk = NULL; + mod->mod_multi_entries = 1; + + F_SET(mod, WT_PM_REC_MULTIBLOCK); + break; + } + + /* + * If this is a root page, then we don't have an address and we + * have to create a sync point. The address was cleared when + * we were about to write the buffer so we know what to do here. + */ + if (bnd->addr.addr == NULL) + WT_RET(__wt_bt_write(session, + &r->dsk, NULL, NULL, 1, bnd->already_compressed)); + else { + mod->mod_replace = bnd->addr; + bnd->addr.addr = NULL; + } + + F_SET(mod, WT_PM_REC_REPLACE); + break; + default: /* Page split */ + WT_RET(__wt_verbose(session, WT_VERB_RECONCILE, + "page %p reconciled into %" PRIu32 " pages", + page, r->bnd_next)); + + switch (page->type) { + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + WT_STAT_FAST_DATA_INCR( + session, rec_multiblock_internal); + break; + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + case WT_PAGE_ROW_LEAF: + WT_STAT_FAST_DATA_INCR(session, rec_multiblock_leaf); + break; + WT_ILLEGAL_VALUE(session); + } + + /* Display the actual split keys. */ + if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT)) { + WT_DECL_ITEM(tkey); + WT_DECL_RET; + uint32_t i; + + if (page->type == WT_PAGE_ROW_INT || + page->type == WT_PAGE_ROW_LEAF) + WT_RET(__wt_scr_alloc(session, 0, &tkey)); + for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i) + switch (page->type) { + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + WT_ERR(__wt_buf_set_printable( + session, tkey, + bnd->key.data, bnd->key.size)); + WT_ERR(__wt_verbose( + session, WT_VERB_SPLIT, + "split: starting key " + "%.*s", + (int)tkey->size, + (const char *)tkey->data)); + break; + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_INT: + case WT_PAGE_COL_VAR: + WT_ERR(__wt_verbose( + session, WT_VERB_SPLIT, + "split: starting recno %" PRIu64, + bnd->recno)); + break; + WT_ILLEGAL_VALUE_ERR(session); + } +err: __wt_scr_free(&tkey); + WT_RET(ret); + } + if (r->bnd_next > r->bnd_next_max) { + r->bnd_next_max = r->bnd_next; + WT_STAT_FAST_DATA_SET( + session, rec_multiblock_max, r->bnd_next_max); + } + + switch (page->type) { + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + WT_RET(__rec_split_row(session, r, page)); + break; + case WT_PAGE_COL_INT: + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + WT_RET(__rec_split_col(session, r, page)); + break; + WT_ILLEGAL_VALUE(session); + } + F_SET(mod, WT_PM_REC_MULTIBLOCK); + break; + } + + /* + * If updates were skipped, the tree isn't clean. The checkpoint call + * cleared the tree's modified value before calling the eviction thread, + * so we must explicitly reset the tree's modified flag. We insert a + * barrier after the change for clarity (the requirement is the value + * be set before a subsequent checkpoint reads it, and because the + * current checkpoint is waiting on this reconciliation to complete, + * there's no risk of that happening). + * + * Otherwise, if no updates were skipped, we have a new maximum + * transaction written for the page (used to decide if a clean page can + * be evicted). The page only might be clean; if the write generation + * is unchanged since reconciliation started, clear it and update cache + * dirty statistics, if the write generation changed, then the page has + * been written since we started reconciliation, it cannot be + * discarded. + */ + if (r->leave_dirty) { + mod->first_dirty_txn = r->skipped_txn; + + btree->modified = 1; + WT_FULL_BARRIER(); + } else { + mod->rec_max_txn = r->max_txn; + + if (WT_ATOMIC_CAS4(mod->write_gen, r->orig_write_gen, 0)) + __wt_cache_dirty_decr(session, page); + } + + return (0); +} + +/* + * __rec_write_wrapup_err -- + * Finish the reconciliation on error. + */ +static int +__rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_BM *bm; + WT_BOUNDARY *bnd; + WT_DECL_RET; + WT_MULTI *multi; + WT_PAGE_MODIFY *mod; + uint32_t i; + + bm = S2BT(session)->bm; + mod = page->modify; + + /* + * Clear the address-reused flag from the multiblock reconciliation + * information (otherwise we might think the backing block is being + * reused on a subsequent reconciliation where we want to free it). + */ + if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_MULTIBLOCK) + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) + multi->addr.reuse = 0; + + /* + * On error, discard blocks we've written, they're unreferenced by the + * tree. This is not a question of correctness, we're avoiding block + * leaks. + * + * Don't discard backing blocks marked for reuse, they remain part of + * a previous reconciliation. + */ + WT_TRET(__wt_ovfl_track_wrapup_err(session, page)); + for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i) + if (bnd->addr.addr != NULL) { + if (bnd->addr.reuse) + bnd->addr.addr = NULL; + else { + WT_TRET(bm->free(bm, session, + bnd->addr.addr, bnd->addr.size)); + __wt_free(session, bnd->addr.addr); + } + } + + return (ret); +} + +/* + * __rec_split_row -- + * Split a row-store page into a set of replacement blocks. + */ +static int +__rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_BOUNDARY *bnd; + WT_MULTI *multi; + WT_PAGE_MODIFY *mod; + WT_REF *ref; + uint32_t i; + size_t size; + void *p; + + mod = page->modify; + + /* We never set the first page's key, grab it from the original page. */ + ref = r->ref; + if (__wt_ref_is_root(ref)) + WT_RET(__wt_buf_set(session, &r->bnd[0].key, "", 1)); + else { + __wt_ref_key(ref->home, ref, &p, &size); + WT_RET(__wt_buf_set(session, &r->bnd[0].key, p, size)); + } + + /* Allocate, then initialize the array of replacement blocks. */ + WT_RET(__wt_calloc_def(session, r->bnd_next, &mod->mod_multi)); + + for (multi = mod->mod_multi, + bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { + WT_RET(__wt_row_ikey(session, 0, + bnd->key.data, bnd->key.size, &multi->key.ikey)); + + if (bnd->skip == NULL) { + multi->addr = bnd->addr; + multi->addr.reuse = 0; + multi->size = bnd->size; + multi->cksum = bnd->cksum; + bnd->addr.addr = NULL; + } else { + multi->skip = bnd->skip; + multi->skip_entries = bnd->skip_next; + bnd->skip = NULL; + multi->skip_dsk = bnd->dsk; + bnd->dsk = NULL; + } + } + mod->mod_multi_entries = r->bnd_next; + + return (0); +} + +/* + * __rec_split_col -- + * Split a column-store page into a set of replacement blocks. + */ +static int +__rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_BOUNDARY *bnd; + WT_MULTI *multi; + WT_PAGE_MODIFY *mod; + uint32_t i; + + mod = page->modify; + + /* Allocate, then initialize the array of replacement blocks. */ + WT_RET(__wt_calloc_def(session, r->bnd_next, &mod->mod_multi)); + + for (multi = mod->mod_multi, + bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { + multi->key.recno = bnd->recno; + + if (bnd->skip == NULL) { + multi->addr = bnd->addr; + multi->addr.reuse = 0; + multi->size = bnd->size; + multi->cksum = bnd->cksum; + bnd->addr.addr = NULL; + } else { + multi->skip = bnd->skip; + multi->skip_entries = bnd->skip_next; + bnd->skip = NULL; + multi->skip_dsk = bnd->dsk; + bnd->dsk = NULL; + } + } + mod->mod_multi_entries = r->bnd_next; + + return (0); +} + +/* + * __rec_cell_build_int_key -- + * Process a key and return a WT_CELL structure and byte string to be + * stored on a row-store internal page. + */ +static int +__rec_cell_build_int_key(WT_SESSION_IMPL *session, + WT_RECONCILE *r, const void *data, size_t size, int *is_ovflp) +{ + WT_BTREE *btree; + WT_KV *key; + + *is_ovflp = 0; + + btree = S2BT(session); + + key = &r->k; + + /* Copy the bytes into the "current" and key buffers. */ + WT_RET(__wt_buf_set(session, r->cur, data, size)); + WT_RET(__wt_buf_set(session, &key->buf, data, size)); + + /* Create an overflow object if the data won't fit. */ + if (size > btree->maxintlitem) { + WT_STAT_FAST_DATA_INCR(session, rec_overflow_key_internal); + + *is_ovflp = 1; + return (__rec_cell_build_ovfl( + session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0)); + } + + key->cell_len = __wt_cell_pack_int_key(&key->cell, key->buf.size); + key->len = key->cell_len + key->buf.size; + + return (0); +} + +/* + * __rec_cell_build_leaf_key -- + * Process a key and return a WT_CELL structure and byte string to be + * stored on a row-store leaf page. + */ +static int +__rec_cell_build_leaf_key(WT_SESSION_IMPL *session, + WT_RECONCILE *r, const void *data, size_t size, int *is_ovflp) +{ + WT_BTREE *btree; + WT_KV *key; + size_t pfx_max; + uint8_t pfx; + const uint8_t *a, *b; + + *is_ovflp = 0; + + btree = S2BT(session); + + key = &r->k; + + pfx = 0; + if (data == NULL) + /* + * When data is NULL, our caller has a prefix compressed key + * they can't use (probably because they just crossed a split + * point). Use the full key saved when last called, instead. + */ + WT_RET(__wt_buf_set( + session, &key->buf, r->cur->data, r->cur->size)); + else { + /* + * Save a copy of the key for later reference: we use the full + * key for prefix-compression comparisons, and if we are, for + * any reason, unable to use the compressed key we generate. + */ + WT_RET(__wt_buf_set(session, r->cur, data, size)); + + /* + * Do prefix compression on the key. We know by definition the + * previous key sorts before the current key, which means the + * keys must differ and we just need to compare up to the + * shorter of the two keys. + */ + if (r->key_pfx_compress) { + /* + * We can't compress out more than 256 bytes, limit the + * comparison to that. + */ + pfx_max = UINT8_MAX; + if (size < pfx_max) + pfx_max = size; + if (r->last->size < pfx_max) + pfx_max = r->last->size; + for (a = data, b = r->last->data; pfx < pfx_max; ++pfx) + if (*a++ != *b++) + break; + + /* + * Prefix compression may cost us CPU and memory when + * the page is re-loaded, don't do it unless there's + * reasonable gain. + */ + if (pfx < btree->prefix_compression_min) + pfx = 0; + else + WT_STAT_FAST_DATA_INCRV( + session, rec_prefix_compression, pfx); + } + + /* Copy the non-prefix bytes into the key buffer. */ + WT_RET(__wt_buf_set( + session, &key->buf, (uint8_t *)data + pfx, size - pfx)); + } + + /* Optionally compress the key using the Huffman engine. */ + if (btree->huffman_key != NULL) + WT_RET(__wt_huffman_encode(session, btree->huffman_key, + key->buf.data, (uint32_t)key->buf.size, &key->buf)); + + /* Create an overflow object if the data won't fit. */ + if (key->buf.size > btree->maxleafitem) { + /* + * Overflow objects aren't prefix compressed -- rebuild any + * object that was prefix compressed. + */ + if (pfx == 0) { + WT_STAT_FAST_DATA_INCR(session, rec_overflow_key_leaf); + + *is_ovflp = 1; + return (__rec_cell_build_ovfl( + session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0)); + } + return ( + __rec_cell_build_leaf_key(session, r, NULL, 0, is_ovflp)); + } + + key->cell_len = __wt_cell_pack_leaf_key(&key->cell, pfx, key->buf.size); + key->len = key->cell_len + key->buf.size; + + return (0); +} + +/* + * __rec_cell_build_addr -- + * Process an address reference and return a cell structure to be stored + * on the page. + */ +static void +__rec_cell_build_addr(WT_RECONCILE *r, + const void *addr, size_t size, u_int cell_type, uint64_t recno) +{ + WT_KV *val; + + val = &r->v; + + /* + * We don't check the address size because we can't store an address on + * an overflow page: if the address won't fit, the overflow page's + * address won't fit either. This possibility must be handled by Btree + * configuration, we have to disallow internal page sizes that are too + * small with respect to the largest address cookie the underlying block + * manager might return. + */ + + /* + * We don't copy the data into the buffer, it's not necessary; just + * re-point the buffer's data/length fields. + */ + val->buf.data = addr; + val->buf.size = size; + val->cell_len = + __wt_cell_pack_addr(&val->cell, cell_type, recno, val->buf.size); + val->len = val->cell_len + val->buf.size; +} + +/* + * __rec_cell_build_val -- + * Process a data item and return a WT_CELL structure and byte string to + * be stored on the page. + */ +static int +__rec_cell_build_val(WT_SESSION_IMPL *session, + WT_RECONCILE *r, const void *data, size_t size, uint64_t rle) +{ + WT_BTREE *btree; + WT_KV *val; + + btree = S2BT(session); + + val = &r->v; + + /* + * We don't copy the data into the buffer, it's not necessary; just + * re-point the buffer's data/length fields. + */ + val->buf.data = data; + val->buf.size = size; + + /* Handle zero-length cells quickly. */ + if (size != 0) { + /* Optionally compress the data using the Huffman engine. */ + if (btree->huffman_value != NULL) + WT_RET(__wt_huffman_encode( + session, btree->huffman_value, + val->buf.data, (uint32_t)val->buf.size, &val->buf)); + + /* Create an overflow object if the data won't fit. */ + if (val->buf.size > btree->maxleafitem) { + WT_STAT_FAST_DATA_INCR(session, rec_overflow_value); + + return (__rec_cell_build_ovfl( + session, r, val, WT_CELL_VALUE_OVFL, rle)); + } + } + val->cell_len = __wt_cell_pack_data(&val->cell, rle, val->buf.size); + val->len = val->cell_len + val->buf.size; + + return (0); +} + +/* + * __rec_cell_build_ovfl -- + * Store overflow items in the file, returning the address cookie. + */ +static int +__rec_cell_build_ovfl(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_KV *kv, uint8_t type, uint64_t rle) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_HEADER *dsk; + size_t size; + uint8_t *addr, buf[WT_BTREE_MAX_ADDR_COOKIE]; + + btree = S2BT(session); + bm = btree->bm; + page = r->page; + + /* Track if page has overflow items. */ + r->ovfl_items = 1; + + /* + * See if this overflow record has already been written and reuse it if + * possible. Else, write a new overflow record. + */ + if (!__wt_ovfl_reuse_search(session, page, + &addr, &size, kv->buf.data, kv->buf.size)) { + /* Allocate a buffer big enough to write the overflow record. */ + size = kv->buf.size; + WT_RET(bm->write_size(bm, session, &size)); + WT_RET(__wt_scr_alloc(session, size, &tmp)); + + /* Initialize the buffer: disk header and overflow record. */ + dsk = tmp->mem; + memset(dsk, 0, WT_PAGE_HEADER_SIZE); + dsk->type = WT_PAGE_OVFL; + dsk->u.datalen = (uint32_t)kv->buf.size; + memcpy(WT_PAGE_HEADER_BYTE(btree, dsk), + kv->buf.data, kv->buf.size); + dsk->mem_size = tmp->size = + WT_PAGE_HEADER_BYTE_SIZE(btree) + (uint32_t)kv->buf.size; + + /* Write the buffer. */ + addr = buf; + WT_ERR(__wt_bt_write(session, tmp, addr, &size, 0, 0)); + + /* + * Track the overflow record (unless it's a bulk load, which + * by definition won't ever reuse a record. + */ + if (!r->is_bulk_load) + WT_ERR(__wt_ovfl_reuse_add(session, page, + addr, size, kv->buf.data, kv->buf.size)); + } + + /* Set the callers K/V to reference the overflow record's address. */ + WT_ERR(__wt_buf_set(session, &kv->buf, addr, size)); + + /* Build the cell and return. */ + kv->cell_len = __wt_cell_pack_ovfl(&kv->cell, type, rle, kv->buf.size); + kv->len = kv->cell_len + kv->buf.size; + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * The dictionary -- + * The rest of this file is support for dictionaries. + * + * It's difficult to write generic skiplist functions without turning a single + * memory allocation into two, or requiring a function call instead of a simple + * comparison. Fortunately, skiplists are relatively simple things and we can + * include them in-place. If you need generic skip-list functions to modify, + * this set wouldn't be a bad place to start. + * + * __rec_dictionary_skip_search -- + * Search a dictionary skiplist. + */ +static WT_DICTIONARY * +__rec_dictionary_skip_search(WT_DICTIONARY **head, uint64_t hash) +{ + WT_DICTIONARY **e; + int i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) { + if (*e == NULL) { /* Empty levels */ + --i; + --e; + continue; + } + + /* + * Return any exact matches: we don't care in what search level + * we found a match. + */ + if ((*e)->hash == hash) /* Exact match */ + return (*e); + if ((*e)->hash > hash) { /* Drop down a level */ + --i; + --e; + } else /* Keep going at this level */ + e = &(*e)->next[i]; + } + return (NULL); +} + +/* + * __rec_dictionary_skip_search_stack -- + * Search a dictionary skiplist, returning an insert/remove stack. + */ +static void +__rec_dictionary_skip_search_stack( + WT_DICTIONARY **head, WT_DICTIONARY ***stack, uint64_t hash) +{ + WT_DICTIONARY **e; + int i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) + if (*e == NULL || (*e)->hash > hash) + stack[i--] = e--; /* Drop down a level */ + else + e = &(*e)->next[i]; /* Keep going at this level */ +} + +/* + * __rec_dictionary_skip_insert -- + * Insert an entry into the dictionary skip-list. + */ +static void +__rec_dictionary_skip_insert( + WT_DICTIONARY **head, WT_DICTIONARY *e, uint64_t hash) +{ + WT_DICTIONARY **stack[WT_SKIP_MAXDEPTH]; + u_int i; + + /* Insert the new entry into the skiplist. */ + __rec_dictionary_skip_search_stack(head, stack, hash); + for (i = 0; i < e->depth; ++i) { + e->next[i] = *stack[i]; + *stack[i] = e; + } +} + +/* + * __rec_dictionary_init -- + * Allocate and initialize the dictionary. + */ +static int +__rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, u_int slots) +{ + u_int depth, i; + + /* Free any previous dictionary. */ + __rec_dictionary_free(session, r); + + r->dictionary_slots = slots; + WT_RET(__wt_calloc(session, + r->dictionary_slots, sizeof(WT_DICTIONARY *), &r->dictionary)); + for (i = 0; i < r->dictionary_slots; ++i) { + depth = __wt_skip_choose_depth(session); + WT_RET(__wt_calloc(session, 1, + sizeof(WT_DICTIONARY) + depth * sizeof(WT_DICTIONARY *), + &r->dictionary[i])); + r->dictionary[i]->depth = depth; + } + return (0); +} + +/* + * __rec_dictionary_free -- + * Free the dictionary. + */ +static void +__rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + u_int i; + + if (r->dictionary == NULL) + return; + + /* + * We don't correct dictionary_slots when we fail during allocation, + * but that's OK, the value is either NULL or a memory reference to + * be free'd. + */ + for (i = 0; i < r->dictionary_slots; ++i) + __wt_free(session, r->dictionary[i]); + __wt_free(session, r->dictionary); +} + +/* + * __rec_dictionary_reset -- + * Reset the dictionary when reconciliation restarts and when crossing a + * page boundary (a potential split). + */ +static void +__rec_dictionary_reset(WT_RECONCILE *r) +{ + if (r->dictionary_slots) { + r->dictionary_next = 0; + memset(r->dictionary_head, 0, sizeof(r->dictionary_head)); + } +} + +/* + * __rec_dictionary_lookup -- + * Check the dictionary for a matching value on this page. + */ +static int +__rec_dictionary_lookup( + WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *val, WT_DICTIONARY **dpp) +{ + WT_DICTIONARY *dp, *next; + uint64_t hash; + int match; + + *dpp = NULL; + + /* Search the dictionary, and return any match we find. */ + hash = __wt_hash_fnv64(val->buf.data, val->buf.size); + for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash); + dp != NULL && dp->hash == hash; dp = dp->next[0]) { + WT_RET(__wt_cell_pack_data_match( + dp->cell, &val->cell, val->buf.data, &match)); + if (match) { + WT_STAT_FAST_DATA_INCR(session, rec_dictionary); + *dpp = dp; + return (0); + } + } + + /* + * We're not doing value replacement in the dictionary. We stop adding + * new entries if we run out of empty dictionary slots (but continue to + * use the existing entries). I can't think of any reason a leaf page + * value is more likely to be seen because it was seen more recently + * than some other value: if we find working sets where that's not the + * case, it shouldn't be too difficult to maintain a pointer which is + * the next dictionary slot to re-use. + */ + if (r->dictionary_next >= r->dictionary_slots) + return (0); + + /* + * Set the hash value, we'll add this entry into the dictionary when we + * write it into the page's disk image buffer (because that's when we + * know where on the page it will be written). + */ + next = r->dictionary[r->dictionary_next++]; + next->cell = NULL; /* Not necessary, just cautious. */ + next->hash = hash; + __rec_dictionary_skip_insert(r->dictionary_head, next, hash); + *dpp = next; + return (0); +} diff --git a/src/third_party/wiredtiger/src/btree/row_key.c b/src/third_party/wiredtiger/src/btree/row_key.c new file mode 100644 index 00000000000..308bc1f0dc5 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/row_key.c @@ -0,0 +1,500 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static void __inmem_row_leaf_slots(uint8_t *, uint32_t, uint32_t, uint32_t); + +/* + * __wt_row_leaf_keys -- + * Instantiate the interesting keys for random search of a page. + */ +int +__wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BTREE *btree; + WT_DECL_ITEM(key); + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_ROW *rip; + uint32_t gap, i; + + btree = S2BT(session); + + if (page->pg_row_entries == 0) { /* Just checking... */ + F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS); + return (0); + } + + /* + * Row-store leaf pages are written as one big prefix-compressed chunk, + * that is, only the first key on the page is not prefix-compressed, and + * to instantiate the last key on the page, you have to take the first + * key on the page and roll it forward to the end of the page. We don't + * want to do that on every page access, of course, so we instantiate a + * set of keys, essentially creating prefix chunks on the page, where we + * can roll forward from the closest, previous, instantiated key. The + * complication is that not all keys on a page are equal: we're doing a + * binary search on the page, which means there are keys we look at a + * lot (every time we search the page), and keys we never look at unless + * they are actually being searched for. This function figures out the + * "interesting" keys on a page, and then we sequentially walk that list + * instantiating those keys. + * + * Allocate a bit array and figure out the set of "interesting" keys, + * marking up the array. + */ + WT_RET(__wt_scr_alloc(session, 0, &key)); + WT_RET(__wt_scr_alloc(session, + (uint32_t)__bitstr_size(page->pg_row_entries), &tmp)); + + if ((gap = btree->key_gap) == 0) + gap = 1; + __inmem_row_leaf_slots(tmp->mem, 0, page->pg_row_entries, gap); + + /* Instantiate the keys. */ + for (rip = page->pg_row_d, i = 0; i < page->pg_row_entries; ++rip, ++i) + if (__bit_test(tmp->mem, i)) + WT_ERR(__wt_row_leaf_key_work( + session, page, rip, key, 1)); + + F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS); + +err: __wt_scr_free(&key); + __wt_scr_free(&tmp); + return (ret); +} + +/* + * __inmem_row_leaf_slots -- + * Figure out the interesting slots of a page for random search, up to + * the specified depth. + */ +static void +__inmem_row_leaf_slots( + uint8_t *list, uint32_t base, uint32_t entries, uint32_t gap) +{ + uint32_t indx, limit; + + if (entries < gap) + return; + + /* + * !!! + * Don't clean this code up -- it deliberately looks like the binary + * search code. + * + * !!! + * There's got to be a function that would give me this information, but + * I don't see any performance reason we can't just do this recursively. + */ + limit = entries; + indx = base + (limit >> 1); + __bit_set(list, indx); + + __inmem_row_leaf_slots(list, base, limit >> 1, gap); + + base = indx + 1; + --limit; + __inmem_row_leaf_slots(list, base, limit >> 1, gap); +} + +/* + * __wt_row_leaf_key_copy -- + * Get a copy of a row-store leaf-page key. + */ +int +__wt_row_leaf_key_copy( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key) +{ + WT_RET(__wt_row_leaf_key(session, page, rip, key, 0)); + + /* The return buffer may only hold a reference to a key, copy it. */ + if (!WT_DATA_IN_ITEM(key)) + WT_RET(__wt_buf_set(session, key, key->data, key->size)); + + return (0); +} + +/* + * __wt_row_leaf_key_work -- + * Return a reference to, a row-store leaf-page key, optionally instantiate + * the key into the in-memory page. + */ +int +__wt_row_leaf_key_work(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, int instantiate) +{ + enum { FORWARD, BACKWARD } direction; + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_IKEY *ikey; + WT_ROW *rip, *jump_rip; + size_t size; + u_int last_prefix; + int jump_slot_offset, slot_offset; + void *copy; + const void *p; + + /* + * !!! + * It is unusual to call this function: most code should be calling the + * front-end, __wt_row_leaf_key, be careful if you're calling this code + * directly. + */ + + btree = S2BT(session); + unpack = &_unpack; + rip = rip_arg; + + jump_rip = NULL; + jump_slot_offset = 0; + last_prefix = 0; + + p = NULL; /* -Werror=maybe-uninitialized */ + size = 0; /* -Werror=maybe-uninitialized */ + + direction = BACKWARD; + for (slot_offset = 0;;) { + if (0) { +switch_and_jump: /* Switching to a forward roll. */ + WT_ASSERT(session, direction == BACKWARD); + direction = FORWARD; + + /* Skip list of keys with compatible prefixes. */ + rip = jump_rip; + slot_offset = jump_slot_offset; + } + copy = WT_ROW_KEY_COPY(rip); + + /* + * Figure out what the key looks like. + */ + (void)__wt_row_leaf_key_info( + page, copy, &ikey, &cell, &p, &size); + + /* 1: the test for a directly referenced on-page key. */ + if (cell == NULL) { + keyb->data = p; + keyb->size = size; + + /* + * If this is the key we originally wanted, we don't + * care if we're rolling forward or backward, or if + * it's an overflow key or not, it's what we wanted. + * This shouldn't normally happen, the fast-path code + * that front-ends this function will have figured it + * out before we were called. + * + * The key doesn't need to be instantiated, skip past + * that test. + */ + if (slot_offset == 0) + goto done; + + /* + * This key is not an overflow key by definition and + * isn't compressed in any way, we can use it to roll + * forward. + * If rolling backward, switch directions. + * If rolling forward: there's a bug somewhere, + * we should have hit this key when rolling backward. + */ + goto switch_and_jump; + } + + /* 2: the test for an instantiated off-page key. */ + if (ikey != NULL) { + /* + * If this is the key we originally wanted, we don't + * care if we're rolling forward or backward, or if + * it's an overflow key or not, it's what we wanted. + * Take a copy and wrap up. + * + * The key doesn't need to be instantiated, skip past + * that test. + */ + if (slot_offset == 0) { + keyb->data = p; + keyb->size = size; + goto done; + } + + /* + * If we wanted a different key and this key is an + * overflow key: + * If we're rolling backward, this key is useless + * to us because it doesn't have a valid prefix: keep + * rolling backward. + * If we're rolling forward, there's no work to be + * done because prefixes skip overflow keys: keep + * rolling forward. + */ + if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL) + goto next; + + /* + * If we wanted a different key and this key is not an + * overflow key, it has a valid prefix, we can use it. + * If rolling backward, take a copy of the key and + * switch directions, we can roll forward from this key. + * If rolling forward, replace the key we've been + * building with this key, it's what we would have built + * anyway. + * In short: if it's not an overflow key, take a copy + * and roll forward. + */ + keyb->data = p; + keyb->size = size; + direction = FORWARD; + goto next; + } + + /* + * It must be an on-page cell, unpack it. + */ + __wt_cell_unpack(cell, unpack); + + /* 3: the test for an on-page reference to an overflow key. */ + if (unpack->type == WT_CELL_KEY_OVFL) { + /* + * If this is the key we wanted from the start, we don't + * care if it's an overflow key, get a copy and wrap up. + * + * Avoid racing with reconciliation deleting overflow + * keys. Deleted overflow keys must be instantiated + * first, acquire the overflow lock and check. Read + * the key if we still need to do so, but holding the + * overflow lock. Note we are not using the version of + * the cell-data-ref calls that acquire the overflow + * lock and do a look-aside into the tracking cache: + * this is an overflow key, not a value, meaning it's + * instantiated before being deleted, not copied into + * the tracking cache. + */ + if (slot_offset == 0) { + WT_ERR( + __wt_readlock(session, btree->ovfl_lock)); + copy = WT_ROW_KEY_COPY(rip); + if (!__wt_row_leaf_key_info(page, copy, + NULL, &cell, &keyb->data, &keyb->size)) { + __wt_cell_unpack(cell, unpack); + ret = __wt_dsk_cell_data_ref(session, + WT_PAGE_ROW_LEAF, unpack, keyb); + } + WT_TRET( + __wt_readunlock(session, btree->ovfl_lock)); + WT_ERR(ret); + break; + } + + /* + * If we wanted a different key: + * If we're rolling backward, this key is useless + * to us because it doesn't have a valid prefix: keep + * rolling backward. + * If we're rolling forward, there's no work to be + * done because prefixes skip overflow keys: keep + * rolling forward. + */ + goto next; + } + + /* + * 4: the test for an on-page reference to a key that isn't + * prefix compressed. + */ + if (unpack->prefix == 0) { + /* + * The only reason to be here is a Huffman encoded key, + * a non-encoded key with no prefix compression should + * have been directly referenced, and we should not have + * needed to unpack its cell. + */ + WT_ASSERT(session, btree->huffman_key != NULL); + + /* + * If this is the key we originally wanted, we don't + * care if we're rolling forward or backward, it's + * what we want. Take a copy and wrap up. + * + * If we wanted a different key, this key has a valid + * prefix, we can use it. + * If rolling backward, take a copy of the key and + * switch directions, we can roll forward from this key. + * If rolling forward there's a bug, we should have + * found this key while rolling backwards and switched + * directions then. + * + * The key doesn't need to be instantiated, skip past + * that test. + */ + WT_ERR(__wt_dsk_cell_data_ref( + session, WT_PAGE_ROW_LEAF, unpack, keyb)); + if (slot_offset == 0) + goto done; + goto switch_and_jump; + } + + /* + * 5: an on-page reference to a key that's prefix compressed. + * If rolling backward, keep looking for something we can + * use. + * If rolling forward, build the full key and keep rolling + * forward. + */ + if (direction == BACKWARD) { + /* + * If there's a set of keys with identical prefixes, we + * don't want to instantiate each one, the prefixes are + * all the same. + * + * As we roll backward through the page, track the last + * time the prefix decreased in size, so we can start + * with that key during our roll-forward. For a page + * populated with a single key prefix, we'll be able to + * instantiate the key we want as soon as we find a key + * without a prefix. + */ + if (slot_offset == 0) + last_prefix = unpack->prefix; + if (slot_offset == 0 || last_prefix > unpack->prefix) { + jump_rip = rip; + jump_slot_offset = slot_offset; + last_prefix = unpack->prefix; + } + } + if (direction == FORWARD) { + /* + * Get a reference to the current key's bytes. Usually + * we want bytes from the page, fast-path that case. + */ + if (btree->huffman_key == NULL) { + p = unpack->data; + size = unpack->size; + } else { + if (tmp == NULL) + WT_ERR( + __wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__wt_dsk_cell_data_ref( + session, WT_PAGE_ROW_LEAF, unpack, tmp)); + p = tmp->data; + size = tmp->size; + } + + /* + * Grow the buffer as necessary as well as ensure data + * has been copied into local buffer space, then append + * the suffix to the prefix already in the buffer. + * + * Don't grow the buffer unnecessarily or copy data we + * don't need, truncate the item's data length to the + * prefix bytes. + */ + keyb->size = unpack->prefix; + WT_ERR(__wt_buf_grow(session, keyb, keyb->size + size)); + memcpy((uint8_t *)keyb->data + keyb->size, p, size); + keyb->size += size; + + if (slot_offset == 0) + break; + } + +next: switch (direction) { + case BACKWARD: + --rip; + ++slot_offset; + break; + case FORWARD: + ++rip; + --slot_offset; + break; + } + } + + /* + * Optionally instantiate the key: there's a cost to figuring out a key + * value in a leaf page with prefix-compressed or Huffman encoded keys, + * amortize the cost by instantiating a copy of the calculated key in + * allocated memory. We don't instantiate keys when pages are first + * brought into memory because it's wasted effort if the page is only + * read by a cursor in sorted order. If, instead, the page is read by a + * cursor in reverse order, we immediately instantiate periodic keys for + * the page (otherwise the reverse walk would be insanely slow). If, + * instead, the page is randomly searched, we instantiate keys as they + * are accessed (meaning, for example, as long as the binary search only + * touches one-half of the page, the only keys we instantiate will be in + * that half of the page). + */ + if (instantiate) { + copy = WT_ROW_KEY_COPY(rip_arg); + (void)__wt_row_leaf_key_info( + page, copy, &ikey, &cell, NULL, NULL); + if (ikey == NULL) { + WT_ERR(__wt_row_ikey(session, + WT_PAGE_DISK_OFFSET(page, cell), + keyb->data, keyb->size, &ikey)); + + /* + * Serialize the swap of the key into place: on success, + * update the page's memory footprint, on failure, free + * the allocated memory. + */ + if (WT_ATOMIC_CAS8(WT_ROW_KEY_COPY(rip), copy, ikey)) + __wt_cache_page_inmem_incr(session, + page, sizeof(WT_IKEY) + ikey->size); + else + __wt_free(session, ikey); + } + } + +done: +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __wt_row_ikey_incr -- + * Instantiate a key in a WT_IKEY structure and increment the page's + * memory footprint. + */ +int +__wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, + uint32_t cell_offset, const void *key, size_t size, void *ikeyp) +{ + WT_RET(__wt_row_ikey(session, cell_offset, key, size, ikeyp)); + + __wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + size); + + return (0); +} + +/* + * __wt_row_ikey -- + * Instantiate a key in a WT_IKEY structure. + */ +int +__wt_row_ikey(WT_SESSION_IMPL *session, + uint32_t cell_offset, const void *key, size_t size, void *ikeyp) +{ + WT_IKEY *ikey; + + /* + * Allocate memory for the WT_IKEY structure and the key, then copy + * the key into place. + */ + WT_RET(__wt_calloc(session, 1, sizeof(WT_IKEY) + size, &ikey)); + ikey->size = WT_STORE_SIZE(size); + ikey->cell_offset = cell_offset; + memcpy(WT_IKEY_DATA(ikey), key, size); + + *(WT_IKEY **)ikeyp = ikey; + return (0); +} diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c new file mode 100644 index 00000000000..e0036d14cbb --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -0,0 +1,346 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_page_modify_alloc -- + * Allocate a page's modification structure. + */ +int +__wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_CONNECTION_IMPL *conn; + WT_PAGE_MODIFY *modify; + + conn = S2C(session); + + WT_RET(__wt_calloc_def(session, 1, &modify)); + + /* + * Select a spinlock for the page; let the barrier immediately below + * keep things from racing too badly. + */ + modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS(conn); + + /* + * Multiple threads of control may be searching and deciding to modify + * a page. If our modify structure is used, update the page's memory + * footprint, else discard the modify structure, another thread did the + * work. + */ + if (WT_ATOMIC_CAS8(page->modify, NULL, modify)) + __wt_cache_page_inmem_incr(session, page, sizeof(*modify)); + else + __wt_free(session, modify); + return (0); +} + +/* + * __wt_row_modify -- + * Row-store insert, update and delete. + */ +int +__wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, + WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove) +{ + WT_DECL_RET; + WT_INSERT *ins; + WT_INSERT_HEAD *ins_head, **ins_headp; + WT_PAGE *page; + WT_UPDATE *old_upd, **upd_entry; + size_t ins_size, upd_size; + uint32_t ins_slot; + u_int i, skipdepth; + int logged; + + ins = NULL; + page = cbt->ref->page; + logged = 0; + + /* This code expects a remove to have a NULL value. */ + if (is_remove) + value = NULL; + + /* If we don't yet have a modify structure, we'll need one. */ + WT_RET(__wt_page_modify_init(session, page)); + + /* + * Modify: allocate an update array as necessary, build a WT_UPDATE + * structure, and call a serialized function to insert the WT_UPDATE + * structure. + * + * Insert: allocate an insert array as necessary, build a WT_INSERT + * and WT_UPDATE structure pair, and call a serialized function to + * insert the WT_INSERT structure. + */ + if (cbt->compare == 0) { + if (cbt->ins == NULL) { + /* Allocate an update array as necessary. */ + WT_PAGE_ALLOC_AND_SWAP(session, page, + page->pg_row_upd, upd_entry, page->pg_row_entries); + + /* Set the WT_UPDATE array reference. */ + upd_entry = &page->pg_row_upd[cbt->slot]; + } else + upd_entry = &cbt->ins->upd; + + if (upd == NULL) { + /* Make sure the update can proceed. */ + WT_ERR(__wt_txn_update_check( + session, old_upd = *upd_entry)); + + /* Allocate a WT_UPDATE structure and transaction ID. */ + WT_ERR( + __wt_update_alloc(session, value, &upd, &upd_size)); + WT_ERR(__wt_txn_modify(session, upd)); + logged = 1; + + /* Avoid WT_CURSOR.update data copy. */ + cbt->modify_update = upd; + } else { + upd_size = sizeof(WT_UPDATE) + + (WT_UPDATE_DELETED_ISSET(upd) ? 0 : upd->size); + + /* + * We are restoring updates that couldn't be evicted, + * there should only be one update list per key. + */ + WT_ASSERT(session, *upd_entry == NULL); + /* + * Set the "old" entry to the second update in the list + * so that the serialization function succeeds in + * swapping the first update into place. + */ + old_upd = *upd_entry = upd->next; + } + + /* + * Point the new WT_UPDATE item to the next element in the list. + * If we get it right, the serialization function lock acts as + * our memory barrier to flush this write. + */ + upd->next = old_upd; + + /* Serialize the update. */ + WT_ERR(__wt_update_serial( + session, page, upd_entry, &upd, upd_size)); + } else { + /* + * Allocate the insert array as necessary. + * + * We allocate an additional insert array slot for insert keys + * sorting less than any key on the page. The test to select + * that slot is baroque: if the search returned the first page + * slot, we didn't end up processing an insert list, and the + * comparison value indicates the search key was smaller than + * the returned slot, then we're using the smallest-key insert + * slot. That's hard, so we set a flag. + */ + WT_PAGE_ALLOC_AND_SWAP(session, page, + page->pg_row_ins, ins_headp, page->pg_row_entries + 1); + + ins_slot = F_ISSET(cbt, WT_CBT_SEARCH_SMALLEST) ? + page->pg_row_entries: cbt->slot; + ins_headp = &page->pg_row_ins[ins_slot]; + + /* Allocate the WT_INSERT_HEAD structure as necessary. */ + WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1); + ins_head = *ins_headp; + + /* Choose a skiplist depth for this insert. */ + skipdepth = __wt_skip_choose_depth(session); + + /* + * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and + * update the cursor to reference it (the WT_INSERT_HEAD might + * be allocated, the WT_INSERT was allocated). + */ + WT_ERR(__wt_row_insert_alloc( + session, key, skipdepth, &ins, &ins_size)); + cbt->ins_head = ins_head; + cbt->ins = ins; + + if (upd == NULL) { + WT_ERR( + __wt_update_alloc(session, value, &upd, &upd_size)); + WT_ERR(__wt_txn_modify(session, upd)); + logged = 1; + + /* Avoid WT_CURSOR.update data copy. */ + cbt->modify_update = upd; + } else + upd_size = sizeof(WT_UPDATE) + + (WT_UPDATE_DELETED_ISSET(upd) ? 0 : upd->size); + + ins->upd = upd; + ins_size += upd_size; + + /* + * If there was no insert list during the search, the cursor's + * information cannot be correct, search couldn't have + * initialized it. + * + * Otherwise, point the new WT_INSERT item's skiplist to the + * next elements in the insert list (which we will check are + * still valid inside the serialization function). + * + * The serial mutex acts as our memory barrier to flush these + * writes before inserting them into the list. + */ + if (WT_SKIP_FIRST(ins_head) == NULL) + for (i = 0; i < skipdepth; i++) { + cbt->ins_stack[i] = &ins_head->head[i]; + ins->next[i] = cbt->next_stack[i] = NULL; + } + else + for (i = 0; i < skipdepth; i++) + ins->next[i] = cbt->next_stack[i]; + + /* Insert the WT_INSERT structure. */ + WT_ERR(__wt_insert_serial( + session, page, cbt->ins_head, cbt->ins_stack, + &ins, ins_size, skipdepth)); + } + + if (logged) + WT_ERR(__wt_txn_log_op(session, cbt)); + + if (0) { +err: /* + * Remove the update from the current transaction, so we don't + * try to modify it on rollback. + */ + if (logged) + __wt_txn_unmodify(session); + __wt_free(session, ins); + cbt->ins = NULL; + __wt_free(session, upd); + } + + return (ret); +} + +/* + * __wt_row_insert_alloc -- + * Row-store insert: allocate a WT_INSERT structure and fill it in. + */ +int +__wt_row_insert_alloc(WT_SESSION_IMPL *session, + WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep) +{ + WT_INSERT *ins; + size_t ins_size; + + /* + * Allocate the WT_INSERT structure, next pointers for the skip list, + * and room for the key. Then copy the key into place. + */ + ins_size = sizeof(WT_INSERT) + + skipdepth * sizeof(WT_INSERT *) + key->size; + WT_RET(__wt_calloc(session, 1, ins_size, &ins)); + + ins->u.key.offset = WT_STORE_SIZE(ins_size - key->size); + WT_INSERT_KEY_SIZE(ins) = WT_STORE_SIZE(key->size); + memcpy(WT_INSERT_KEY(ins), key->data, key->size); + + *insp = ins; + if (ins_sizep != NULL) + *ins_sizep = ins_size; + return (0); +} + +/* + * __wt_update_alloc -- + * Allocate a WT_UPDATE structure and associated value and fill it in. + */ +int +__wt_update_alloc( + WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep) +{ + WT_UPDATE *upd; + size_t size; + + /* + * Allocate the WT_UPDATE structure and room for the value, then copy + * the value into place. + */ + size = value == NULL ? 0 : value->size; + WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd)); + if (value == NULL) + WT_UPDATE_DELETED_SET(upd); + else { + upd->size = WT_STORE_SIZE(size); + memcpy(WT_UPDATE_DATA(upd), value->data, size); + } + + *updp = upd; + *sizep = sizeof(WT_UPDATE) + size; + return (0); +} + +/* + * __wt_update_obsolete_check -- + * Check for obsolete updates. + */ +WT_UPDATE * +__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd) +{ + WT_UPDATE *first, *next; + + /* + * This function identifies obsolete updates, and truncates them from + * the rest of the chain; because this routine is called from inside + * a serialization function, the caller has responsibility for actually + * freeing the memory. + * + * Walk the list of updates, looking for obsolete updates at the end. + */ + for (first = NULL; upd != NULL; upd = upd->next) + if (__wt_txn_visible_all(session, upd->txnid)) { + if (first == NULL) + first = upd; + } else if (upd->txnid != WT_TXN_ABORTED) + first = NULL; + + /* + * We cannot discard this WT_UPDATE structure, we can only discard + * WT_UPDATE structures subsequent to it, other threads of control will + * terminate their walk in this element. Save a reference to the list + * we will discard, and terminate the list. + */ + if (first != NULL && + (next = first->next) != NULL && + WT_ATOMIC_CAS8(first->next, next, NULL)) + return (next); + + return (NULL); +} + +/* + * __wt_update_obsolete_free -- + * Free an obsolete update list. + */ +void +__wt_update_obsolete_free( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd) +{ + WT_UPDATE *next; + size_t size; + + /* Free a WT_UPDATE list. */ + for (size = 0; upd != NULL; upd = next) { + /* Deleted items have a dummy size: don't include that. */ + size += sizeof(WT_UPDATE) + + (WT_UPDATE_DELETED_ISSET(upd) ? 0 : upd->size); + + next = upd->next; + __wt_free(session, upd); + } + if (size != 0) + __wt_cache_page_inmem_decr(session, page, size); +} diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c new file mode 100644 index 00000000000..b190aaaded5 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -0,0 +1,553 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_search_insert_append -- + * Fast append search of a row-store insert list, creating a skiplist stack + * as we go. + */ +static inline int +__wt_search_insert_append(WT_SESSION_IMPL *session, + WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, int *donep) +{ + WT_BTREE *btree; + WT_COLLATOR *collator; + WT_INSERT *ins; + WT_INSERT_HEAD *inshead; + WT_ITEM key; + int cmp, i; + + btree = S2BT(session); + collator = btree->collator; + *donep = 0; + + inshead = cbt->ins_head; + if ((ins = WT_SKIP_LAST(inshead)) == NULL) + return (0); + key.data = WT_INSERT_KEY(ins); + key.size = WT_INSERT_KEY_SIZE(ins); + + WT_RET(__wt_compare(session, collator, srch_key, &key, &cmp)); + if (cmp >= 0) { + /* + * !!! + * We may race with another appending thread. + * + * To catch that case, rely on the atomic pointer read above + * and set the next stack to NULL here. If we have raced with + * another thread, one of the next pointers will not be NULL by + * the time they are checked against the next stack inside the + * serialized insert function. + */ + for (i = WT_SKIP_MAXDEPTH - 1; i >= 0; i--) { + cbt->ins_stack[i] = (i == 0) ? &ins->next[0] : + (inshead->tail[i] != NULL) ? + &inshead->tail[i]->next[i] : &inshead->head[i]; + cbt->next_stack[i] = NULL; + } + cbt->compare = -cmp; + cbt->ins = ins; + *donep = 1; + } + return (0); +} + +/* + * __wt_search_insert -- + * Search a row-store insert list, creating a skiplist stack as we go. + */ +int +__wt_search_insert( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key) +{ + WT_BTREE *btree; + WT_COLLATOR *collator; + WT_INSERT *ins, **insp, *last_ins; + WT_INSERT_HEAD *inshead; + WT_ITEM key; + size_t match, skiphigh, skiplow; + int cmp, i; + + btree = S2BT(session); + collator = btree->collator; + inshead = cbt->ins_head; + cmp = 0; /* -Wuninitialized */ + + /* + * The insert list is a skip list: start at the highest skip level, then + * go as far as possible at each level before stepping down to the next. + */ + match = skiphigh = skiplow = 0; + ins = last_ins = NULL; + for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) { + if ((ins = *insp) == NULL) { + cbt->next_stack[i] = NULL; + cbt->ins_stack[i--] = insp--; + continue; + } + + /* + * Comparisons may be repeated as we drop down skiplist levels; + * don't repeat comparisons, they might be expensive. + */ + if (ins != last_ins) { + last_ins = ins; + key.data = WT_INSERT_KEY(ins); + key.size = WT_INSERT_KEY_SIZE(ins); + match = WT_MIN(skiplow, skiphigh); + WT_RET(__wt_compare_skip( + session, collator, srch_key, &key, &cmp, &match)); + } + + if (cmp > 0) { /* Keep going at this level */ + insp = &ins->next[i]; + skiplow = match; + } else if (cmp < 0) { /* Drop down a level */ + cbt->next_stack[i] = ins; + cbt->ins_stack[i--] = insp--; + skiphigh = match; + } else + for (; i >= 0; i--) { + cbt->next_stack[i] = ins->next[i]; + cbt->ins_stack[i] = &ins->next[i]; + } + } + + /* + * For every insert element we review, we're getting closer to a better + * choice; update the compare field to its new value. If we went past + * the last item in the list, return the last one: that is used to + * decide whether we are positioned in a skiplist. + */ + cbt->compare = -cmp; + cbt->ins = (ins != NULL) ? ins : last_ins; + return (0); +} + +/* + * __wt_row_search -- + * Search a row-store tree for a specific key. + */ +int +__wt_row_search(WT_SESSION_IMPL *session, + WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, int insert) +{ + WT_BTREE *btree; + WT_COLLATOR *collator; + WT_DECL_RET; + WT_ITEM *item; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + WT_REF *current, *descent; + WT_ROW *rip; + size_t match, skiphigh, skiplow; + uint32_t base, indx, limit; + int append_check, cmp, depth, descend_right, done; + + btree = S2BT(session); + collator = btree->collator; + item = &cbt->search_key; + + __cursor_pos_clear(cbt); + + /* + * The row-store search routine uses a different comparison API. + * The assumption is we're comparing more than a few keys with + * matching prefixes, and it's a win to avoid the memory fetches + * by skipping over those prefixes. That's done by tracking the + * length of the prefix match for the lowest and highest keys we + * compare as we descend the tree. + */ + skiphigh = skiplow = 0; + + /* + * If a cursor repeatedly appends to the tree, compare the search key + * against the last key on each internal page during insert before + * doing the full binary search. + * + * Track if the descent is to the right-side of the tree, used to set + * the cursor's append history. + */ + append_check = insert && cbt->append_tree; + descend_right = 1; + + /* + * In the service of eviction splits, we're only searching a single leaf + * page, not a full tree. + */ + if (leaf != NULL) { + current = leaf; + goto leaf_only; + } + + /* Search the internal pages of the tree. */ + cmp = -1; + current = &btree->root; + for (depth = 2;; ++depth) { +restart: page = current->page; + if (page->type != WT_PAGE_ROW_INT) + break; + + pindex = WT_INTL_INDEX_COPY(page); + + /* + * Fast-path internal pages with one child, a common case for + * the root page in new trees. + */ + if (pindex->entries == 1) { + descent = pindex->index[0]; + goto descend; + } + + /* Fast-path appends. */ + if (append_check) { + descent = pindex->index[pindex->entries - 1]; + __wt_ref_key(page, descent, &item->data, &item->size); + WT_ERR(__wt_compare( + session, collator, srch_key, item, &cmp)); + if (cmp >= 0) + goto descend; + + /* A failed append check turns off append checks. */ + append_check = 0; + } + + /* + * Binary search of the internal page. There are two versions + * (a default loop and an application-specified collation loop), + * because moving the collation test and error handling inside + * the loop costs about 5%. + * + * The 0th key on an internal page is a problem for a couple of + * reasons. First, we have to force the 0th key to sort less + * than any application key, so internal pages don't have to be + * updated if the application stores a new, "smallest" key in + * the tree. Second, reconciliation is aware of this and will + * store a byte of garbage in the 0th key, so the comparison of + * an application key and a 0th key is meaningless (but doing + * the comparison could still incorrectly modify our tracking + * of the leading bytes in each key that we can skip during the + * comparison). For these reasons, skip the 0th key. + */ + base = 1; + limit = pindex->entries - 1; + if (collator == NULL) + for (; limit != 0; limit >>= 1) { + indx = base + (limit >> 1); + descent = pindex->index[indx]; + __wt_ref_key( + page, descent, &item->data, &item->size); + + match = WT_MIN(skiplow, skiphigh); + cmp = __wt_lex_compare_skip( + srch_key, item, &match); + if (cmp > 0) { + skiplow = match; + base = indx + 1; + --limit; + } else if (cmp < 0) + skiphigh = match; + else + goto descend; + } + else + for (; limit != 0; limit >>= 1) { + indx = base + (limit >> 1); + descent = pindex->index[indx]; + __wt_ref_key( + page, descent, &item->data, &item->size); + + WT_ERR(__wt_compare( + session, collator, srch_key, item, &cmp)); + if (cmp > 0) { + base = indx + 1; + --limit; + } else if (cmp == 0) + goto descend; + } + + /* + * Set the slot to descend the tree: descent is already set if + * there was an exact match on the page, otherwise, base is + * the smallest index greater than key, possibly (last + 1). + */ + descent = pindex->index[base - 1]; + + /* + * If we end up somewhere other than the last slot, it's not a + * right-side descent. + */ + if (pindex->entries != base - 1) + descend_right = 0; + +descend: /* + * Swap the current page for the child page. If the page splits + * while we're retrieving it, restart the search in the current + * page; otherwise return on error, the swap call ensures we're + * holding nothing on failure. + */ + switch (ret = __wt_page_swap(session, current, descent, 0)) { + case 0: + current = descent; + break; + case WT_RESTART: + skiphigh = skiplow = 0; + goto restart; + default: + return (ret); + } + } + + /* Track how deep the tree gets. */ + if (depth > btree->maximum_depth) + btree->maximum_depth = depth; + +leaf_only: + page = current->page; + cbt->ref = current; + + /* + * In the case of a right-side tree descent during an insert, do a fast + * check for an append to the page, try to catch cursors appending data + * into the tree. + * + * It's tempting to make this test more rigorous: if a cursor inserts + * randomly into a two-level tree (a root referencing a single child + * that's empty except for an insert list), the right-side descent flag + * will be set and this comparison wasted. The problem resolves itself + * as the tree grows larger: either we're no longer doing right-side + * descent, or we'll avoid additional comparisons in internal pages, + * making up for the wasted comparison here. Similarly, the cursor's + * history is set any time it's an insert and a right-side descent, + * both to avoid a complicated/expensive test, and, in the case of + * multiple threads appending to the tree, we want to mark them all as + * appending, even if this test doesn't work. + */ + if (insert && descend_right) { + cbt->append_tree = 1; + + if (page->pg_row_entries == 0) { + cbt->slot = WT_ROW_SLOT(page, page->pg_row_d); + + F_SET(cbt, WT_CBT_SEARCH_SMALLEST); + cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); + } else { + cbt->slot = WT_ROW_SLOT(page, + page->pg_row_d + (page->pg_row_entries - 1)); + + cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); + } + + WT_ERR( + __wt_search_insert_append(session, cbt, srch_key, &done)); + if (done) + return (0); + + /* + * Don't leave the insert list head set, code external to the + * search uses it. + */ + cbt->ins_head = NULL; + } + + /* + * Binary search of the leaf page. There are two versions (a default + * loop and an application-specified collation loop), because moving + * the collation test and error handling inside the loop costs about 5%. + */ + base = 0; + limit = page->pg_row_entries; + if (collator == NULL) + for (; limit != 0; limit >>= 1) { + indx = base + (limit >> 1); + rip = page->pg_row_d + indx; + WT_ERR(__wt_row_leaf_key(session, page, rip, item, 1)); + + match = WT_MIN(skiplow, skiphigh); + cmp = __wt_lex_compare_skip(srch_key, item, &match); + if (cmp > 0) { + skiplow = match; + base = indx + 1; + --limit; + } else if (cmp < 0) + skiphigh = match; + else + goto leaf_match; + } + else + for (; limit != 0; limit >>= 1) { + indx = base + (limit >> 1); + rip = page->pg_row_d + indx; + WT_ERR(__wt_row_leaf_key(session, page, rip, item, 1)); + + WT_ERR(__wt_compare( + session, collator, srch_key, item, &cmp)); + if (cmp > 0) { + base = indx + 1; + --limit; + } else if (cmp == 0) + goto leaf_match; + } + + /* + * The best case is finding an exact match in the leaf page's WT_ROW + * array, probable for any read-mostly workload. Check that case and + * get out fast. + */ + if (0) { +leaf_match: cbt->compare = 0; + cbt->slot = WT_ROW_SLOT(page, rip); + return (0); + } + + /* + * We didn't find an exact match in the WT_ROW array. + * + * Base is the smallest index greater than key and may be the 0th index + * or the (last + 1) index. Set the slot to be the largest index less + * than the key if that's possible (if base is the 0th index it means + * the application is inserting a key before any key found on the page). + * + * It's still possible there is an exact match, but it's on an insert + * list. Figure out which insert chain to search and then set up the + * return information assuming we'll find nothing in the insert list + * (we'll correct as needed inside the search routine, depending on + * what we find). + * + * If inserting a key smaller than any key found in the WT_ROW array, + * use the extra slot of the insert array, otherwise the insert array + * maps one-to-one to the WT_ROW array. + */ + if (base == 0) { + cbt->compare = 1; + cbt->slot = WT_ROW_SLOT(page, page->pg_row_d); + + F_SET(cbt, WT_CBT_SEARCH_SMALLEST); + cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); + } else { + cbt->compare = -1; + cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (base - 1)); + + cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); + } + + /* If there's no insert list, we're done. */ + if (WT_SKIP_FIRST(cbt->ins_head) == NULL) + return (0); + + /* + * Test for an append first when inserting onto an insert list, try to + * catch cursors repeatedly inserting at a single point. + */ + if (insert) { + WT_ERR( + __wt_search_insert_append(session, cbt, srch_key, &done)); + if (done) + return (0); + } + WT_ERR(__wt_search_insert(session, cbt, srch_key)); + + return (0); + +err: if (leaf != NULL) + WT_TRET(__wt_page_release(session, current, 0)); + return (ret); +} + +/* + * __wt_row_random -- + * Return a random key from a row-store tree. + */ +int +__wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_INSERT *p, *t; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + WT_REF *current, *descent; + + btree = S2BT(session); + + __cursor_pos_clear(cbt); + +restart: + /* Walk the internal pages of the tree. */ + current = &btree->root; + for (;;) { + page = current->page; + if (page->type != WT_PAGE_ROW_INT) + break; + + pindex = WT_INTL_INDEX_COPY(page); + descent = pindex->index[ + __wt_random(session->rnd) % pindex->entries]; + + /* + * Swap the parent page for the child page; return on error, + * the swap function ensures we're holding nothing on failure. + */ + if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) { + current = descent; + continue; + } + /* + * Restart is returned if we find a page that's been split; the + * held page isn't discarded when restart is returned, discard + * it and restart the search from the top of the tree. + */ + if (ret == WT_RESTART && + (ret = __wt_page_release(session, current, 0)) == 0) + goto restart; + return (ret); + } + + if (page->pg_row_entries != 0) { + /* + * The use case for this call is finding a place to split the + * tree. Cheat (it's not like this is "random", anyway), and + * make things easier by returning the first key on the page. + * If the caller is attempting to split a newly created tree, + * or a tree with just one big page, that's not going to work, + * check for that. + */ + cbt->ref = current; + cbt->compare = 0; + pindex = WT_INTL_INDEX_COPY(btree->root.page); + cbt->slot = pindex->entries < 2 ? + __wt_random(session->rnd) % page->pg_row_entries : 0; + + return (__wt_row_leaf_key(session, + page, page->pg_row_d + cbt->slot, &cbt->search_key, 0)); + } + + /* + * If the tree is new (and not empty), it might have a large insert + * list, pick the key in the middle of that insert list. + */ + F_SET(cbt, WT_CBT_SEARCH_SMALLEST); + if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) + WT_ERR(WT_NOTFOUND); + for (p = t = WT_SKIP_FIRST(cbt->ins_head);;) { + if ((p = WT_SKIP_NEXT(p)) == NULL) + break; + if ((p = WT_SKIP_NEXT(p)) == NULL) + break; + t = WT_SKIP_NEXT(t); + } + cbt->ref = current; + cbt->compare = 0; + cbt->ins = t; + + return (0); + +err: WT_TRET(__wt_page_release(session, current, 0)); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/config/config.c b/src/third_party/wiredtiger/src/config/config.c new file mode 100644 index 00000000000..c792cb4fcf2 --- /dev/null +++ b/src/third_party/wiredtiger/src/config/config.c @@ -0,0 +1,745 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __config_err -- + * Error message and return for config string parse failures. + */ +static int +__config_err(WT_CONFIG *conf, const char *msg, int err) +{ + WT_RET_MSG(conf->session, err, + "Error parsing '%.*s' at byte %u: %s", + (int)(conf->end - conf->orig), conf->orig, + (u_int)(conf->cur - conf->orig), msg); +} + +/* + * __wt_config_initn -- + * Initialize a config handle, used to iterate through a config string of + * specified length. + */ +int +__wt_config_initn( + WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len) +{ + conf->session = session; + conf->orig = conf->cur = str; + conf->end = str + len; + conf->depth = 0; + conf->top = -1; + conf->go = NULL; + + return (0); +} + +/* + * __wt_config_init -- + * Initialize a config handle, used to iterate through a NUL-terminated + * config string. + */ +int +__wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str) +{ + size_t len; + + len = (str == NULL) ? 0 : strlen(str); + + return (__wt_config_initn(session, conf, str, len)); +} + +/* + * __wt_config_subinit -- + * Initialize a config handle, used to iterate through a config string + * extracted from another config string (used for parsing nested + * structures). + */ +int +__wt_config_subinit( + WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item) +{ + return (__wt_config_initn(session, conf, item->str, item->len)); +} + +#define PUSH(i, t) do { \ + if (conf->top == -1) \ + conf->top = conf->depth; \ + if (conf->depth == conf->top) { \ + if (out->len > 0) \ + return (__config_err(conf, \ + "New value starts without a separator", \ + EINVAL)); \ + out->type = (t); \ + out->str = (conf->cur + (i)); \ + } \ +} while (0) + +#define CAP(i) do { \ + if (conf->depth == conf->top) \ + out->len = (size_t)((conf->cur + (i) + 1) - out->str); \ +} while (0) + +typedef enum { + A_LOOP, A_BAD, A_DOWN, A_UP, A_VALUE, A_NEXT, A_QDOWN, A_QUP, + A_ESC, A_UNESC, A_BARE, A_NUMBARE, A_UNBARE, A_UTF8_2, + A_UTF8_3, A_UTF8_4, A_UTF_CONTINUE +} CONFIG_ACTION; + +/* + * static void *gostruct[] = { + * [0 ... 255] = &&l_bad, + * ['\t'] = &&l_loop, [' '] = &&l_loop, + * ['\r'] = &&l_loop, ['\n'] = &&l_loop, + * ['"'] = &&l_qup, + * [':'] = &&l_value, ['='] = &&l_value, + * [','] = &&l_next, + * // tracking [] and {} individually would allow fuller + * // validation but is really messy + * ['('] = &&l_up, [')'] = &&l_down, + * ['['] = &&l_up, [']'] = &&l_down, + * ['{'] = &&l_up, ['}'] = &&l_down, + * // bare identifiers + * ['-'] = &&l_numbare, + * ['0' ... '9'] = &&l_numbare, + * ['_'] = &&l_bare, + * ['A' ... 'Z'] = &&l_bare, ['a' ... 'z'] = &&l_bare, + * ['/'] = &&l_bare, + * }; + */ +static const int8_t gostruct[256] = { + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_LOOP, A_LOOP, A_BAD, A_BAD, A_LOOP, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_LOOP, A_BAD, A_QUP, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UP, A_DOWN, A_BAD, A_BAD, + A_NEXT, A_NUMBARE, A_BARE, A_BARE, A_NUMBARE, A_NUMBARE, + A_NUMBARE, A_NUMBARE, A_NUMBARE, A_NUMBARE, A_NUMBARE, + A_NUMBARE, A_NUMBARE, A_NUMBARE, A_VALUE, A_BAD, A_BAD, + A_VALUE, A_BAD, A_BAD, A_BAD, A_BARE, A_BARE, A_BARE, A_BARE, + A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, + A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, + A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_UP, A_BAD, + A_DOWN, A_BAD, A_BARE, A_BAD, A_BARE, A_BARE, A_BARE, A_BARE, + A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, + A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, + A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_UP, A_BAD, + A_DOWN, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD +}; + +/* + * static void *gobare[] = + * { + * [0 ... 31] = &&l_bad, + * // could be more pedantic/validation-checking + * [32 ... 126] = &&l_loop, + * ['\t'] = &&l_unbare, [' '] = &&l_unbare, + * ['\r'] = &&l_unbare, ['\n'] = &&l_unbare, + * [':'] = &&l_unbare, ['='] = &&l_unbare, + * [','] = &&l_unbare, + * [')'] = &&l_unbare, [']'] = &&l_unbare, ['}'] = &&l_unbare, + * [127 ... 255] = &&l_bad + * }; + */ +static const int8_t gobare[256] = { + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_UNBARE, A_UNBARE, A_BAD, A_BAD, A_UNBARE, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UNBARE, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_UNBARE, A_LOOP, A_LOOP, A_UNBARE, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_UNBARE, A_LOOP, A_LOOP, A_UNBARE, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_UNBARE, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_UNBARE, A_LOOP, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD +}; + +/* + * static void *gostring[] = + * { + * [0 ... 31] = &&l_bad, [127] = &&l_bad, + * [32 ... 126] = &&l_loop, + * ['\\'] = &&l_esc, ['"'] = &&l_qdown, + * [128 ... 191] = &&l_bad, + * [192 ... 223] = &&l_utf8_2, + * [224 ... 239] = &&l_utf8_3, + * [240 ... 247] = &&l_utf8_4, + * [248 ... 255] = &&l_bad + * }; + */ +static const int8_t gostring[256] = { + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_LOOP, A_LOOP, A_QDOWN, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_ESC, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, + A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UTF8_2, + A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, + A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, + A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, + A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, + A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, + A_UTF8_2, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, + A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, + A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_4, + A_UTF8_4, A_UTF8_4, A_UTF8_4, A_UTF8_4, A_UTF8_4, A_UTF8_4, + A_UTF8_4, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD +}; + +/* + * static void *goutf8_continue[] = + * { + * [0 ... 127] = &&l_bad, + * [128 ... 191] = &&l_utf_continue, + * [192 ... 255] = &&l_bad + * }; + */ +static const int8_t goutf8_continue[256] = { + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, + A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, + A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, + A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, + A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, + A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, + A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, + A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, + A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, + A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, + A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, + A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, + A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, + A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, + A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, + A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, + A_UTF_CONTINUE, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD +}; + +/* + * static void *goesc[] = + * { + * [0 ... 255] = &&l_bad, + * ['"'] = &&l_unesc, ['\\'] = &&l_unesc, + * ['/'] = &&l_unesc, ['b'] = &&l_unesc, + * ['f'] = &&l_unesc, ['n'] = &&l_unesc, + * ['r'] = &&l_unesc, ['t'] = &&l_unesc, ['u'] = &&l_unesc + * }; + */ +static const int8_t goesc[256] = { + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UNESC, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_UNESC, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_UNESC, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_UNESC, A_BAD, A_BAD, A_BAD, A_UNESC, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UNESC, A_BAD, + A_BAD, A_BAD, A_UNESC, A_BAD, A_UNESC, A_UNESC, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, + A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD +}; + +/* + * __config_next -- + * Get the next config item in the string without processing the value. + */ +static int +__config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value) +{ + WT_CONFIG_ITEM *out = key; + int utf8_remain = 0; + static const WT_CONFIG_ITEM true_value = { + "", 0, 1, WT_CONFIG_ITEM_BOOL + }; + + key->len = 0; + /* Keys with no value default to true. */ + *value = true_value; + + if (conf->go == NULL) + conf->go = gostruct; + + while (conf->cur < conf->end) { + switch (conf->go[(int)*conf->cur]) { + case A_LOOP: + break; + + case A_BAD: + return (__config_err( + conf, "Unexpected character", EINVAL)); + + case A_DOWN: + --conf->depth; + CAP(0); + break; + + case A_UP: + if (conf->top == -1) + conf->top = 1; + PUSH(0, WT_CONFIG_ITEM_STRUCT); + ++conf->depth; + break; + + case A_VALUE: + if (conf->depth == conf->top) { + /* + * Special case: ':' is permitted in unquoted + * values. + */ + if (out == value && *conf->cur != ':') + return (__config_err(conf, + "Value already complete", EINVAL)); + out = value; + } + break; + + case A_NEXT: + /* + * If we're at the top level and we have a complete + * key (and optional value), we're done. + */ + if (conf->depth == conf->top && key->len > 0) { + ++conf->cur; + return (0); + } else + break; + + case A_QDOWN: + CAP(-1); + conf->go = gostruct; + break; + + case A_QUP: + PUSH(1, WT_CONFIG_ITEM_STRING); + conf->go = gostring; + break; + + case A_ESC: + conf->go = goesc; + break; + + case A_UNESC: + conf->go = gostring; + break; + + case A_BARE: + PUSH(0, WT_CONFIG_ITEM_ID); + conf->go = gobare; + break; + + case A_NUMBARE: + PUSH(0, WT_CONFIG_ITEM_NUM); + conf->go = gobare; + break; + + case A_UNBARE: + CAP(-1); + conf->go = gostruct; + continue; + + case A_UTF8_2: + conf->go = goutf8_continue; + utf8_remain = 1; + break; + + case A_UTF8_3: + conf->go = goutf8_continue; + utf8_remain = 2; + break; + + case A_UTF8_4: + conf->go = goutf8_continue; + utf8_remain = 3; + break; + + case A_UTF_CONTINUE: + if (!--utf8_remain) + conf->go = gostring; + break; + } + + conf->cur++; + } + + /* Might have a trailing key/value without a closing brace */ + if (conf->go == gobare) { + CAP(-1); + conf->go = gostruct; + } + + /* Did we find something? */ + if (conf->depth <= conf->top && key->len > 0) + return (0); + + /* We're either at the end of the string or we failed to parse. */ + if (conf->depth == 0) + return (WT_NOTFOUND); + + return (__config_err(conf, + "Closing brackets missing from config string", EINVAL)); +} + +/* + * Arithmetic shift of a negative number is undefined by ISO/IEC 9899, and the + * WiredTiger API supports negative numbers. Check it's not a negative number, + * and then cast the shift out of paranoia. + */ +#define WT_SHIFT_INT64(v, s) do { \ + if ((v) < 0) \ + goto range; \ + (v) = (int64_t)(((uint64_t)(v)) << (s)); \ +} while (0) + +/* + * __config_process_value -- + * Deal with special config values like true / false. + */ +static int +__config_process_value(WT_CONFIG *conf, WT_CONFIG_ITEM *value) +{ + char *endptr; + + /* Empty values are okay: we can't do anything interesting with them. */ + if (value->len == 0) + return (0); + + if (value->type == WT_CONFIG_ITEM_ID) { + if (strncasecmp(value->str, "true", value->len) == 0) { + value->type = WT_CONFIG_ITEM_BOOL; + value->val = 1; + } else if (strncasecmp(value->str, "false", value->len) == 0) { + value->type = WT_CONFIG_ITEM_BOOL; + value->val = 0; + } + } else if (value->type == WT_CONFIG_ITEM_NUM) { + errno = 0; + value->val = strtoll(value->str, &endptr, 10); + + /* Check any leftover characters. */ + while (endptr < value->str + value->len) + switch (*endptr++) { + case 'b': + case 'B': + /* Byte: no change. */ + break; + case 'k': + case 'K': + WT_SHIFT_INT64(value->val, 10); + break; + case 'm': + case 'M': + WT_SHIFT_INT64(value->val, 20); + break; + case 'g': + case 'G': + WT_SHIFT_INT64(value->val, 30); + break; + case 't': + case 'T': + WT_SHIFT_INT64(value->val, 40); + break; + case 'p': + case 'P': + WT_SHIFT_INT64(value->val, 50); + break; + default: + /* + * We didn't get a well-formed number. That + * might be okay, the required type will be + * checked by __wt_config_check. + */ + value->type = WT_CONFIG_ITEM_ID; + break; + } + + /* + * If we parsed the whole string but the number is out of range, + * report an error. Don't report an error for strings that + * aren't well-formed integers: if an integer is expected, that + * will be caught by __wt_config_check. + */ + if (value->type == WT_CONFIG_ITEM_NUM && errno == ERANGE) + goto range; + } + + return (0); + +range: + return (__config_err(conf, "Number out of range", ERANGE)); +} + +/* + * __wt_config_next -- + * Get the next config item in the string and process the value. + */ +int +__wt_config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value) +{ + WT_RET(__config_next(conf, key, value)); + return (__config_process_value(conf, value)); +} + +/* + * __config_getraw -- + * Given a config parser, find the final value for a given key. + */ +static int +__config_getraw( + WT_CONFIG *cparser, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value, int top) +{ + WT_CONFIG sparser; + WT_CONFIG_ITEM k, v, subk; + WT_DECL_RET; + int found; + + found = 0; + while ((ret = __config_next(cparser, &k, &v)) == 0) { + if (k.type != WT_CONFIG_ITEM_STRING && + k.type != WT_CONFIG_ITEM_ID) + continue; + if (k.len == key->len && + strncasecmp(key->str, k.str, k.len) == 0) { + *value = v; + found = 1; + } else if (k.len < key->len && key->str[k.len] == '.' && + strncasecmp(key->str, k.str, k.len) == 0) { + subk.str = key->str + k.len + 1; + subk.len = (key->len - k.len) - 1; + WT_RET(__wt_config_initn( + cparser->session, &sparser, v.str, v.len)); + if ((ret = + __config_getraw(&sparser, &subk, value, 0)) == 0) + found = 1; + WT_RET_NOTFOUND_OK(ret); + } + } + WT_RET_NOTFOUND_OK(ret); + + if (!found) + return (WT_NOTFOUND); + return (top ? __config_process_value(cparser, value) : 0); +} + +/* + * __wt_config_get -- + * Given a NULL-terminated list of configuration strings, find + * the final value for a given key. + */ +int +__wt_config_get(WT_SESSION_IMPL *session, + const char **cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value) +{ + WT_CONFIG cparser; + WT_DECL_RET; + int found; + + for (found = 0; *cfg != NULL; cfg++) { + WT_RET(__wt_config_init(session, &cparser, *cfg)); + if ((ret = __config_getraw(&cparser, key, value, 1)) == 0) + found = 1; + else if (ret != WT_NOTFOUND) + return (ret); + } + + return (found ? 0 : WT_NOTFOUND); +} + +/* + * __wt_config_gets -- + * Given a NULL-terminated list of configuration strings, find the final + * value for a given string key. + */ +int +__wt_config_gets(WT_SESSION_IMPL *session, + const char **cfg, const char *key, WT_CONFIG_ITEM *value) +{ + WT_CONFIG_ITEM key_item = + { key, strlen(key), 0, WT_CONFIG_ITEM_STRING }; + + return (__wt_config_get(session, cfg, &key_item, value)); +} + +/* + * __wt_config_getone -- + * Get the value for a given key from a single config string. + */ +int +__wt_config_getone(WT_SESSION_IMPL *session, + const char *config, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value) +{ + WT_CONFIG cparser; + + WT_RET(__wt_config_init(session, &cparser, config)); + return (__config_getraw(&cparser, key, value, 1)); +} + +/* + * __wt_config_getones -- + * Get the value for a given string key from a single config string. + */ +int +__wt_config_getones(WT_SESSION_IMPL *session, + const char *config, const char *key, WT_CONFIG_ITEM *value) +{ + WT_CONFIG cparser; + WT_CONFIG_ITEM key_item = + { key, strlen(key), 0, WT_CONFIG_ITEM_STRING }; + + WT_RET(__wt_config_init(session, &cparser, config)); + return (__config_getraw(&cparser, &key_item, value, 1)); +} + +/* + * __wt_config_gets_def -- + * Performance hack: skip parsing config strings by hard-coding defaults. + * + * It's expensive to repeatedly parse configuration strings, so don't do + * it unless it's necessary in performance paths like cursor creation. + * Assume the second configuration string is the application's + * configuration string, and if it's not set (which is true most of the + * time), then use the supplied default value. This makes it faster to + * open cursors when checking for obscure open configuration strings like + * "next_random". + */ +int +__wt_config_gets_def(WT_SESSION_IMPL *session, + const char **cfg, const char *key, int def, WT_CONFIG_ITEM *value) +{ + static const WT_CONFIG_ITEM false_value = { + "", 0, 0, WT_CONFIG_ITEM_NUM + }; + + *value = false_value; + value->val = def; + if (cfg == NULL || cfg[0] == NULL || cfg[1] == NULL) + return (0); + else if (cfg[2] == NULL) + WT_RET_NOTFOUND_OK( + __wt_config_getones(session, cfg[1], key, value)); + return (__wt_config_gets(session, cfg, key, value)); +} + +/* + * __wt_config_subgetraw -- + * Get the value for a given key from a config string in a WT_CONFIG_ITEM. + * This is useful for dealing with nested structs in config strings. + */ +int +__wt_config_subgetraw(WT_SESSION_IMPL *session, + WT_CONFIG_ITEM *cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value) +{ + WT_CONFIG cparser; + + WT_RET(__wt_config_initn(session, &cparser, cfg->str, cfg->len)); + return (__config_getraw(&cparser, key, value, 1)); +} + +/* + * __wt_config_subgets -- + * Get the value for a given key from a config string in a WT_CONFIG_ITEM. + * This is useful for dealing with nested structs in config strings. + */ +int +__wt_config_subgets(WT_SESSION_IMPL *session, + WT_CONFIG_ITEM *cfg, const char *key, WT_CONFIG_ITEM *value) +{ + WT_CONFIG_ITEM key_item = + { key, strlen(key), 0, WT_CONFIG_ITEM_STRING }; + + return (__wt_config_subgetraw(session, cfg, &key_item, value)); +} diff --git a/src/third_party/wiredtiger/src/config/config_api.c b/src/third_party/wiredtiger/src/config/config_api.c new file mode 100644 index 00000000000..42f4c117b81 --- /dev/null +++ b/src/third_party/wiredtiger/src/config/config_api.c @@ -0,0 +1,105 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __config_parser_close -- + * WT_CONFIG_PARSER->close method. + */ +static int +__config_parser_close(WT_CONFIG_PARSER *wt_config_parser) +{ + WT_CONFIG_PARSER_IMPL *config_parser; + + config_parser = (WT_CONFIG_PARSER_IMPL *)wt_config_parser; + + if (config_parser == NULL) + return (EINVAL); + + __wt_free(config_parser->session, config_parser); + return (0); +} + +/* + * __config_parser_get -- + * WT_CONFIG_PARSER->search method. + */ +static int +__config_parser_get(WT_CONFIG_PARSER *wt_config_parser, + const char *key, WT_CONFIG_ITEM *cval) +{ + WT_CONFIG_PARSER_IMPL *config_parser; + + config_parser = (WT_CONFIG_PARSER_IMPL *)wt_config_parser; + + if (config_parser == NULL) + return (EINVAL); + + return (__wt_config_subgets(config_parser->session, + &config_parser->config_item, key, cval)); +} + +/* + * __config_parser_next -- + * WT_CONFIG_PARSER->next method. + */ +static int +__config_parser_next(WT_CONFIG_PARSER *wt_config_parser, + WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *cval) +{ + WT_CONFIG_PARSER_IMPL *config_parser; + + config_parser = (WT_CONFIG_PARSER_IMPL *)wt_config_parser; + + if (config_parser == NULL) + return (EINVAL); + + return (__wt_config_next(&config_parser->config, key, cval)); +} + +/* + * wiredtiger_config_parser_open -- + * Create a configuration parser. + */ +int +wiredtiger_config_parser_open(WT_SESSION *wt_session, + const char *config, size_t len, WT_CONFIG_PARSER **config_parserp) +{ + static const WT_CONFIG_PARSER stds = { + __config_parser_close, + __config_parser_next, + __config_parser_get + }; + WT_CONFIG_ITEM config_item = + { config, len, 0, WT_CONFIG_ITEM_STRING }; + WT_CONFIG_PARSER_IMPL *config_parser; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + *config_parserp = NULL; + session = (WT_SESSION_IMPL *)wt_session; + + WT_RET(__wt_calloc_def(session, 1, &config_parser)); + config_parser->iface = stds; + config_parser->session = session; + + /* + * Setup a WT_CONFIG_ITEM to be used for get calls and a WT_CONFIG + * structure for iterations through the configuration string. + */ + memcpy(&config_parser->config_item, &config_item, sizeof(config_item)); + WT_ERR(__wt_config_initn( + session, &config_parser->config, config, len)); + + if (ret == 0) + *config_parserp = (WT_CONFIG_PARSER *)config_parser; + else +err: __wt_free(session, config_parser); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/config/config_check.c b/src/third_party/wiredtiger/src/config/config_check.c new file mode 100644 index 00000000000..310e54c3349 --- /dev/null +++ b/src/third_party/wiredtiger/src/config/config_check.c @@ -0,0 +1,370 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int config_check( + WT_SESSION_IMPL *, const WT_CONFIG_CHECK *, const char *, size_t); + +/* + * __conn_foc_add -- + * Add a new entry into the connection's free-on-close list. + */ +static int +__conn_foc_add(WT_SESSION_IMPL *session, const void *p) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + /* + * Our caller is expected to be holding any locks we need. + */ + WT_RET(__wt_realloc_def( + session, &conn->foc_size, conn->foc_cnt + 1, &conn->foc)); + + conn->foc[conn->foc_cnt++] = (void *)p; + return (0); +} + +/* + * __wt_conn_foc_discard -- + * Discard any memory the connection accumulated. + */ +void +__wt_conn_foc_discard(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + size_t i; + + conn = S2C(session); + + /* + * If we have a list of chunks to free, run through the list, then + * free the list itself. + */ + for (i = 0; i < conn->foc_cnt; ++i) + __wt_free(session, conn->foc[i]); + __wt_free(session, conn->foc); +} + +/* + * __wt_configure_method -- + * WT_CONNECTION.configure_method. + */ +int +__wt_configure_method(WT_SESSION_IMPL *session, + const char *method, const char *uri, + const char *config, const char *type, const char *check) +{ + const WT_CONFIG_CHECK *cp; + WT_CONFIG_CHECK *checks, *newcheck; + const WT_CONFIG_ENTRY **epp; + WT_CONFIG_ENTRY *entry; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + size_t cnt; + char *newcheck_name, *p; + + /* + * !!! + * We ignore the specified uri, that is, all new configuration options + * will be valid for all data sources. That's shouldn't be too bad + * as the worst that can happen is an application might specify some + * configuration option and not get an error -- the option should be + * ignored by the underlying implementation since it's unexpected, so + * there shouldn't be any real problems. Eventually I expect we will + * get the whole data-source thing sorted, at which time there may be + * configuration arrays for each data source, and that's when the uri + * will matter. + */ + WT_UNUSED(uri); + + conn = S2C(session); + checks = newcheck = NULL; + entry = NULL; + newcheck_name = NULL; + + /* Argument checking; we only support a limited number of types. */ + if (config == NULL) + WT_RET_MSG(session, EINVAL, "no configuration specified"); + if (type == NULL) + WT_RET_MSG(session, EINVAL, "no configuration type specified"); + if (strcmp(type, "boolean") != 0 && strcmp(type, "int") != 0 && + strcmp(type, "list") != 0 && strcmp(type, "string") != 0) + WT_RET_MSG(session, EINVAL, + "type must be one of \"boolean\", \"int\", \"list\" or " + "\"string\""); + + /* Find a match for the method name. */ + for (epp = conn->config_entries; (*epp)->method != NULL; ++epp) + if (strcmp((*epp)->method, method) == 0) + break; + if ((*epp)->method == NULL) + WT_RET_MSG(session, + WT_NOTFOUND, "no method matching %s found", method); + + /* + * Technically possible for threads to race, lock the connection while + * adding the new configuration information. We're holding the lock + * for an extended period of time, but configuration changes should be + * rare and only happen during startup. + */ + __wt_spin_lock(session, &conn->api_lock); + + /* + * Allocate new configuration entry and fill it in. + * + * The new base value is the previous base value, a separator and the + * new configuration string. + */ + WT_ERR(__wt_calloc_def(session, 1, &entry)); + entry->method = (*epp)->method; + WT_ERR(__wt_calloc_def(session, + strlen((*epp)->base) + strlen(",") + strlen(config) + 1, &p)); + (void)strcpy(p, (*epp)->base); + (void)strcat(p, ","); + (void)strcat(p, config); + entry->base = p; + + /* + * There may be a default value in the config argument passed in (for + * example, (kvs_parallelism=64"). The default value isn't part of the + * name, build a new one. + */ + WT_ERR(__wt_strdup(session, config, &newcheck_name)); + if ((p = strchr(newcheck_name, '=')) != NULL) + *p = '\0'; + + /* + * The new configuration name may replace an existing check with new + * information, in that case skip the old version. + */ + cnt = 0; + if ((*epp)->checks != NULL) + for (cp = (*epp)->checks; cp->name != NULL; ++cp) + ++cnt; + WT_ERR(__wt_calloc_def(session, cnt + 2, &checks)); + cnt = 0; + if ((*epp)->checks != NULL) + for (cp = (*epp)->checks; cp->name != NULL; ++cp) + if (strcmp(newcheck_name, cp->name) != 0) + checks[cnt++] = *cp; + newcheck = &checks[cnt]; + newcheck->name = newcheck_name; + WT_ERR(__wt_strdup(session, type, &newcheck->type)); + if (check != NULL) + WT_ERR(__wt_strdup(session, check, &newcheck->checks)); + entry->checks = checks; + + /* + * Confirm the configuration string passes the new set of + * checks. + */ + WT_ERR(config_check(session, entry->checks, config, 0)); + + /* + * The next time this configuration is updated, we don't want to figure + * out which of these pieces of memory were allocated and will need to + * be free'd on close (this isn't a heavily used API and it's too much + * work); add them all to the free-on-close list now. We don't check + * for errors deliberately, we'd have to figure out which elements have + * already been added to the free-on-close array and which have not in + * order to avoid freeing chunks of memory twice. Again, this isn't a + * commonly used API and it shouldn't ever happen, just leak it. + */ + (void)__conn_foc_add(session, entry->base); + (void)__conn_foc_add(session, entry); + (void)__conn_foc_add(session, checks); + (void)__conn_foc_add(session, newcheck->type); + (void)__conn_foc_add(session, newcheck->checks); + (void)__conn_foc_add(session, newcheck_name); + + /* + * Instead of using locks to protect configuration information, assume + * we can atomically update a pointer to a chunk of memory, and because + * a pointer is never partially written, readers will correctly see the + * original or new versions of the memory. Readers might be using the + * old version as it's being updated, though, which means we cannot free + * the old chunk of memory until all possible readers have finished. + * Currently, that's on connection close: in other words, we can use + * this because it's small amounts of memory, and we really, really do + * not want to acquire locks every time we access configuration strings, + * since that's done on every API call. + */ + WT_PUBLISH(*epp, entry); + + if (0) { +err: if (entry != NULL) { + __wt_free(session, entry->base); + __wt_free(session, entry); + } + __wt_free(session, checks); + if (newcheck != NULL) { + __wt_free(session, newcheck->type); + __wt_free(session, newcheck->checks); + } + __wt_free(session, newcheck_name); + } + + __wt_spin_unlock(session, &conn->api_lock); + return (ret); +} + +/* + * __wt_config_check -- + * Check the keys in an application-supplied config string match what is + * specified in an array of check strings. + */ +int +__wt_config_check(WT_SESSION_IMPL *session, + const WT_CONFIG_ENTRY *entry, const char *config, size_t config_len) +{ + /* + * Callers don't check, it's a fast call without a configuration or + * check array. + */ + return (config == NULL || entry->checks == NULL ? + 0 : config_check(session, entry->checks, config, config_len)); +} + +/* + * config_check -- + * Check the keys in an application-supplied config string match what is + * specified in an array of check strings. + */ +static int +config_check(WT_SESSION_IMPL *session, + const WT_CONFIG_CHECK *checks, const char *config, size_t config_len) +{ + WT_CONFIG parser, cparser, sparser; + WT_CONFIG_ITEM k, v, ck, cv, dummy; + WT_DECL_RET; + int badtype, found, i; + + /* + * The config_len parameter is optional, and allows passing in strings + * that are not nul-terminated. + */ + if (config_len == 0) + WT_RET(__wt_config_init(session, &parser, config)); + else + WT_RET(__wt_config_initn(session, &parser, config, config_len)); + while ((ret = __wt_config_next(&parser, &k, &v)) == 0) { + if (k.type != WT_CONFIG_ITEM_STRING && + k.type != WT_CONFIG_ITEM_ID) + WT_RET_MSG(session, EINVAL, + "Invalid configuration key found: '%.*s'", + (int)k.len, k.str); + + /* Search for a matching entry. */ + for (i = 0; checks[i].name != NULL; i++) + if (WT_STRING_MATCH(checks[i].name, k.str, k.len)) + break; + if (checks[i].name == NULL) + WT_RET_MSG(session, EINVAL, + "unknown configuration key: '%.*s'", + (int)k.len, k.str); + + if (strcmp(checks[i].type, "boolean") == 0) { + badtype = (v.type != WT_CONFIG_ITEM_BOOL && + (v.type != WT_CONFIG_ITEM_NUM || + (v.val != 0 && v.val != 1))); + } else if (strcmp(checks[i].type, "category") == 0) { + /* Deal with categories of the form: XXX=(XXX=blah). */ + ret = config_check(session, + checks[i].subconfigs, + k.str + strlen(checks[i].name) + 1, v.len); + if (ret != EINVAL) + badtype = 0; + else + badtype = 1; + } else if (strcmp(checks[i].type, "format") == 0) { + badtype = 0; + } else if (strcmp(checks[i].type, "int") == 0) { + badtype = (v.type != WT_CONFIG_ITEM_NUM); + } else if (strcmp(checks[i].type, "list") == 0) { + badtype = (v.len > 0 && + v.type != WT_CONFIG_ITEM_STRUCT); + } else if (strcmp(checks[i].type, "string") == 0) { + badtype = 0; + } else + WT_RET_MSG(session, EINVAL, + "unknown configuration type: '%s'", + checks[i].type); + + if (badtype) + WT_RET_MSG(session, EINVAL, + "Invalid value for key '%.*s': expected a %s", + (int)k.len, k.str, checks[i].type); + + if (checks[i].checks == NULL) + continue; + + /* Setup an iterator for the check string. */ + WT_RET(__wt_config_init(session, &cparser, checks[i].checks)); + while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) { + if (WT_STRING_MATCH("min", ck.str, ck.len)) { + if (v.val < cv.val) + WT_RET_MSG(session, EINVAL, + "Value too small for key '%.*s' " + "the minimum is %.*s", + (int)k.len, k.str, + (int)cv.len, cv.str); + } else if (WT_STRING_MATCH("max", ck.str, ck.len)) { + if (v.val > cv.val) + WT_RET_MSG(session, EINVAL, + "Value too large for key '%.*s' " + "the maximum is %.*s", + (int)k.len, k.str, + (int)cv.len, cv.str); + } else if (WT_STRING_MATCH("choices", ck.str, ck.len)) { + if (v.len == 0) + WT_RET_MSG(session, EINVAL, + "Key '%.*s' requires a value", + (int)k.len, k.str); + if (v.type == WT_CONFIG_ITEM_STRUCT) { + /* + * Handle the 'verbose' case of a list + * containing restricted choices. + */ + WT_RET(__wt_config_subinit(session, + &sparser, &v)); + found = 1; + while (found && + (ret = __wt_config_next(&sparser, + &v, &dummy)) == 0) { + ret = __wt_config_subgetraw( + session, &cv, &v, &dummy); + found = (ret == 0); + } + } else { + ret = __wt_config_subgetraw(session, + &cv, &v, &dummy); + found = (ret == 0); + } + + if (ret != 0 && ret != WT_NOTFOUND) + return (ret); + if (!found) + WT_RET_MSG(session, EINVAL, + "Value '%.*s' not a " + "permitted choice for key '%.*s'", + (int)v.len, v.str, + (int)k.len, k.str); + } else + WT_RET_MSG(session, EINVAL, + "unexpected configuration description " + "keyword %.*s", (int)ck.len, ck.str); + } + } + + if (ret == WT_NOTFOUND) + ret = 0; + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/config/config_collapse.c b/src/third_party/wiredtiger/src/config/config_collapse.c new file mode 100644 index 00000000000..3e4c539cbe9 --- /dev/null +++ b/src/third_party/wiredtiger/src/config/config_collapse.c @@ -0,0 +1,380 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_config_collapse -- + * Collapse a set of configuration strings into newly allocated memory. + * + * This function takes a NULL-terminated list of configuration strings (where + * the first one contains all the defaults and the values are in order from + * least to most preferred, that is, the default values are least preferred), + * and collapses them into newly allocated memory. The algorithm is to walk + * the first of the configuration strings, and for each entry, search all of + * the configuration strings for a final value, keeping the last value found. + * + * Notes: + * Any key not appearing in the first configuration string is discarded + * from the final result, because we'll never search for it. + * + * Nested structures aren't parsed. For example, imagine a configuration + * string contains "key=(k2=v2,k3=v3)", and a subsequent string has + * "key=(k4=v4)", the result will be "key=(k4=v4)", as we search for and + * use the final value of "key", regardless of field overlap or missing + * fields in the nested value. + */ +int +__wt_config_collapse( + WT_SESSION_IMPL *session, const char **cfg, const char **config_ret) +{ + WT_CONFIG cparser; + WT_CONFIG_ITEM k, v; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + + WT_ERR(__wt_config_init(session, &cparser, cfg[0])); + while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) { + if (k.type != WT_CONFIG_ITEM_STRING && + k.type != WT_CONFIG_ITEM_ID) + WT_ERR_MSG(session, EINVAL, + "Invalid configuration key found: '%s'\n", k.str); + WT_ERR(__wt_config_get(session, cfg, &k, &v)); + /* Include the quotes around string keys/values. */ + if (k.type == WT_CONFIG_ITEM_STRING) { + --k.str; + k.len += 2; + } + if (v.type == WT_CONFIG_ITEM_STRING) { + --v.str; + v.len += 2; + } + WT_ERR(__wt_buf_catfmt(session, tmp, "%.*s=%.*s,", + (int)k.len, k.str, (int)v.len, v.str)); + } + if (ret != WT_NOTFOUND) + goto err; + + /* + * If the caller passes us no valid configuration strings, we get here + * with no bytes to copy -- that's OK, the underlying string copy can + * handle empty strings. + * + * Strip any trailing comma. + */ + if (tmp->size != 0) + --tmp->size; + ret = __wt_strndup(session, tmp->data, tmp->size, config_ret); + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * We need a character that can't appear in a key as a separator. + */ +#undef SEP /* separator key, character */ +#define SEP "[" +#undef SEPC +#define SEPC '[' + +/* + * Individual configuration entries, including a generation number used to make + * the qsort stable. + */ +typedef struct { + char *k, *v; /* key, value */ + size_t gen; /* generation */ +} WT_CONFIG_MERGE_ENTRY; + +/* + * The array of configuration entries. + */ +typedef struct { + size_t entries_allocated; /* allocated */ + size_t entries_next; /* next slot */ + + WT_CONFIG_MERGE_ENTRY *entries; /* array of entries */ +} WT_CONFIG_MERGE; + +/* + * __config_merge_scan -- + * Walk a configuration string, inserting entries into the merged array. + */ +static int +__config_merge_scan(WT_SESSION_IMPL *session, + const char *key, const char *value, WT_CONFIG_MERGE *cp) +{ + WT_CONFIG cparser; + WT_CONFIG_ITEM k, v; + WT_DECL_ITEM(kb); + WT_DECL_ITEM(vb); + WT_DECL_RET; + size_t len; + + WT_ERR(__wt_scr_alloc(session, 0, &kb)); + WT_ERR(__wt_scr_alloc(session, 0, &vb)); + + WT_ERR(__wt_config_init(session, &cparser, value)); + while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) { + if (k.type != WT_CONFIG_ITEM_STRING && + k.type != WT_CONFIG_ITEM_ID) + WT_ERR_MSG(session, EINVAL, + "Invalid configuration key found: '%s'\n", k.str); + + /* Include the quotes around string keys/values. */ + if (k.type == WT_CONFIG_ITEM_STRING) { + --k.str; + k.len += 2; + } + if (v.type == WT_CONFIG_ITEM_STRING) { + --v.str; + v.len += 2; + } + + /* + * !!! + * We're using a JSON quote character to separate the names we + * create for nested structures. That's not completely safe as + * it's possible to quote characters in JSON such that a quote + * character appears as a literal character in a key name. In + * a few cases, applications can create their own key namespace + * (for example, shared library extension names), and therefore + * it's possible for an application to confuse us. Error if we + * we ever see a key with a magic character. + */ + for (len = 0; len < k.len; ++len) + if (k.str[len] == SEPC) + WT_ERR_MSG(session, EINVAL, + "key %.*s contains a '%c' separator " + "character", + (int)k.len, (char *)k.str, SEPC); + + /* Build the key/value strings. */ + WT_ERR(__wt_buf_fmt(session, + kb, "%s%s%.*s", + key == NULL ? "" : key, + key == NULL ? "" : SEP, + (int)k.len, k.str)); + WT_ERR(__wt_buf_fmt(session, + vb, "%.*s", (int)v.len, v.str)); + + /* + * If the value is a structure, recursively parse it. + * + * !!! + * Don't merge unless the structure has field names. WiredTiger + * stores checkpoint LSNs in the metadata file using nested + * structures without field names: "checkpoint_lsn=(1,0)", not + * "checkpoint_lsn=(file=1,offset=0)". The value type is still + * WT_CONFIG_ITEM_STRUCT, so we check for a field name in the + * value. + */ + if (v.type == WT_CONFIG_ITEM_STRUCT && + strchr(vb->data, '=') != NULL) { + WT_ERR(__config_merge_scan( + session, kb->data, vb->data, cp)); + continue; + } + + /* Insert the value into the array. */ + WT_ERR(__wt_realloc_def(session, + &cp->entries_allocated, + cp->entries_next + 1, &cp->entries)); + WT_ERR(__wt_strndup(session, + kb->data, kb->size, &cp->entries[cp->entries_next].k)); + WT_ERR(__wt_strndup(session, + vb->data, vb->size, &cp->entries[cp->entries_next].v)); + cp->entries[cp->entries_next].gen = cp->entries_next; + ++cp->entries_next; + } + WT_ERR_NOTFOUND_OK(ret); + +err: __wt_scr_free(&kb); + __wt_scr_free(&vb); + return (ret); +} + +/* + * __strip_comma -- + * Strip a trailing comma. + */ +static void +__strip_comma(WT_ITEM *buf) +{ + if (buf->size != 0 && ((char *)buf->data)[buf->size - 1] == ',') + --buf->size; +} + +/* + * __config_merge_format_next -- + * Walk the array, building entries. + */ +static int +__config_merge_format_next(WT_SESSION_IMPL *session, const char *prefix, + size_t plen, size_t *enp, WT_CONFIG_MERGE *cp, WT_ITEM *build) +{ + WT_CONFIG_MERGE_ENTRY *ep; + size_t len1, len2, next; + char *p; + + for (; *enp < cp->entries_next; ++*enp) { + ep = &cp->entries[*enp]; + len1 = strlen(ep->k); + + /* + * The entries are in sorted order, take the last entry for any + * key. + */ + if (*enp < (cp->entries_next - 1)) { + len2 = strlen((ep + 1)->k); + + /* Choose the last of identical keys. */ + if (len1 == len2 && + memcmp(ep->k, (ep + 1)->k, len1) == 0) + continue; + + /* + * The test is complicated by matching empty entries + * "foo=" against nested structures "foo,bar=", where + * the latter is a replacement for the former. + */ + if (len2 > len1 && + (ep + 1)->k[len1] == SEPC && + memcmp(ep->k, (ep + 1)->k, len1) == 0) + continue; + } + + /* + * If we're skipping a prefix and this entry doesn't match it, + * back off one entry and pop up a level. + */ + if (plen != 0 && + (plen > len1 || memcmp(ep->k, prefix, plen) != 0)) { + --*enp; + break; + } + + /* + * If the entry introduces a new level, recurse through that + * new level. + */ + if ((p = strchr(ep->k + plen, SEPC)) != NULL) { + next = WT_PTRDIFF(p, ep->k); + WT_RET(__wt_buf_catfmt(session, + build, "%.*s=(", (int)(next - plen), ep->k + plen)); + WT_RET(__config_merge_format_next( + session, ep->k, next + 1, enp, cp, build)); + __strip_comma(build); + WT_RET(__wt_buf_catfmt(session, build, "),")); + continue; + } + + /* Append the entry to the buffer. */ + WT_RET(__wt_buf_catfmt( + session, build, "%s=%s,", ep->k + plen, ep->v)); + } + + return (0); +} + +/* + * __config_merge_format -- + * Take the sorted array of entries, and format them into allocated memory. + */ +static int +__config_merge_format( + WT_SESSION_IMPL *session, WT_CONFIG_MERGE *cp, const char **config_ret) +{ + WT_DECL_ITEM(build); + WT_DECL_RET; + size_t entries; + + WT_RET(__wt_scr_alloc(session, 4 * 1024, &build)); + + entries = 0; + WT_ERR(__config_merge_format_next(session, "", 0, &entries, cp, build)); + + __strip_comma(build); + + ret = __wt_strndup(session, build->data, build->size, config_ret); + +err: __wt_scr_free(&build); + return (ret); +} + +/* + * __config_merge_cmp -- + * Qsort function: sort the config merge array. + */ +static int +__config_merge_cmp(const void *a, const void *b) +{ + WT_CONFIG_MERGE_ENTRY *ae, *be; + int cmp; + + ae = (WT_CONFIG_MERGE_ENTRY *)a; + be = (WT_CONFIG_MERGE_ENTRY *)b; + + if ((cmp = strcmp(ae->k, be->k)) != 0) + return (cmp); + return (ae->gen > be->gen ? 1 : -1); +} + +/* + * __wt_config_merge -- + * Merge a set of configuration strings into newly allocated memory. + * + * This function takes a NULL-terminated list of configuration strings (where + * the values are in order from least to most preferred), and merges them into + * newly allocated memory. The algorithm is to walk the configuration strings + * and build a table of each key/value pair. The pairs are sorted based on the + * name and the configuration string in which they were found, and a final + * configuration string is built from the result. + * + * Note: + * Nested structures are parsed and merge. For example, if configuration + * strings "key=(k1=v1,k2=v2)" and "key=(k1=v2)" appear, the result will + * be "key=(k1=v2,k2=v2)" because the nested values are merged. + */ +int +__wt_config_merge( + WT_SESSION_IMPL *session, const char **cfg, const char **config_ret) +{ + WT_CONFIG_MERGE merge; + WT_DECL_RET; + size_t i; + + /* Start out with a reasonable number of entries. */ + WT_CLEAR(merge); + + WT_RET(__wt_realloc_def( + session, &merge.entries_allocated, 100, &merge.entries)); + + /* Scan the configuration strings, entering them into the array. */ + for (; *cfg != NULL; ++cfg) + WT_ERR(__config_merge_scan(session, NULL, *cfg, &merge)); + + /* + * Sort the array by key and, in the case of identical keys, by + * generation. + */ + qsort(merge.entries, merge.entries_next, + sizeof(WT_CONFIG_MERGE_ENTRY), __config_merge_cmp); + + /* Convert the array of entries into a string. */ + ret = __config_merge_format(session, &merge, config_ret); + +err: for (i = 0; i < merge.entries_next; ++i) { + __wt_free(session, merge.entries[i].k); + __wt_free(session, merge.entries[i].v); + } + __wt_free(session, merge.entries); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/config/config_concat.c b/src/third_party/wiredtiger/src/config/config_concat.c new file mode 100644 index 00000000000..99475ef6f47 --- /dev/null +++ b/src/third_party/wiredtiger/src/config/config_concat.c @@ -0,0 +1,71 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_config_concat -- + * Given a NULL-terminated list of configuration strings, concatenate them + * into newly allocated memory. Nothing special is assumed about any of + * the config strings, they are simply combined in order. + * + * This code deals with the case where some of the config strings are + * wrapped in brackets but others aren't: the resulting string does not + * have brackets. + */ +int +__wt_config_concat( + WT_SESSION_IMPL *session, const char **cfg, const char **config_ret) +{ + WT_CONFIG cparser; + WT_CONFIG_ITEM k, v; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + const char **cp; + + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + + for (cp = cfg; *cp != NULL; ++cp) { + WT_ERR(__wt_config_init(session, &cparser, *cp)); + while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) { + if (k.type != WT_CONFIG_ITEM_STRING && + k.type != WT_CONFIG_ITEM_ID) + WT_ERR_MSG(session, EINVAL, + "Invalid configuration key found: '%s'\n", + k.str); + /* Include the quotes around string keys/values. */ + if (k.type == WT_CONFIG_ITEM_STRING) { + --k.str; + k.len += 2; + } + if (v.type == WT_CONFIG_ITEM_STRING) { + --v.str; + v.len += 2; + } + WT_ERR(__wt_buf_catfmt(session, tmp, "%.*s%s%.*s,", + (int)k.len, k.str, + (v.len > 0) ? "=" : "", + (int)v.len, v.str)); + } + if (ret != WT_NOTFOUND) + goto err; + } + + /* + * If the caller passes us no valid configuration strings, we get here + * with no bytes to copy -- that's OK, the underlying string copy can + * handle empty strings. + * + * Strip any trailing comma. + */ + if (tmp->size != 0) + --tmp->size; + ret = __wt_strndup(session, tmp->data, tmp->size, config_ret); + +err: __wt_scr_free(&tmp); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c new file mode 100644 index 00000000000..0cd2d32df57 --- /dev/null +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -0,0 +1,744 @@ +/* DO NOT EDIT: automatically built by dist/config.py. */ + +#include "wt_internal.h" + +static const WT_CONFIG_CHECK confchk_colgroup_meta[] = { + { "app_metadata", "string", NULL, NULL }, + { "columns", "list", NULL, NULL }, + { "source", "string", NULL, NULL }, + { "type", "string", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_connection_async_new_op[] = { + { "append", "boolean", NULL, NULL }, + { "overwrite", "boolean", NULL, NULL }, + { "raw", "boolean", NULL, NULL }, + { "timeout", "int", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_connection_close[] = { + { "leak_memory", "boolean", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_connection_load_extension[] = { + { "config", "string", NULL, NULL }, + { "entry", "string", NULL, NULL }, + { "terminate", "string", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_connection_open_session[] = { + { "isolation", "string", + "choices=[\"read-uncommitted\",\"read-committed\",\"snapshot\"]", + NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_async_subconfigs[] = { + { "enabled", "boolean", NULL, NULL }, + { "ops_max", "int", "min=10,max=4096", NULL }, + { "threads", "int", "min=1,max=20", NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_checkpoint_subconfigs[] = { + { "log_size", "int", "min=0,max=2GB", NULL }, + { "name", "string", NULL, NULL }, + { "wait", "int", "min=0,max=100000", NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_eviction_subconfigs[] = { + { "threads_max", "int", "min=1,max=20", NULL }, + { "threads_min", "int", "min=1,max=20", NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_lsm_manager_subconfigs[] = { + { "merge", "boolean", NULL, NULL }, + { "worker_thread_max", "int", "min=3,max=20", NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_shared_cache_subconfigs[] = { + { "chunk", "int", "min=1MB,max=10TB", NULL }, + { "name", "string", NULL, NULL }, + { "reserve", "int", NULL, NULL }, + { "size", "int", "min=1MB,max=10TB", NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_statistics_log_subconfigs[] = { + { "on_close", "boolean", NULL, NULL }, + { "path", "string", NULL, NULL }, + { "sources", "list", NULL, NULL }, + { "timestamp", "string", NULL, NULL }, + { "wait", "int", "min=0,max=100000", NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_connection_reconfigure[] = { + { "async", "category", NULL, confchk_async_subconfigs }, + { "cache_size", "int", "min=1MB,max=10TB", NULL }, + { "checkpoint", "category", NULL, + confchk_checkpoint_subconfigs }, + { "error_prefix", "string", NULL, NULL }, + { "eviction", "category", NULL, confchk_eviction_subconfigs }, + { "eviction_dirty_target", "int", "min=10,max=99", NULL }, + { "eviction_target", "int", "min=10,max=99", NULL }, + { "eviction_trigger", "int", "min=10,max=99", NULL }, + { "lsm_manager", "category", NULL, + confchk_lsm_manager_subconfigs }, + { "lsm_merge", "boolean", NULL, NULL }, + { "shared_cache", "category", NULL, + confchk_shared_cache_subconfigs }, + { "statistics", "list", + "choices=[\"all\",\"fast\",\"none\",\"clear\"]", + NULL }, + { "statistics_log", "category", NULL, + confchk_statistics_log_subconfigs }, + { "verbose", "list", + "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\"" + ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\"," + "\"salvage\",\"shared_cache\",\"split\",\"temporary\"," + "\"transaction\",\"verify\",\"version\",\"write\"]", + NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_file_meta[] = { + { "allocation_size", "int", "min=512B,max=128MB", NULL }, + { "app_metadata", "string", NULL, NULL }, + { "block_allocation", "string", + "choices=[\"first\",\"best\"]", + NULL }, + { "block_compressor", "string", NULL, NULL }, + { "cache_resident", "boolean", NULL, NULL }, + { "checkpoint", "string", NULL, NULL }, + { "checkpoint_lsn", "string", NULL, NULL }, + { "checksum", "string", + "choices=[\"on\",\"off\",\"uncompressed\"]", + NULL }, + { "collator", "string", NULL, NULL }, + { "columns", "list", NULL, NULL }, + { "dictionary", "int", "min=0", NULL }, + { "format", "string", "choices=[\"btree\"]", NULL }, + { "huffman_key", "string", NULL, NULL }, + { "huffman_value", "string", NULL, NULL }, + { "id", "string", NULL, NULL }, + { "internal_item_max", "int", "min=0", NULL }, + { "internal_key_truncate", "boolean", NULL, NULL }, + { "internal_page_max", "int", "min=512B,max=512MB", NULL }, + { "key_format", "format", NULL, NULL }, + { "key_gap", "int", "min=0", NULL }, + { "leaf_item_max", "int", "min=0", NULL }, + { "leaf_page_max", "int", "min=512B,max=512MB", NULL }, + { "memory_page_max", "int", "min=512B,max=10TB", NULL }, + { "os_cache_dirty_max", "int", "min=0", NULL }, + { "os_cache_max", "int", "min=0", NULL }, + { "prefix_compression", "boolean", NULL, NULL }, + { "prefix_compression_min", "int", "min=0", NULL }, + { "split_pct", "int", "min=25,max=100", NULL }, + { "value_format", "format", NULL, NULL }, + { "version", "string", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_index_meta[] = { + { "app_metadata", "string", NULL, NULL }, + { "columns", "list", NULL, NULL }, + { "key_format", "format", NULL, NULL }, + { "source", "string", NULL, NULL }, + { "type", "string", NULL, NULL }, + { "value_format", "format", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_session_begin_transaction[] = { + { "isolation", "string", + "choices=[\"read-uncommitted\",\"read-committed\",\"snapshot\"]", + NULL }, + { "name", "string", NULL, NULL }, + { "priority", "int", "min=-100,max=100", NULL }, + { "sync", "boolean", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_session_checkpoint[] = { + { "drop", "list", NULL, NULL }, + { "force", "boolean", NULL, NULL }, + { "name", "string", NULL, NULL }, + { "target", "list", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_session_compact[] = { + { "timeout", "int", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_lsm_subconfigs[] = { + { "auto_throttle", "boolean", NULL, NULL }, + { "bloom", "boolean", NULL, NULL }, + { "bloom_bit_count", "int", "min=2,max=1000", NULL }, + { "bloom_config", "string", NULL, NULL }, + { "bloom_hash_count", "int", "min=2,max=100", NULL }, + { "bloom_oldest", "boolean", NULL, NULL }, + { "chunk_max", "int", "min=100MB,max=10TB", NULL }, + { "chunk_size", "int", "min=512K,max=500MB", NULL }, + { "merge_max", "int", "min=2,max=100", NULL }, + { "merge_min", "int", "max=100", NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_session_create[] = { + { "allocation_size", "int", "min=512B,max=128MB", NULL }, + { "app_metadata", "string", NULL, NULL }, + { "block_allocation", "string", + "choices=[\"first\",\"best\"]", + NULL }, + { "block_compressor", "string", NULL, NULL }, + { "cache_resident", "boolean", NULL, NULL }, + { "checksum", "string", + "choices=[\"on\",\"off\",\"uncompressed\"]", + NULL }, + { "colgroups", "list", NULL, NULL }, + { "collator", "string", NULL, NULL }, + { "columns", "list", NULL, NULL }, + { "dictionary", "int", "min=0", NULL }, + { "exclusive", "boolean", NULL, NULL }, + { "format", "string", "choices=[\"btree\"]", NULL }, + { "huffman_key", "string", NULL, NULL }, + { "huffman_value", "string", NULL, NULL }, + { "internal_item_max", "int", "min=0", NULL }, + { "internal_key_truncate", "boolean", NULL, NULL }, + { "internal_page_max", "int", "min=512B,max=512MB", NULL }, + { "key_format", "format", NULL, NULL }, + { "key_gap", "int", "min=0", NULL }, + { "leaf_item_max", "int", "min=0", NULL }, + { "leaf_page_max", "int", "min=512B,max=512MB", NULL }, + { "lsm", "category", NULL, confchk_lsm_subconfigs }, + { "memory_page_max", "int", "min=512B,max=10TB", NULL }, + { "os_cache_dirty_max", "int", "min=0", NULL }, + { "os_cache_max", "int", "min=0", NULL }, + { "prefix_compression", "boolean", NULL, NULL }, + { "prefix_compression_min", "int", "min=0", NULL }, + { "source", "string", NULL, NULL }, + { "split_pct", "int", "min=25,max=100", NULL }, + { "type", "string", NULL, NULL }, + { "value_format", "format", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_session_drop[] = { + { "force", "boolean", NULL, NULL }, + { "remove_files", "boolean", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_session_open_cursor[] = { + { "append", "boolean", NULL, NULL }, + { "bulk", "string", NULL, NULL }, + { "checkpoint", "string", NULL, NULL }, + { "dump", "string", + "choices=[\"hex\",\"json\",\"print\"]", + NULL }, + { "next_random", "boolean", NULL, NULL }, + { "overwrite", "boolean", NULL, NULL }, + { "raw", "boolean", NULL, NULL }, + { "readonly", "boolean", NULL, NULL }, + { "skip_sort_check", "boolean", NULL, NULL }, + { "statistics", "list", + "choices=[\"all\",\"fast\",\"clear\"]", + NULL }, + { "target", "list", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_session_reconfigure[] = { + { "isolation", "string", + "choices=[\"read-uncommitted\",\"read-committed\",\"snapshot\"]", + NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_session_salvage[] = { + { "force", "boolean", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_session_verify[] = { + { "dump_address", "boolean", NULL, NULL }, + { "dump_blocks", "boolean", NULL, NULL }, + { "dump_offsets", "list", NULL, NULL }, + { "dump_pages", "boolean", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_table_meta[] = { + { "app_metadata", "string", NULL, NULL }, + { "colgroups", "list", NULL, NULL }, + { "columns", "list", NULL, NULL }, + { "key_format", "format", NULL, NULL }, + { "value_format", "format", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_log_subconfigs[] = { + { "archive", "boolean", NULL, NULL }, + { "enabled", "boolean", NULL, NULL }, + { "file_max", "int", "min=100KB,max=2GB", NULL }, + { "path", "string", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_transaction_sync_subconfigs[] = { + { "enabled", "boolean", NULL, NULL }, + { "method", "string", + "choices=[\"dsync\",\"fsync\",\"none\"]", + NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { + { "async", "category", NULL, confchk_async_subconfigs }, + { "buffer_alignment", "int", "min=-1,max=1MB", NULL }, + { "cache_size", "int", "min=1MB,max=10TB", NULL }, + { "checkpoint", "category", NULL, + confchk_checkpoint_subconfigs }, + { "checkpoint_sync", "boolean", NULL, NULL }, + { "config_base", "boolean", NULL, NULL }, + { "create", "boolean", NULL, NULL }, + { "direct_io", "list", + "choices=[\"checkpoint\",\"data\",\"log\"]", + NULL }, + { "error_prefix", "string", NULL, NULL }, + { "eviction", "category", NULL, confchk_eviction_subconfigs }, + { "eviction_dirty_target", "int", "min=10,max=99", NULL }, + { "eviction_target", "int", "min=10,max=99", NULL }, + { "eviction_trigger", "int", "min=10,max=99", NULL }, + { "exclusive", "boolean", NULL, NULL }, + { "extensions", "list", NULL, NULL }, + { "file_extend", "list", "choices=[\"data\",\"log\"]", NULL }, + { "hazard_max", "int", "min=15", NULL }, + { "log", "category", NULL, confchk_log_subconfigs }, + { "lsm_manager", "category", NULL, + confchk_lsm_manager_subconfigs }, + { "lsm_merge", "boolean", NULL, NULL }, + { "mmap", "boolean", NULL, NULL }, + { "multiprocess", "boolean", NULL, NULL }, + { "session_max", "int", "min=1", NULL }, + { "shared_cache", "category", NULL, + confchk_shared_cache_subconfigs }, + { "statistics", "list", + "choices=[\"all\",\"fast\",\"none\",\"clear\"]", + NULL }, + { "statistics_log", "category", NULL, + confchk_statistics_log_subconfigs }, + { "transaction_sync", "category", NULL, + confchk_transaction_sync_subconfigs }, + { "use_environment_priv", "boolean", NULL, NULL }, + { "verbose", "list", + "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\"" + ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\"," + "\"salvage\",\"shared_cache\",\"split\",\"temporary\"," + "\"transaction\",\"verify\",\"version\",\"write\"]", + NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { + { "async", "category", NULL, confchk_async_subconfigs }, + { "buffer_alignment", "int", "min=-1,max=1MB", NULL }, + { "cache_size", "int", "min=1MB,max=10TB", NULL }, + { "checkpoint", "category", NULL, + confchk_checkpoint_subconfigs }, + { "checkpoint_sync", "boolean", NULL, NULL }, + { "config_base", "boolean", NULL, NULL }, + { "create", "boolean", NULL, NULL }, + { "direct_io", "list", + "choices=[\"checkpoint\",\"data\",\"log\"]", + NULL }, + { "error_prefix", "string", NULL, NULL }, + { "eviction", "category", NULL, confchk_eviction_subconfigs }, + { "eviction_dirty_target", "int", "min=10,max=99", NULL }, + { "eviction_target", "int", "min=10,max=99", NULL }, + { "eviction_trigger", "int", "min=10,max=99", NULL }, + { "exclusive", "boolean", NULL, NULL }, + { "extensions", "list", NULL, NULL }, + { "file_extend", "list", "choices=[\"data\",\"log\"]", NULL }, + { "hazard_max", "int", "min=15", NULL }, + { "log", "category", NULL, confchk_log_subconfigs }, + { "lsm_manager", "category", NULL, + confchk_lsm_manager_subconfigs }, + { "lsm_merge", "boolean", NULL, NULL }, + { "mmap", "boolean", NULL, NULL }, + { "multiprocess", "boolean", NULL, NULL }, + { "session_max", "int", "min=1", NULL }, + { "shared_cache", "category", NULL, + confchk_shared_cache_subconfigs }, + { "statistics", "list", + "choices=[\"all\",\"fast\",\"none\",\"clear\"]", + NULL }, + { "statistics_log", "category", NULL, + confchk_statistics_log_subconfigs }, + { "transaction_sync", "category", NULL, + confchk_transaction_sync_subconfigs }, + { "use_environment_priv", "boolean", NULL, NULL }, + { "verbose", "list", + "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\"" + ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\"," + "\"salvage\",\"shared_cache\",\"split\",\"temporary\"," + "\"transaction\",\"verify\",\"version\",\"write\"]", + NULL }, + { "version", "string", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { + { "async", "category", NULL, confchk_async_subconfigs }, + { "buffer_alignment", "int", "min=-1,max=1MB", NULL }, + { "cache_size", "int", "min=1MB,max=10TB", NULL }, + { "checkpoint", "category", NULL, + confchk_checkpoint_subconfigs }, + { "checkpoint_sync", "boolean", NULL, NULL }, + { "direct_io", "list", + "choices=[\"checkpoint\",\"data\",\"log\"]", + NULL }, + { "error_prefix", "string", NULL, NULL }, + { "eviction", "category", NULL, confchk_eviction_subconfigs }, + { "eviction_dirty_target", "int", "min=10,max=99", NULL }, + { "eviction_target", "int", "min=10,max=99", NULL }, + { "eviction_trigger", "int", "min=10,max=99", NULL }, + { "extensions", "list", NULL, NULL }, + { "file_extend", "list", "choices=[\"data\",\"log\"]", NULL }, + { "hazard_max", "int", "min=15", NULL }, + { "log", "category", NULL, confchk_log_subconfigs }, + { "lsm_manager", "category", NULL, + confchk_lsm_manager_subconfigs }, + { "lsm_merge", "boolean", NULL, NULL }, + { "mmap", "boolean", NULL, NULL }, + { "multiprocess", "boolean", NULL, NULL }, + { "session_max", "int", "min=1", NULL }, + { "shared_cache", "category", NULL, + confchk_shared_cache_subconfigs }, + { "statistics", "list", + "choices=[\"all\",\"fast\",\"none\",\"clear\"]", + NULL }, + { "statistics_log", "category", NULL, + confchk_statistics_log_subconfigs }, + { "transaction_sync", "category", NULL, + confchk_transaction_sync_subconfigs }, + { "verbose", "list", + "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\"" + ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\"," + "\"salvage\",\"shared_cache\",\"split\",\"temporary\"," + "\"transaction\",\"verify\",\"version\",\"write\"]", + NULL }, + { "version", "string", NULL, NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { + { "async", "category", NULL, confchk_async_subconfigs }, + { "buffer_alignment", "int", "min=-1,max=1MB", NULL }, + { "cache_size", "int", "min=1MB,max=10TB", NULL }, + { "checkpoint", "category", NULL, + confchk_checkpoint_subconfigs }, + { "checkpoint_sync", "boolean", NULL, NULL }, + { "direct_io", "list", + "choices=[\"checkpoint\",\"data\",\"log\"]", + NULL }, + { "error_prefix", "string", NULL, NULL }, + { "eviction", "category", NULL, confchk_eviction_subconfigs }, + { "eviction_dirty_target", "int", "min=10,max=99", NULL }, + { "eviction_target", "int", "min=10,max=99", NULL }, + { "eviction_trigger", "int", "min=10,max=99", NULL }, + { "extensions", "list", NULL, NULL }, + { "file_extend", "list", "choices=[\"data\",\"log\"]", NULL }, + { "hazard_max", "int", "min=15", NULL }, + { "log", "category", NULL, confchk_log_subconfigs }, + { "lsm_manager", "category", NULL, + confchk_lsm_manager_subconfigs }, + { "lsm_merge", "boolean", NULL, NULL }, + { "mmap", "boolean", NULL, NULL }, + { "multiprocess", "boolean", NULL, NULL }, + { "session_max", "int", "min=1", NULL }, + { "shared_cache", "category", NULL, + confchk_shared_cache_subconfigs }, + { "statistics", "list", + "choices=[\"all\",\"fast\",\"none\",\"clear\"]", + NULL }, + { "statistics_log", "category", NULL, + confchk_statistics_log_subconfigs }, + { "transaction_sync", "category", NULL, + confchk_transaction_sync_subconfigs }, + { "verbose", "list", + "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\"" + ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\"," + "\"salvage\",\"shared_cache\",\"split\",\"temporary\"," + "\"transaction\",\"verify\",\"version\",\"write\"]", + NULL }, + { NULL, NULL, NULL, NULL } +}; + +static const WT_CONFIG_ENTRY config_entries[] = { + { "colgroup.meta", + "app_metadata=,columns=,source=,type=file", + confchk_colgroup_meta + }, + { "connection.add_collator", + "", + NULL + }, + { "connection.add_compressor", + "", + NULL + }, + { "connection.add_data_source", + "", + NULL + }, + { "connection.add_extractor", + "", + NULL + }, + { "connection.async_new_op", + "append=0,overwrite=,raw=0,timeout=1200", + confchk_connection_async_new_op + }, + { "connection.close", + "leak_memory=0", + confchk_connection_close + }, + { "connection.load_extension", + "config=,entry=wiredtiger_extension_init," + "terminate=wiredtiger_extension_terminate", + confchk_connection_load_extension + }, + { "connection.open_session", + "isolation=read-committed", + confchk_connection_open_session + }, + { "connection.reconfigure", + "async=(enabled=0,ops_max=1024,threads=2),cache_size=100MB," + "checkpoint=(log_size=0,name=\"WiredTigerCheckpoint\",wait=0)," + "error_prefix=,eviction=(threads_max=1,threads_min=1)," + "eviction_dirty_target=80,eviction_target=80,eviction_trigger=95," + "lsm_manager=(merge=,worker_thread_max=4),lsm_merge=," + "shared_cache=(chunk=10MB,name=,reserve=0,size=500MB)," + "statistics=none,statistics_log=(on_close=0," + "path=\"WiredTigerStat.%d.%H\",sources=," + "timestamp=\"%b %d %H:%M:%S\",wait=0),verbose=", + confchk_connection_reconfigure + }, + { "cursor.close", + "", + NULL + }, + { "file.meta", + "allocation_size=4KB,app_metadata=,block_allocation=best," + "block_compressor=,cache_resident=0,checkpoint=,checkpoint_lsn=," + "checksum=uncompressed,collator=,columns=,dictionary=0," + "format=btree,huffman_key=,huffman_value=,id=,internal_item_max=0" + ",internal_key_truncate=,internal_page_max=4KB,key_format=u," + "key_gap=10,leaf_item_max=0,leaf_page_max=32KB," + "memory_page_max=5MB,os_cache_dirty_max=0,os_cache_max=0," + "prefix_compression=0,prefix_compression_min=4,split_pct=75," + "value_format=u,version=(major=0,minor=0)", + confchk_file_meta + }, + { "index.meta", + "app_metadata=,columns=,key_format=u,source=,type=file," + "value_format=u", + confchk_index_meta + }, + { "session.begin_transaction", + "isolation=,name=,priority=0,sync=", + confchk_session_begin_transaction + }, + { "session.checkpoint", + "drop=,force=0,name=,target=", + confchk_session_checkpoint + }, + { "session.close", + "", + NULL + }, + { "session.commit_transaction", + "", + NULL + }, + { "session.compact", + "timeout=1200", + confchk_session_compact + }, + { "session.create", + "allocation_size=4KB,app_metadata=,block_allocation=best," + "block_compressor=,cache_resident=0,checksum=uncompressed," + "colgroups=,collator=,columns=,dictionary=0,exclusive=0," + "format=btree,huffman_key=,huffman_value=,internal_item_max=0," + "internal_key_truncate=,internal_page_max=4KB,key_format=u," + "key_gap=10,leaf_item_max=0,leaf_page_max=32KB," + "lsm=(auto_throttle=,bloom=,bloom_bit_count=16,bloom_config=," + "bloom_hash_count=8,bloom_oldest=0,chunk_max=5GB,chunk_size=10MB," + "merge_max=15,merge_min=0),memory_page_max=5MB," + "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0," + "prefix_compression_min=4,source=,split_pct=75,type=file," + "value_format=u", + confchk_session_create + }, + { "session.drop", + "force=0,remove_files=", + confchk_session_drop + }, + { "session.log_printf", + "", + NULL + }, + { "session.open_cursor", + "append=0,bulk=0,checkpoint=,dump=,next_random=0,overwrite=,raw=0" + ",readonly=0,skip_sort_check=0,statistics=,target=", + confchk_session_open_cursor + }, + { "session.reconfigure", + "isolation=read-committed", + confchk_session_reconfigure + }, + { "session.rename", + "", + NULL + }, + { "session.rollback_transaction", + "", + NULL + }, + { "session.salvage", + "force=0", + confchk_session_salvage + }, + { "session.truncate", + "", + NULL + }, + { "session.upgrade", + "", + NULL + }, + { "session.verify", + "dump_address=0,dump_blocks=0,dump_offsets=,dump_pages=0", + confchk_session_verify + }, + { "table.meta", + "app_metadata=,colgroups=,columns=,key_format=u,value_format=u", + confchk_table_meta + }, + { "wiredtiger_open", + "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1," + "cache_size=100MB,checkpoint=(log_size=0," + "name=\"WiredTigerCheckpoint\",wait=0),checkpoint_sync=," + "config_base=,create=0,direct_io=,error_prefix=," + "eviction=(threads_max=1,threads_min=1),eviction_dirty_target=80," + "eviction_target=80,eviction_trigger=95,exclusive=0,extensions=," + "file_extend=,hazard_max=1000,log=(archive=,enabled=0," + "file_max=100MB,path=\"\"),lsm_manager=(merge=," + "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," + "session_max=100,shared_cache=(chunk=10MB,name=,reserve=0," + "size=500MB),statistics=none,statistics_log=(on_close=0," + "path=\"WiredTigerStat.%d.%H\",sources=," + "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0" + ",method=fsync),use_environment_priv=0,verbose=", + confchk_wiredtiger_open + }, + { "wiredtiger_open_all", + "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1," + "cache_size=100MB,checkpoint=(log_size=0," + "name=\"WiredTigerCheckpoint\",wait=0),checkpoint_sync=," + "config_base=,create=0,direct_io=,error_prefix=," + "eviction=(threads_max=1,threads_min=1),eviction_dirty_target=80," + "eviction_target=80,eviction_trigger=95,exclusive=0,extensions=," + "file_extend=,hazard_max=1000,log=(archive=,enabled=0," + "file_max=100MB,path=\"\"),lsm_manager=(merge=," + "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," + "session_max=100,shared_cache=(chunk=10MB,name=,reserve=0," + "size=500MB),statistics=none,statistics_log=(on_close=0," + "path=\"WiredTigerStat.%d.%H\",sources=," + "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0" + ",method=fsync),use_environment_priv=0,verbose=,version=(major=0," + "minor=0)", + confchk_wiredtiger_open_all + }, + { "wiredtiger_open_basecfg", + "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1," + "cache_size=100MB,checkpoint=(log_size=0," + "name=\"WiredTigerCheckpoint\",wait=0),checkpoint_sync=," + "direct_io=,error_prefix=,eviction=(threads_max=1,threads_min=1)," + "eviction_dirty_target=80,eviction_target=80,eviction_trigger=95," + "extensions=,file_extend=,hazard_max=1000,log=(archive=,enabled=0" + ",file_max=100MB,path=\"\"),lsm_manager=(merge=," + "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," + "session_max=100,shared_cache=(chunk=10MB,name=,reserve=0," + "size=500MB),statistics=none,statistics_log=(on_close=0," + "path=\"WiredTigerStat.%d.%H\",sources=," + "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0" + ",method=fsync),verbose=,version=(major=0,minor=0)", + confchk_wiredtiger_open_basecfg + }, + { "wiredtiger_open_usercfg", + "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1," + "cache_size=100MB,checkpoint=(log_size=0," + "name=\"WiredTigerCheckpoint\",wait=0),checkpoint_sync=," + "direct_io=,error_prefix=,eviction=(threads_max=1,threads_min=1)," + "eviction_dirty_target=80,eviction_target=80,eviction_trigger=95," + "extensions=,file_extend=,hazard_max=1000,log=(archive=,enabled=0" + ",file_max=100MB,path=\"\"),lsm_manager=(merge=," + "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," + "session_max=100,shared_cache=(chunk=10MB,name=,reserve=0," + "size=500MB),statistics=none,statistics_log=(on_close=0," + "path=\"WiredTigerStat.%d.%H\",sources=," + "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0" + ",method=fsync),verbose=", + confchk_wiredtiger_open_usercfg + }, + { NULL, NULL, NULL } +}; + +int +__wt_conn_config_init(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + const WT_CONFIG_ENTRY *ep, **epp; + + conn = S2C(session); + + /* Build a list of pointers to the configuration information. */ + WT_RET(__wt_calloc_def(session, + sizeof(config_entries) / sizeof(config_entries[0]), &epp)); + conn->config_entries = epp; + + /* Fill in the list to reference the default information. */ + for (ep = config_entries;;) { + *epp++ = ep++; + if (ep->method == NULL) + break; + } + return (0); +} + +void +__wt_conn_config_discard(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + __wt_free(session, conn->config_entries); +} diff --git a/src/third_party/wiredtiger/src/config/config_ext.c b/src/third_party/wiredtiger/src/config/config_ext.c new file mode 100644 index 00000000000..26b3799d61c --- /dev/null +++ b/src/third_party/wiredtiger/src/config/config_ext.c @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_ext_config_parser_open -- + * WT_EXTENSION_API->config_parser_open implementation + */ +int +__wt_ext_config_parser_open(WT_EXTENSION_API *wt_ext, WT_SESSION *wt_session, + const char *config, size_t len, WT_CONFIG_PARSER **config_parserp) +{ + WT_UNUSED(wt_ext); + return (wiredtiger_config_parser_open( + wt_session, config, len, config_parserp)); +} + +/* + * __wt_ext_config_get -- + * Given a NULL-terminated list of configuration strings, find the final + * value for a given string key (external API version). + */ +int +__wt_ext_config_get(WT_EXTENSION_API *wt_api, + WT_SESSION *wt_session, WT_CONFIG_ARG *cfg_arg, const char *key, + WT_CONFIG_ITEM *cval) +{ + WT_CONNECTION_IMPL *conn; + WT_SESSION_IMPL *session; + const char **cfg; + + conn = (WT_CONNECTION_IMPL *)wt_api->conn; + if ((session = (WT_SESSION_IMPL *)wt_session) == NULL) + session = conn->default_session; + + if ((cfg = (const char **)cfg_arg) == NULL) + return (WT_NOTFOUND); + return (__wt_config_gets(session, cfg, key, cval)); +} diff --git a/src/third_party/wiredtiger/src/config/config_upgrade.c b/src/third_party/wiredtiger/src/config/config_upgrade.c new file mode 100644 index 00000000000..24297df839b --- /dev/null +++ b/src/third_party/wiredtiger/src/config/config_upgrade.c @@ -0,0 +1,32 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_config_upgrade -- + * Upgrade a configuration string by appended the replacement version. + */ +int +__wt_config_upgrade(WT_SESSION_IMPL *session, WT_ITEM *buf) +{ + WT_CONFIG_ITEM v; + const char *config; + + config = buf->data; + + /* + * wiredtiger_open: + * lsm_merge=boolean -> lsm_manager=(merge=boolean) + */ + if (__wt_config_getones( + session, config, "lsm_merge", &v) != WT_NOTFOUND) + WT_RET(__wt_buf_catfmt(session, buf, + ",lsm_manager=(merge=%s)", v.val ? "true" : "false")); + + return (0); +} diff --git a/src/third_party/wiredtiger/src/conn/api_strerror.c b/src/third_party/wiredtiger/src/conn/api_strerror.c new file mode 100644 index 00000000000..1ad136eae12 --- /dev/null +++ b/src/third_party/wiredtiger/src/conn/api_strerror.c @@ -0,0 +1,43 @@ +/* DO NOT EDIT: automatically built by dist/api_err.py. */ + +#include "wt_internal.h" + +/* + * wiredtiger_strerror -- + * Return a string for any error value. + */ +const char * +wiredtiger_strerror(int error) +{ + static char errbuf[64]; + char *p; + + if (error == 0) + return ("Successful return: 0"); + + switch (error) { + case WT_DUPLICATE_KEY: + return ("WT_DUPLICATE_KEY: attempt to insert an existing key"); + case WT_ERROR: + return ("WT_ERROR: non-specific WiredTiger error"); + case WT_NOTFOUND: + return ("WT_NOTFOUND: item not found"); + case WT_PANIC: + return ("WT_PANIC: WiredTiger library panic"); + case WT_RESTART: + return ("WT_RESTART: restart the operation (internal)"); + case WT_ROLLBACK: + return ("WT_ROLLBACK: conflict between concurrent operations"); + default: + if (error > 0 && (p = strerror(error)) != NULL) + return (p); + break; + } + + /* + * !!! + * Not thread-safe, but this is never supposed to happen. + */ + (void)snprintf(errbuf, sizeof(errbuf), "Unknown error: %d", error); + return (errbuf); +} diff --git a/src/third_party/wiredtiger/src/conn/api_version.c b/src/third_party/wiredtiger/src/conn/api_version.c new file mode 100644 index 00000000000..1355220c585 --- /dev/null +++ b/src/third_party/wiredtiger/src/conn/api_version.c @@ -0,0 +1,24 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * wiredtiger_version -- + * Return library version information. + */ +const char * +wiredtiger_version(int *majorp, int *minorp, int *patchp) +{ + if (majorp != NULL) + *majorp = WIREDTIGER_VERSION_MAJOR; + if (minorp != NULL) + *minorp = WIREDTIGER_VERSION_MINOR; + if (patchp != NULL) + *patchp = WIREDTIGER_VERSION_PATCH; + return (WIREDTIGER_VERSION_STRING); +} diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c new file mode 100644 index 00000000000..c7562ab94c3 --- /dev/null +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -0,0 +1,1573 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __conn_statistics_config(WT_SESSION_IMPL *, const char *[]); + +/* + * ext_collate -- + * Call the collation function (external API version). + */ +static int +ext_collate(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, + WT_COLLATOR *collator, WT_ITEM *first, WT_ITEM *second, int *cmpp) +{ + WT_CONNECTION_IMPL *conn; + WT_SESSION_IMPL *session; + + conn = (WT_CONNECTION_IMPL *)wt_api->conn; + if ((session = (WT_SESSION_IMPL *)wt_session) == NULL) + session = conn->default_session; + + WT_RET(__wt_compare(session, collator, first, second, cmpp)); + + return (0); +} + +/* + * ext_collator_config -- + * Given a configuration, configure the collator (external API version). + */ +static int +ext_collator_config(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, + WT_CONFIG_ARG *cfg_arg, WT_COLLATOR **collatorp, int *ownp) +{ + WT_CONNECTION_IMPL *conn; + WT_SESSION_IMPL *session; + const char **cfg; + + conn = (WT_CONNECTION_IMPL *)wt_api->conn; + if ((session = (WT_SESSION_IMPL *)wt_session) == NULL) + session = conn->default_session; + + /* The default is a standard lexicographic comparison. */ + if ((cfg = (const char **)cfg_arg) == NULL) + return (0); + + return (__wt_collator_config(session, cfg, collatorp, ownp)); +} + +/* + * __wt_collator_config -- + * Given a configuration, configure the collator. + */ +int +__wt_collator_config(WT_SESSION_IMPL *session, const char **cfg, + WT_COLLATOR **collatorp, int *ownp) +{ + WT_CONNECTION_IMPL *conn; + WT_CONFIG_ITEM cval; + WT_DECL_RET; + WT_NAMED_COLLATOR *ncoll; + + *collatorp = NULL; + *ownp = 0; + + conn = S2C(session); + + if ((ret = __wt_config_gets(session, cfg, "collator", &cval)) != 0) + return (ret == WT_NOTFOUND ? 0 : ret); + + if (cval.len > 0) { + TAILQ_FOREACH(ncoll, &conn->collqh, q) + if (WT_STRING_MATCH(ncoll->name, cval.str, cval.len)) + break; + + if (ncoll == NULL) + WT_RET_MSG(session, EINVAL, + "unknown collator '%.*s'", (int)cval.len, cval.str); + + if (ncoll->collator->customize != NULL) { + WT_RET(__wt_config_gets(session, + session->dhandle->cfg, "app_metadata", &cval)); + WT_RET(ncoll->collator->customize( + ncoll->collator, &session->iface, + session->dhandle->name, &cval, collatorp)); + } + if (*collatorp == NULL) + *collatorp = ncoll->collator; + else + *ownp = 1; + } + + return (0); +} + +/* + * __conn_get_extension_api -- + * WT_CONNECTION.get_extension_api method. + */ +static WT_EXTENSION_API * +__conn_get_extension_api(WT_CONNECTION *wt_conn) +{ + WT_CONNECTION_IMPL *conn; + + conn = (WT_CONNECTION_IMPL *)wt_conn; + + conn->extension_api.conn = wt_conn; + conn->extension_api.err_printf = __wt_ext_err_printf; + conn->extension_api.msg_printf = __wt_ext_msg_printf; + conn->extension_api.strerror = wiredtiger_strerror; + conn->extension_api.scr_alloc = __wt_ext_scr_alloc; + conn->extension_api.scr_free = __wt_ext_scr_free; + conn->extension_api.collator_config = ext_collator_config; + conn->extension_api.collate = ext_collate; + conn->extension_api.config_parser_open = __wt_ext_config_parser_open; + conn->extension_api.config_get = __wt_ext_config_get; + conn->extension_api.metadata_insert = __wt_ext_metadata_insert; + conn->extension_api.metadata_remove = __wt_ext_metadata_remove; + conn->extension_api.metadata_search = __wt_ext_metadata_search; + conn->extension_api.metadata_update = __wt_ext_metadata_update; + conn->extension_api.struct_pack = __wt_ext_struct_pack; + conn->extension_api.struct_size = __wt_ext_struct_size; + conn->extension_api.struct_unpack = __wt_ext_struct_unpack; + conn->extension_api.transaction_id = __wt_ext_transaction_id; + conn->extension_api.transaction_isolation_level = + __wt_ext_transaction_isolation_level; + conn->extension_api.transaction_notify = __wt_ext_transaction_notify; + conn->extension_api.transaction_oldest = __wt_ext_transaction_oldest; + conn->extension_api.transaction_visible = __wt_ext_transaction_visible; + conn->extension_api.version = wiredtiger_version; + + return (&conn->extension_api); +} + +#ifdef HAVE_BUILTIN_EXTENSION_SNAPPY + extern int snappy_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); +#endif +#ifdef HAVE_BUILTIN_EXTENSION_ZLIB + extern int zlib_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); +#endif + +/* + * __conn_load_default_extensions -- + * Load extensions that are enabled via --with-builtins + */ +static int +__conn_load_default_extensions(WT_CONNECTION_IMPL *conn) +{ + WT_UNUSED(conn); +#ifdef HAVE_BUILTIN_EXTENSION_SNAPPY + WT_RET(snappy_extension_init(&conn->iface, NULL)); +#endif +#ifdef HAVE_BUILTIN_EXTENSION_ZLIB + WT_RET(zlib_extension_init(&conn->iface, NULL)); +#endif + return (0); +} + +/* + * __conn_load_extension -- + * WT_CONNECTION->load_extension method. + */ +static int +__conn_load_extension( + WT_CONNECTION *wt_conn, const char *path, const char *config) +{ + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_DLH *dlh; + WT_SESSION_IMPL *session; + int (*load)(WT_CONNECTION *, WT_CONFIG_ARG *); + int is_local; + const char *init_name, *terminate_name; + + dlh = NULL; + init_name = terminate_name = NULL; + is_local = (strcmp(path, "local") == 0); + + conn = (WT_CONNECTION_IMPL *)wt_conn; + CONNECTION_API_CALL(conn, session, load_extension, config, cfg); + + /* + * This assumes the underlying shared libraries are reference counted, + * that is, that re-opening a shared library simply increments a ref + * count, and closing it simply decrements the ref count, and the last + * close discards the reference entirely -- in other words, we do not + * check to see if we've already opened this shared library. + */ + WT_ERR(__wt_dlopen(session, is_local ? NULL : path, &dlh)); + + /* + * Find the load function, remember the unload function for when we + * close. + */ + WT_ERR(__wt_config_gets(session, cfg, "entry", &cval)); + WT_ERR(__wt_strndup(session, cval.str, cval.len, &init_name)); + WT_ERR(__wt_dlsym(session, dlh, init_name, 1, &load)); + + WT_ERR(__wt_config_gets(session, cfg, "terminate", &cval)); + WT_ERR(__wt_strndup(session, cval.str, cval.len, &terminate_name)); + WT_ERR(__wt_dlsym(session, dlh, terminate_name, 0, &dlh->terminate)); + + /* Call the load function last, it simplifies error handling. */ + WT_ERR(load(wt_conn, (WT_CONFIG_ARG *)cfg)); + + /* Link onto the environment's list of open libraries. */ + __wt_spin_lock(session, &conn->api_lock); + TAILQ_INSERT_TAIL(&conn->dlhqh, dlh, q); + __wt_spin_unlock(session, &conn->api_lock); + dlh = NULL; + +err: if (dlh != NULL) + WT_TRET(__wt_dlclose(session, dlh)); + __wt_free(session, init_name); + __wt_free(session, terminate_name); + + API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __conn_load_extensions -- + * Load the list of application-configured extensions. + */ +static int +__conn_load_extensions(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONFIG subconfig; + WT_CONFIG_ITEM cval, skey, sval; + WT_CONNECTION_IMPL *conn; + WT_DECL_ITEM(exconfig); + WT_DECL_ITEM(expath); + WT_DECL_RET; + + conn = S2C(session); + + WT_ERR(__conn_load_default_extensions(conn)); + + WT_ERR(__wt_config_gets(session, cfg, "extensions", &cval)); + WT_ERR(__wt_config_subinit(session, &subconfig, &cval)); + while ((ret = __wt_config_next(&subconfig, &skey, &sval)) == 0) { + if (expath == NULL) + WT_ERR(__wt_scr_alloc(session, 0, &expath)); + WT_ERR(__wt_buf_fmt( + session, expath, "%.*s", (int)skey.len, skey.str)); + if (sval.len > 0) { + if (exconfig == NULL) + WT_ERR(__wt_scr_alloc(session, 0, &exconfig)); + WT_ERR(__wt_buf_fmt(session, + exconfig, "%.*s", (int)sval.len, sval.str)); + } + WT_ERR(conn->iface.load_extension(&conn->iface, + expath->data, (sval.len > 0) ? exconfig->data : NULL)); + } + WT_ERR_NOTFOUND_OK(ret); + +err: __wt_scr_free(&expath); + __wt_scr_free(&exconfig); + + return (ret); +} + +/* + * __conn_add_collator -- + * WT_CONNECTION->add_collator method. + */ +static int +__conn_add_collator(WT_CONNECTION *wt_conn, + const char *name, WT_COLLATOR *collator, const char *config) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_NAMED_COLLATOR *ncoll; + WT_SESSION_IMPL *session; + + ncoll = NULL; + + conn = (WT_CONNECTION_IMPL *)wt_conn; + CONNECTION_API_CALL(conn, session, add_collator, config, cfg); + WT_UNUSED(cfg); + + WT_ERR(__wt_calloc_def(session, 1, &ncoll)); + WT_ERR(__wt_strdup(session, name, &ncoll->name)); + ncoll->collator = collator; + + __wt_spin_lock(session, &conn->api_lock); + TAILQ_INSERT_TAIL(&conn->collqh, ncoll, q); + ncoll = NULL; + __wt_spin_unlock(session, &conn->api_lock); + +err: if (ncoll != NULL) { + __wt_free(session, ncoll->name); + __wt_free(session, ncoll); + } + + API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __wt_conn_remove_collator -- + * Remove collator added by WT_CONNECTION->add_collator, only used + * internally. + */ +int +__wt_conn_remove_collator(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_NAMED_COLLATOR *ncoll; + + conn = S2C(session); + + while ((ncoll = TAILQ_FIRST(&conn->collqh)) != NULL) { + /* Call any termination method. */ + if (ncoll->collator->terminate != NULL) + WT_TRET(ncoll->collator->terminate( + ncoll->collator, (WT_SESSION *)session)); + + /* Remove from the connection's list, free memory. */ + TAILQ_REMOVE(&conn->collqh, ncoll, q); + __wt_free(session, ncoll->name); + __wt_free(session, ncoll); + } + + return (ret); +} + +/* + * __conn_add_compressor -- + * WT_CONNECTION->add_compressor method. + */ +static int +__conn_add_compressor(WT_CONNECTION *wt_conn, + const char *name, WT_COMPRESSOR *compressor, const char *config) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_NAMED_COMPRESSOR *ncomp; + WT_SESSION_IMPL *session; + + WT_UNUSED(name); + WT_UNUSED(compressor); + ncomp = NULL; + + conn = (WT_CONNECTION_IMPL *)wt_conn; + CONNECTION_API_CALL(conn, session, add_compressor, config, cfg); + WT_UNUSED(cfg); + + WT_ERR(__wt_calloc_def(session, 1, &ncomp)); + WT_ERR(__wt_strdup(session, name, &ncomp->name)); + ncomp->compressor = compressor; + + __wt_spin_lock(session, &conn->api_lock); + TAILQ_INSERT_TAIL(&conn->compqh, ncomp, q); + ncomp = NULL; + __wt_spin_unlock(session, &conn->api_lock); + +err: if (ncomp != NULL) { + __wt_free(session, ncomp->name); + __wt_free(session, ncomp); + } + + API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __wt_conn_remove_compressor -- + * remove compressor added by WT_CONNECTION->add_compressor, only used + * internally. + */ +int +__wt_conn_remove_compressor(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_NAMED_COMPRESSOR *ncomp; + + conn = S2C(session); + + while ((ncomp = TAILQ_FIRST(&conn->compqh)) != NULL) { + /* Call any termination method. */ + if (ncomp->compressor->terminate != NULL) + WT_TRET(ncomp->compressor->terminate( + ncomp->compressor, (WT_SESSION *)session)); + + /* Remove from the connection's list, free memory. */ + TAILQ_REMOVE(&conn->compqh, ncomp, q); + __wt_free(session, ncomp->name); + __wt_free(session, ncomp); + } + + return (ret); +} + +/* + * __conn_add_data_source -- + * WT_CONNECTION->add_data_source method. + */ +static int +__conn_add_data_source(WT_CONNECTION *wt_conn, + const char *prefix, WT_DATA_SOURCE *dsrc, const char *config) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_NAMED_DATA_SOURCE *ndsrc; + WT_SESSION_IMPL *session; + + ndsrc = NULL; + + conn = (WT_CONNECTION_IMPL *)wt_conn; + CONNECTION_API_CALL(conn, session, add_data_source, config, cfg); + WT_UNUSED(cfg); + + WT_ERR(__wt_calloc_def(session, 1, &ndsrc)); + WT_ERR(__wt_strdup(session, prefix, &ndsrc->prefix)); + ndsrc->dsrc = dsrc; + + /* Link onto the environment's list of data sources. */ + __wt_spin_lock(session, &conn->api_lock); + TAILQ_INSERT_TAIL(&conn->dsrcqh, ndsrc, q); + ndsrc = NULL; + __wt_spin_unlock(session, &conn->api_lock); + +err: if (ndsrc != NULL) { + __wt_free(session, ndsrc->prefix); + __wt_free(session, ndsrc); + } + + API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __wt_conn_remove_data_source -- + * Remove data source added by WT_CONNECTION->add_data_source. + */ +int +__wt_conn_remove_data_source(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_NAMED_DATA_SOURCE *ndsrc; + + conn = S2C(session); + + while ((ndsrc = TAILQ_FIRST(&conn->dsrcqh)) != NULL) { + /* Call any termination method. */ + if (ndsrc->dsrc->terminate != NULL) + WT_TRET(ndsrc->dsrc->terminate( + ndsrc->dsrc, (WT_SESSION *)session)); + + /* Remove from the connection's list, free memory. */ + TAILQ_REMOVE(&conn->dsrcqh, ndsrc, q); + __wt_free(session, ndsrc->prefix); + __wt_free(session, ndsrc); + } + + return (ret); +} + +/* + * __conn_add_extractor -- + * WT_CONNECTION->add_extractor method. + */ +static int +__conn_add_extractor(WT_CONNECTION *wt_conn, + const char *name, WT_EXTRACTOR *extractor, const char *config) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + WT_UNUSED(name); + WT_UNUSED(extractor); + ret = ENOTSUP; + + conn = (WT_CONNECTION_IMPL *)wt_conn; + CONNECTION_API_CALL(conn, session, add_extractor, config, cfg); + WT_UNUSED(cfg); + +err: API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __conn_async_flush -- + * WT_CONNECTION.async_flush method. + */ +static int +__conn_async_flush(WT_CONNECTION *wt_conn) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + conn = (WT_CONNECTION_IMPL *)wt_conn; + CONNECTION_API_CALL_NOCONF(conn, session, async_flush); + WT_ERR(__wt_async_flush(session)); + +err: API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __conn_async_new_op -- + * WT_CONNECTION.async_new_op method. + */ +static int +__conn_async_new_op(WT_CONNECTION *wt_conn, const char *uri, const char *config, + WT_ASYNC_CALLBACK *callback, WT_ASYNC_OP **asyncopp) +{ + WT_ASYNC_OP_IMPL *op; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + conn = (WT_CONNECTION_IMPL *)wt_conn; + CONNECTION_API_CALL(conn, session, async_new_op, config, cfg); + WT_ERR(__wt_async_new_op(session, uri, config, cfg, callback, &op)); + + *asyncopp = &op->iface; + +err: API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __conn_get_home -- + * WT_CONNECTION.get_home method. + */ +static const char * +__conn_get_home(WT_CONNECTION *wt_conn) +{ + return (((WT_CONNECTION_IMPL *)wt_conn)->home); +} + +/* + * __conn_configure_method -- + * WT_CONNECTION.configure_method method. + */ +static int +__conn_configure_method(WT_CONNECTION *wt_conn, const char *method, + const char *uri, const char *config, const char *type, const char *check) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + conn = (WT_CONNECTION_IMPL *)wt_conn; + CONNECTION_API_CALL_NOCONF(conn, session, configure_method); + + ret = __wt_configure_method(session, method, uri, config, type, check); + +err: API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __conn_is_new -- + * WT_CONNECTION->is_new method. + */ +static int +__conn_is_new(WT_CONNECTION *wt_conn) +{ + return (((WT_CONNECTION_IMPL *)wt_conn)->is_new); +} + +/* + * __conn_close -- + * WT_CONNECTION->close method. + */ +static int +__conn_close(WT_CONNECTION *wt_conn, const char *config) +{ + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION *wt_session; + WT_SESSION_IMPL *s, *session; + uint32_t i; + + conn = (WT_CONNECTION_IMPL *)wt_conn; + + CONNECTION_API_CALL(conn, session, close, config, cfg); + + WT_TRET(__wt_config_gets(session, cfg, "leak_memory", &cval)); + if (cval.val != 0) + F_SET(conn, WT_CONN_LEAK_MEMORY); + +err: /* + * Rollback all running transactions. + * We do this as a separate pass because an active transaction in one + * session could cause trouble when closing a file, even if that + * session never referenced that file. + */ + for (s = conn->sessions, i = 0; i < conn->session_cnt; ++s, ++i) + if (s->active && !F_ISSET(s, WT_SESSION_INTERNAL) && + F_ISSET(&s->txn, TXN_RUNNING)) { + wt_session = &s->iface; + WT_TRET(wt_session->rollback_transaction( + wt_session, NULL)); + } + + /* Close open, external sessions. */ + for (s = conn->sessions, i = 0; i < conn->session_cnt; ++s, ++i) + if (s->active && !F_ISSET(s, WT_SESSION_INTERNAL)) { + wt_session = &s->iface; + /* + * Notify the user that we are closing the session + * handle via the registered close callback. + */ + if (s->event_handler->handle_close != NULL) + WT_TRET(s->event_handler->handle_close( + s->event_handler, wt_session, NULL)); + WT_TRET(wt_session->close(wt_session, config)); + } + + WT_TRET(__wt_connection_close(conn)); + + /* We no longer have a session, don't try to update it. */ + session = NULL; + + API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __conn_reconfigure -- + * WT_CONNECTION->reconfigure method. + */ +static int +__conn_reconfigure(WT_CONNECTION *wt_conn, const char *config) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION_IMPL *session; + const char *p, *config_cfg[] = { NULL, NULL, NULL }; + + conn = (WT_CONNECTION_IMPL *)wt_conn; + + CONNECTION_API_CALL(conn, session, reconfigure, config, cfg); + WT_UNUSED(cfg); + + /* Serialize reconfiguration. */ + __wt_spin_lock(session, &conn->reconfig_lock); + + /* + * The configuration argument has been checked for validity, replace the + * previous connection configuration. + * + * DO NOT merge the configuration before the reconfigure calls. Some + * of the underlying reconfiguration functions do explicit checks with + * the second element of the configuration array, knowing the defaults + * are in slot #1 and the application's modifications are in slot #2. + */ + config_cfg[0] = conn->cfg; + config_cfg[1] = config; + + WT_ERR(__conn_statistics_config(session, config_cfg)); + WT_ERR(__wt_async_reconfig(session, config_cfg)); + WT_ERR(__wt_cache_config(session, config_cfg)); + WT_ERR(__wt_cache_pool_config(session, config_cfg)); + WT_ERR(__wt_checkpoint_server_create(session, config_cfg)); + WT_ERR(__wt_lsm_manager_reconfig(session, config_cfg)); + WT_ERR(__wt_statlog_create(session, config_cfg)); + WT_ERR(__wt_verbose_config(session, config_cfg)); + + WT_ERR(__wt_config_merge(session, config_cfg, &p)); + __wt_free(session, conn->cfg); + conn->cfg = p; + +err: __wt_spin_unlock(session, &conn->reconfig_lock); + + API_END_RET(session, ret); +} + +/* + * __conn_open_session -- + * WT_CONNECTION->open_session method. + */ +static int +__conn_open_session(WT_CONNECTION *wt_conn, + WT_EVENT_HANDLER *event_handler, const char *config, + WT_SESSION **wt_sessionp) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION_IMPL *session, *session_ret; + + *wt_sessionp = NULL; + + conn = (WT_CONNECTION_IMPL *)wt_conn; + session_ret = NULL; + + CONNECTION_API_CALL(conn, session, open_session, config, cfg); + WT_UNUSED(cfg); + + WT_ERR(__wt_open_session(conn, event_handler, config, &session_ret)); + + *wt_sessionp = &session_ret->iface; + +err: API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __conn_config_append -- + * Append an entry to a config stack. + */ +static void +__conn_config_append(const char *cfg[], const char *config) +{ + while (*cfg != NULL) + ++cfg; + *cfg = config; +} + +/* + * __conn_config_check_version -- + * Check if a configuration version isn't compatible. + */ +static int +__conn_config_check_version(WT_SESSION_IMPL *session, const char *config) +{ + WT_CONFIG_ITEM vmajor, vminor; + + /* + * Version numbers aren't included in all configuration strings, but + * we check all of them just in case. Ignore configurations without + * a version. + */ + if (__wt_config_getones( + session, config, "version.major", &vmajor) == WT_NOTFOUND) + return (0); + WT_RET(__wt_config_getones(session, config, "version.minor", &vminor)); + + if (vmajor.val > WIREDTIGER_VERSION_MAJOR || + (vmajor.val == WIREDTIGER_VERSION_MAJOR && + vminor.val > WIREDTIGER_VERSION_MINOR)) + WT_RET_MSG(session, ENOTSUP, + "WiredTiger configuration is from an incompatible release " + "of the WiredTiger engine"); + + return (0); +} + +/* + * __conn_config_file -- + * Read WiredTiger config files from the home directory. + */ +static int +__conn_config_file(WT_SESSION_IMPL *session, + const char *filename, int is_user, const char **cfg, WT_ITEM *cbuf) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_FH *fh; + size_t len; + wt_off_t size; + int exist, quoted; + char *p, *t; + + conn = S2C(session); + fh = NULL; + + /* Configuration files are always optional. */ + WT_RET(__wt_exist(session, filename, &exist)); + if (!exist) + return (0); + + /* + * The base configuration should not exist if we are creating this + * database. + */ + if (!is_user && conn->is_new) + WT_RET_MSG(session, EINVAL, + "%s exists before database creation", filename); + + /* Open the configuration file. */ + WT_RET(__wt_open(session, filename, 0, 0, 0, &fh)); + WT_ERR(__wt_filesize(session, fh, &size)); + if (size == 0) + goto err; + + /* + * Sanity test: a 100KB configuration file would be insane. (There's + * no practical reason to limit the file size, but I can either limit + * the file size to something rational, or add code to test if the + * wt_off_t size is larger than a uint32_t, which is more complicated + * and a waste of time.) + */ + if (size > 100 * 1024) + WT_ERR_MSG( + session, EFBIG, "Configuration file too big: %s", filename); + len = (size_t)size; + + /* + * Copy the configuration file into memory, with a little slop, I'm not + * interested in debugging off-by-ones. + * + * The beginning of a file is the same as if we run into an unquoted + * newline character, simplify the parsing loop by pretending that's + * what we're doing. + */ + WT_ERR(__wt_buf_init(session, cbuf, len + 10)); + WT_ERR(__wt_read( + session, fh, (wt_off_t)0, len, ((uint8_t *)cbuf->mem) + 1)); + ((uint8_t *)cbuf->mem)[0] = '\n'; + cbuf->size = len + 1; + + /* + * Collapse the file's lines into a single string: newline characters + * are replaced with commas unless the newline is quoted or backslash + * escaped. Comment lines (an unescaped newline where the next non- + * white-space character is a hash), are discarded. + */ + for (quoted = 0, p = t = cbuf->mem; len > 0;) { + /* + * Backslash pairs pass through untouched, unless immediately + * preceding a newline, in which case both the backslash and + * the newline are discarded. Backslash characters escape + * quoted characters, too, that is, a backslash followed by a + * quote doesn't start or end a quoted string. + */ + if (*p == '\\' && len > 1) { + if (p[1] != '\n') { + *t++ = p[0]; + *t++ = p[1]; + } + p += 2; + len -= 2; + continue; + } + + /* + * If we're in a quoted string, or starting a quoted string, + * take all characters, including white-space and newlines. + */ + if (quoted || *p == '"') { + if (*p == '"') + quoted = !quoted; + *t++ = *p++; + --len; + continue; + } + + /* Everything else gets taken, except for newline characters. */ + if (*p != '\n') { + *t++ = *p++; + --len; + continue; + } + + /* + * Replace any newline characters with commas (and strings of + * commas are safe). + * + * After any newline, skip to a non-white-space character; if + * the next character is a hash mark, skip to the next newline. + */ + for (;;) { + for (*t++ = ','; --len > 0 && isspace(*++p);) + ; + if (len == 0) + break; + if (*p != '#') + break; + while (--len > 0 && *++p != '\n') + ; + if (len == 0) + break; + } + } + *t = '\0'; + cbuf->size = WT_PTRDIFF(t, cbuf->data); + + /* Check any version. */ + WT_ERR(__conn_config_check_version(session, cbuf->data)); + + /* Upgrade the configuration string. */ + WT_ERR(__wt_config_upgrade(session, cbuf)); + + /* Check the configuration information. */ + WT_ERR(__wt_config_check(session, is_user ? + WT_CONFIG_REF(session, wiredtiger_open_usercfg) : + WT_CONFIG_REF(session, wiredtiger_open_basecfg), cbuf->data, 0)); + + /* Append it to the stack. */ + __conn_config_append(cfg, cbuf->data); + +err: if (fh != NULL) + WT_TRET(__wt_close(session, fh)); + return (ret); +} + +/* + * __conn_config_env -- + * Read configuration from an environment variable, if set. + */ +static int +__conn_config_env(WT_SESSION_IMPL *session, const char *cfg[], WT_ITEM *cbuf) +{ + WT_CONFIG_ITEM cval; + const char *env_config; + size_t len; + + if ((env_config = getenv("WIREDTIGER_CONFIG")) == NULL) + return (0); + len = strlen(env_config); + if (len == 0) + return (0); + WT_RET(__wt_buf_set(session, cbuf, env_config, len + 1)); + + /* + * Security stuff: + * + * If the "use_environment_priv" configuration string is set, use the + * environment variable if the process has appropriate privileges. + */ + WT_RET(__wt_config_gets(session, cfg, "use_environment_priv", &cval)); + if (cval.val == 0 && __wt_has_priv()) + WT_RET_MSG(session, WT_ERROR, "%s", + "WIREDTIGER_CONFIG environment variable set but process " + "lacks privileges to use that environment variable"); + + /* Check any version. */ + WT_RET(__conn_config_check_version(session, env_config)); + + /* Upgrade the configuration string. */ + WT_RET(__wt_config_upgrade(session, cbuf)); + + /* Check the configuration information. */ + WT_RET(__wt_config_check(session, + WT_CONFIG_REF(session, wiredtiger_open), env_config, 0)); + + /* Append it to the stack. */ + __conn_config_append(cfg, env_config); + + return (0); +} + +/* + * __conn_home -- + * Set the database home directory. + */ +static int +__conn_home(WT_SESSION_IMPL *session, const char *home, const char *cfg[]) +{ + WT_CONFIG_ITEM cval; + + /* If the application specifies a home directory, use it. */ + if (home != NULL) + goto copy; + + /* If there's no WIREDTIGER_HOME environment variable, use ".". */ + if ((home = getenv("WIREDTIGER_HOME")) == NULL || strlen(home) == 0) { + home = "."; + goto copy; + } + + /* + * Security stuff: + * + * Unless the "use_environment_priv" configuration string is set, + * fail if the process is running with special privileges. + */ + WT_RET(__wt_config_gets(session, cfg, "use_environment_priv", &cval)); + if (cval.val == 0 && __wt_has_priv()) + WT_RET_MSG(session, WT_ERROR, "%s", + "WIREDTIGER_HOME environment variable set but process " + "lacks privileges to use that environment variable"); + +copy: return (__wt_strdup(session, home, &S2C(session)->home)); +} + +/* + * __conn_single -- + * Confirm that no other thread of control is using this database. + */ +static int +__conn_single(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn, *t; + WT_DECL_RET; + WT_FH *fh; + size_t len; + wt_off_t size; + char buf[256]; + + conn = S2C(session); + fh = NULL; + + __wt_spin_lock(session, &__wt_process.spinlock); + + /* + * We first check for other threads of control holding a lock on this + * database, because the byte-level locking functions are based on the + * POSIX 1003.1 fcntl APIs, which require all locks associated with a + * file for a given process are removed when any file descriptor for + * the file is closed by that process. In other words, we can't open a + * file handle on the lock file until we are certain that closing that + * handle won't discard the owning thread's lock. Applications hopefully + * won't open a database in multiple threads, but we don't want to have + * it fail the first time, but succeed the second. + */ + TAILQ_FOREACH(t, &__wt_process.connqh, q) + if (t->home != NULL && + t != conn && strcmp(t->home, conn->home) == 0) { + ret = EBUSY; + break; + } + if (ret != 0) + WT_ERR_MSG(session, EBUSY, + "WiredTiger database is already being managed by another " + "thread in this process"); + + /* + * !!! + * Be careful changing this code. + * + * We locked the WiredTiger file before release 2.3.2; a separate lock + * file was added after 2.3.1 because hot backup has to copy the + * WiredTiger file and system utilities on Windows can't copy locked + * files. + * + * For this reason, we don't use the lock file's existence to decide if + * we're creating the database or not, use the WiredTiger file instead, + * it has existed in every version of WiredTiger. + * + * Additionally, avoid an upgrade race: a 2.3.1 release process might + * have the WiredTiger file locked, and we're going to create the lock + * file and lock it instead. For this reason, first acquire a lock on + * the lock file and then a lock on the WiredTiger file, then release + * the latter so hot backups can proceed. (If someone were to run a + * current release and subsequently a historic release, we could still + * fail because the historic release will ignore our lock file and will + * then successfully lock the WiredTiger file, but I can't think of any + * way to fix that.) + * + * Open the WiredTiger lock file, creating it if it doesn't exist. (I'm + * not removing the lock file if we create it and subsequently fail, it + * isn't simple to detect that case, and there's no risk other than a + * useless file being left in the directory.) + */ + WT_ERR(__wt_open(session, WT_SINGLETHREAD, 1, 0, 0, &conn->lock_fh)); + + /* + * Lock a byte of the file: if we don't get the lock, some other process + * is holding it, we're done. The file may be zero-length, and that's + * OK, the underlying call supports locking past the end-of-file. + */ + if (__wt_bytelock(conn->lock_fh, (wt_off_t)0, 1) != 0) + WT_ERR_MSG(session, EBUSY, + "WiredTiger database is already being managed by another " + "process"); + + /* + * If the size of the lock file is 0, we created it (or we won a locking + * race with the thread that created it, it doesn't matter). + * + * Write something into the file, zero-length files make me nervous. + */ + WT_ERR(__wt_filesize(session, conn->lock_fh, &size)); + if (size == 0) { +#define WT_SINGLETHREAD_STRING "WiredTiger lock file\n" + WT_ERR(__wt_write(session, conn->lock_fh, (wt_off_t)0, + strlen(WT_SINGLETHREAD_STRING), WT_SINGLETHREAD_STRING)); + } + + /* We own the lock file, optionally create the WiredTiger file. */ + WT_ERR(__wt_config_gets(session, cfg, "create", &cval)); + WT_ERR(__wt_open(session, + WT_WIREDTIGER, cval.val == 0 ? 0 : 1, 0, 0, &fh)); + + /* + * Lock the WiredTiger file (for backward compatibility reasons as + * described above). Immediately release the lock, it's just a test. + */ + if (__wt_bytelock(fh, (wt_off_t)0, 1) != 0) { + WT_ERR_MSG(session, EBUSY, + "WiredTiger database is already being managed by another " + "process"); + } + WT_ERR(__wt_bytelock(fh, (wt_off_t)0, 0)); + + /* + * If the size of the file is zero, we created it, fill it in. If the + * size of the file is non-zero, fail if configured for exclusivity. + */ + WT_ERR(__wt_filesize(session, fh, &size)); + if (size == 0) { + len = (size_t)snprintf(buf, sizeof(buf), + "%s\n%s\n", WT_WIREDTIGER, WIREDTIGER_VERSION_STRING); + WT_ERR(__wt_write(session, fh, (wt_off_t)0, len, buf)); + + conn->is_new = 1; + } else { + WT_ERR(__wt_config_gets(session, cfg, "exclusive", &cval)); + if (cval.val != 0) + WT_ERR_MSG(session, EEXIST, + "WiredTiger database already exists and exclusive " + "option configured"); + + conn->is_new = 0; + } + +err: /* + * We ignore the connection's lock file handle on error, it will be + * closed when the connection structure is destroyed. + */ + if (fh != NULL) + WT_TRET(__wt_close(session, fh)); + + __wt_spin_unlock(session, &__wt_process.spinlock); + return (ret); +} + +/* + * __conn_statistics_config -- + * Set statistics configuration. + */ +static int +__conn_statistics_config(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONFIG_ITEM cval, sval; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + uint32_t flags; + int set; + + conn = S2C(session); + + WT_RET(__wt_config_gets(session, cfg, "statistics", &cval)); + + flags = 0; + set = 0; + if ((ret = __wt_config_subgets( + session, &cval, "none", &sval)) == 0 && sval.val != 0) { + LF_SET(WT_CONN_STAT_NONE); + ++set; + } + WT_RET_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets( + session, &cval, "fast", &sval)) == 0 && sval.val != 0) { + LF_SET(WT_CONN_STAT_FAST); + ++set; + } + WT_RET_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets( + session, &cval, "all", &sval)) == 0 && sval.val != 0) { + LF_SET(WT_CONN_STAT_ALL | WT_CONN_STAT_FAST); + ++set; + } + WT_RET_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets( + session, &cval, "clear", &sval)) == 0 && sval.val != 0) + LF_SET(WT_CONN_STAT_CLEAR); + WT_RET_NOTFOUND_OK(ret); + + if (set > 1) + WT_RET_MSG(session, EINVAL, + "only one statistics configuration value may be specified"); + + /* Configuring statistics clears any existing values. */ + conn->stat_flags = flags; + + return (0); +} + +/* Simple structure for name and flag configuration searches. */ +typedef struct { + const char *name; + uint32_t flag; +} WT_NAME_FLAG; + +/* + * __wt_verbose_config -- + * Set verbose configuration. + */ +int +__wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) +{ + static const WT_NAME_FLAG verbtypes[] = { + { "api", WT_VERB_API }, + { "block", WT_VERB_BLOCK }, + { "checkpoint", WT_VERB_CHECKPOINT }, + { "compact", WT_VERB_COMPACT }, + { "evict", WT_VERB_EVICT }, + { "evictserver", WT_VERB_EVICTSERVER }, + { "fileops", WT_VERB_FILEOPS }, + { "log", WT_VERB_LOG }, + { "lsm", WT_VERB_LSM }, + { "metadata", WT_VERB_METADATA }, + { "mutex", WT_VERB_MUTEX }, + { "overflow", WT_VERB_OVERFLOW }, + { "read", WT_VERB_READ }, + { "reconcile", WT_VERB_RECONCILE }, + { "recovery", WT_VERB_RECOVERY }, + { "salvage", WT_VERB_SALVAGE }, + { "shared_cache", WT_VERB_SHARED_CACHE }, + { "split", WT_VERB_SPLIT }, + { "temporary", WT_VERB_TEMPORARY }, + { "transaction", WT_VERB_TRANSACTION }, + { "verify", WT_VERB_VERIFY }, + { "version", WT_VERB_VERSION }, + { "write", WT_VERB_WRITE }, + { NULL, 0 } + }; + WT_CONFIG_ITEM cval, sval; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + const WT_NAME_FLAG *ft; + uint32_t flags; + + conn = S2C(session); + + WT_RET(__wt_config_gets(session, cfg, "verbose", &cval)); + + flags = 0; + for (ft = verbtypes; ft->name != NULL; ft++) { + if ((ret = __wt_config_subgets( + session, &cval, ft->name, &sval)) == 0 && sval.val != 0) { +#ifdef HAVE_VERBOSE + LF_SET(ft->flag); +#else + WT_RET_MSG(session, EINVAL, + "Verbose option specified when WiredTiger built " + "without verbose support. Add --enable-verbose to " + "configure command and rebuild to include support " + "for verbose messages"); +#endif + } + WT_RET_NOTFOUND_OK(ret); + } + + conn->verbose = flags; + return (0); +} + +/* + * __conn_write_config -- + * Save the configuration used to create a database. + */ +static int +__conn_write_config( + WT_SESSION_IMPL *session, const char *filename, const char *cfg[]) +{ + FILE *fp; + WT_CONFIG parser; + WT_CONFIG_ITEM k, v; + WT_DECL_RET; + char *path; + + /* + * We were passed an array of configuration strings where slot 0 is all + * all possible values and the second and subsequent slots are changes + * specified by the application during open (using the wiredtiger_open + * configuration string, an environment variable, or user-configuration + * file). The base configuration file contains all changes to default + * settings made at create, and we include the user-configuration file + * in that list, even though we don't expect it to change. Of course, + * an application could leave that file as it is right now and not + * remove a configuration we need, but applications can also guarantee + * all database users specify consistent environment variables and + * wiredtiger_open configuration arguments, and if we protect against + * those problems, might as well include the application's configuration + * file as well. + * + * If there is no configuration, don't bother creating an empty file. + */ + if (cfg[1] == NULL) + return (0); + + WT_RET(__wt_filename(session, filename, &path)); + if ((fp = fopen(path, "w")) == NULL) + ret = __wt_errno(); + __wt_free(session, path); + if (fp == NULL) + return (ret); + + fprintf(fp, "%s\n\n", + "# Do not modify this file.\n" + "#\n" + "# WiredTiger created this file when the database was created,\n" + "# to store persistent database settings. Instead of changing\n" + "# these settings, set a WIREDTIGER_CONFIG environment variable\n" + "# or create a WiredTiger.config file to override them."); + + fprintf(fp, "version=(major=%d,minor=%d)\n\n", + WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR); + + /* + * We want the list of defaults that have been changed, that is, if the + * application didn't somehow configure a setting, we don't write out a + * default value, so future releases may silently migrate to new default + * values. + */ + while (*++cfg != NULL) { + WT_ERR(__wt_config_init( session, + &parser, WT_CONFIG_BASE(session, wiredtiger_open_basecfg))); + while ((ret = __wt_config_next(&parser, &k, &v)) == 0) { + if ((ret = + __wt_config_getone(session, *cfg, &k, &v)) == 0) { + /* Fix quoting for non-trivial settings. */ + if (v.type == WT_CONFIG_ITEM_STRING) { + --v.str; + v.len += 2; + } + fprintf(fp, "%.*s=%.*s\n", + (int)k.len, k.str, (int)v.len, v.str); + } + WT_ERR_NOTFOUND_OK(ret); + } + WT_ERR_NOTFOUND_OK(ret); + } + +err: WT_TRET(fclose(fp)); + + /* Don't leave a damaged file in place. */ + if (ret != 0) + (void)__wt_remove(session, filename); + + return (ret); +} + +/* + * wiredtiger_open -- + * Main library entry point: open a new connection to a WiredTiger + * database. + */ +int +wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, + const char *config, WT_CONNECTION **wt_connp) +{ + static const WT_CONNECTION stdc = { + __conn_async_flush, + __conn_async_new_op, + __conn_close, + __conn_reconfigure, + __conn_get_home, + __conn_configure_method, + __conn_is_new, + __conn_open_session, + __conn_load_extension, + __conn_add_data_source, + __conn_add_collator, + __conn_add_compressor, + __conn_add_extractor, + __conn_get_extension_api + }; + static const WT_NAME_FLAG file_types[] = { + { "checkpoint", WT_FILE_TYPE_CHECKPOINT }, + { "data", WT_FILE_TYPE_DATA }, + { "log", WT_FILE_TYPE_LOG }, + { NULL, 0 } + }; + + WT_CONFIG_ITEM cval, sval; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_ITEM i1, i2, i3; + const WT_NAME_FLAG *ft; + WT_SESSION_IMPL *session; + + /* Leave space for optional additional configuration. */ + const char *cfg[] = { NULL, NULL, NULL, NULL, NULL, NULL }; + + *wt_connp = NULL; + + conn = NULL; + session = NULL; + + /* + * We could use scratch buffers, but I'd rather the default session + * not tie down chunks of memory past the open call. + */ + WT_CLEAR(i1); + WT_CLEAR(i2); + WT_CLEAR(i3); + + WT_RET(__wt_library_init()); + + WT_RET(__wt_calloc_def(NULL, 1, &conn)); + conn->iface = stdc; + + /* + * Immediately link the structure into the connection structure list: + * the only thing ever looked at on that list is the database name, + * and a NULL value is fine. + */ + __wt_spin_lock(NULL, &__wt_process.spinlock); + TAILQ_INSERT_TAIL(&__wt_process.connqh, conn, q); + __wt_spin_unlock(NULL, &__wt_process.spinlock); + + session = conn->default_session = &conn->dummy_session; + session->iface.connection = &conn->iface; + session->name = "wiredtiger_open"; + __wt_random_init(session->rnd); + __wt_event_handler_set(session, event_handler); + + /* Remaining basic initialization of the connection structure. */ + WT_ERR(__wt_connection_init(conn)); + + /* Check/set the application-specified configuration string. */ + WT_ERR(__wt_config_check(session, + WT_CONFIG_REF(session, wiredtiger_open), config, 0)); + cfg[0] = WT_CONFIG_BASE(session, wiredtiger_open); + cfg[1] = config; + + /* Configure error messages so we get them right early. */ + WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval)); + if (cval.len != 0) + WT_ERR(__wt_strndup( + session, cval.str, cval.len, &conn->error_prefix)); + + /* Get the database home. */ + WT_ERR(__conn_home(session, home, cfg)); + + /* Make sure no other thread of control already owns this database. */ + WT_ERR(__conn_single(session, cfg)); + + /* + * Build the configuration stack, in the following order (where later + * entries override earlier entries): + * + * 1. all possible wiredtiger_open configurations + * 2. base configuration file, created with the database (optional) + * 3. the config passed in by the application. + * 4. user configuration file (optional) + * 5. environment variable settings (optional) + * + * Clear the entries we added to the stack, we're going to build it in + * order. + */ + cfg[0] = WT_CONFIG_BASE(session, wiredtiger_open_all); + cfg[1] = NULL; + WT_ERR(__conn_config_file(session, WT_BASECONFIG, 0, cfg, &i1)); + __conn_config_append(cfg, config); + WT_ERR(__conn_config_file(session, WT_USERCONFIG, 1, cfg, &i2)); + WT_ERR(__conn_config_env(session, cfg, &i3)); + + /* + * Configuration ... + * + * We can't open sessions yet, so any configurations that cause + * sessions to be opened must be handled inside __wt_connection_open. + * + * The error message configuration might have changed (if set in a + * configuration file, and not in the application's configuration + * string), get it again. Do it first, make error messages correct. + */ + WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval)); + if (cval.len != 0) { + __wt_free(session, conn->error_prefix); + WT_ERR(__wt_strndup( + session, cval.str, cval.len, &conn->error_prefix)); + } + + WT_ERR(__wt_config_gets(session, cfg, "hazard_max", &cval)); + conn->hazard_max = (uint32_t)cval.val; + + WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval)); + conn->session_size = (uint32_t)cval.val + WT_NUM_INTERNAL_SESSIONS; + + WT_ERR(__wt_config_gets(session, cfg, "checkpoint_sync", &cval)); + if (cval.val) + F_SET(conn, WT_CONN_CKPT_SYNC); + + WT_ERR(__wt_config_gets(session, cfg, "buffer_alignment", &cval)); + if (cval.val == -1) + conn->buffer_alignment = WT_BUFFER_ALIGNMENT_DEFAULT; + else + conn->buffer_alignment = (size_t)cval.val; +#ifndef HAVE_POSIX_MEMALIGN + if (conn->buffer_alignment != 0) + WT_ERR_MSG(session, EINVAL, + "buffer_alignment requires posix_memalign"); +#endif + + WT_ERR(__wt_config_gets(session, cfg, "direct_io", &cval)); + for (ft = file_types; ft->name != NULL; ft++) { + ret = __wt_config_subgets(session, &cval, ft->name, &sval); + if (ret == 0) { + if (sval.val) + FLD_SET(conn->direct_io, ft->flag); + } else if (ret != WT_NOTFOUND) + goto err; + } + + WT_ERR(__wt_config_gets(session, cfg, "file_extend", &cval)); + for (ft = file_types; ft->name != NULL; ft++) { + ret = __wt_config_subgets(session, &cval, ft->name, &sval); + if (ret == 0) { + switch (ft->flag) { + case WT_FILE_TYPE_DATA: + conn->data_extend_len = sval.val; + break; + case WT_FILE_TYPE_LOG: + conn->log_extend_len = sval.val; + break; + } + } else if (ret != WT_NOTFOUND) + goto err; + } + + WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval)); + conn->mmap = cval.val == 0 ? 0 : 1; + + WT_ERR(__conn_statistics_config(session, cfg)); + WT_ERR(__wt_lsm_manager_config(session, cfg)); + WT_ERR(__wt_verbose_config(session, cfg)); + + /* Now that we know if verbose is configured, output the version. */ + WT_ERR(__wt_verbose( + session, WT_VERB_VERSION, "%s", WIREDTIGER_VERSION_STRING)); + + /* + * Open the connection, then reset the local session as the real one + * was allocated in __wt_connection_open. + */ + WT_ERR(__wt_connection_open(conn, cfg)); + session = conn->default_session; + + /* + * Check on the turtle and metadata files, creating them if necessary + * (which avoids application threads racing to create the metadata file + * later). Once the metadata file exists, get a reference to it in + * the connection's session. + */ + WT_ERR(__wt_turtle_init(session)); + WT_ERR(__wt_metadata_open(session)); + + /* + * Load the extensions after initialization completes; extensions expect + * everything else to be in place, and the extensions call back into the + * library. + */ + WT_ERR(__conn_load_extensions(session, cfg)); + + /* + * We've completed configuration, write the base configuration file if + * we're creating the database. + */ + if (conn->is_new) { + WT_ERR(__wt_config_gets(session, cfg, "config_base", &cval)); + if (cval.val) + WT_ERR( + __conn_write_config(session, WT_BASECONFIG, cfg)); + } + + /* + * Start the worker threads last. + */ + WT_ERR(__wt_connection_workers(session, cfg)); + + /* Merge the final configuration for later reconfiguration. */ + WT_ERR(__wt_config_merge(session, cfg, &conn->cfg)); + + WT_STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0); + *wt_connp = &conn->iface; + +err: /* Discard the configuration strings. */ + __wt_buf_free(session, &i1); + __wt_buf_free(session, &i2); + __wt_buf_free(session, &i3); + + if (ret != 0 && conn != NULL) + WT_TRET(__wt_connection_close(conn)); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/conn/conn_cache.c b/src/third_party/wiredtiger/src/conn/conn_cache.c new file mode 100644 index 00000000000..079bd05ff1e --- /dev/null +++ b/src/third_party/wiredtiger/src/conn/conn_cache.c @@ -0,0 +1,174 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_cache_config -- + * Configure the underlying cache. + */ +int +__wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CACHE *cache; + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + cache = conn->cache; + + /* + * If not using a shared cache configure the cache size, otherwise + * check for a reserved size. + */ + if (!F_ISSET(conn, WT_CONN_CACHE_POOL)) { + WT_RET(__wt_config_gets(session, cfg, "cache_size", &cval)); + conn->cache_size = (uint64_t)cval.val; + } else { + WT_RET(__wt_config_gets( + session, cfg, "shared_cache.reserve", &cval)); + if (cval.val == 0) + WT_RET(__wt_config_gets( + session, cfg, "shared_cache.chunk", &cval)); + cache->cp_reserved = (uint64_t)cval.val; + } + + WT_RET(__wt_config_gets(session, cfg, "eviction_target", &cval)); + cache->eviction_target = (u_int)cval.val; + + WT_RET(__wt_config_gets(session, cfg, "eviction_trigger", &cval)); + cache->eviction_trigger = (u_int)cval.val; + + WT_RET(__wt_config_gets(session, cfg, "eviction_dirty_target", &cval)); + cache->eviction_dirty_target = (u_int)cval.val; + + /* + * The eviction thread configuration options include the main eviction + * thread and workers. Our implementation splits them out. Adjust for + * the difference when parsing the configuration. + */ + WT_RET(__wt_config_gets(session, cfg, "eviction.threads_max", &cval)); + WT_ASSERT(session, cval.val > 0); + conn->evict_workers_max = (u_int)cval.val - 1; + + WT_RET(__wt_config_gets(session, cfg, "eviction.threads_min", &cval)); + WT_ASSERT(session, cval.val > 0); + conn->evict_workers_min = (u_int)cval.val - 1; + + if (conn->evict_workers_min > conn->evict_workers_max) + WT_RET_MSG(session, EINVAL, + "eviction=(threads_min) cannot be greater than " + "eviction=(threads_max)"); + + return (0); +} + +/* + * __wt_cache_create -- + * Create the underlying cache. + */ +int +__wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + conn = S2C(session); + + WT_ASSERT(session, conn->cache == NULL || + (F_ISSET(conn, WT_CONN_CACHE_POOL) && conn->cache != NULL)); + + WT_RET(__wt_calloc_def(session, 1, &conn->cache)); + + cache = conn->cache; + + /* Use a common routine for run-time configuration options. */ + WT_RET(__wt_cache_config(session, cfg)); + + /* Add the configured cache to the cache pool. */ + if (F_ISSET(conn, WT_CONN_CACHE_POOL)) + WT_RET(__wt_conn_cache_pool_open(session)); + + /* + * The target size must be lower than the trigger size or we will never + * get any work done. + */ + if (cache->eviction_target >= cache->eviction_trigger) + WT_ERR_MSG(session, EINVAL, + "eviction target must be lower than the eviction trigger"); + + WT_ERR(__wt_cond_alloc(session, + "cache eviction server", 0, &cache->evict_cond)); + WT_ERR(__wt_cond_alloc(session, + "eviction waiters", 0, &cache->evict_waiter_cond)); + WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction")); + WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk")); + + /* Allocate the LRU eviction queue. */ + cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR; + WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict)); + + /* + * We get/set some values in the cache statistics (rather than have + * two copies), configure them. + */ + __wt_cache_stats_update(session); + return (0); + +err: WT_RET(__wt_cache_destroy(session)); + return (ret); +} + +/* + * __wt_cache_stats_update -- + * Update the cache statistics for return to the application. + */ +void +__wt_cache_stats_update(WT_SESSION_IMPL *session) +{ + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + WT_CONNECTION_STATS *stats; + + conn = S2C(session); + cache = conn->cache; + stats = &conn->stats; + + WT_STAT_SET(stats, cache_bytes_max, conn->cache_size); + WT_STAT_SET(stats, cache_bytes_inuse, __wt_cache_bytes_inuse(cache)); + WT_STAT_SET(stats, cache_pages_inuse, __wt_cache_pages_inuse(cache)); + WT_STAT_SET(stats, cache_bytes_dirty, cache->bytes_dirty); + WT_STAT_SET(stats, cache_pages_dirty, cache->pages_dirty); +} + +/* + * __wt_cache_destroy -- + * Discard the underlying cache. + */ +int +__wt_cache_destroy(WT_SESSION_IMPL *session) +{ + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + conn = S2C(session); + cache = conn->cache; + + if (cache == NULL) + return (0); + + WT_TRET(__wt_cond_destroy(session, &cache->evict_cond)); + WT_TRET(__wt_cond_destroy(session, &cache->evict_waiter_cond)); + __wt_spin_destroy(session, &cache->evict_lock); + __wt_spin_destroy(session, &cache->evict_walk_lock); + + __wt_free(session, cache->evict); + __wt_free(session, conn->cache); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c new file mode 100644 index 00000000000..ba80ac15267 --- /dev/null +++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c @@ -0,0 +1,639 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * Tuning constants. + */ +/* Threshold when a connection is allocated more cache */ +#define WT_CACHE_POOL_BUMP_THRESHOLD 6 +/* Threshold when a connection is allocated less cache */ +#define WT_CACHE_POOL_REDUCE_THRESHOLD 2 +/* Balancing passes after a bump before a connection is a candidate. */ +#define WT_CACHE_POOL_BUMP_SKIPS 10 +/* Balancing passes after a reduction before a connection is a candidate. */ +#define WT_CACHE_POOL_REDUCE_SKIPS 5 + +static int __cache_pool_adjust(WT_SESSION_IMPL *, uint64_t, uint64_t, int *); +static int __cache_pool_assess(WT_SESSION_IMPL *, uint64_t *); +static int __cache_pool_balance(WT_SESSION_IMPL *); + +/* + * __wt_cache_pool_config -- + * Parse and setup the cache pool options. + */ +int +__wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) +{ + WT_CACHE_POOL *cp; + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn, *entry; + WT_DECL_RET; + char *pool_name; + int created, reconfiguring; + uint64_t chunk, reserve, size, used_cache; + + conn = S2C(session); + created = reconfiguring = 0; + pool_name = NULL; + cp = NULL; + size = 0; + + if (F_ISSET(conn, WT_CONN_CACHE_POOL)) + reconfiguring = 1; + else { + WT_RET( + __wt_config_gets(session, cfg, "shared_cache.name", &cval)); + if (cval.len == 0) { + /* + * Tell the user if they configured some shared cache + * settings, but didn't enable it by naming it. + */ + if (__wt_config_gets(session, + &cfg[1], "shared_cache", &cval) != WT_NOTFOUND) + WT_RET_MSG(session, EINVAL, + "Shared cache configuration requires a " + "pool name"); + return (0); + } + if (__wt_config_gets(session, + &cfg[1], "cache_size", &cval) != WT_NOTFOUND) + WT_RET_MSG(session, EINVAL, + "Only one of cache_size and shared_cache can be " + "in the configuration"); + + /* + * NOTE: The allocations made when configuring and opening a + * cache pool don't really belong to the connection that + * allocates them. If a memory allocator becomes connection + * specific in the future we will need a way to allocate memory + * outside of the connection here. + */ + WT_RET(__wt_strndup(session, cval.str, cval.len, &pool_name)); + } + + __wt_spin_lock(session, &__wt_process.spinlock); + if (__wt_process.cache_pool == NULL) { + WT_ASSERT(session, !reconfiguring); + /* Create a cache pool. */ + WT_ERR(__wt_calloc_def(session, 1, &cp)); + created = 1; + cp->name = pool_name; + pool_name = NULL; /* Belongs to the cache pool now. */ + TAILQ_INIT(&cp->cache_pool_qh); + WT_ERR(__wt_spin_init( + session, &cp->cache_pool_lock, "cache shared pool")); + WT_ERR(__wt_cond_alloc(session, + "cache pool server", 0, &cp->cache_pool_cond)); + + __wt_process.cache_pool = cp; + WT_ERR(__wt_verbose(session, + WT_VERB_SHARED_CACHE, "Created cache pool %s", cp->name)); + } else if (!reconfiguring && !WT_STRING_MATCH( + __wt_process.cache_pool->name, pool_name, strlen(pool_name))) + /* Only a single cache pool is supported. */ + WT_ERR_MSG(session, WT_ERROR, + "Attempting to join a cache pool that does not exist: %s", + pool_name); + + cp = __wt_process.cache_pool; + + /* + * The cache pool requires a reference count to avoid a race between + * configuration/open and destroy. + */ + if (!reconfiguring) + ++cp->refs; + + /* + * Cache pool configurations are optional when not creating. If + * values aren't being changed, retrieve the current value so that + * validation of settings works. + */ + if (!created) { + if (__wt_config_gets(session, &cfg[1], + "shared_cache.size", &cval) == 0 && cval.val != 0) + size = (uint64_t)cval.val; + else + size = cp->size; + if (__wt_config_gets(session, &cfg[1], + "shared_cache.chunk", &cval) == 0 && cval.val != 0) + chunk = (uint64_t)cval.val; + else + chunk = cp->chunk; + } else { + /* + * The only time shared cache configuration uses default + * values is when we are creating the pool. + */ + WT_ERR(__wt_config_gets( + session, cfg, "shared_cache.size", &cval)); + WT_ASSERT(session, cval.val != 0); + size = (uint64_t)cval.val; + WT_ERR(__wt_config_gets( + session, cfg, "shared_cache.chunk", &cval)); + WT_ASSERT(session, cval.val != 0); + chunk = (uint64_t)cval.val; + } + + /* + * Retrieve the reserve size here for validation of configuration. + * Don't save it yet since the connections cache is not created if + * we are opening. Cache configuration is responsible for saving the + * setting. + * The different conditions when reserved size are set are: + * - It's part of the users configuration - use that value. + * - We are reconfiguring - keep the previous value. + * - We are joining a cache pool for the first time (including + * creating the pool) - use the chunk size; that's the default. + */ + if (__wt_config_gets(session, &cfg[1], + "shared_cache.reserve", &cval) == 0 && cval.val != 0) + reserve = (uint64_t)cval.val; + else if (reconfiguring) + reserve = conn->cache->cp_reserved; + else + reserve = chunk; + + /* + * Validate that size and reserve values don't cause the cache + * pool to be over subscribed. + */ + used_cache = 0; + if (!created) { + TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) + used_cache += entry->cache->cp_reserved; + } + if (used_cache + reserve > size) + WT_ERR_MSG(session, EINVAL, + "Shared cache unable to accommodate this configuration. " + "Shared cache size: %" PRIu64 ", reserved: %" PRIu64, + size, used_cache + reserve); + + /* The configuration is verified - it's safe to update the pool. */ + cp->size = size; + cp->chunk = chunk; + + /* Wake up the cache pool server so any changes are noticed. */ + if (reconfiguring) + WT_ERR(__wt_cond_signal( + session, __wt_process.cache_pool->cache_pool_cond)); + + WT_ERR(__wt_verbose(session, WT_VERB_SHARED_CACHE, + "Configured cache pool %s. Size: %" PRIu64 + ", chunk size: %" PRIu64, cp->name, cp->size, cp->chunk)); + + F_SET(conn, WT_CONN_CACHE_POOL); +err: __wt_spin_unlock(session, &__wt_process.spinlock); + if (!reconfiguring) + __wt_free(session, pool_name); + if (ret != 0 && created) { + __wt_free(session, cp->name); + WT_TRET(__wt_cond_destroy(session, &cp->cache_pool_cond)); + __wt_free(session, cp); + } + return (ret); +} + +/* + * __wt_conn_cache_pool_open -- + * Add a connection to the cache pool. + */ +int +__wt_conn_cache_pool_open(WT_SESSION_IMPL *session) +{ + WT_CACHE *cache; + WT_CACHE_POOL *cp; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + conn = S2C(session); + cache = conn->cache; + cp = __wt_process.cache_pool; + + /* + * Create a session that can be used by the cache pool thread, do + * it in the main thread to avoid shutdown races + */ + if ((ret = __wt_open_internal_session( + conn, "cache-pool", 0, 0, &cache->cp_session)) != 0) + WT_RET_MSG(NULL, ret, + "Failed to create session for cache pool"); + + /* + * Add this connection into the cache pool connection queue. Figure + * out if a manager thread is needed while holding the lock. Don't + * start the thread until we have released the lock. + */ + __wt_spin_lock(session, &cp->cache_pool_lock); + TAILQ_INSERT_TAIL(&cp->cache_pool_qh, conn, cpq); + __wt_spin_unlock(session, &cp->cache_pool_lock); + + WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, + "Added %s to cache pool %s", conn->home, cp->name)); + + /* + * Each connection participating in the cache pool starts a manager + * thread. Only one manager is active at a time, but having a thread + * in each connection saves having a complex election process when + * the active connection shuts down. + */ + F_SET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE); + F_SET(cache, WT_CACHE_POOL_RUN); + WT_RET(__wt_thread_create(session, &cache->cp_tid, + __wt_cache_pool_server, cache->cp_session)); + + /* Wake up the cache pool server to get our initial chunk. */ + WT_RET(__wt_cond_signal(session, cp->cache_pool_cond)); + + return (0); +} + +/* + * __wt_conn_cache_pool_destroy -- + * Remove our resources from the shared cache pool. Remove the cache pool + * if we were the last connection. + */ +int +__wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session) +{ + WT_CACHE *cache; + WT_CACHE_POOL *cp; + WT_CONNECTION_IMPL *conn, *entry; + WT_DECL_RET; + WT_SESSION *wt_session; + int cp_locked, found; + + conn = S2C(session); + cache = conn->cache; + cp_locked = found = 0; + cp = __wt_process.cache_pool; + + if (!F_ISSET(conn, WT_CONN_CACHE_POOL)) + return (0); + + __wt_spin_lock(session, &cp->cache_pool_lock); + cp_locked = 1; + TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) + if (entry == conn) { + found = 1; + break; + } + + /* + * If there was an error during open, we may not have made it onto the + * queue. We did increment the reference count, so proceed regardless. + */ + if (found) { + WT_TRET(__wt_verbose(session, WT_VERB_SHARED_CACHE, + "Removing %s from cache pool", entry->home)); + TAILQ_REMOVE(&cp->cache_pool_qh, entry, cpq); + + /* Give the connection's resources back to the pool. */ + WT_ASSERT(session, cp->currently_used >= conn->cache_size); + cp->currently_used -= conn->cache_size; + + /* + * Stop our manager thread - release the cache pool lock while + * joining the thread to allow it to complete any balance + * operation. + */ + __wt_spin_unlock(session, &cp->cache_pool_lock); + cp_locked = 0; + + F_CLR(cache, WT_CACHE_POOL_RUN); + WT_TRET(__wt_cond_signal(session, cp->cache_pool_cond)); + WT_TRET(__wt_thread_join(session, cache->cp_tid)); + + wt_session = &cache->cp_session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + + /* + * Grab the lock again now to stop other threads joining the + * pool while we are figuring out whether we were the last + * participant. + */ + __wt_spin_lock(session, &cp->cache_pool_lock); + cp_locked = 1; + } + + /* + * If there are no references, we are cleaning up after a failed + * wiredtiger_open, there is nothing further to do. + */ + if (cp->refs < 1) { + if (cp_locked) + __wt_spin_unlock(session, &cp->cache_pool_lock); + return (0); + } + + if (--cp->refs == 0) { + WT_ASSERT(session, TAILQ_EMPTY(&cp->cache_pool_qh)); + F_CLR_ATOMIC(cp, WT_CACHE_POOL_ACTIVE); + } + + if (!F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE)) { + WT_TRET(__wt_verbose( + session, WT_VERB_SHARED_CACHE, "Destroying cache pool")); + __wt_spin_lock(session, &__wt_process.spinlock); + /* + * We have been holding the pool lock - no connections could + * have been added. + */ + WT_ASSERT(session, + cp == __wt_process.cache_pool && + TAILQ_EMPTY(&cp->cache_pool_qh)); + __wt_process.cache_pool = NULL; + __wt_spin_unlock(session, &__wt_process.spinlock); + __wt_spin_unlock(session, &cp->cache_pool_lock); + cp_locked = 0; + + /* Now free the pool. */ + __wt_free(session, cp->name); + + __wt_spin_destroy(session, &cp->cache_pool_lock); + WT_TRET(__wt_cond_destroy(session, &cp->cache_pool_cond)); + __wt_free(session, cp); + } + + if (cp_locked) { + __wt_spin_unlock(session, &cp->cache_pool_lock); + + /* Notify other participants if we were managing */ + if (F_ISSET(cache, WT_CACHE_POOL_MANAGER)) { + F_CLR_ATOMIC(cp, WT_CACHE_POOL_MANAGED); + WT_TRET(__wt_verbose(session, WT_VERB_SHARED_CACHE, + "Shutting down shared cache manager connection")); + } + } + + return (ret); +} + +/* + * __cache_pool_balance -- + * Do a pass over the cache pool members and ensure the pool is being + * effectively used. + */ +static int +__cache_pool_balance(WT_SESSION_IMPL *session) +{ + WT_CACHE_POOL *cp; + WT_DECL_RET; + int adjusted; + uint64_t bump_threshold, highest; + + cp = __wt_process.cache_pool; + adjusted = 0; + highest = 0; + + __wt_spin_lock(NULL, &cp->cache_pool_lock); + + /* If the queue is empty there is nothing to do. */ + if (TAILQ_FIRST(&cp->cache_pool_qh) == NULL) + goto err; + + WT_ERR(__cache_pool_assess(session, &highest)); + bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD; + /* + * Actively attempt to: + * - Reduce the amount allocated, if we are over the budget + * - Increase the amount used if there is capacity and any pressure. + */ + for (bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD; + F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) && + F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN);) { + WT_ERR(__cache_pool_adjust( + session, highest, bump_threshold, &adjusted)); + /* + * Stop if the amount of cache being used is stable, and we + * aren't over capacity. + */ + if (cp->currently_used <= cp->size && !adjusted) + break; + if (bump_threshold > 0) + --bump_threshold; + } + +err: __wt_spin_unlock(NULL, &cp->cache_pool_lock); + return (ret); +} + +/* + * __cache_pool_assess -- + * Assess the usage of the cache pool. + */ +static int +__cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest) +{ + WT_CACHE_POOL *cp; + WT_CACHE *cache; + WT_CONNECTION_IMPL *entry; + uint64_t entries, highest, new; + + cp = __wt_process.cache_pool; + entries = highest = 0; + + /* Generate read pressure information. */ + TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) { + if (entry->cache_size == 0 || + entry->cache == NULL) + continue; + cache = entry->cache; + ++entries; + new = cache->bytes_evict; + /* Handle wrapping of eviction requests. */ + if (new >= cache->cp_saved_evict) + cache->cp_current_evict = new - cache->cp_saved_evict; + else + cache->cp_current_evict = new; + cache->cp_saved_evict = new; + if (cache->cp_current_evict > highest) + highest = cache->cp_current_evict; + } + WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, + "Highest eviction count: %" PRIu64 ", entries: %" PRIu64, + highest, entries)); + /* Normalize eviction information across connections. */ + highest = highest / (entries + 1); + ++highest; /* Avoid divide by zero. */ + + *phighest = highest; + return (0); +} + +/* + * __cache_pool_adjust -- + * Adjust the allocation of cache to each connection. If force is set + * ignore cache load information, and reduce the allocation for every + * connection allocated more than their reserved size. + */ +static int +__cache_pool_adjust(WT_SESSION_IMPL *session, + uint64_t highest, uint64_t bump_threshold, int *adjustedp) +{ + WT_CACHE_POOL *cp; + WT_CACHE *cache; + WT_CONNECTION_IMPL *entry; + uint64_t adjusted, reserved, read_pressure; + int force, grew; + + *adjustedp = 0; + cp = __wt_process.cache_pool; + force = (cp->currently_used > cp->size); + grew = 0; + if (WT_VERBOSE_ISSET(session, WT_VERB_SHARED_CACHE)) { + WT_RET(__wt_verbose(session, + WT_VERB_SHARED_CACHE, "Cache pool distribution: ")); + WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, + "\t" "cache_size, read_pressure, skips: ")); + } + + TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) { + cache = entry->cache; + reserved = cache->cp_reserved; + adjusted = 0; + + read_pressure = cache->cp_current_evict / highest; + WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, + "\t%" PRIu64 ", %" PRIu64 ", %" PRIu32, + entry->cache_size, read_pressure, cache->cp_skip_count)); + + /* Allow to stabilize after changes. */ + if (cache->cp_skip_count > 0 && --cache->cp_skip_count > 0) + continue; + /* + * If the entry is currently allocated less than the reserved + * size, increase it's allocation. This should only happen if: + * - It's the first time we've seen this member + * - The reserved size has been adjusted + */ + if (entry->cache_size < reserved) { + grew = 1; + adjusted = reserved - entry->cache_size; + /* + * Conditions for reducing the amount of resources for an + * entry: + * - If we are forcing and this entry has more than the + * minimum amount of space in use. + * - If the read pressure in this entry is below the + * threshold, other entries need more cache, the entry has + * more than the minimum space and there is no available + * space in the pool. + */ + } else if ((force && entry->cache_size > reserved) || + (read_pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && + highest > 1 && entry->cache_size > reserved && + cp->currently_used >= cp->size)) { + grew = 0; + /* + * Shrink by a chunk size if that doesn't drop us + * below the reserved size. + */ + if (entry->cache_size > cp->chunk + reserved) + adjusted = cp->chunk; + else + adjusted = entry->cache_size - reserved; + /* + * Conditions for increasing the amount of resources for an + * entry: + * - There was some activity across the pool + * - This entry is using less than the entire cache pool + * - The connection is using enough cache to require eviction + * - There is space available in the pool + * - Additional cache would benefit the connection + */ + } else if (highest > 1 && + entry->cache_size < cp->size && + cache->bytes_inmem >= + (entry->cache_size * cache->eviction_target) / 100 && + cp->currently_used < cp->size && + read_pressure > bump_threshold) { + grew = 1; + adjusted = WT_MIN(cp->chunk, + cp->size - cp->currently_used); + } + if (adjusted > 0) { + *adjustedp = 1; + if (grew > 0) { + cache->cp_skip_count = WT_CACHE_POOL_BUMP_SKIPS; + entry->cache_size += adjusted; + cp->currently_used += adjusted; + } else { + cache->cp_skip_count = + WT_CACHE_POOL_REDUCE_SKIPS; + WT_ASSERT(session, + entry->cache_size >= adjusted && + cp->currently_used >= adjusted); + entry->cache_size -= adjusted; + cp->currently_used -= adjusted; + } + WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, + "Allocated %s%" PRId64 " to %s", + grew ? "" : "-", adjusted, entry->home)); + /* + * TODO: Add a loop waiting for connection to give up + * cache. + */ + } + } + return (0); +} + +/* + * __wt_cache_pool_server -- + * Thread to manage cache pool among connections. + */ +void * +__wt_cache_pool_server(void *arg) +{ + WT_CACHE *cache; + WT_CACHE_POOL *cp; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)arg; + + cp = __wt_process.cache_pool; + cache = S2C(session)->cache; + + while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) && + F_ISSET(cache, WT_CACHE_POOL_RUN)) { + if (cp->currently_used <= cp->size) + WT_ERR(__wt_cond_wait(session, + cp->cache_pool_cond, 1000000)); + + /* + * Re-check pool run flag - since we want to avoid getting the + * lock on shutdown. + */ + if (!F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) && + F_ISSET(cache, WT_CACHE_POOL_RUN)) + break; + + /* Try to become the managing thread */ + F_CAS_ATOMIC(cp, WT_CACHE_POOL_MANAGED, ret); + if (ret == 0) { + F_SET(cache, WT_CACHE_POOL_MANAGER); + WT_ERR(__wt_verbose(session, WT_VERB_SHARED_CACHE, + "Cache pool switched manager thread")); + } + + /* + * Continue even if there was an error. Details of errors are + * reported in the balance function. + */ + if (F_ISSET(cache, WT_CACHE_POOL_MANAGER)) + (void)__cache_pool_balance(session); + } + + if (0) { +err: __wt_err(session, ret, "cache pool manager server error"); + } + return (NULL); +} diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c new file mode 100644 index 00000000000..ab97d4ead46 --- /dev/null +++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c @@ -0,0 +1,228 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __ckpt_server_start(WT_CONNECTION_IMPL *); + +/* + * __ckpt_server_config -- + * Parse and setup the checkpoint server options. + */ +static int +__ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, int *startp) +{ + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + char *p; + + conn = S2C(session); + + /* + * The checkpoint configuration requires a wait time and/or a log + * size -- if one is not set, we're not running at all. + * Checkpoints based on log size also require logging be enabled. + */ + WT_RET(__wt_config_gets(session, cfg, "checkpoint.wait", &cval)); + conn->ckpt_usecs = (long)cval.val * 1000000; + WT_RET(__wt_config_gets(session, cfg, "checkpoint.log_size", &cval)); + conn->ckpt_logsize = (wt_off_t)cval.val; + __wt_log_written_reset(session); + if ((conn->ckpt_usecs == 0 && conn->ckpt_logsize == 0) || + (conn->ckpt_logsize && !conn->logging && conn->ckpt_usecs == 0)) { + *startp = 0; + return (0); + } + *startp = 1; + + /* + * The application can specify a checkpoint name, which we ignore if + * it's our default. + */ + WT_RET(__wt_config_gets(session, cfg, "checkpoint.name", &cval)); + if (cval.len != 0 && + !WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) { + WT_RET(__wt_checkpoint_name_ok(session, cval.str, cval.len)); + + WT_RET(__wt_scr_alloc(session, cval.len + 20, &tmp)); + WT_ERR(__wt_buf_fmt( + session, tmp, "name=%.*s", (int)cval.len, cval.str)); + WT_ERR(__wt_strdup(session, tmp->data, &p)); + + __wt_free(session, conn->ckpt_config); + conn->ckpt_config = p; + } + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __ckpt_server -- + * The checkpoint server thread. + */ +static void * +__ckpt_server(void *arg) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION *wt_session; + WT_SESSION_IMPL *session; + + session = arg; + conn = S2C(session); + wt_session = (WT_SESSION *)session; + + while (F_ISSET(conn, WT_CONN_SERVER_RUN) && + F_ISSET(conn, WT_CONN_SERVER_CHECKPOINT)) { + /* Checkpoint the database. */ + WT_ERR(wt_session->checkpoint(wt_session, conn->ckpt_config)); + + /* Reset. */ + if (conn->ckpt_logsize) { + __wt_log_written_reset(session); + conn->ckpt_signalled = 0; + } + /* + * Wait... + * NOTE: If the user only configured logsize, then usecs + * will be 0 and this wait won't return until signalled. + */ + WT_ERR( + __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs)); + } + + if (0) { +err: __wt_err(session, ret, "checkpoint server error"); + } + return (NULL); +} + +/* + * __ckpt_server_start -- + * Start the checkpoint server thread. + */ +static int +__ckpt_server_start(WT_CONNECTION_IMPL *conn) +{ + WT_SESSION_IMPL *session; + + /* Nothing to do if the server is already running. */ + if (conn->ckpt_session != NULL) + return (0); + + F_SET(conn, WT_CONN_SERVER_CHECKPOINT); + /* The checkpoint server gets its own session. */ + WT_RET(__wt_open_internal_session( + conn, "checkpoint-server", 1, 1, &conn->ckpt_session)); + session = conn->ckpt_session; + + /* + * Checkpoint does enough I/O it may be called upon to perform slow + * operations for the block manager. + */ + F_SET(session, WT_SESSION_CAN_WAIT); + + WT_RET( + __wt_cond_alloc(session, "checkpoint server", 0, &conn->ckpt_cond)); + + /* + * Start the thread. + */ + WT_RET(__wt_thread_create( + session, &conn->ckpt_tid, __ckpt_server, session)); + conn->ckpt_tid_set = 1; + + return (0); +} + +/* + * __wt_checkpoint_server_create -- + * Configure and start the checkpoint server. + */ +int +__wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONNECTION_IMPL *conn; + int start; + + conn = S2C(session); + start = 0; + + /* If there is already a server running, shut it down. */ + if (conn->ckpt_session != NULL) + WT_RET(__wt_checkpoint_server_destroy(session)); + + WT_RET(__ckpt_server_config(session, cfg, &start)); + if (start) + WT_RET(__ckpt_server_start(conn)); + + return (0); +} + +/* + * __wt_checkpoint_server_destroy -- + * Destroy the checkpoint server thread. + */ +int +__wt_checkpoint_server_destroy(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION *wt_session; + + conn = S2C(session); + + F_CLR(conn, WT_CONN_SERVER_CHECKPOINT); + if (conn->ckpt_tid_set) { + WT_TRET(__wt_cond_signal(session, conn->ckpt_cond)); + WT_TRET(__wt_thread_join(session, conn->ckpt_tid)); + conn->ckpt_tid_set = 0; + } + WT_TRET(__wt_cond_destroy(session, &conn->ckpt_cond)); + + __wt_free(session, conn->ckpt_config); + + /* Close the server thread's session. */ + if (conn->ckpt_session != NULL) { + wt_session = &conn->ckpt_session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + } + + /* + * Ensure checkpoint settings are cleared - so that reconfigure doesn't + * get confused. + */ + conn->ckpt_session = NULL; + conn->ckpt_tid_set = 0; + conn->ckpt_cond = NULL; + conn->ckpt_config = NULL; + conn->ckpt_usecs = 0; + + return (ret); +} + +/* + * __wt_checkpoint_signal -- + * Signal the checkpoint thread if sufficient log has been written. + * Return 1 if this signals the checkpoint thread, 0 otherwise. + */ +int +__wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + WT_ASSERT(session, WT_CKPT_LOGSIZE(conn)); + if (logsize >= conn->ckpt_logsize && !conn->ckpt_signalled) { + WT_RET(__wt_cond_signal(session, conn->ckpt_cond)); + conn->ckpt_signalled = 1; + } + return (0); +} diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c new file mode 100644 index 00000000000..f4f540e33c7 --- /dev/null +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -0,0 +1,694 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __conn_dhandle_open_lock -- + * Spin on the current data handle until either (a) it is open, read + * locked; or (b) it is closed, write locked. If exclusive access is + * requested and cannot be granted immediately, fail with EBUSY. + */ +static int +__conn_dhandle_open_lock( + WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, uint32_t flags) +{ + WT_BTREE *btree; + WT_DECL_RET; + + btree = dhandle->handle; + + /* + * Check that the handle is open. We've already incremented + * the reference count, so once the handle is open it won't be + * closed by another thread. + * + * If we can see the WT_DHANDLE_OPEN flag set while holding a + * lock on the handle, then it's really open and we can start + * using it. Alternatively, if we can get an exclusive lock + * and WT_DHANDLE_OPEN is still not set, we need to do the open. + */ + for (;;) { + if (!LF_ISSET(WT_DHANDLE_EXCLUSIVE) && + F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) + return (EBUSY); + + if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && + !LF_ISSET(WT_DHANDLE_EXCLUSIVE)) { + WT_RET(__wt_readlock(session, dhandle->rwlock)); + if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) + return (0); + WT_RET(__wt_readunlock(session, dhandle->rwlock)); + } + + /* + * It isn't open or we want it exclusive: try to get an + * exclusive lock. There is some subtlety here: if we race + * with another thread that successfully opens the file, we + * don't want to block waiting to get exclusive access. + */ + if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == 0) { + /* + * If it was opened while we waited, drop the write + * lock and get a read lock instead. + */ + if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && + !LF_ISSET(WT_DHANDLE_EXCLUSIVE)) { + WT_RET( + __wt_writeunlock(session, dhandle->rwlock)); + continue; + } + + /* We have an exclusive lock, we're done. */ + F_SET(dhandle, WT_DHANDLE_EXCLUSIVE); + return (0); + } else if (ret != EBUSY || LF_ISSET(WT_DHANDLE_EXCLUSIVE)) + return (EBUSY); + + /* Give other threads a chance to make progress. */ + __wt_yield(); + } +} + +/* + * __conn_dhandle_get -- + * Find an open btree file handle, otherwise create a new one, lock it + * exclusively, and return it linked into the connection's list. + */ +static int +__conn_dhandle_get(WT_SESSION_IMPL *session, + const char *name, const char *ckpt, uint32_t flags) +{ + WT_BTREE *btree; + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + uint64_t hash; + + conn = S2C(session); + + /* We must be holding the schema lock at a higher level. */ + WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) && + !LF_ISSET(WT_DHANDLE_HAVE_REF)); + + /* Increment the reference count if we already have the btree open. */ + hash = __wt_hash_city64(name, strlen(name)); + SLIST_FOREACH(dhandle, &conn->dhlh, l) + if ((hash == dhandle->name_hash && + strcmp(name, dhandle->name) == 0) && + ((ckpt == NULL && dhandle->checkpoint == NULL) || + (ckpt != NULL && dhandle->checkpoint != NULL && + strcmp(ckpt, dhandle->checkpoint) == 0))) { + WT_RET(__conn_dhandle_open_lock( + session, dhandle, flags)); + (void)WT_ATOMIC_ADD4(dhandle->session_ref, 1); + session->dhandle = dhandle; + return (0); + } + + /* + * Allocate the data source handle and underlying btree handle, then + * initialize the data source handle. Exclusively lock the data + * source handle before inserting it in the list. + */ + WT_RET(__wt_calloc_def(session, 1, &dhandle)); + + WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle")); + dhandle->session_ref = 1; + + dhandle->name_hash = hash; + WT_ERR(__wt_strdup(session, name, &dhandle->name)); + if (ckpt != NULL) + WT_ERR(__wt_strdup(session, ckpt, &dhandle->checkpoint)); + + WT_ERR(__wt_calloc_def(session, 1, &btree)); + dhandle->handle = btree; + btree->dhandle = dhandle; + + WT_ERR(__wt_spin_init( + session, &dhandle->close_lock, "data handle close")); + + F_SET(dhandle, WT_DHANDLE_EXCLUSIVE); + WT_ERR(__wt_writelock(session, dhandle->rwlock)); + + /* + * Prepend the handle to the connection list, assuming we're likely to + * need new files again soon, until they are cached by all sessions. + * + * !!! + * We hold only the schema lock here, not the dhandle lock. Eviction + * walks this list only holding the dhandle lock. This works because + * we're inserting at the beginning of the list, and we're only + * publishing this one entry per lock acquisition. Eviction either + * sees our newly added entry or the former head of the list, and it + * doesn't matter which (if eviction only sees a single element in the + * list because the insert races, it will return without finding enough + * candidates for eviction, and will then retry). + */ + SLIST_INSERT_HEAD(&conn->dhlh, dhandle, l); + + session->dhandle = dhandle; + return (0); + +err: WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock)); + __wt_free(session, dhandle->name); + __wt_free(session, dhandle->checkpoint); + __wt_free(session, dhandle->handle); /* btree free */ + __wt_spin_destroy(session, &dhandle->close_lock); + __wt_overwrite_and_free(session, dhandle); + + return (ret); +} + +/* + * __wt_conn_btree_sync_and_close -- + * Sync and close the underlying btree handle. + */ +int +__wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int force) +{ + WT_BTREE *btree; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + int no_schema_lock; + + dhandle = session->dhandle; + btree = S2BT(session); + + if (!F_ISSET(dhandle, WT_DHANDLE_OPEN)) + return (0); + + /* + * If we don't already have the schema lock, make it an error to try + * to acquire it. The problem is that we are holding an exclusive + * lock on the handle, and if we attempt to acquire the schema lock + * we might deadlock with a thread that has the schema lock and wants + * a handle lock (specifically, checkpoint). + */ + no_schema_lock = 0; + if (!F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) { + no_schema_lock = 1; + F_SET(session, WT_SESSION_NO_SCHEMA_LOCK); + } + + /* + * We may not be holding the schema lock, and threads may be walking + * the list of open handles (for example, checkpoint). Acquire the + * handle's close lock. + */ + __wt_spin_lock(session, &dhandle->close_lock); + + /* + * The close can fail if an update cannot be written, return the EBUSY + * error to our caller for eventual retry. + */ + if (!F_ISSET(btree, + WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) + WT_ERR(__wt_checkpoint_close(session, force)); + + if (dhandle->checkpoint == NULL) + --S2C(session)->open_btree_count; + + WT_TRET(__wt_btree_close(session)); + F_CLR(dhandle, WT_DHANDLE_OPEN); + F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); + +err: __wt_spin_unlock(session, &dhandle->close_lock); + + if (no_schema_lock) + F_CLR(session, WT_SESSION_NO_SCHEMA_LOCK); + + return (ret); +} + +/* + * __conn_btree_config_clear -- + * Clear the underlying object's configuration information. + */ +static void +__conn_btree_config_clear(WT_SESSION_IMPL *session) +{ + WT_DATA_HANDLE *dhandle; + const char **a; + + dhandle = session->dhandle; + + if (dhandle->cfg == NULL) + return; + for (a = dhandle->cfg; *a != NULL; ++a) + __wt_free(session, *a); + __wt_free(session, dhandle->cfg); +} + +/* + * __conn_btree_config_set -- + * Set up a btree handle's configuration information. + */ +static int +__conn_btree_config_set(WT_SESSION_IMPL *session) +{ + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + const char *metaconf; + + dhandle = session->dhandle; + + /* + * Read the object's entry from the metadata file, we're done if we + * don't find one. + */ + if ((ret = + __wt_metadata_search(session, dhandle->name, &metaconf)) != 0) { + if (ret == WT_NOTFOUND) + ret = ENOENT; + WT_RET(ret); + } + + /* + * The defaults are included because underlying objects have persistent + * configuration information stored in the metadata file. If defaults + * are included in the configuration, we can add new configuration + * strings without upgrading the metadata file or writing special code + * in case a configuration string isn't initialized, as long as the new + * configuration string has an appropriate default value. + * + * The error handling is a little odd, but be careful: we're holding a + * chunk of allocated memory in metaconf. If we fail before we copy a + * reference to it into the object's configuration array, we must free + * it, after the copy, we don't want to free it. + */ + WT_ERR(__wt_calloc_def(session, 3, &dhandle->cfg)); + WT_ERR(__wt_strdup( + session, WT_CONFIG_BASE(session, file_meta), &dhandle->cfg[0])); + dhandle->cfg[1] = metaconf; + return (0); + +err: __wt_free(session, metaconf); + return (ret); +} + +/* + * __conn_btree_open -- + * Open the current btree handle. + */ +static int +__conn_btree_open( + WT_SESSION_IMPL *session, const char *op_cfg[], uint32_t flags) +{ + WT_BTREE *btree; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + + dhandle = session->dhandle; + btree = S2BT(session); + + WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) && + F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) && + !LF_ISSET(WT_DHANDLE_LOCK_ONLY)); + + /* + * If the handle is already open, it has to be closed so it can be + * reopened with a new configuration. We don't need to check again: + * this function isn't called if the handle is already open in the + * required mode. + * + * This call can return EBUSY if there's an update in the object that's + * not yet globally visible. That's not a problem because it can only + * happen when we're switching from a normal handle to a "special" one, + * so we're returning EBUSY to an attempt to verify or do other special + * operations. The reverse won't happen because when the handle from a + * verify or other special operation is closed, there won't be updates + * in the tree that can block the close. + */ + if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) + WT_RET(__wt_conn_btree_sync_and_close(session, 0)); + + /* Discard any previous configuration, set up the new configuration. */ + __conn_btree_config_clear(session); + WT_RET(__conn_btree_config_set(session)); + + /* Set any special flags on the handle. */ + F_SET(btree, LF_ISSET(WT_BTREE_SPECIAL_FLAGS)); + + do { + WT_ERR(__wt_btree_open(session, op_cfg)); + F_SET(dhandle, WT_DHANDLE_OPEN); + /* + * Checkpoint handles are read only, so eviction calculations + * based on the number of btrees are better to ignore them. + */ + if (dhandle->checkpoint == NULL) + ++S2C(session)->open_btree_count; + + /* Drop back to a readlock if that is all that was needed. */ + if (!LF_ISSET(WT_DHANDLE_EXCLUSIVE)) { + F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); + WT_ERR(__wt_writeunlock(session, dhandle->rwlock)); + WT_ERR( + __conn_dhandle_open_lock(session, dhandle, flags)); + } + } while (!F_ISSET(dhandle, WT_DHANDLE_OPEN)); + + if (0) { +err: F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); + /* + * If the open failed, close the handle. If there was no + * reference to the handle in this session, we incremented the + * session reference count, so decrement it here. Otherwise, + * just close the handle without decrementing. + */ + if (!LF_ISSET(WT_DHANDLE_HAVE_REF)) + __wt_conn_btree_close(session); + else if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) + WT_TRET(__wt_conn_btree_sync_and_close(session, 0)); + } + + return (ret); +} + +/* + * __wt_conn_btree_get -- + * Get an open btree file handle, otherwise open a new one. + */ +int +__wt_conn_btree_get(WT_SESSION_IMPL *session, + const char *name, const char *ckpt, const char *op_cfg[], uint32_t flags) +{ + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + + if (LF_ISSET(WT_DHANDLE_HAVE_REF)) + WT_RET( + __conn_dhandle_open_lock(session, session->dhandle, flags)); + else + WT_RET(__conn_dhandle_get(session, name, ckpt, flags)); + dhandle = session->dhandle; + + if (!LF_ISSET(WT_DHANDLE_LOCK_ONLY) && + (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || + LF_ISSET(WT_BTREE_SPECIAL_FLAGS))) + if ((ret = __conn_btree_open(session, op_cfg, flags)) != 0) { + F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); + WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); + } + + WT_ASSERT(session, ret != 0 || + LF_ISSET(WT_DHANDLE_EXCLUSIVE) == + F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE)); + + return (ret); +} + +/* + * __wt_conn_btree_apply -- + * Apply a function to all open btree handles apart from the metadata + * file. + */ +int +__wt_conn_btree_apply(WT_SESSION_IMPL *session, + int apply_checkpoints, + int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) +{ + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + + conn = S2C(session); + + WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); + + SLIST_FOREACH(dhandle, &conn->dhlh, l) + if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && + WT_PREFIX_MATCH(dhandle->name, "file:") && + (apply_checkpoints || dhandle->checkpoint == NULL) && + !WT_IS_METADATA(dhandle)) { + /* + * We need to pull the handle into the session handle + * cache and make sure it's referenced to stop other + * internal code dropping the handle (e.g in LSM when + * cleaning up obsolete chunks). Holding the metadata + * lock isn't enough. + */ + ret = __wt_session_get_btree(session, + dhandle->name, dhandle->checkpoint, NULL, 0); + if (ret == 0) { + ret = func(session, cfg); + if (WT_META_TRACKING(session)) + WT_TRET(__wt_meta_track_handle_lock( + session, 0)); + else + WT_TRET(__wt_session_release_btree( + session)); + } else if (ret == EBUSY) + ret = __wt_conn_btree_apply_single( + session, dhandle->name, + dhandle->checkpoint, func, cfg); + WT_RET(ret); + } + + return (0); +} + +/* + * __wt_conn_btree_apply_single -- + * Apply a function to a single btree handle that couldn't be locked + * (attempting to get the handle returned EBUSY). + */ +int +__wt_conn_btree_apply_single(WT_SESSION_IMPL *session, + const char *uri, const char *checkpoint, + int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) +{ + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle, *saved_dhandle; + WT_DECL_RET; + + conn = S2C(session); + saved_dhandle = session->dhandle; + + WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); + + SLIST_FOREACH(dhandle, &conn->dhlh, l) + if (strcmp(dhandle->name, uri) == 0 && + ((dhandle->checkpoint == NULL && checkpoint == NULL) || + (dhandle->checkpoint != NULL && checkpoint != NULL && + strcmp(dhandle->checkpoint, checkpoint) == 0))) { + /* + * We're holding the schema lock which locks out handle + * open (which might change the state of the underlying + * object). However, closing a handle doesn't require + * the schema lock, lock out closing the handle and then + * confirm the handle is still open. + */ + __wt_spin_lock(session, &dhandle->close_lock); + if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) { + session->dhandle = dhandle; + ret = func(session, cfg); + } + __wt_spin_unlock(session, &dhandle->close_lock); + WT_ERR(ret); + } + +err: session->dhandle = saved_dhandle; + return (ret); +} + +/* + * __wt_conn_btree_close -- + * Discard a reference to an open btree file handle. + */ +void +__wt_conn_btree_close(WT_SESSION_IMPL *session) +{ + (void)WT_ATOMIC_SUB4(session->dhandle->session_ref, 1); +} + +/* + * __wt_conn_dhandle_close_all -- + * Close all data handles handles with matching name (including all + * checkpoint handles). + */ +int +__wt_conn_dhandle_close_all( + WT_SESSION_IMPL *session, const char *name, int force) +{ + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + + conn = S2C(session); + + WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); + WT_ASSERT(session, session->dhandle == NULL); + + SLIST_FOREACH(dhandle, &conn->dhlh, l) { + if (strcmp(dhandle->name, name) != 0) + continue; + + session->dhandle = dhandle; + + /* Lock the handle exclusively. */ + WT_ERR(__wt_session_get_btree(session, + dhandle->name, dhandle->checkpoint, + NULL, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY)); + if (WT_META_TRACKING(session)) + WT_ERR(__wt_meta_track_handle_lock(session, 0)); + + /* + * We have an exclusive lock, which means there are no cursors + * open at this point. Close the handle, if necessary. + */ + if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) { + if ((ret = __wt_meta_track_sub_on(session)) == 0) + ret = __wt_conn_btree_sync_and_close( + session, force); + + /* + * If the close succeeded, drop any locks it acquired. + * If there was a failure, this function will fail and + * the whole transaction will be rolled back. + */ + if (ret == 0) + ret = __wt_meta_track_sub_off(session); + } + + if (!WT_META_TRACKING(session)) + WT_TRET(__wt_session_release_btree(session)); + + WT_ERR(ret); + } + +err: session->dhandle = NULL; + return (ret); +} + +/* + * __wt_conn_dhandle_discard_single -- + * Close/discard a single data handle. + */ +int +__wt_conn_dhandle_discard_single( + WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, int final) +{ + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *save_dhandle; + WT_DECL_RET; + WT_DECL_SPINLOCK_ID(id); /* Must appear last */ + + conn = S2C(session); + + save_dhandle = session->dhandle; + session->dhandle = dhandle; + + /* + * We're called from the periodic sweep function and the final close; + * the former wants to continue if the handle is suddenly found to be + * busy, the latter wants to shut things down. + */ + if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) { + if (!final) + WT_ERR(EBUSY); + WT_ERR(__wt_conn_btree_sync_and_close(session, 0)); + } + + /* + * Get the schema lock (required to remove entries from the data handle + * list), get the dhandle lock to block the eviction server from + * walking the list. + */ + F_SET(session, WT_SESSION_SCHEMA_LOCKED); + __wt_spin_lock(session, &conn->schema_lock); + + /* + * If the eviction server is running, don't block waiting for it while + * holding the schema lock. The sweep server will try again. + */ + if (final) + __wt_spin_lock(session, &conn->dhandle_lock); + else if ((ret = + __wt_spin_trylock(session, &conn->dhandle_lock, &id)) != 0) + goto unlock; + + /* + * Check if the handle was reacquired by a session while we waited; + * this should only happen when called from the periodic sweep code, of + * course. + */ + if (!final && dhandle->session_ref != 0) + ret = EBUSY; + else + SLIST_REMOVE(&conn->dhlh, dhandle, __wt_data_handle, l); + + __wt_spin_unlock(session, &conn->dhandle_lock); + +unlock: __wt_spin_unlock(session, &conn->schema_lock); + F_CLR(session, WT_SESSION_SCHEMA_LOCKED); + + /* + * After successfully removing the handle, clean it up. + */ + if (ret == 0) { + WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock)); + __wt_free(session, dhandle->name); + __wt_free(session, dhandle->checkpoint); + __conn_btree_config_clear(session); + __wt_free(session, dhandle->handle); + __wt_spin_destroy(session, &dhandle->close_lock); + __wt_overwrite_and_free(session, dhandle); + + WT_CLEAR_BTREE_IN_SESSION(session); + } + +err: session->dhandle = save_dhandle; + WT_ASSERT(session, !final || ret == 0); + return (ret); +} + +/* + * __wt_conn_dhandle_discard -- + * Close/discard all data handles. + */ +int +__wt_conn_dhandle_discard(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + + conn = S2C(session); + + /* + * Close open data handles: first, everything but the metadata file + * (as closing a normal file may open and write the metadata file), + * then the metadata file. This function isn't called often, and I + * don't want to "know" anything about the metadata file's position on + * the list, so we do it the hard way. + */ +restart: + SLIST_FOREACH(dhandle, &conn->dhlh, l) { + if (WT_IS_METADATA(dhandle)) + continue; + + WT_TRET(__wt_conn_dhandle_discard_single(session, dhandle, 1)); + goto restart; + } + + /* + * Closing the files may have resulted in entries on our default + * session's list of open data handles, specifically, we added the + * metadata file if any of the files were dirty. Clean up that list + * before we shut down the metadata entry, for good. + */ + __wt_session_close_cache(session); + F_SET(session, WT_SESSION_NO_DATA_HANDLES); + + /* Close the metadata file handle. */ + while ((dhandle = SLIST_FIRST(&conn->dhlh)) != NULL) + WT_TRET(__wt_conn_dhandle_discard_single(session, dhandle, 1)); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c new file mode 100644 index 00000000000..e4f0a6ddd73 --- /dev/null +++ b/src/third_party/wiredtiger/src/conn/conn_handle.c @@ -0,0 +1,142 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_connection_init -- + * Structure initialization for a just-created WT_CONNECTION_IMPL handle. + */ +int +__wt_connection_init(WT_CONNECTION_IMPL *conn) +{ + WT_SESSION_IMPL *session; + u_int i; + + session = conn->default_session; + + SLIST_INIT(&conn->dhlh); /* Data handle list */ + TAILQ_INIT(&conn->dlhqh); /* Library list */ + TAILQ_INIT(&conn->dsrcqh); /* Data source list */ + TAILQ_INIT(&conn->fhqh); /* File list */ + TAILQ_INIT(&conn->collqh); /* Collator list */ + TAILQ_INIT(&conn->compqh); /* Compressor list */ + + TAILQ_INIT(&conn->lsmqh); /* WT_LSM_TREE list */ + + /* Setup the LSM work queues. */ + TAILQ_INIT(&conn->lsm_manager.switchqh); + TAILQ_INIT(&conn->lsm_manager.appqh); + TAILQ_INIT(&conn->lsm_manager.managerqh); + + /* Configuration. */ + WT_RET(__wt_conn_config_init(session)); + + /* Statistics. */ + __wt_stat_init_connection_stats(&conn->stats); + + /* Locks. */ + WT_RET(__wt_spin_init(session, &conn->api_lock, "api")); + WT_RET(__wt_spin_init(session, &conn->checkpoint_lock, "checkpoint")); + WT_RET(__wt_spin_init(session, &conn->dhandle_lock, "data handle")); + WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list")); + WT_RET(__wt_spin_init(session, &conn->hot_backup_lock, "hot backup")); + WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); + WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema")); + WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS(conn), &conn->page_lock)); + for (i = 0; i < WT_PAGE_LOCKS(conn); ++i) + WT_RET( + __wt_spin_init(session, &conn->page_lock[i], "btree page")); + + /* Setup the spin locks for the LSM manager queues. */ + WT_RET(__wt_spin_init(session, + &conn->lsm_manager.app_lock, "LSM application queue lock")); + WT_RET(__wt_spin_init(session, + &conn->lsm_manager.manager_lock, "LSM manager queue lock")); + WT_RET(__wt_spin_init( + session, &conn->lsm_manager.switch_lock, "LSM switch queue lock")); + WT_RET(__wt_cond_alloc( + session, "LSM worker cond", 0, &conn->lsm_manager.work_cond)); + + /* + * Generation numbers. + * + * Start split generations at one. Threads publish this generation + * number before examining tree structures, and zero when they leave. + * We need to distinguish between threads that are in a tree before the + * first split has happened, and threads that are not in a tree. + */ + conn->split_gen = 1; + + /* + * Block manager. + * XXX + * If there's ever a second block manager, we'll want to make this + * more opaque, but for now this is simpler. + */ + WT_RET(__wt_spin_init(session, &conn->block_lock, "block manager")); + TAILQ_INIT(&conn->blockqh); /* Block manager list */ + + return (0); +} + +/* + * __wt_connection_destroy -- + * Destroy the connection's underlying WT_CONNECTION_IMPL structure. + */ +int +__wt_connection_destroy(WT_CONNECTION_IMPL *conn) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + u_int i; + + /* Check there's something to destroy. */ + if (conn == NULL) + return (0); + + session = conn->default_session; + + /* + * Close remaining open files (before discarding the mutex, the + * underlying file-close code uses the mutex to guard lists of + * open files. + */ + if (conn->lock_fh != NULL) + WT_TRET(__wt_close(session, conn->lock_fh)); + + /* Remove from the list of connections. */ + __wt_spin_lock(session, &__wt_process.spinlock); + TAILQ_REMOVE(&__wt_process.connqh, conn, q); + __wt_spin_unlock(session, &__wt_process.spinlock); + + /* Configuration */ + __wt_conn_config_discard(session); /* configuration */ + + __wt_conn_foc_discard(session); /* free-on-close */ + + __wt_spin_destroy(session, &conn->api_lock); + __wt_spin_destroy(session, &conn->block_lock); + __wt_spin_destroy(session, &conn->checkpoint_lock); + __wt_spin_destroy(session, &conn->dhandle_lock); + __wt_spin_destroy(session, &conn->fh_lock); + __wt_spin_destroy(session, &conn->hot_backup_lock); + __wt_spin_destroy(session, &conn->reconfig_lock); + __wt_spin_destroy(session, &conn->schema_lock); + for (i = 0; i < WT_PAGE_LOCKS(conn); ++i) + __wt_spin_destroy(session, &conn->page_lock[i]); + __wt_free(session, conn->page_lock); + + /* Free allocated memory. */ + __wt_free(session, conn->cfg); + __wt_free(session, conn->home); + __wt_free(session, conn->error_prefix); + __wt_free(session, conn->sessions); + + __wt_free(NULL, conn); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c new file mode 100644 index 00000000000..e516fdc68d2 --- /dev/null +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -0,0 +1,284 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __logmgr_sync_cfg -- + * Interpret the transaction_sync config. + */ +static int +__logmgr_sync_cfg(WT_SESSION_IMPL *session, const char **cfg) +{ + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + WT_RET( + __wt_config_gets(session, cfg, "transaction_sync.enabled", &cval)); + if (cval.val) + FLD_SET(conn->txn_logsync, WT_LOG_FLUSH); + else + FLD_CLR(conn->txn_logsync, WT_LOG_FLUSH); + + WT_RET( + __wt_config_gets(session, cfg, "transaction_sync.method", &cval)); + FLD_CLR(conn->txn_logsync, WT_LOG_DSYNC | WT_LOG_FSYNC); + if (WT_STRING_MATCH("dsync", cval.str, cval.len)) + FLD_SET(conn->txn_logsync, WT_LOG_DSYNC); + else if (WT_STRING_MATCH("fsync", cval.str, cval.len)) + FLD_SET(conn->txn_logsync, WT_LOG_FSYNC); + return (0); +} + +/* + * __logmgr_config -- + * Parse and setup the logging server options. + */ +static int +__logmgr_config(WT_SESSION_IMPL *session, const char **cfg, int *runp) +{ + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + /* + * The logging configuration is off by default. + */ + WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); + *runp = cval.val != 0; + if (*runp == 0) + return (0); + + WT_RET(__wt_config_gets(session, cfg, "log.archive", &cval)); + conn->archive = cval.val != 0; + + WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval)); + conn->log_file_max = (wt_off_t)cval.val; + WT_STAT_FAST_CONN_SET(session, log_max_filesize, conn->log_file_max); + + WT_RET(__wt_config_gets(session, cfg, "log.path", &cval)); + WT_RET(__wt_strndup(session, cval.str, cval.len, &conn->log_path)); + + WT_RET(__logmgr_sync_cfg(session, cfg)); + return (0); +} + +/* + * __log_archive_server -- + * The log archiving server thread. + */ +static void * +__log_archive_server(void *arg) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + WT_LSN lsn; + WT_SESSION_IMPL *session; + uint32_t lognum; + u_int i, logcount; + char **logfiles; + + session = arg; + conn = S2C(session); + log = conn->log; + logcount = 0; + logfiles = NULL; + + while (F_ISSET(conn, WT_CONN_SERVER_RUN)) { + /* + * If archiving is reconfigured and turned off, wait until it + * gets turned back on and check again. Don't wait forever: if + * a notification gets lost during close, we want to find out + * eventually. + */ + if (conn->archive == 0 || + __wt_try_writelock(session, log->log_archive_lock) != 0) { + if (conn->archive != 0) { + WT_ERR(__wt_verbose(session, WT_VERB_LOG, + "log_archive: Blocked due to open log " + "cursor holding archive lock")); + } + WT_ERR( + __wt_cond_wait(session, conn->arch_cond, 1000000)); + continue; + } + + lsn = log->ckpt_lsn; + lsn.offset = 0; + WT_ERR(__wt_verbose(session, WT_VERB_LOG, + "log_archive: ckpt LSN %" PRIu32 ",%" PRIu64, + lsn.file, lsn.offset)); + /* + * Main archive code. Get the list of all log files and + * remove any earlier than the checkpoint LSN. + */ + WT_ERR(__wt_dirlist(session, conn->log_path, + WT_LOG_FILENAME, WT_DIRLIST_INCLUDE, &logfiles, &logcount)); + + /* + * We can only archive files if a hot backup is not in progress. + */ + __wt_spin_lock(session, &conn->hot_backup_lock); + for (i = 0; i < logcount; i++) { + if (conn->hot_backup == 0) { + WT_ERR(__wt_log_extract_lognum( + session, logfiles[i], &lognum)); + if (lognum < lsn.file) + WT_ERR( + __wt_log_remove(session, lognum)); + } + } + __wt_spin_unlock(session, &conn->hot_backup_lock); + __wt_log_files_free(session, logfiles, logcount); + logfiles = NULL; + logcount = 0; + + /* + * Indicate what is our new earliest LSN. It is the start + * of the log file containing the last checkpoint. + */ + log->first_lsn = lsn; + log->first_lsn.offset = 0; + WT_ERR(__wt_writeunlock(session, log->log_archive_lock)); + + /* Wait until the next event. */ + WT_ERR(__wt_cond_wait(session, conn->arch_cond, 1000000)); + } + + if (0) { +err: __wt_err(session, ret, "log archive server error"); + } + if (logfiles != NULL) + __wt_log_files_free(session, logfiles, logcount); + return (NULL); +} + +/* + * __wt_logmgr_create -- + * Start the log subsystem and archive server thread. + */ +int +__wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + int run; + + conn = S2C(session); + + /* Handle configuration. */ + WT_RET(__logmgr_config(session, cfg, &run)); + + /* If logging is not configured, we're done. */ + if (!run) + return (0); + + conn->logging = 1; + /* + * Logging is on, allocate the WT_LOG structure and open the log file. + */ + WT_RET(__wt_calloc(session, 1, sizeof(WT_LOG), &conn->log)); + log = conn->log; + WT_RET(__wt_spin_init(session, &log->log_lock, "log")); + WT_RET(__wt_spin_init(session, &log->log_slot_lock, "log slot")); + WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync")); + WT_RET(__wt_rwlock_alloc(session, + &log->log_archive_lock, "log archive lock")); + if (FLD_ISSET(conn->direct_io, WT_FILE_TYPE_LOG)) + log->allocsize = + WT_MAX((uint32_t)conn->buffer_alignment, LOG_ALIGN); + else + log->allocsize = LOG_ALIGN; + INIT_LSN(&log->alloc_lsn); + INIT_LSN(&log->ckpt_lsn); + INIT_LSN(&log->first_lsn); + INIT_LSN(&log->sync_lsn); + INIT_LSN(&log->trunc_lsn); + INIT_LSN(&log->write_lsn); + log->fileid = 0; + WT_RET(__wt_cond_alloc(session, "log sync", 0, &log->log_sync_cond)); + WT_RET(__wt_log_open(session)); + WT_RET(__wt_log_slot_init(session)); + + /* If archiving is not configured, we're done. */ + if (!conn->archive) + return (0); + + /* + * If an archive thread exists, the user may have reconfigured the + * archive thread. Signal the thread. Otherwise the user wants + * archiving and we need to start up the thread. + */ + if (conn->arch_session != NULL) { + WT_ASSERT(session, conn->arch_cond != NULL); + WT_ASSERT(session, conn->arch_tid_set != 0); + WT_RET(__wt_cond_signal(session, conn->arch_cond)); + } else { + /* The log archive server gets its own session. */ + WT_RET(__wt_open_internal_session( + conn, "archive-server", 0, 0, &conn->arch_session)); + WT_RET(__wt_cond_alloc(conn->arch_session, + "log archiving server", 0, &conn->arch_cond)); + + /* + * Start the thread. + */ + WT_RET(__wt_thread_create(conn->arch_session, + &conn->arch_tid, __log_archive_server, conn->arch_session)); + conn->arch_tid_set = 1; + } + + return (0); +} + +/* + * __wt_logmgr_destroy -- + * Destroy the log archiving server thread and logging subsystem. + */ +int +__wt_logmgr_destroy(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION *wt_session; + + conn = S2C(session); + + if (!conn->logging) + return (0); + if (conn->arch_tid_set) { + WT_TRET(__wt_cond_signal(session, conn->arch_cond)); + WT_TRET(__wt_thread_join(session, conn->arch_tid)); + conn->arch_tid_set = 0; + } + WT_TRET(__wt_cond_destroy(session, &conn->arch_cond)); + + WT_TRET(__wt_log_close(session)); + + __wt_free(session, conn->log_path); + + /* Close the server thread's session. */ + if (conn->arch_session != NULL) { + wt_session = &conn->arch_session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + conn->arch_session = NULL; + } + + WT_TRET(__wt_log_slot_destroy(session)); + WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond)); + WT_TRET(__wt_rwlock_destroy(session, &conn->log->log_archive_lock)); + __wt_spin_destroy(session, &conn->log->log_lock); + __wt_spin_destroy(session, &conn->log->log_slot_lock); + __wt_spin_destroy(session, &conn->log->log_sync_lock); + __wt_free(session, conn->log); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c new file mode 100644 index 00000000000..41fc9809521 --- /dev/null +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -0,0 +1,244 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_connection_open -- + * Open a connection. + */ +int +__wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) +{ + WT_SESSION_IMPL *session; + + /* Default session. */ + session = conn->default_session; + WT_ASSERT(session, session->iface.connection == &conn->iface); + + /* + * Tell internal server threads to run: this must be set before opening + * any sessions. + */ + F_SET(conn, WT_CONN_SERVER_RUN); + + /* WT_SESSION_IMPL array. */ + WT_RET(__wt_calloc(session, + conn->session_size, sizeof(WT_SESSION_IMPL), &conn->sessions)); + + /* + * Open the default session. We open this before starting service + * threads because those may allocate and use session resources that + * need to get cleaned up on close. + */ + WT_RET(__wt_open_internal_session(conn, "connection", 1, 0, &session)); + + /* + * The connection's default session is originally a static structure, + * swap that out for a more fully-functional session. It's necessary + * to have this step: the session allocation code uses the connection's + * session, and if we pass a reference to the default session as the + * place to store the allocated session, things get confused and error + * handling can be corrupted. So, we allocate into a stack variable + * and then assign it on success. + */ + conn->default_session = session; + + /* + * Publish: there must be a barrier to ensure the connection structure + * fields are set before other threads read from the pointer. + */ + WT_WRITE_BARRIER(); + + /* Connect to a cache pool. */ + WT_RET(__wt_cache_pool_config(session, cfg)); + + /* Create the cache. */ + WT_RET(__wt_cache_create(session, cfg)); + + /* Initialize transaction support. */ + WT_RET(__wt_txn_global_init(session, cfg)); + + return (0); +} + +/* + * __wt_connection_close -- + * Close a connection handle. + */ +int +__wt_connection_close(WT_CONNECTION_IMPL *conn) +{ + WT_CONNECTION *wt_conn; + WT_DECL_RET; + WT_DLH *dlh; + WT_FH *fh; + WT_SESSION_IMPL *s, *session; + WT_TXN_GLOBAL *txn_global; + u_int i; + + wt_conn = &conn->iface; + txn_global = &conn->txn_global; + session = conn->default_session; + + /* + * We're shutting down. Make sure everything gets freed. + * + * It's possible that the eviction server is in the middle of a long + * operation, with a transaction ID pinned. In that case, we will loop + * here until the transaction ID is released, when the oldest + * transaction ID will catch up with the current ID. + */ + for (;;) { + __wt_txn_update_oldest(session); + if (txn_global->oldest_id == txn_global->current) + break; + __wt_yield(); + } + + /* Clear any pending async ops. */ + WT_TRET(__wt_async_flush(session)); + + /* + * Shut down server threads other than the eviction server, which is + * needed later to close btree handles. Some of these threads access + * btree handles, so take care in ordering shutdown to make sure they + * exit before files are closed. + */ + F_CLR(conn, WT_CONN_SERVER_RUN); + WT_TRET(__wt_async_destroy(session)); + WT_TRET(__wt_lsm_manager_destroy(session)); + WT_TRET(__wt_checkpoint_server_destroy(session)); + WT_TRET(__wt_statlog_destroy(session, 1)); + WT_TRET(__wt_sweep_destroy(session)); + + /* Close open data handles. */ + WT_TRET(__wt_conn_dhandle_discard(session)); + + /* + * Now that all data handles are closed, tell logging that a checkpoint + * has completed then shut down the log manager (only after closing + * data handles). + */ + if (conn->logging) { + WT_TRET(__wt_txn_checkpoint_log( + session, 1, WT_TXN_LOG_CKPT_STOP, NULL)); + WT_TRET(__wt_logmgr_destroy(session)); + } + + /* Free memory for collators, compressors, data sources. */ + WT_TRET(__wt_conn_remove_collator(session)); + WT_TRET(__wt_conn_remove_compressor(session)); + WT_TRET(__wt_conn_remove_data_source(session)); + + /* + * Complain if files weren't closed, ignoring the lock file, we'll + * close it in a minute. + */ + TAILQ_FOREACH(fh, &conn->fhqh, q) { + if (fh == conn->lock_fh) + continue; + + __wt_errx(session, + "Connection has open file handles: %s", fh->name); + WT_TRET(__wt_close(session, fh)); + fh = TAILQ_FIRST(&conn->fhqh); + } + + /* Shut down the eviction server thread. */ + WT_TRET(__wt_evict_destroy(session)); + + /* Disconnect from shared cache - must be before cache destroy. */ + WT_TRET(__wt_conn_cache_pool_destroy(session)); + + /* Discard the cache. */ + WT_TRET(__wt_cache_destroy(session)); + + /* Discard transaction state. */ + __wt_txn_global_destroy(session); + + /* Close extensions, first calling any unload entry point. */ + while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) { + TAILQ_REMOVE(&conn->dlhqh, dlh, q); + + if (dlh->terminate != NULL) + WT_TRET(dlh->terminate(wt_conn)); + WT_TRET(__wt_dlclose(session, dlh)); + } + + /* + * Close the internal (default) session, and switch back to the dummy + * session in case of any error messages from the remaining operations + * while destroying the connection handle. + */ + if (session != &conn->dummy_session) { + WT_TRET(session->iface.close(&session->iface, NULL)); + session = conn->default_session = &conn->dummy_session; + } + + /* + * The session's split stash isn't discarded during normal session close + * because it may persist past the life of the session. Discard it now. + */ + if ((s = conn->sessions) != NULL) + for (i = 0; i < conn->session_size; ++s, ++i) + __wt_split_stash_discard_all(session, s); + + /* + * The session's hazard pointer memory isn't discarded during normal + * session close because access to it isn't serialized. Discard it + * now. + */ + if ((s = conn->sessions) != NULL) + for (i = 0; i < conn->session_size; ++s, ++i) + if (s != session) + __wt_free(session, s->hazard); + + /* Destroy the handle. */ + WT_TRET(__wt_connection_destroy(conn)); + + return (ret); +} + +/* + * __wt_connection_workers -- + * Start the worker threads. + */ +int +__wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]) +{ + /* + * Start the eviction thread. + */ + WT_RET(__wt_evict_create(session)); + + /* + * Start the handle sweep thread. + */ + WT_RET(__wt_sweep_create(session)); + + /* + * Start the optional statistics thread. Start statistics first so that + * other optional threads can know if statistics are enabled or not. + */ + WT_RET(__wt_statlog_create(session, cfg)); + + /* Start the optional async threads. */ + WT_RET(__wt_async_create(session, cfg)); + + /* + * Start the optional logging/archive thread. + * NOTE: The log manager must be started before checkpoints so that the + * checkpoint server knows if logging is enabled. + */ + WT_RET(__wt_logmgr_create(session, cfg)); + + /* Start the optional checkpoint thread. */ + WT_RET(__wt_checkpoint_server_create(session, cfg)); + + return (0); +} diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c new file mode 100644 index 00000000000..f7229504898 --- /dev/null +++ b/src/third_party/wiredtiger/src/conn/conn_stat.c @@ -0,0 +1,540 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +#ifdef __GNUC__ +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 1) +/* + * !!! + * GCC with -Wformat-nonliteral complains about calls to strftime in this file. + * There's nothing wrong, this makes the warning go away. + */ +#pragma GCC diagnostic ignored "-Wformat-nonliteral" +#endif +#endif + +/* + * __stat_sources_free -- + * Free the array of statistics sources. + */ +static void +__stat_sources_free(WT_SESSION_IMPL *session, char ***sources) +{ + char **p; + + if ((p = (*sources)) != NULL) { + for (; *p != NULL; ++p) + __wt_free(session, *p); + __wt_free(session, *sources); + } +} + +/* + * __wt_conn_stat_init -- + * Initialize the per-connection statistics. + */ +void +__wt_conn_stat_init(WT_SESSION_IMPL *session) +{ + __wt_async_stats_update(session); + __wt_cache_stats_update(session); + __wt_txn_stats_update(session); +} + +/* + * __statlog_config -- + * Parse and setup the statistics server options. + */ +static int +__statlog_config(WT_SESSION_IMPL *session, const char **cfg, int *runp) +{ + WT_CONFIG objectconf; + WT_CONFIG_ITEM cval, k, v; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + int cnt; + char **sources; + + conn = S2C(session); + sources = NULL; + + WT_RET(__wt_config_gets(session, cfg, "statistics_log.wait", &cval)); + /* Only start the server if wait time is non-zero */ + *runp = (cval.val == 0) ? 0 : 1; + conn->stat_usecs = (long)cval.val * 1000000; + + WT_RET(__wt_config_gets( + session, cfg, "statistics_log.on_close", &cval)); + if (cval.val != 0) + FLD_SET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE); + + /* + * Statistics logging configuration requires either a wait time or an + * on-close setting. + */ + if (*runp == 0 && !FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE)) + return (0); + + WT_RET(__wt_config_gets(session, cfg, "statistics_log.sources", &cval)); + WT_RET(__wt_config_subinit(session, &objectconf, &cval)); + for (cnt = 0; (ret = __wt_config_next(&objectconf, &k, &v)) == 0; ++cnt) + ; + WT_RET_NOTFOUND_OK(ret); + if (cnt != 0) { + WT_RET(__wt_calloc_def(session, cnt + 1, &sources)); + WT_RET(__wt_config_subinit(session, &objectconf, &cval)); + for (cnt = 0; + (ret = __wt_config_next(&objectconf, &k, &v)) == 0; ++cnt) { + /* + * XXX + * Only allow "file:" and "lsm:" for now: "file:" works + * because it's been converted to data handles, "lsm:" + * works because we can easily walk the list of open LSM + * objects, even though it hasn't been converted. + */ + if (!WT_PREFIX_MATCH(k.str, "file:") && + !WT_PREFIX_MATCH(k.str, "lsm:")) + WT_ERR_MSG(session, EINVAL, + "statistics_log sources configuration only " + "supports objects of type \"file\" or " + "\"lsm\""); + WT_ERR( + __wt_strndup(session, k.str, k.len, &sources[cnt])); + } + WT_ERR_NOTFOUND_OK(ret); + + conn->stat_sources = sources; + sources = NULL; + } + + WT_ERR(__wt_config_gets(session, cfg, "statistics_log.path", &cval)); + WT_ERR(__wt_nfilename(session, cval.str, cval.len, &conn->stat_path)); + + WT_ERR(__wt_config_gets( + session, cfg, "statistics_log.timestamp", &cval)); + WT_ERR(__wt_strndup(session, cval.str, cval.len, &conn->stat_format)); + +err: __stat_sources_free(session, &sources); + return (ret); +} + +/* + * __statlog_dump -- + * Dump out handle/connection statistics. + */ +static int +__statlog_dump(WT_SESSION_IMPL *session, const char *name, int conn_stats) +{ + WT_CONNECTION_IMPL *conn; + WT_CURSOR *cursor; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_STATS *stats; + u_int i; + uint64_t max; + const char *uri; + const char *cfg[] = { + WT_CONFIG_BASE(session, session_open_cursor), NULL }; + + conn = S2C(session); + + /* Build URI and configuration string. */ + if (conn_stats) + uri = "statistics:"; + else { + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__wt_buf_fmt(session, tmp, "statistics:%s", name)); + uri = tmp->data; + } + + /* + * Open the statistics cursor and dump the statistics. + * + * If we don't find an underlying object, silently ignore it, the object + * may exist only intermittently. + */ + switch (ret = __wt_curstat_open(session, uri, cfg, &cursor)) { + case 0: + max = conn_stats ? + sizeof(WT_CONNECTION_STATS) / sizeof(WT_STATS) : + sizeof(WT_DSRC_STATS) / sizeof(WT_STATS); + for (i = 0, + stats = WT_CURSOR_STATS(cursor); i < max; ++i, ++stats) + WT_ERR_TEST((fprintf(conn->stat_fp, + "%s %" PRIu64 " %s %s\n", + conn->stat_stamp, + stats->v, name, stats->desc) < 0), __wt_errno()); + WT_ERR(cursor->close(cursor)); + break; + case EBUSY: + case ENOENT: + case WT_NOTFOUND: + ret = 0; + break; + default: + break; + } + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __statlog_apply -- + * Review a single open handle and dump statistics on demand. + */ +static int +__statlog_apply(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_DATA_HANDLE *dhandle; + char **p; + + WT_UNUSED(cfg); + + dhandle = session->dhandle; + + /* Check for a match on the set of sources. */ + for (p = S2C(session)->stat_sources; *p != NULL; ++p) + if (WT_PREFIX_MATCH(dhandle->name, *p)) + return (__statlog_dump(session, dhandle->name, 0)); + return (0); +} + +/* + * __statlog_lsm_apply -- + * Review the list open LSM trees, and dump statistics on demand. + * + * XXX + * This code should be removed when LSM objects are converted to data handles. + */ +static int +__statlog_lsm_apply(WT_SESSION_IMPL *session) +{ +#define WT_LSM_TREE_LIST_SLOTS 100 + WT_LSM_TREE *lsm_tree, *list[WT_LSM_TREE_LIST_SLOTS]; + WT_DECL_RET; + int cnt, locked; + char **p; + + cnt = locked = 0; + + /* + * Walk the list of LSM trees, checking for a match on the set of + * sources. + * + * XXX + * We can't hold the schema lock for the traversal because the LSM + * statistics code acquires the tree lock, and the LSM cursor code + * acquires the tree lock and then acquires the schema lock, it's a + * classic deadlock. This is temporary code so I'm not going to do + * anything fancy. + * It is OK to not keep holding the schema lock after populating + * the list of matching LSM trees, since the __wt_lsm_tree_get call + * will bump a reference count, so the tree won't go away. + */ + __wt_spin_lock(session, &S2C(session)->schema_lock); + locked = 1; + TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) { + if (cnt == WT_LSM_TREE_LIST_SLOTS) + break; + for (p = S2C(session)->stat_sources; *p != NULL; ++p) + if (WT_PREFIX_MATCH(lsm_tree->name, *p)) { + WT_ERR(__wt_lsm_tree_get( + session, lsm_tree->name, 0, &list[cnt++])); + break; + } + } + __wt_spin_unlock(session, &S2C(session)->schema_lock); + locked = 0; + + while (cnt > 0) { + --cnt; + WT_TRET(__statlog_dump(session, list[cnt]->name, 0)); + __wt_lsm_tree_release(session, list[cnt]); + } + +err: if (locked) + __wt_spin_unlock(session, &S2C(session)->schema_lock); + /* Release any LSM trees on error. */ + while (cnt > 0) { + --cnt; + __wt_lsm_tree_release(session, list[cnt]); + } + return (ret); +} + +/* + * __statlog_log_one -- + * Output a set of statistics into the current log file. + */ +static int +__statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) +{ + FILE *log_file; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + struct timespec ts; + struct tm *tm, _tm; + + conn = S2C(session); + + /* Get the current local time of day. */ + WT_RET(__wt_epoch(session, &ts)); + tm = localtime_r(&ts.tv_sec, &_tm); + + /* Create the logging path name for this time of day. */ + if (strftime(tmp->mem, tmp->memsize, conn->stat_path, tm) == 0) + WT_RET_MSG(session, ENOMEM, "strftime path conversion"); + + /* If the path has changed, cycle the log file. */ + if ((log_file = conn->stat_fp) == NULL || + path == NULL || strcmp(tmp->mem, path->mem) != 0) { + conn->stat_fp = NULL; + if (log_file != NULL) + WT_RET(fclose(log_file) == 0 ? 0 : __wt_errno()); + + if (path != NULL) + (void)strcpy(path->mem, tmp->mem); + WT_RET_TEST((log_file = + fopen(tmp->mem, "a")) == NULL, __wt_errno()); + } + conn->stat_fp = log_file; + + /* Create the entry prefix for this time of day. */ + if (strftime(tmp->mem, tmp->memsize, conn->stat_format, tm) == 0) + WT_RET_MSG(session, ENOMEM, "strftime timestamp conversion"); + conn->stat_stamp = tmp->mem; + + /* Dump the connection statistics. */ + WT_RET(__statlog_dump(session, conn->home, 1)); + +#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING + /* Dump the spinlock statistics. */ + WT_RET(__wt_statlog_dump_spinlock(conn, conn->home)); +#endif + + /* + * Lock the schema and walk the list of open handles, dumping + * any that match the list of object sources. + */ + if (conn->stat_sources != NULL) { + WT_WITH_SCHEMA_LOCK(session, ret = + __wt_conn_btree_apply(session, 0, __statlog_apply, NULL)); + WT_RET(ret); + } + + /* + * Walk the list of open LSM trees, dumping any that match the + * the list of object sources. + * + * XXX + * This code should be removed when LSM objects are converted to + * data handles. + */ + if (conn->stat_sources != NULL) + WT_RET(__statlog_lsm_apply(session)); + + /* Flush. */ + WT_RET(fflush(conn->stat_fp) == 0 ? 0 : __wt_errno()); + + return (0); +} + +/* + * __wt_statlog_log_one -- + * Log a set of statistics into the configured statistics log. Requires + * that the server is not currently running. + */ +int +__wt_statlog_log_one(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_DECL_ITEM(tmp); + + conn = S2C(session); + + if (!FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE)) + return (0); + + if (F_ISSET(conn, WT_CONN_SERVER_RUN) && + F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) + WT_RET_MSG(session, EINVAL, + "Attempt to log statistics while a server is running"); + + WT_RET(__wt_scr_alloc(session, strlen(conn->stat_path) + 128, &tmp)); + WT_ERR(__statlog_log_one(session, NULL, tmp)); + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __statlog_server -- + * The statistics server thread. + */ +static void * +__statlog_server(void *arg) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_ITEM path, tmp; + WT_SESSION_IMPL *session; + + session = arg; + conn = S2C(session); + + WT_CLEAR(path); + WT_CLEAR(tmp); + + /* + * We need a temporary place to build a path and an entry prefix. + * The length of the path plus 128 should be more than enough. + * + * We also need a place to store the current path, because that's + * how we know when to close/re-open the file. + */ + WT_ERR(__wt_buf_init(session, &path, strlen(conn->stat_path) + 128)); + WT_ERR(__wt_buf_init(session, &tmp, strlen(conn->stat_path) + 128)); + + while (F_ISSET(conn, WT_CONN_SERVER_RUN) && + F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) { + if (!FLD_ISSET(conn->stat_flags, WT_CONN_STAT_NONE)) + WT_ERR(__statlog_log_one(session, &path, &tmp)); + + /* Wait until the next event. */ + WT_ERR( + __wt_cond_wait(session, conn->stat_cond, conn->stat_usecs)); + } + + if (0) { +err: __wt_err(session, ret, "statistics log server error"); + } + __wt_buf_free(session, &path); + __wt_buf_free(session, &tmp); + return (NULL); +} + +/* + * __statlog_start -- + * Start the statistics server thread. + */ +static int +__statlog_start(WT_CONNECTION_IMPL *conn) +{ + WT_SESSION_IMPL *session; + + /* Nothing to do if the server is already running. */ + if (conn->stat_session != NULL) + return (0); + + F_SET(conn, WT_CONN_SERVER_STATISTICS); + /* The statistics log server gets its own session. */ + WT_RET(__wt_open_internal_session( + conn, "statlog-server", 1, 1, &conn->stat_session)); + session = conn->stat_session; + + WT_RET(__wt_cond_alloc( + session, "statistics log server", 0, &conn->stat_cond)); + + /* + * Start the thread. + * + * Statistics logging creates a thread per database, rather than using + * a single thread to do logging for all of the databases. If we ever + * see lots of databases at a time, doing statistics logging, and we + * want to reduce the number of threads, there's no reason we have to + * have more than one thread, I just didn't feel like writing the code + * to figure out the scheduling. + */ + WT_RET(__wt_thread_create( + session, &conn->stat_tid, __statlog_server, session)); + conn->stat_tid_set = 1; + + return (0); +} + +/* + * __wt_statlog_create -- + * Start the statistics server thread. + */ +int +__wt_statlog_create(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONNECTION_IMPL *conn; + int start; + + conn = S2C(session); + start = 0; + + /* + * Stop any server that is already running. This means that each time + * reconfigure is called we'll bounce the server even if there are no + * configuration changes - but that makes our lives easier. + */ + if (conn->stat_session != NULL) + WT_RET(__wt_statlog_destroy(session, 0)); + + WT_RET(__statlog_config(session, cfg, &start)); + if (start) + WT_RET(__statlog_start(conn)); + + return (0); +} + +/* + * __wt_statlog_destroy -- + * Destroy the statistics server thread. + */ +int +__wt_statlog_destroy(WT_SESSION_IMPL *session, int is_close) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION *wt_session; + + conn = S2C(session); + + F_CLR(conn, WT_CONN_SERVER_STATISTICS); + if (conn->stat_tid_set) { + WT_TRET(__wt_cond_signal(session, conn->stat_cond)); + WT_TRET(__wt_thread_join(session, conn->stat_tid)); + conn->stat_tid_set = 0; + } + + /* Log a set of statistics on shutdown if configured. */ + if (is_close) + WT_TRET(__wt_statlog_log_one(session)); + + WT_TRET(__wt_cond_destroy(session, &conn->stat_cond)); + + __stat_sources_free(session, &conn->stat_sources); + __wt_free(session, conn->stat_path); + __wt_free(session, conn->stat_format); + + /* Close the server thread's session. */ + if (conn->stat_session != NULL) { + wt_session = &conn->stat_session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + } + + /* Clear connection settings so reconfigure is reliable. */ + conn->stat_session = NULL; + conn->stat_tid_set = 0; + conn->stat_format = NULL; + if (conn->stat_fp != NULL) { + WT_TRET(fclose(conn->stat_fp) == 0 ? 0 : __wt_errno()); + conn->stat_fp = NULL; + } + conn->stat_path = NULL; + conn->stat_sources = NULL; + conn->stat_stamp = NULL; + conn->stat_usecs = 0; + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c new file mode 100644 index 00000000000..3bccc5814be --- /dev/null +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -0,0 +1,187 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __sweep -- + * Close unused dhandles on the connection dhandle list. + */ +static int +__sweep(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle, *dhandle_next; + WT_DECL_RET; + time_t now; + + conn = S2C(session); + + /* + * Session's cache handles unless the session itself is closed, at which + * time the handle reference counts are immediately decremented. Don't + * discard handles that have been open recently. + */ + WT_RET(__wt_seconds(session, &now)); + + dhandle = SLIST_FIRST(&conn->dhlh); + for (; dhandle != NULL; dhandle = dhandle_next) { + dhandle_next = SLIST_NEXT(dhandle, l); + if (dhandle->session_ref != 0 || + now - dhandle->timeofdeath <= WT_DHANDLE_SWEEP_WAIT) + continue; + + /* + * We have a candidate for closing; if it's open, flush dirty + * leaf pages, then acquire an exclusive lock on the handle + * and close it. We might be blocking opens for a long time + * (over disk I/O), but the handle was quiescent for awhile. + * + * The close can fail if an update cannot be written (updates in + * a no-longer-referenced file might not yet be globally visible + * if sessions have disjoint sets of files open). If the handle + * is busy, skip it, we'll retry the close the next time, after + * the transaction state has progressed. + */ + if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) { + WT_WITH_DHANDLE(session, dhandle, + ret = __wt_cache_op( + session, NULL, WT_SYNC_WRITE_LEAVES)); + WT_RET(ret); + + /* + * We don't set WT_DHANDLE_EXCLUSIVE deliberately, we + * want opens to block on us rather than returning an + * EBUSY error to the application. + */ + ret = __wt_try_writelock(session, dhandle->rwlock); + if (ret == EBUSY) { + ret = 0; + continue; + } + WT_RET(ret); + + WT_WITH_DHANDLE(session, dhandle, + ret = __wt_conn_btree_sync_and_close(session, 0)); + if (ret == EBUSY) + ret = 0; + + WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); + WT_RET(ret); + } + + /* + * Attempt to discard the handle (the called function checks the + * handle-open flag after acquiring appropriate locks, which is + * why we don't do any special handling of EBUSY returns above, + * that path never cleared the handle-open flag. + */ + ret = __wt_conn_dhandle_discard_single(session, dhandle, 0); + if (ret == EBUSY) + ret = 0; + WT_RET(ret); + } + return (0); +} + +/* + * __sweep_server -- + * The handle sweep server thread. + */ +static void * +__sweep_server(void *arg) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = arg; + conn = S2C(session); + + /* + * Sweep for dead handles. + */ + while (F_ISSET(conn, WT_CONN_SERVER_RUN) && + F_ISSET(conn, WT_CONN_SERVER_SWEEP)) { + + /* Wait until the next event. */ + WT_ERR( + __wt_cond_wait(session, conn->sweep_cond, 30 * WT_MILLION)); + + /* Sweep the handles. */ + WT_ERR(__sweep(session)); + } + + if (0) { +err: __wt_err(session, ret, "handle sweep server error"); + } + return (NULL); +} + +/* + * __wt_sweep_create -- + * Start the handle sweep thread. + */ +int +__wt_sweep_create(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + /* Set first, the thread might run before we finish up. */ + F_SET(conn, WT_CONN_SERVER_SWEEP); + + WT_RET(__wt_open_internal_session( + conn, "sweep-server", 1, 1, &conn->sweep_session)); + session = conn->sweep_session; + + /* + * Handle sweep does enough I/O it may be called upon to perform slow + * operations for the block manager. + */ + F_SET(session, WT_SESSION_CAN_WAIT); + + WT_RET(__wt_cond_alloc( + session, "handle sweep server", 0, &conn->sweep_cond)); + + WT_RET(__wt_thread_create( + session, &conn->sweep_tid, __sweep_server, session)); + conn->sweep_tid_set = 1; + + return (0); +} + +/* + * __wt_sweep_destroy -- + * Destroy the handle-sweep thread. + */ +int +__wt_sweep_destroy(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION *wt_session; + + conn = S2C(session); + + F_CLR(conn, WT_CONN_SERVER_SWEEP); + if (conn->sweep_tid_set) { + WT_TRET(__wt_cond_signal(session, conn->sweep_cond)); + WT_TRET(__wt_thread_join(session, conn->sweep_tid)); + conn->sweep_tid_set = 0; + } + WT_TRET(__wt_cond_destroy(session, &conn->sweep_cond)); + + if (conn->sweep_session != NULL) { + wt_session = &conn->sweep_session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + + conn->sweep_session = NULL; + } + return (ret); +} diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c new file mode 100644 index 00000000000..85a85521213 --- /dev/null +++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c @@ -0,0 +1,540 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __backup_all(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *); +static int __backup_cleanup_handles(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *); +static int __backup_file_create(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *); +static int __backup_file_remove(WT_SESSION_IMPL *); +static int __backup_list_all_append(WT_SESSION_IMPL *, const char *[]); +static int __backup_list_append( + WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *); +static int __backup_start( + WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *[]); +static int __backup_stop(WT_SESSION_IMPL *); +static int __backup_uri( + WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *[], int *); + +/* + * __curbackup_next -- + * WT_CURSOR->next method for the backup cursor type. + */ +static int +__curbackup_next(WT_CURSOR *cursor) +{ + WT_CURSOR_BACKUP *cb; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cb = (WT_CURSOR_BACKUP *)cursor; + CURSOR_API_CALL(cursor, session, next, NULL); + + if (cb->list == NULL || cb->list[cb->next].name == NULL) { + F_CLR(cursor, WT_CURSTD_KEY_SET); + WT_ERR(WT_NOTFOUND); + } + + cb->iface.key.data = cb->list[cb->next].name; + cb->iface.key.size = strlen(cb->list[cb->next].name) + 1; + ++cb->next; + + F_SET(cursor, WT_CURSTD_KEY_INT); + +err: API_END_RET(session, ret); +} + +/* + * __curbackup_reset -- + * WT_CURSOR->reset method for the backup cursor type. + */ +static int +__curbackup_reset(WT_CURSOR *cursor) +{ + WT_CURSOR_BACKUP *cb; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cb = (WT_CURSOR_BACKUP *)cursor; + CURSOR_API_CALL(cursor, session, reset, NULL); + + cb->next = 0; + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + +err: API_END_RET(session, ret); +} + +/* + * __curbackup_close -- + * WT_CURSOR->close method for the backup cursor type. + */ +static int +__curbackup_close(WT_CURSOR *cursor) +{ + WT_CURSOR_BACKUP *cb; + WT_DECL_RET; + WT_SESSION_IMPL *session; + int tret; + + cb = (WT_CURSOR_BACKUP *)cursor; + CURSOR_API_CALL(cursor, session, close, NULL); + + WT_TRET(__backup_cleanup_handles(session, cb)); + WT_TRET(__wt_cursor_close(cursor)); + session->bkp_cursor = NULL; + + WT_WITH_SCHEMA_LOCK(session, + tret = __backup_stop(session)); /* Stop the backup. */ + WT_TRET(tret); + +err: API_END_RET(session, ret); +} + +/* + * __wt_curbackup_open -- + * WT_SESSION->open_cursor method for the backup cursor type. + */ +int +__wt_curbackup_open(WT_SESSION_IMPL *session, + const char *uri, const char *cfg[], WT_CURSOR **cursorp) +{ + WT_CURSOR_STATIC_INIT(iface, + __wt_cursor_get_key, /* get-key */ + __wt_cursor_notsup, /* get-value */ + __wt_cursor_notsup, /* set-key */ + __wt_cursor_notsup, /* set-value */ + __wt_cursor_notsup, /* compare */ + __curbackup_next, /* next */ + __wt_cursor_notsup, /* prev */ + __curbackup_reset, /* reset */ + __wt_cursor_notsup, /* search */ + __wt_cursor_notsup, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __curbackup_close); /* close */ + WT_CURSOR *cursor; + WT_CURSOR_BACKUP *cb; + WT_DECL_RET; + + WT_STATIC_ASSERT(offsetof(WT_CURSOR_BACKUP, iface) == 0); + + cb = NULL; + + WT_RET(__wt_calloc_def(session, 1, &cb)); + cursor = &cb->iface; + *cursor = iface; + cursor->session = &session->iface; + session->bkp_cursor = cb; + + cursor->key_format = "S"; /* Return the file names as the key. */ + cursor->value_format = ""; /* No value. */ + + /* + * Start the backup and fill in the cursor's list. Acquire the schema + * lock, we need a consistent view when creating a copy. + */ + WT_WITH_SCHEMA_LOCK(session, ret = __backup_start(session, cb, cfg)); + WT_ERR(ret); + + /* __wt_cursor_init is last so we don't have to clean up on error. */ + WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); + + if (0) { +err: __wt_free(session, cb); + } + + return (ret); +} + +/* + * __backup_start -- + * Start a backup. + */ +static int +__backup_start( + WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[]) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + u_int i, logcount; + int exist, target_list; + char **logfiles; + + conn = S2C(session); + + cb->next = 0; + cb->list = NULL; + logfiles = NULL; + logcount = 0; + + /* + * Single thread hot backups: we're holding the schema lock, so we + * know we'll serialize with other attempts to start a hot backup. + */ + if (conn->hot_backup) + WT_RET_MSG( + session, EINVAL, "there is already a backup cursor open"); + + /* + * The hot backup copy is done outside of WiredTiger, which means file + * blocks can't be freed and re-allocated until the backup completes. + * The checkpoint code checks the backup flag, and if a backup cursor + * is open checkpoints aren't discarded. We release the lock as soon + * as we've set the flag, we don't want to block checkpoints, we just + * want to make sure no checkpoints are deleted. The checkpoint code + * holds the lock until it's finished the checkpoint, otherwise we + * could start a hot backup that would race with an already-started + * checkpoint. + */ + __wt_spin_lock(session, &conn->hot_backup_lock); + conn->hot_backup = 1; + __wt_spin_unlock(session, &conn->hot_backup_lock); + + /* Create the hot backup file. */ + WT_ERR(__backup_file_create(session, cb)); + + /* Add log files if logging is enabled. */ + + /* + * If a list of targets was specified, work our way through them. + * Else, generate a list of all database objects. + * + * Include log files if doing a full backup, and copy them before + * copying data files to avoid rolling the metadata forward across + * a checkpoint that completes during the backup. + */ + target_list = 0; + WT_ERR(__backup_uri(session, cb, cfg, &target_list)); + if (!target_list) { + if (conn->log) { + WT_ERR(__wt_log_get_active_files( + session, &logfiles, &logcount)); + for (i = 0; i < logcount; i++) + WT_ERR(__backup_list_append( + session, cb, logfiles[i])); + } + + WT_ERR(__backup_all(session, cb)); + } + + /* Add the hot backup and standard WiredTiger files to the list. */ + WT_ERR(__backup_list_append(session, cb, WT_METADATA_BACKUP)); + WT_ERR(__wt_exist(session, WT_BASECONFIG, &exist)); + if (exist) + WT_ERR(__backup_list_append(session, cb, WT_BASECONFIG)); + WT_ERR(__wt_exist(session, WT_USERCONFIG, &exist)); + if (exist) + WT_ERR(__backup_list_append(session, cb, WT_USERCONFIG)); + WT_ERR(__backup_list_append(session, cb, WT_WIREDTIGER)); + +err: /* Close the hot backup file. */ + if (cb->bfp != NULL) { + WT_TRET(fclose(cb->bfp) == 0 ? 0 : __wt_errno()); + cb->bfp = NULL; + } + if (logfiles != NULL) + __wt_log_files_free(session, logfiles, logcount); + + if (ret != 0) { + WT_TRET(__backup_cleanup_handles(session, cb)); + WT_TRET(__backup_stop(session)); + } + + return (ret); +} + +/* + * __backup_cleanup_handles -- + * Release and free all btree handles held by the backup. This is kept + * separate from __backup_stop because it can be called without the + * schema lock held. + */ +static int +__backup_cleanup_handles(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) +{ + WT_CURSOR_BACKUP_ENTRY *p; + WT_DECL_RET; + + if (cb->list == NULL) + return (0); + + /* Release the handles, free the file names, free the list itself. */ + for (p = cb->list; p->name != NULL; ++p) { + if (p->handle != NULL) + WT_WITH_DHANDLE(session, p->handle, + WT_TRET(__wt_session_release_btree(session))); + __wt_free(session, p->name); + } + + __wt_free(session, cb->list); + return (ret); +} + +/* + * __backup_stop -- + * Stop a backup. + */ +static int +__backup_stop(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + conn = S2C(session); + + /* Remove any backup metadata file. */ + ret = __backup_file_remove(session); + + /* Checkpoint deletion can proceed, as can the next hot backup. */ + __wt_spin_lock(session, &conn->hot_backup_lock); + conn->hot_backup = 0; + __wt_spin_unlock(session, &conn->hot_backup_lock); + + return (ret); +} + +/* + * __backup_all -- + * Backup all objects in the database. + */ +static int +__backup_all(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) +{ + WT_CONFIG_ITEM cval; + WT_CURSOR *cursor; + WT_DECL_RET; + const char *key, *value; + + cursor = NULL; + + /* + * Open a cursor on the metadata file and copy all of the entries to + * the hot backup file. + */ + WT_ERR(__wt_metadata_cursor(session, NULL, &cursor)); + while ((ret = cursor->next(cursor)) == 0) { + WT_ERR(cursor->get_key(cursor, &key)); + WT_ERR(cursor->get_value(cursor, &value)); + WT_ERR_TEST((fprintf( + cb->bfp, "%s\n%s\n", key, value) < 0), __wt_errno()); + + /* + * While reading the metadata file, check there are no "sources" + * or "types" which can't support hot backup. This checks for + * a data source that's non-standard, which can't be backed up, + * but is also sanity checking: if there's an entry backed by + * anything other than a file or lsm entry, we're confused. + */ + if ((ret = __wt_config_getones( + session, value, "type", &cval)) == 0 && + !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "file") && + !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "lsm")) + WT_ERR_MSG(session, ENOTSUP, + "hot backup is not supported for objects of " + "type %.*s", (int)cval.len, cval.str); + WT_ERR_NOTFOUND_OK(ret); + if ((ret =__wt_config_getones( + session, value, "source", &cval)) == 0 && + !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "file:") && + !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "lsm:")) + WT_ERR_MSG(session, ENOTSUP, + "hot backup is not supported for objects of " + "source %.*s", (int)cval.len, cval.str); + WT_ERR_NOTFOUND_OK(ret); + } + WT_ERR_NOTFOUND_OK(ret); + + /* Build a list of the file objects that need to be copied. */ + WT_ERR(__wt_meta_btree_apply(session, __backup_list_all_append, NULL)); + +err: if (cursor != NULL) + WT_TRET(cursor->close(cursor)); + return (ret); +} + +/* + * __backup_uri -- + * Backup a list of objects. + */ +static int +__backup_uri(WT_SESSION_IMPL *session, + WT_CURSOR_BACKUP *cb, const char *cfg[], int *foundp) +{ + WT_CONFIG targetconf; + WT_CONFIG_ITEM cval, k, v; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + int target_list; + const char *uri; + + *foundp = target_list = 0; + + /* + * If we find a non-empty target configuration string, we have a job, + * otherwise it's not our problem. + */ + WT_RET(__wt_config_gets(session, cfg, "target", &cval)); + WT_RET(__wt_config_subinit(session, &targetconf, &cval)); + for (cb->list_next = 0; + (ret = __wt_config_next(&targetconf, &k, &v)) == 0;) { + if (!target_list) { + target_list = *foundp = 1; + + WT_ERR(__wt_scr_alloc(session, 512, &tmp)); + } + + WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str)); + uri = tmp->data; + if (v.len != 0) + WT_ERR_MSG(session, EINVAL, + "%s: invalid backup target: URIs may need quoting", + uri); + + WT_ERR(__wt_schema_worker( + session, uri, NULL, __wt_backup_list_uri_append, cfg, 0)); + } + WT_ERR_NOTFOUND_OK(ret); + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __backup_file_create -- + * Create the meta-data backup file. + */ +static int +__backup_file_create(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) +{ + WT_DECL_RET; + char *path; + + /* Open the hot backup file. */ + WT_RET(__wt_filename(session, WT_METADATA_BACKUP, &path)); + WT_ERR_TEST((cb->bfp = fopen(path, "w")) == NULL, __wt_errno()); + +err: __wt_free(session, path); + return (ret); +} + +/* + * __backup_file_remove -- + * Remove the meta-data backup file. + */ +static int +__backup_file_remove(WT_SESSION_IMPL *session) +{ + return (__wt_remove(session, WT_METADATA_BACKUP)); +} + +/* + * __wt_backup_list_uri_append -- + * Append a new file name to the list, allocate space as necessary. + * Called via the schema_worker function. + */ +int +__wt_backup_list_uri_append( + WT_SESSION_IMPL *session, const char *name, int *skip) +{ + WT_CURSOR_BACKUP *cb; + const char *value; + + cb = session->bkp_cursor; + WT_UNUSED(skip); + + /* Add the metadata entry to the backup file. */ + WT_RET(__wt_metadata_search(session, name, &value)); + WT_RET_TEST( + (fprintf(cb->bfp, "%s\n%s\n", name, value) < 0), __wt_errno()); + __wt_free(session, value); + + /* Add file type objects to the list of files to be copied. */ + if (WT_PREFIX_MATCH(name, "file:")) + WT_RET(__backup_list_append(session, cb, name)); + + return (0); +} + +/* + * __backup_list_all_append -- + * Append a new file name to the list, allocate space as necessary. + * Called via the __wt_meta_btree_apply function. + */ +static int +__backup_list_all_append(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CURSOR_BACKUP *cb; + + WT_UNUSED(cfg); + + cb = session->bkp_cursor; + + /* Ignore files in the process of being bulk-loaded. */ + if (F_ISSET(S2BT(session), WT_BTREE_BULK)) + return (0); + + /* Add the file to the list of files to be copied. */ + return (__backup_list_append(session, cb, session->dhandle->name)); +} + +/* + * __backup_list_append -- + * Append a new file name to the list, allocate space as necessary. + */ +static int +__backup_list_append( + WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *uri) +{ + WT_CURSOR_BACKUP_ENTRY *p; + WT_DATA_HANDLE *old_dhandle; + WT_DECL_RET; + const char *name; + int need_handle; + + /* Leave a NULL at the end to mark the end of the list. */ + WT_RET(__wt_realloc_def(session, &cb->list_allocated, + cb->list_next + 2, &cb->list)); + p = &cb->list[cb->list_next]; + p[0].name = p[1].name = NULL; + p[0].handle = p[1].handle = NULL; + + need_handle = 0; + name = uri; + if (WT_PREFIX_MATCH(uri, "file:")) { + need_handle = 1; + name += strlen("file:"); + } + + /* + * !!! + * Assumes metadata file entries map one-to-one to physical files. + * To support a block manager where that's not the case, we'd need + * to call into the block manager and get a list of physical files + * that map to this logical "file". I'm not going to worry about + * that for now, that block manager might not even support physical + * copying of files by applications. + */ + WT_RET(__wt_strdup(session, name, &p->name)); + + /* + * If it's a file in the database, get a handle for the underlying + * object (this handle blocks schema level operations, for example + * WT_SESSION.drop or an LSM file discard after level merging). + */ + if (need_handle) { + old_dhandle = session->dhandle; + if ((ret = + __wt_session_get_btree(session, uri, NULL, NULL, 0)) == 0) + p->handle = session->dhandle; + session->dhandle = old_dhandle; + WT_RET(ret); + } + + ++cb->list_next; + return (0); +} diff --git a/src/third_party/wiredtiger/src/cursor/cur_bulk.c b/src/third_party/wiredtiger/src/cursor/cur_bulk.c new file mode 100644 index 00000000000..96a45a7e629 --- /dev/null +++ b/src/third_party/wiredtiger/src/cursor/cur_bulk.c @@ -0,0 +1,287 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __curbulk_insert_fix -- + * Fixed-length column-store bulk cursor insert. + */ +static int +__curbulk_insert_fix(WT_CURSOR *cursor) +{ + WT_BTREE *btree; + WT_CURSOR_BULK *cbulk; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbulk = (WT_CURSOR_BULK *)cursor; + btree = cbulk->cbt.btree; + + /* + * Bulk cursor inserts are updates, but don't need auto-commit + * transactions because they are single-threaded and not visible + * until the bulk cursor is closed. + */ + CURSOR_API_CALL(cursor, session, insert, btree); + + WT_CURSOR_NEEDVALUE(cursor); + + WT_ERR(__wt_bulk_insert_fix(session, cbulk)); + + WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); + +err: API_END_RET(session, ret); +} + +/* + * __curbulk_insert_var -- + * Variable-length column-store bulk cursor insert. + */ +static int +__curbulk_insert_var(WT_CURSOR *cursor) +{ + WT_BTREE *btree; + WT_CURSOR_BULK *cbulk; + WT_DECL_RET; + WT_SESSION_IMPL *session; + int duplicate; + + cbulk = (WT_CURSOR_BULK *)cursor; + btree = cbulk->cbt.btree; + + /* + * Bulk cursor inserts are updates, but don't need auto-commit + * transactions because they are single-threaded and not visible + * until the bulk cursor is closed. + */ + CURSOR_API_CALL(cursor, session, insert, btree); + + WT_CURSOR_NEEDVALUE(cursor); + + /* + * If this isn't the first value inserted, compare it against the last + * value and increment the RLE count. + * + * Instead of a "first time" variable, I'm using the RLE count, because + * it is only zero before the first row is inserted. + */ + duplicate = 0; + if (cbulk->rle != 0) { + if (cbulk->last.size == cursor->value.size && + memcmp(cbulk->last.data, cursor->value.data, + cursor->value.size) == 0) { + ++cbulk->rle; + duplicate = 1; + } else + WT_ERR(__wt_bulk_insert_var(session, cbulk)); + } + + /* + * Save a copy of the value for the next comparison and reset the RLE + * counter. + */ + if (!duplicate) { + WT_ERR(__wt_buf_set(session, + &cbulk->last, cursor->value.data, cursor->value.size)); + cbulk->rle = 1; + } + + WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); + +err: API_END_RET(session, ret); +} + +/* + * __bulk_row_keycmp_err -- + * Error routine when keys inserted out-of-order. + */ +static int +__bulk_row_keycmp_err(WT_CURSOR_BULK *cbulk) +{ + WT_CURSOR *cursor; + WT_DECL_ITEM(a); + WT_DECL_ITEM(b); + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session; + cursor = &cbulk->cbt.iface; + + WT_ERR(__wt_scr_alloc(session, 512, &a)); + WT_ERR(__wt_scr_alloc(session, 512, &b)); + + WT_ERR(__wt_buf_set_printable( + session, a, cursor->key.data, cursor->key.size)); + WT_ERR(__wt_buf_set_printable( + session, b, cbulk->last.data, cbulk->last.size)); + + WT_ERR_MSG(session, EINVAL, + "bulk-load presented with out-of-order keys: %.*s compares smaller " + "than previously inserted key %.*s", + (int)a->size, (const char *)a->data, + (int)b->size, (const char *)b->data); + +err: __wt_scr_free(&a); + __wt_scr_free(&b); + return (ret); +} + +/* + * __curbulk_insert_row -- + * Row-store bulk cursor insert, with key-sort checks. + */ +static int +__curbulk_insert_row(WT_CURSOR *cursor) +{ + WT_BTREE *btree; + WT_CURSOR_BULK *cbulk; + WT_DECL_RET; + WT_SESSION_IMPL *session; + int cmp; + + cbulk = (WT_CURSOR_BULK *)cursor; + btree = cbulk->cbt.btree; + + /* + * Bulk cursor inserts are updates, but don't need auto-commit + * transactions because they are single-threaded and not visible + * until the bulk cursor is closed. + */ + CURSOR_API_CALL(cursor, session, insert, btree); + + WT_CURSOR_CHECKKEY(cursor); + WT_CURSOR_CHECKVALUE(cursor); + + /* + * If this isn't the first key inserted, compare it against the last key + * to ensure the application doesn't accidentally corrupt the table. + * + * Instead of a "first time" variable, I'm using the RLE count, because + * it is only zero before the first row is inserted. + */ + if (cbulk->rle != 0) { + WT_ERR(__wt_compare(session, + btree->collator, &cursor->key, &cbulk->last, &cmp)); + if (cmp <= 0) + WT_ERR(__bulk_row_keycmp_err(cbulk)); + } + + /* + * Save a copy of the key for the next comparison and set the RLE + * counter. + */ + WT_ERR(__wt_buf_set(session, + &cbulk->last, cursor->key.data, cursor->key.size)); + cbulk->rle = 1; + + WT_ERR(__wt_bulk_insert_row(session, cbulk)); + + WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); + +err: API_END_RET(session, ret); +} + +/* + * __curbulk_insert_row_skip_check -- + * Row-store bulk cursor insert, without key-sort checks. + */ +static int +__curbulk_insert_row_skip_check(WT_CURSOR *cursor) +{ + WT_BTREE *btree; + WT_CURSOR_BULK *cbulk; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbulk = (WT_CURSOR_BULK *)cursor; + btree = cbulk->cbt.btree; + + /* + * Bulk cursor inserts are updates, but don't need auto-commit + * transactions because they are single-threaded and not visible + * until the bulk cursor is closed. + */ + CURSOR_API_CALL(cursor, session, insert, btree); + + WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_NEEDVALUE(cursor); + + WT_ERR(__wt_bulk_insert_row(session, cbulk)); + + WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); + +err: API_END_RET(session, ret); +} + +/* + * __curbulk_close -- + * WT_CURSOR->close for the bulk cursor type. + */ +static int +__curbulk_close(WT_CURSOR *cursor) +{ + WT_BTREE *btree; + WT_CURSOR_BULK *cbulk; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbulk = (WT_CURSOR_BULK *)cursor; + btree = cbulk->cbt.btree; + + CURSOR_API_CALL(cursor, session, close, btree); + + WT_TRET(__wt_bulk_wrapup(session, cbulk)); + __wt_buf_free(session, &cbulk->last); + + WT_TRET(__wt_session_release_btree(session)); + + /* The URI is owned by the btree handle. */ + cursor->internal_uri = NULL; + + WT_TRET(__wt_cursor_close(cursor)); + +err: API_END_RET(session, ret); +} + +/* + * __wt_curbulk_init -- + * Initialize a bulk cursor. + */ +int +__wt_curbulk_init(WT_SESSION_IMPL *session, + WT_CURSOR_BULK *cbulk, int bitmap, int skip_sort_check) +{ + WT_CURSOR *c; + WT_CURSOR_BTREE *cbt; + + c = &cbulk->cbt.iface; + cbt = &cbulk->cbt; + + /* Bulk cursors only support insert and close (reset is a no-op). */ + __wt_cursor_set_notsup(c); + switch (cbt->btree->type) { + case BTREE_COL_FIX: + c->insert = __curbulk_insert_fix; + break; + case BTREE_COL_VAR: + c->insert = __curbulk_insert_var; + break; + case BTREE_ROW: + c->insert = skip_sort_check ? + __curbulk_insert_row_skip_check : __curbulk_insert_row; + break; + WT_ILLEGAL_VALUE(session); + } + c->close = __curbulk_close; + + cbulk->bitmap = bitmap; + if (bitmap) + F_SET(c, WT_CURSTD_RAW); + + return (__wt_bulk_init(session, cbulk)); +} diff --git a/src/third_party/wiredtiger/src/cursor/cur_config.c b/src/third_party/wiredtiger/src/cursor/cur_config.c new file mode 100644 index 00000000000..868b144efc1 --- /dev/null +++ b/src/third_party/wiredtiger/src/cursor/cur_config.c @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __curconfig_close -- + * WT_CURSOR->close method for the config cursor type. + */ +static int +__curconfig_close(WT_CURSOR *cursor) +{ + return (__wt_cursor_close(cursor)); +} + +/* + * __wt_curconfig_open -- + * WT_SESSION->open_cursor method for config cursors. + */ +int +__wt_curconfig_open(WT_SESSION_IMPL *session, + const char *uri, const char *cfg[], WT_CURSOR **cursorp) +{ + WT_CURSOR_STATIC_INIT(iface, + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __wt_cursor_notsup, /* compare */ + __wt_cursor_notsup, /* next */ + __wt_cursor_notsup, /* prev */ + __wt_cursor_noop, /* reset */ + __wt_cursor_notsup, /* search */ + __wt_cursor_notsup, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __curconfig_close); + WT_CURSOR_CONFIG *cconfig; + WT_CURSOR *cursor; + WT_DECL_RET; + + WT_STATIC_ASSERT(offsetof(WT_CURSOR_CONFIG, iface) == 0); + + WT_UNUSED(uri); + + WT_RET(__wt_calloc_def(session, 1, &cconfig)); + + cursor = &cconfig->iface; + *cursor = iface; + cursor->session = &session->iface; + cursor->key_format = cursor->value_format = "S"; + + /* __wt_cursor_init is last so we don't have to clean up on error. */ + WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); + + if (0) { +err: __wt_free(session, cconfig); + } + return (ret); +} diff --git a/src/third_party/wiredtiger/src/cursor/cur_ds.c b/src/third_party/wiredtiger/src/cursor/cur_ds.c new file mode 100644 index 00000000000..33e89764617 --- /dev/null +++ b/src/third_party/wiredtiger/src/cursor/cur_ds.c @@ -0,0 +1,524 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __curds_txn_enter -- + * Do transactional initialization when starting an operation. + */ +static int +__curds_txn_enter(WT_SESSION_IMPL *session) +{ + session->ncursors++; /* XXX */ + __wt_txn_cursor_op(session); + + return (0); +} + +/* + * __curds_txn_leave -- + * Do transactional cleanup when ending an operation. + */ +static void +__curds_txn_leave(WT_SESSION_IMPL *session) +{ + if (--session->ncursors == 0) /* XXX */ + __wt_txn_read_last(session); +} + +/* + * __curds_key_set -- + * Set the key for the data-source. + */ +static int +__curds_key_set(WT_CURSOR *cursor) +{ + WT_CURSOR *source; + WT_DECL_RET; + + source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; + + WT_CURSOR_NEEDKEY(cursor); + + source->recno = cursor->recno; + source->key.data = cursor->key.data; + source->key.size = cursor->key.size; + +err: return (ret); +} + +/* + * __curds_value_set -- + * Set the value for the data-source. + */ +static int +__curds_value_set(WT_CURSOR *cursor) +{ + WT_CURSOR *source; + WT_DECL_RET; + + source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; + + WT_CURSOR_NEEDVALUE(cursor); + + source->value.data = cursor->value.data; + source->value.size = cursor->value.size; + +err: return (ret); +} + +/* + * __curds_cursor_resolve -- + * Resolve cursor operation. + */ +static int +__curds_cursor_resolve(WT_CURSOR *cursor, int ret) +{ + WT_CURSOR *source; + + source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; + + /* + * Update the cursor's key, value and flags. (We use the _INT flags in + * the same way as file objects: there's some chance the underlying data + * source is passing us a reference to data only pinned per operation, + * might as well be safe.) + * + * There's also a requirement the underlying data-source never returns + * with the cursor/source key referencing application memory: it'd be + * great to do a copy as necessary here so the data-source doesn't have + * to worry about copying the key, but we don't have enough information + * to know if a cursor is pointing at application or data-source memory. + */ + if (ret == 0) { + cursor->key.data = source->key.data; + cursor->key.size = source->key.size; + cursor->value.data = source->value.data; + cursor->value.size = source->value.size; + cursor->recno = source->recno; + + F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + } else { + if (ret == WT_NOTFOUND) + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + else + F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + + /* + * Cursor operation failure implies a lost cursor position and + * a subsequent next/prev starting at the beginning/end of the + * table. We simplify underlying data source implementations + * by resetting the cursor explicitly here. + */ + WT_TRET(source->reset(source)); + } + + return (ret); +} + +/* + * __curds_compare -- + * WT_CURSOR.compare method for the data-source cursor type. + */ +static int +__curds_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) +{ + WT_COLLATOR *collator; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + CURSOR_API_CALL(a, session, compare, NULL); + + /* + * Confirm both cursors refer to the same source and have keys, then + * compare them. + */ + if (strcmp(a->internal_uri, b->internal_uri) != 0) + WT_ERR_MSG(session, EINVAL, + "Cursors must reference the same object"); + + WT_CURSOR_NEEDKEY(a); + WT_CURSOR_NEEDKEY(b); + + if (WT_CURSOR_RECNO(a)) { + if (a->recno < b->recno) + *cmpp = -1; + else if (a->recno == b->recno) + *cmpp = 0; + else + *cmpp = 1; + } else { + /* + * The assumption is data-sources don't provide WiredTiger with + * WT_CURSOR.compare methods, instead, we'll copy the key/value + * out of the underlying data-source cursor and any comparison + * to be done can be done at this level. + */ + collator = ((WT_CURSOR_DATA_SOURCE *)a)->collator; + WT_ERR(__wt_compare( + session, collator, &a->key, &b->key, cmpp)); + } + +err: API_END_RET(session, ret); +} + +/* + * __curds_next -- + * WT_CURSOR.next method for the data-source cursor type. + */ +static int +__curds_next(WT_CURSOR *cursor) +{ + WT_CURSOR *source; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; + + CURSOR_API_CALL(cursor, session, next, NULL); + + WT_STAT_FAST_CONN_INCR(session, cursor_next); + WT_STAT_FAST_DATA_INCR(session, cursor_next); + + WT_ERR(__curds_txn_enter(session)); + + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + ret = __curds_cursor_resolve(cursor, source->next(source)); + +err: __curds_txn_leave(session); + + API_END_RET(session, ret); +} + +/* + * __curds_prev -- + * WT_CURSOR.prev method for the data-source cursor type. + */ +static int +__curds_prev(WT_CURSOR *cursor) +{ + WT_CURSOR *source; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; + + CURSOR_API_CALL(cursor, session, prev, NULL); + + WT_STAT_FAST_CONN_INCR(session, cursor_prev); + WT_STAT_FAST_DATA_INCR(session, cursor_prev); + + WT_ERR(__curds_txn_enter(session)); + + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + ret = __curds_cursor_resolve(cursor, source->prev(source)); + +err: __curds_txn_leave(session); + API_END_RET(session, ret); +} + +/* + * __curds_reset -- + * WT_CURSOR.reset method for the data-source cursor type. + */ +static int +__curds_reset(WT_CURSOR *cursor) +{ + WT_CURSOR *source; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; + + CURSOR_API_CALL(cursor, session, reset, NULL); + + WT_STAT_FAST_CONN_INCR(session, cursor_reset); + WT_STAT_FAST_DATA_INCR(session, cursor_reset); + + WT_ERR(source->reset(source)); + + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + +err: API_END_RET(session, ret); +} + +/* + * __curds_search -- + * WT_CURSOR.search method for the data-source cursor type. + */ +static int +__curds_search(WT_CURSOR *cursor) +{ + WT_CURSOR *source; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; + + CURSOR_API_CALL(cursor, session, search, NULL); + + WT_STAT_FAST_CONN_INCR(session, cursor_search); + WT_STAT_FAST_DATA_INCR(session, cursor_search); + + WT_ERR(__curds_txn_enter(session)); + + WT_ERR(__curds_key_set(cursor)); + ret = __curds_cursor_resolve(cursor, source->search(source)); + +err: __curds_txn_leave(session); + + API_END_RET(session, ret); +} + +/* + * __curds_search_near -- + * WT_CURSOR.search_near method for the data-source cursor type. + */ +static int +__curds_search_near(WT_CURSOR *cursor, int *exact) +{ + WT_CURSOR *source; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; + + CURSOR_API_CALL(cursor, session, search_near, NULL); + + WT_STAT_FAST_CONN_INCR(session, cursor_search_near); + WT_STAT_FAST_DATA_INCR(session, cursor_search_near); + + WT_ERR(__curds_txn_enter(session)); + + WT_ERR(__curds_key_set(cursor)); + ret = + __curds_cursor_resolve(cursor, source->search_near(source, exact)); + +err: __curds_txn_leave(session); + + API_END_RET(session, ret); +} + +/* + * __curds_insert -- + * WT_CURSOR.insert method for the data-source cursor type. + */ +static int +__curds_insert(WT_CURSOR *cursor) +{ + WT_CURSOR *source; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; + + CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL); + + WT_ERR(__curds_txn_enter(session)); + + WT_STAT_FAST_CONN_INCR(session, cursor_insert); + WT_STAT_FAST_DATA_INCR(session, cursor_insert); + WT_STAT_FAST_DATA_INCRV(session, + cursor_insert_bytes, cursor->key.size + cursor->value.size); + + if (!F_ISSET(cursor, WT_CURSTD_APPEND)) + WT_ERR(__curds_key_set(cursor)); + WT_ERR(__curds_value_set(cursor)); + ret = __curds_cursor_resolve(cursor, source->insert(source)); + +err: __curds_txn_leave(session); + + CURSOR_UPDATE_API_END(session, ret); + return (ret); +} + +/* + * __curds_update -- + * WT_CURSOR.update method for the data-source cursor type. + */ +static int +__curds_update(WT_CURSOR *cursor) +{ + WT_CURSOR *source; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; + + CURSOR_UPDATE_API_CALL(cursor, session, update, NULL); + + WT_STAT_FAST_CONN_INCR(session, cursor_update); + WT_STAT_FAST_DATA_INCR(session, cursor_update); + WT_STAT_FAST_DATA_INCRV( + session, cursor_update_bytes, cursor->value.size); + + WT_ERR(__curds_txn_enter(session)); + + WT_ERR(__curds_key_set(cursor)); + WT_ERR(__curds_value_set(cursor)); + ret = __curds_cursor_resolve(cursor, source->update(source)); + +err: __curds_txn_leave(session); + + CURSOR_UPDATE_API_END(session, ret); + return (ret); +} + +/* + * __curds_remove -- + * WT_CURSOR.remove method for the data-source cursor type. + */ +static int +__curds_remove(WT_CURSOR *cursor) +{ + WT_CURSOR *source; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; + + CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL); + + WT_STAT_FAST_CONN_INCR(session, cursor_remove); + WT_STAT_FAST_DATA_INCR(session, cursor_remove); + WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size); + + WT_ERR(__curds_txn_enter(session)); + + WT_ERR(__curds_key_set(cursor)); + ret = __curds_cursor_resolve(cursor, source->remove(source)); + +err: __curds_txn_leave(session); + + CURSOR_UPDATE_API_END(session, ret); + return (ret); +} + +/* + * __curds_close -- + * WT_CURSOR.close method for the data-source cursor type. + */ +static int +__curds_close(WT_CURSOR *cursor) +{ + WT_CURSOR_DATA_SOURCE *cds; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cds = (WT_CURSOR_DATA_SOURCE *)cursor; + + CURSOR_API_CALL(cursor, session, close, NULL); + + if (cds->source != NULL) + ret = cds->source->close(cds->source); + + if (cds->collator_owned) { + if (cds->collator->terminate != NULL) + WT_TRET(cds->collator->terminate( + cds->collator, &session->iface)); + cds->collator_owned = 0; + } + cds->collator = NULL; + + /* + * The key/value formats are in allocated memory, which isn't standard + * behavior. + */ + __wt_free(session, cursor->key_format); + __wt_free(session, cursor->value_format); + + WT_TRET(__wt_cursor_close(cursor)); + +err: API_END_RET(session, ret); +} + +/* + * __wt_curds_open -- + * Initialize a data-source cursor. + */ +int +__wt_curds_open( + WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, + const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp) +{ + WT_CURSOR_STATIC_INIT(iface, + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __curds_compare, /* compare */ + __curds_next, /* next */ + __curds_prev, /* prev */ + __curds_reset, /* reset */ + __curds_search, /* search */ + __curds_search_near, /* search-near */ + __curds_insert, /* insert */ + __curds_update, /* update */ + __curds_remove, /* remove */ + __curds_close); /* close */ + WT_CONFIG_ITEM cval; + WT_CURSOR *cursor, *source; + WT_CURSOR_DATA_SOURCE *data_source; + WT_DECL_RET; + const char *metaconf; + + WT_STATIC_ASSERT(offsetof(WT_CURSOR_DATA_SOURCE, iface) == 0); + + data_source = NULL; + metaconf = NULL; + + WT_RET(__wt_calloc_def(session, 1, &data_source)); + cursor = &data_source->iface; + *cursor = iface; + cursor->session = &session->iface; + F_SET(cursor, WT_CURSTD_DATA_SOURCE); + + /* + * XXX + * The underlying data-source may require the object's key and value + * formats. This isn't a particularly elegant way of getting that + * information to the data-source, this feels like a layering problem + * to me. + */ + WT_ERR(__wt_metadata_search(session, uri, &metaconf)); + WT_ERR(__wt_config_getones(session, metaconf, "key_format", &cval)); + WT_ERR(__wt_strndup(session, cval.str, cval.len, &cursor->key_format)); + WT_ERR(__wt_config_getones(session, metaconf, "value_format", &cval)); + WT_ERR( + __wt_strndup(session, cval.str, cval.len, &cursor->value_format)); + + WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp)); + + /* Data-source cursors have a collator reference. */ + WT_ERR(__wt_collator_config(session, cfg, + &data_source->collator, &data_source->collator_owned)); + + WT_ERR(dsrc->open_cursor(dsrc, + &session->iface, uri, (WT_CONFIG_ARG *)cfg, &data_source->source)); + source = data_source->source; + source->session = (WT_SESSION *)session; + memset(&source->q, 0, sizeof(source->q)); + source->recno = 0; + memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf)); + memset(&source->key, 0, sizeof(source->key)); + memset(&source->value, 0, sizeof(source->value)); + source->saved_err = 0; + source->flags = 0; + + if (0) { +err: if (F_ISSET(cursor, WT_CURSTD_OPEN)) + WT_TRET(cursor->close(cursor)); + else + __wt_free(session, data_source); + *cursorp = NULL; + } + + __wt_free(session, metaconf); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/cursor/cur_dump.c b/src/third_party/wiredtiger/src/cursor/cur_dump.c new file mode 100644 index 00000000000..003b7e1f961 --- /dev/null +++ b/src/third_party/wiredtiger/src/cursor/cur_dump.c @@ -0,0 +1,400 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __raw_to_dump -- + * We have a buffer where the data item contains a raw value, + * convert it to a printable string. + */ +static int +__raw_to_dump( + WT_SESSION_IMPL *session, WT_ITEM *from, WT_ITEM *to, int hexonly) +{ + if (hexonly) + WT_RET(__wt_raw_to_hex(session, from->data, from->size, to)); + else + WT_RET( + __wt_raw_to_esc_hex(session, from->data, from->size, to)); + + return (0); +} + +/* + * __dump_to_raw -- + * We have a buffer containing a dump string, + * convert it to a raw value. + */ +static int +__dump_to_raw( + WT_SESSION_IMPL *session, const char *src_arg, WT_ITEM *item, int hexonly) +{ + if (hexonly) + WT_RET(__wt_hex_to_raw(session, src_arg, item)); + else + WT_RET(__wt_esc_hex_to_raw(session, src_arg, item)); + + return (0); +} + +/* + * __curdump_get_key -- + * WT_CURSOR->get_key for dump cursors. + */ +static int +__curdump_get_key(WT_CURSOR *cursor, ...) +{ + WT_CURSOR *child; + WT_CURSOR_DUMP *cdump; + WT_CURSOR_JSON *json; + WT_DECL_RET; + WT_ITEM item, *itemp; + WT_SESSION_IMPL *session; + size_t size; + uint64_t recno; + const char *fmt; + const void *buffer; + va_list ap; + + cdump = (WT_CURSOR_DUMP *)cursor; + child = cdump->child; + + va_start(ap, cursor); + CURSOR_API_CALL(cursor, session, get_key, NULL); + + if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) { + json = (WT_CURSOR_JSON *)cursor->json_private; + WT_ASSERT(session, json != NULL); + if (WT_CURSOR_RECNO(cursor)) { + WT_ERR(child->get_key(child, &recno)); + buffer = &recno; + size = sizeof(recno); + fmt = "R"; + } else { + WT_ERR(__wt_cursor_get_raw_key(child, &item)); + buffer = item.data; + size = item.size; + if (F_ISSET(cursor, WT_CURSTD_RAW)) + fmt = "u"; + else + fmt = cursor->key_format; + } + ret = __wt_json_alloc_unpack(session, buffer, size, fmt, + json, 1, ap); + } else { + if (WT_CURSOR_RECNO(cursor) && + !F_ISSET(cursor, WT_CURSTD_RAW)) { + WT_ERR(child->get_key(child, &recno)); + + WT_ERR(__wt_buf_fmt(session, &cursor->key, "%" + PRIu64, recno)); + } else { + WT_ERR(child->get_key(child, &item)); + + WT_ERR(__raw_to_dump(session, &item, &cursor->key, + F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0)); + } + + if (F_ISSET(cursor, WT_CURSTD_RAW)) { + itemp = va_arg(ap, WT_ITEM *); + itemp->data = cursor->key.data; + itemp->size = cursor->key.size; + } else + *va_arg(ap, const char **) = cursor->key.data; + } + +err: va_end(ap); + API_END_RET(session, ret); +} + +/* + * str2recno -- + * Convert a string to a record number. + */ +static int +str2recno(WT_SESSION_IMPL *session, const char *p, uint64_t *recnop) +{ + uint64_t recno; + char *endptr; + + /* + * strtouq takes lots of things like hex values, signs and so on and so + * forth -- none of them are OK with us. Check the string starts with + * digit, that turns off the special processing. + */ + if (!isdigit(p[0])) + goto format; + + errno = 0; + recno = __wt_strtouq(p, &endptr, 0); + if (recno == ULLONG_MAX && errno == ERANGE) + WT_RET_MSG(session, ERANGE, "%s: invalid record number", p); + if (endptr[0] != '\0') +format: WT_RET_MSG(session, EINVAL, "%s: invalid record number", p); + + *recnop = recno; + return (0); +} + +/* + * __curdump_set_key -- + * WT_CURSOR->set_key for dump cursors. + */ +static void +__curdump_set_key(WT_CURSOR *cursor, ...) +{ + WT_CURSOR_DUMP *cdump; + WT_CURSOR *child; + WT_DECL_RET; + WT_SESSION_IMPL *session; + uint64_t recno; + va_list ap; + const char *p; + + cdump = (WT_CURSOR_DUMP *)cursor; + child = cdump->child; + CURSOR_API_CALL(cursor, session, set_key, NULL); + + va_start(ap, cursor); + if (F_ISSET(cursor, WT_CURSTD_RAW)) + p = va_arg(ap, WT_ITEM *)->data; + else + p = va_arg(ap, const char *); + va_end(ap); + + if (WT_CURSOR_RECNO(cursor) && !F_ISSET(cursor, WT_CURSTD_RAW)) { + WT_ERR(str2recno(session, p, &recno)); + + child->set_key(child, recno); + } else { + if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) + WT_ERR(__wt_json_to_item(session, p, cursor->key_format, + (WT_CURSOR_JSON *)cursor->json_private, 1, + &cursor->key)); + else + WT_ERR(__dump_to_raw(session, p, &cursor->key, + F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0)); + + child->set_key(child, &cursor->key); + } + + if (0) { +err: cursor->saved_err = ret; + F_CLR(cursor, WT_CURSTD_KEY_SET); + } + API_END(session, ret); +} + +/* + * __curdump_get_value -- + * WT_CURSOR->get_value for dump cursors. + */ +static int +__curdump_get_value(WT_CURSOR *cursor, ...) +{ + WT_CURSOR_DUMP *cdump; + WT_CURSOR_JSON *json; + WT_CURSOR *child; + WT_DECL_RET; + WT_ITEM item, *itemp; + WT_SESSION_IMPL *session; + va_list ap; + const char *fmt; + + cdump = (WT_CURSOR_DUMP *)cursor; + child = cdump->child; + + va_start(ap, cursor); + CURSOR_API_CALL(cursor, session, get_value, NULL); + + if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) { + json = (WT_CURSOR_JSON *)cursor->json_private; + WT_ASSERT(session, json != NULL); + WT_ERR(__wt_cursor_get_raw_value(child, &item)); + fmt = F_ISSET(cursor, WT_CURSTD_RAW) ? + "u" : cursor->value_format; + ret = __wt_json_alloc_unpack(session, item.data, + item.size, fmt, json, 0, ap); + } else { + WT_ERR(child->get_value(child, &item)); + + WT_ERR(__raw_to_dump(session, &item, &cursor->value, + F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0)); + + if (F_ISSET(cursor, WT_CURSTD_RAW)) { + itemp = va_arg(ap, WT_ITEM *); + itemp->data = cursor->value.data; + itemp->size = cursor->value.size; + } else + *va_arg(ap, const char **) = cursor->value.data; + } + +err: va_end(ap); + API_END_RET(session, ret); +} + +/* + * __curdump_set_value -- + * WT_CURSOR->set_value for dump cursors. + */ +static void +__curdump_set_value(WT_CURSOR *cursor, ...) +{ + WT_CURSOR_DUMP *cdump; + WT_CURSOR *child; + WT_DECL_RET; + WT_SESSION_IMPL *session; + va_list ap; + const char *p; + + cdump = (WT_CURSOR_DUMP *)cursor; + child = cdump->child; + CURSOR_API_CALL(cursor, session, set_value, NULL); + + va_start(ap, cursor); + if (F_ISSET(cursor, WT_CURSTD_RAW)) + p = va_arg(ap, WT_ITEM *)->data; + else + p = va_arg(ap, const char *); + va_end(ap); + + if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) + WT_ERR(__wt_json_to_item(session, p, cursor->value_format, + (WT_CURSOR_JSON *)cursor->json_private, 0, &cursor->value)); + else + WT_ERR(__dump_to_raw(session, p, &cursor->value, + F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0)); + + child->set_value(child, &cursor->value); + + if (0) { +err: cursor->saved_err = ret; + F_CLR(cursor, WT_CURSTD_VALUE_SET); + } + API_END(session, ret); +} + +/* Pass through a call to the underlying cursor. */ +#define WT_CURDUMP_PASS(op) \ +static int \ +__curdump_##op(WT_CURSOR *cursor) \ +{ \ + WT_CURSOR *child; \ + \ + child = ((WT_CURSOR_DUMP *)cursor)->child; \ + return (child->op(child)); \ +} + +WT_CURDUMP_PASS(next) +WT_CURDUMP_PASS(prev) +WT_CURDUMP_PASS(reset) +WT_CURDUMP_PASS(search) + +/* + * __curdump_search_near -- + * WT_CURSOR::search_near for dump cursors. + */ +static int +__curdump_search_near(WT_CURSOR *cursor, int *exact) +{ + WT_CURSOR_DUMP *cdump; + + cdump = (WT_CURSOR_DUMP *)cursor; + return (cdump->child->search_near(cdump->child, exact)); +} + +WT_CURDUMP_PASS(insert) +WT_CURDUMP_PASS(update) +WT_CURDUMP_PASS(remove) + +/* + * __curdump_close -- + * WT_CURSOR::close for dump cursors. + */ +static int +__curdump_close(WT_CURSOR *cursor) +{ + WT_CURSOR_DUMP *cdump; + WT_CURSOR *child; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cdump = (WT_CURSOR_DUMP *)cursor; + child = cdump->child; + + CURSOR_API_CALL(cursor, session, get_key, NULL); + if (child != NULL) + WT_TRET(child->close(child)); + /* We shared the child's URI. */ + cursor->internal_uri = NULL; + __wt_json_close(session, cursor); + WT_TRET(__wt_cursor_close(cursor)); + +err: API_END_RET(session, ret); +} + +/* + * __wt_curdump_create -- + * initialize a dump cursor. + */ +int +__wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp) +{ + WT_CURSOR_STATIC_INIT(iface, + __curdump_get_key, /* get-key */ + __curdump_get_value, /* get-value */ + __curdump_set_key, /* set-key */ + __curdump_set_value, /* set-value */ + __wt_cursor_notsup, /* compare */ + __curdump_next, /* next */ + __curdump_prev, /* prev */ + __curdump_reset, /* reset */ + __curdump_search, /* search */ + __curdump_search_near, /* search-near */ + __curdump_insert, /* insert */ + __curdump_update, /* update */ + __curdump_remove, /* remove */ + __curdump_close); /* close */ + WT_CURSOR *cursor; + WT_CURSOR_DUMP *cdump; + WT_CURSOR_JSON *json; + WT_DECL_RET; + WT_SESSION_IMPL *session; + const char *cfg[2]; + + WT_STATIC_ASSERT(offsetof(WT_CURSOR_DUMP, iface) == 0); + + session = (WT_SESSION_IMPL *)child->session; + + WT_RET(__wt_calloc_def(session, 1, &cdump)); + cursor = &cdump->iface; + *cursor = iface; + cursor->session = child->session; + cursor->internal_uri = child->internal_uri; + cursor->key_format = child->key_format; + cursor->value_format = child->value_format; + cdump->child = child; + + /* Copy the dump flags from the child cursor. */ + F_SET(cursor, F_ISSET(child, + WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_JSON | WT_CURSTD_DUMP_PRINT)); + if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) { + WT_ERR(__wt_calloc_def(session, 1, &json)); + cursor->json_private = child->json_private = json; + } + + /* __wt_cursor_init is last so we don't have to clean up on error. */ + cfg[0] = WT_CONFIG_BASE(session, session_open_cursor); + cfg[1] = NULL; + WT_ERR(__wt_cursor_init(cursor, NULL, owner, cfg, cursorp)); + + if (0) { +err: __wt_free(session, cursor); + } + return (ret); +} diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c new file mode 100644 index 00000000000..e5aaa19d0cc --- /dev/null +++ b/src/third_party/wiredtiger/src/cursor/cur_file.c @@ -0,0 +1,471 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * WT_BTREE_CURSOR_SAVE_AND_RESTORE + * Save the cursor's key/value data/size fields, call an underlying btree + * function, and then consistently handle failure and success. + */ +#define WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, f, ret) do { \ + WT_ITEM __key_copy = (cursor)->key; \ + uint64_t __recno = (cursor)->recno; \ + WT_ITEM __value_copy = (cursor)->value; \ + if (((ret) = (f)) == 0) { \ + F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); \ + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); \ + } else { \ + if (F_ISSET(cursor, WT_CURSTD_KEY_EXT)) { \ + (cursor)->recno = __recno; \ + WT_ITEM_SET((cursor)->key, __key_copy); \ + } \ + if (F_ISSET(cursor, WT_CURSTD_VALUE_EXT)) \ + WT_ITEM_SET((cursor)->value, __value_copy); \ + F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); \ + } \ +} while (0) + +/* + * __curfile_compare -- + * WT_CURSOR->compare method for the btree cursor type. + */ +static int +__curfile_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) +{ + WT_CURSOR_BTREE *cbt; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbt = (WT_CURSOR_BTREE *)a; + CURSOR_API_CALL(a, session, compare, cbt->btree); + + /* + * Confirm both cursors refer to the same source and have keys, then + * call the underlying object to compare them. + */ + if (strcmp(a->internal_uri, b->internal_uri) != 0) + WT_ERR_MSG(session, EINVAL, + "Cursors must reference the same object"); + + WT_CURSOR_CHECKKEY(a); + WT_CURSOR_CHECKKEY(b); + + ret = __wt_btcur_compare( + (WT_CURSOR_BTREE *)a, (WT_CURSOR_BTREE *)b, cmpp); + +err: API_END_RET(session, ret); +} + +/* + * __curfile_next -- + * WT_CURSOR->next method for the btree cursor type. + */ +static int +__curfile_next(WT_CURSOR *cursor) +{ + WT_CURSOR_BTREE *cbt; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbt = (WT_CURSOR_BTREE *)cursor; + CURSOR_API_CALL(cursor, session, next, cbt->btree); + + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + if ((ret = __wt_btcur_next(cbt, 0)) == 0) + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + +err: API_END_RET(session, ret); +} + +/* + * __curfile_next_random -- + * WT_CURSOR->next method for the btree cursor type when configured with + * next_random. + */ +static int +__curfile_next_random(WT_CURSOR *cursor) +{ + WT_CURSOR_BTREE *cbt; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbt = (WT_CURSOR_BTREE *)cursor; + CURSOR_API_CALL(cursor, session, next, cbt->btree); + + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + if ((ret = __wt_btcur_next_random(cbt)) == 0) + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + +err: API_END_RET(session, ret); +} + +/* + * __curfile_prev -- + * WT_CURSOR->prev method for the btree cursor type. + */ +static int +__curfile_prev(WT_CURSOR *cursor) +{ + WT_CURSOR_BTREE *cbt; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbt = (WT_CURSOR_BTREE *)cursor; + CURSOR_API_CALL(cursor, session, prev, cbt->btree); + + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + if ((ret = __wt_btcur_prev(cbt, 0)) == 0) + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + +err: API_END_RET(session, ret); +} + +/* + * __curfile_reset -- + * WT_CURSOR->reset method for the btree cursor type. + */ +static int +__curfile_reset(WT_CURSOR *cursor) +{ + WT_CURSOR_BTREE *cbt; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbt = (WT_CURSOR_BTREE *)cursor; + CURSOR_API_CALL(cursor, session, reset, cbt->btree); + + ret = __wt_btcur_reset(cbt); + + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + +err: API_END_RET(session, ret); +} + +/* + * __curfile_search -- + * WT_CURSOR->search method for the btree cursor type. + */ +static int +__curfile_search(WT_CURSOR *cursor) +{ + WT_CURSOR_BTREE *cbt; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbt = (WT_CURSOR_BTREE *)cursor; + CURSOR_API_CALL(cursor, session, search, cbt->btree); + + WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_NOVALUE(cursor); + + WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_search(cbt), ret); + +err: API_END_RET(session, ret); +} + +/* + * __curfile_search_near -- + * WT_CURSOR->search_near method for the btree cursor type. + */ +static int +__curfile_search_near(WT_CURSOR *cursor, int *exact) +{ + WT_CURSOR_BTREE *cbt; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbt = (WT_CURSOR_BTREE *)cursor; + CURSOR_API_CALL(cursor, session, search_near, cbt->btree); + + WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_NOVALUE(cursor); + + WT_BTREE_CURSOR_SAVE_AND_RESTORE( + cursor, __wt_btcur_search_near(cbt, exact), ret); + +err: API_END_RET(session, ret); +} + +/* + * __curfile_insert -- + * WT_CURSOR->insert method for the btree cursor type. + */ +static int +__curfile_insert(WT_CURSOR *cursor) +{ + WT_CURSOR_BTREE *cbt; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbt = (WT_CURSOR_BTREE *)cursor; + CURSOR_UPDATE_API_CALL(cursor, session, insert, cbt->btree); + if (!F_ISSET(cursor, WT_CURSTD_APPEND)) + WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_NEEDVALUE(cursor); + + WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_insert(cbt), ret); + + /* + * Insert is the one cursor operation that doesn't end with the cursor + * pointing to an on-page item. The standard macro handles errors + * correctly, but we need to leave the application cursor unchanged in + * the case of success, except for column-store appends, where we are + * returning a key. + */ + if (ret == 0) { + if (!F_ISSET(cursor, WT_CURSTD_APPEND)) { + F_SET(cursor, WT_CURSTD_KEY_EXT); + F_CLR(cursor, WT_CURSTD_KEY_INT); + } + F_SET(cursor, WT_CURSTD_VALUE_EXT); + F_CLR(cursor, WT_CURSTD_VALUE_INT); + } + +err: CURSOR_UPDATE_API_END(session, ret); + return (ret); +} + +/* + * __curfile_update -- + * WT_CURSOR->update method for the btree cursor type. + */ +static int +__curfile_update(WT_CURSOR *cursor) +{ + WT_CURSOR_BTREE *cbt; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbt = (WT_CURSOR_BTREE *)cursor; + CURSOR_UPDATE_API_CALL(cursor, session, update, cbt->btree); + + WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_NEEDVALUE(cursor); + + WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_update(cbt), ret); + +err: CURSOR_UPDATE_API_END(session, ret); + return (ret); +} + +/* + * __wt_curfile_update_check -- + * WT_CURSOR->update_check method for the btree cursor type. + */ +int +__wt_curfile_update_check(WT_CURSOR *cursor) +{ + WT_CURSOR_BTREE *cbt; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbt = (WT_CURSOR_BTREE *)cursor; + CURSOR_UPDATE_API_CALL(cursor, session, update, cbt->btree); + + WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_NOVALUE(cursor); + + WT_BTREE_CURSOR_SAVE_AND_RESTORE( + cursor, __wt_btcur_update_check(cbt), ret); + +err: CURSOR_UPDATE_API_END(session, ret); + return (ret); +} + +/* + * __curfile_remove -- + * WT_CURSOR->remove method for the btree cursor type. + */ +static int +__curfile_remove(WT_CURSOR *cursor) +{ + WT_CURSOR_BTREE *cbt; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbt = (WT_CURSOR_BTREE *)cursor; + CURSOR_UPDATE_API_CALL(cursor, session, remove, cbt->btree); + + WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_NOVALUE(cursor); + + WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_remove(cbt), ret); + + /* + * After a successful remove, copy the key: the value is not available. + */ + if (ret == 0) { + if (F_ISSET(cursor, WT_CURSTD_KEY_INT) && + !WT_DATA_IN_ITEM(&(cursor)->key)) { + WT_ERR(__wt_buf_set(session, &cursor->key, + cursor->key.data, cursor->key.size)); + F_CLR(cursor, WT_CURSTD_KEY_INT); + F_SET(cursor, WT_CURSTD_KEY_EXT); + } + F_CLR(cursor, WT_CURSTD_VALUE_SET); + } + +err: CURSOR_UPDATE_API_END(session, ret); + return (ret); +} + +/* + * __curfile_close -- + * WT_CURSOR->close method for the btree cursor type. + */ +static int +__curfile_close(WT_CURSOR *cursor) +{ + WT_CURSOR_BTREE *cbt; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbt = (WT_CURSOR_BTREE *)cursor; + CURSOR_API_CALL(cursor, session, close, cbt->btree); + WT_TRET(__wt_btcur_close(cbt)); + if (cbt->btree != NULL) + WT_TRET(__wt_session_release_btree(session)); + /* The URI is owned by the btree handle. */ + cursor->internal_uri = NULL; + WT_TRET(__wt_cursor_close(cursor)); + +err: API_END_RET(session, ret); +} + +/* + * __wt_curfile_create -- + * Open a cursor for a given btree handle. + */ +int +__wt_curfile_create(WT_SESSION_IMPL *session, + WT_CURSOR *owner, const char *cfg[], int bulk, int bitmap, + WT_CURSOR **cursorp) +{ + WT_CURSOR_STATIC_INIT(iface, + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __curfile_compare, /* compare */ + __curfile_next, /* next */ + __curfile_prev, /* prev */ + __curfile_reset, /* reset */ + __curfile_search, /* search */ + __curfile_search_near, /* search-near */ + __curfile_insert, /* insert */ + __curfile_update, /* update */ + __curfile_remove, /* remove */ + __curfile_close); /* close */ + WT_BTREE *btree; + WT_CONFIG_ITEM cval; + WT_CURSOR *cursor; + WT_CURSOR_BTREE *cbt; + WT_CURSOR_BULK *cbulk; + WT_DECL_RET; + size_t csize; + + WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0); + + cbt = NULL; + + btree = S2BT(session); + WT_ASSERT(session, btree != NULL); + + csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE); + WT_RET(__wt_calloc(session, 1, csize, &cbt)); + + cursor = &cbt->iface; + *cursor = iface; + cursor->session = &session->iface; + cursor->internal_uri = btree->dhandle->name; + cursor->key_format = btree->key_format; + cursor->value_format = btree->value_format; + + cbt->btree = btree; + if (bulk) { + F_SET(cursor, WT_CURSTD_BULK); + + cbulk = (WT_CURSOR_BULK *)cbt; + + /* Optionally skip the validation of each bulk-loaded key. */ + WT_ERR(__wt_config_gets_def( + session, cfg, "skip_sort_check", 0, &cval)); + WT_ERR(__wt_curbulk_init( + session, cbulk, bitmap, cval.val == 0 ? 0 : 1)); + } + + /* + * random_retrieval + * Random retrieval cursors only support next, reset and close. + */ + WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval)); + if (cval.val != 0) { + __wt_cursor_set_notsup(cursor); + cursor->next = __curfile_next_random; + cursor->reset = __curfile_reset; + } + + /* __wt_cursor_init is last so we don't have to clean up on error. */ + WT_ERR(__wt_cursor_init( + cursor, cursor->internal_uri, owner, cfg, cursorp)); + + WT_STAT_FAST_CONN_INCR(session, cursor_create); + WT_STAT_FAST_DATA_INCR(session, cursor_create); + + if (0) { +err: __wt_free(session, cbt); + } + + return (ret); +} + +/* + * __wt_curfile_open -- + * WT_SESSION->open_cursor method for the btree cursor type. + */ +int +__wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, + WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) +{ + WT_CONFIG_ITEM cval; + WT_DECL_RET; + int bitmap, bulk; + uint32_t flags; + + flags = 0; + + WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval)); + if (cval.type == WT_CONFIG_ITEM_BOOL || + (cval.type == WT_CONFIG_ITEM_NUM && + (cval.val == 0 || cval.val == 1))) { + bitmap = 0; + bulk = (cval.val != 0); + } else if (WT_STRING_MATCH("bitmap", cval.str, cval.len)) + bitmap = bulk = 1; + else + WT_RET_MSG(session, EINVAL, + "Value for 'bulk' must be a boolean or 'bitmap'"); + + /* Bulk handles require exclusive access. */ + if (bulk) + LF_SET(WT_BTREE_BULK | WT_DHANDLE_EXCLUSIVE); + + /* Get the handle and lock it while the cursor is using it. */ + if (WT_PREFIX_MATCH(uri, "file:")) + WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, flags)); + else + WT_RET(__wt_bad_object_type(session, uri)); + + WT_ERR(__wt_curfile_create(session, owner, cfg, bulk, bitmap, cursorp)); + + return (0); + +err: /* If the cursor could not be opened, release the handle. */ + WT_TRET(__wt_session_release_btree(session)); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c new file mode 100644 index 00000000000..936337047b8 --- /dev/null +++ b/src/third_party/wiredtiger/src/cursor/cur_index.c @@ -0,0 +1,447 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __curindex_get_value -- + * WT_CURSOR->get_value implementation for index cursors. + */ +static int +__curindex_get_value(WT_CURSOR *cursor, ...) +{ + WT_CURSOR_INDEX *cindex; + WT_DECL_RET; + WT_ITEM *item; + WT_SESSION_IMPL *session; + va_list ap; + + cindex = (WT_CURSOR_INDEX *)cursor; + CURSOR_API_CALL(cursor, session, get_value, NULL); + WT_CURSOR_NEEDVALUE(cursor); + + va_start(ap, cursor); + if (F_ISSET(cursor, WT_CURSTD_RAW)) { + ret = __wt_schema_project_merge(session, + cindex->cg_cursors, cindex->value_plan, + cursor->value_format, &cursor->value); + if (ret == 0) { + item = va_arg(ap, WT_ITEM *); + item->data = cursor->value.data; + item->size = cursor->value.size; + } + } else + ret = __wt_schema_project_out(session, + cindex->cg_cursors, cindex->value_plan, ap); + va_end(ap); + +err: API_END_RET(session, ret); +} + +/* + * __curindex_set_value -- + * WT_CURSOR->set_value implementation for index cursors. + */ +static void +__curindex_set_value(WT_CURSOR *cursor, ...) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + + CURSOR_API_CALL(cursor, session, set_value, NULL); + ret = ENOTSUP; +err: cursor->saved_err = ret; + F_CLR(cursor, WT_CURSTD_VALUE_SET); + API_END(session, ret); +} + +/* + * __curindex_move -- + * When an index cursor changes position, set the primary key in the + * associated column groups and update their positions to match. + */ +static int +__curindex_move(WT_CURSOR_INDEX *cindex) +{ + WT_CURSOR **cp, *first; + WT_SESSION_IMPL *session; + u_int i; + + session = (WT_SESSION_IMPL *)cindex->iface.session; + first = NULL; + + /* Point the public cursor to the key in the child. */ + __wt_cursor_set_raw_key(&cindex->iface, &cindex->child->key); + F_CLR(&cindex->iface, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + + for (i = 0, cp = cindex->cg_cursors; + i < WT_COLGROUPS(cindex->table); + i++, cp++) { + if (*cp == NULL) + continue; + if (first == NULL) { + /* + * Set the primary key -- note that we need the primary + * key columns, so we have to use the full key format, + * not just the public columns. + */ + WT_RET(__wt_schema_project_slice(session, + cp, cindex->index->key_plan, + 1, cindex->index->key_format, + &cindex->iface.key)); + first = *cp; + } else { + (*cp)->key.data = first->key.data; + (*cp)->key.size = first->key.size; + (*cp)->recno = first->recno; + } + F_SET(*cp, WT_CURSTD_KEY_EXT); + WT_RET((*cp)->search(*cp)); + } + + F_SET(&cindex->iface, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + return (0); +} + +/* + * __curindex_next -- + * WT_CURSOR->next method for index cursors. + */ +static int +__curindex_next(WT_CURSOR *cursor) +{ + WT_CURSOR_INDEX *cindex; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cindex = (WT_CURSOR_INDEX *)cursor; + CURSOR_API_CALL(cursor, session, next, NULL); + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + + if ((ret = cindex->child->next(cindex->child)) == 0) + ret = __curindex_move(cindex); + +err: API_END_RET(session, ret); +} + +/* + * __curindex_prev -- + * WT_CURSOR->prev method for index cursors. + */ +static int +__curindex_prev(WT_CURSOR *cursor) +{ + WT_CURSOR_INDEX *cindex; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cindex = (WT_CURSOR_INDEX *)cursor; + CURSOR_API_CALL(cursor, session, prev, NULL); + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + + if ((ret = cindex->child->prev(cindex->child)) == 0) + ret = __curindex_move(cindex); + +err: API_END_RET(session, ret); +} + +/* + * __curindex_reset -- + * WT_CURSOR->reset method for index cursors. + */ +static int +__curindex_reset(WT_CURSOR *cursor) +{ + WT_CURSOR **cp; + WT_CURSOR_INDEX *cindex; + WT_DECL_RET; + WT_SESSION_IMPL *session; + u_int i; + + cindex = (WT_CURSOR_INDEX *)cursor; + CURSOR_API_CALL(cursor, session, reset, NULL); + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + + WT_TRET(cindex->child->reset(cindex->child)); + for (i = 0, cp = cindex->cg_cursors; + i < WT_COLGROUPS(cindex->table); + i++, cp++) { + if (*cp == NULL) + continue; + WT_TRET((*cp)->reset(*cp)); + } + +err: API_END_RET(session, ret); +} + +/* + * __curindex_search -- + * WT_CURSOR->search method for index cursors. + */ +static int +__curindex_search(WT_CURSOR *cursor) +{ + WT_CURSOR *child; + WT_CURSOR_INDEX *cindex; + WT_DECL_RET; + WT_SESSION_IMPL *session; + int exact; + + cindex = (WT_CURSOR_INDEX *)cursor; + child = cindex->child; + CURSOR_API_CALL(cursor, session, search, NULL); + + /* + * We expect partial matches, but we want the smallest item that + * matches the prefix. Fail if there is no matching item. + */ + __wt_cursor_set_raw_key(child, &cursor->key); + WT_ERR(child->search_near(child, &exact)); + + /* + * We expect partial matches, and want the smallest record with a key + * greater than or equal to the search key. The only way for the key + * to be equal is if there is an index on the primary key, because + * otherwise the primary key columns will be appended to the index key, + * but we don't disallow that (odd) case. + */ + if (exact < 0) + WT_ERR(child->next(child)); + + if (child->key.size < cursor->key.size || + memcmp(child->key.data, cursor->key.data, cursor->key.size) != 0) { + ret = WT_NOTFOUND; + goto err; + } + + WT_ERR(__curindex_move(cindex)); + + if (0) { +err: F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + } + + API_END_RET(session, ret); +} + +/* + * __curindex_search_near -- + * WT_CURSOR->search_near method for index cursors. + */ +static int +__curindex_search_near(WT_CURSOR *cursor, int *exact) +{ + WT_CURSOR_INDEX *cindex; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cindex = (WT_CURSOR_INDEX *)cursor; + CURSOR_API_CALL(cursor, session, search_near, NULL); + __wt_cursor_set_raw_key(cindex->child, &cursor->key); + if ((ret = cindex->child->search_near(cindex->child, exact)) == 0) + ret = __curindex_move(cindex); + else + F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + +err: API_END_RET(session, ret); +} + +/* + * __curindex_close -- + * WT_CURSOR->close method for index cursors. + */ +static int +__curindex_close(WT_CURSOR *cursor) +{ + WT_CURSOR_INDEX *cindex; + WT_CURSOR **cp; + WT_DECL_RET; + WT_INDEX *idx; + WT_SESSION_IMPL *session; + u_int i; + + cindex = (WT_CURSOR_INDEX *)cursor; + idx = cindex->index; + + CURSOR_API_CALL(cursor, session, close, NULL); + + if ((cp = cindex->cg_cursors) != NULL) + for (i = 0, cp = cindex->cg_cursors; + i < WT_COLGROUPS(cindex->table); i++, cp++) + if (*cp != NULL) { + WT_TRET((*cp)->close(*cp)); + *cp = NULL; + } + + __wt_free(session, cindex->cg_cursors); + if (cindex->key_plan != idx->key_plan) + __wt_free(session, cindex->key_plan); + if (cursor->value_format != cindex->table->value_format) + __wt_free(session, cursor->value_format); + if (cindex->value_plan != idx->value_plan) + __wt_free(session, cindex->value_plan); + + if (cindex->child != NULL) + WT_TRET(cindex->child->close(cindex->child)); + + __wt_schema_release_table(session, cindex->table); + /* The URI is owned by the index. */ + cursor->internal_uri = NULL; + WT_TRET(__wt_cursor_close(cursor)); + +err: API_END_RET(session, ret); +} + +/* + * __curindex_open_colgroups -- + * Open cursors on the column groups required for an index cursor. + */ +static int +__curindex_open_colgroups( + WT_SESSION_IMPL *session, WT_CURSOR_INDEX *cindex, const char *cfg_arg[]) +{ + WT_TABLE *table; + WT_CURSOR **cp; + u_long arg; + /* Child cursors are opened with dump disabled. */ + const char *cfg[] = { cfg_arg[0], cfg_arg[1], "dump=\"\"", NULL }; + char *proj; + + table = cindex->table; + WT_RET(__wt_calloc_def(session, WT_COLGROUPS(table), &cp)); + cindex->cg_cursors = cp; + + /* Work out which column groups we need. */ + for (proj = (char *)cindex->value_plan; *proj != '\0'; proj++) { + arg = strtoul(proj, &proj, 10); + if ((*proj != WT_PROJ_KEY && *proj != WT_PROJ_VALUE) || + cp[arg] != NULL) + continue; + WT_RET(__wt_open_cursor(session, + table->cgroups[arg]->source, + &cindex->iface, cfg, &cp[arg])); + } + + return (0); +} + +/* + * __wt_curindex_open -- + * WT_SESSION->open_cursor method for index cursors. + */ +int +__wt_curindex_open(WT_SESSION_IMPL *session, + const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) +{ + WT_CURSOR_STATIC_INIT(iface, + __wt_cursor_get_key, /* get-key */ + __curindex_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __curindex_set_value, /* set-value */ + __wt_cursor_notsup, /* compare */ + __curindex_next, /* next */ + __curindex_prev, /* prev */ + __curindex_reset, /* reset */ + __curindex_search, /* search */ + __curindex_search_near, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __curindex_close); /* close */ + WT_CURSOR_INDEX *cindex; + WT_CURSOR *cursor; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_INDEX *idx; + WT_TABLE *table; + const char *columns, *idxname, *tablename; + size_t namesize; + + tablename = uri; + if (!WT_PREFIX_SKIP(tablename, "index:") || + (idxname = strchr(tablename, ':')) == NULL) + WT_RET_MSG(session, EINVAL, "Invalid cursor URI: '%s'", uri); + namesize = (size_t)(idxname - tablename); + ++idxname; + + if ((ret = __wt_schema_get_table(session, + tablename, namesize, 0, &table)) != 0) { + if (ret == WT_NOTFOUND) + WT_RET_MSG(session, EINVAL, + "Cannot open cursor '%s' on unknown table", uri); + return (ret); + } + + columns = strchr(idxname, '('); + if (columns == NULL) + namesize = strlen(idxname); + else + namesize = (size_t)(columns - idxname); + + WT_RET(__wt_schema_open_index(session, table, idxname, namesize, &idx)); + WT_RET(__wt_calloc_def(session, 1, &cindex)); + + cursor = &cindex->iface; + *cursor = iface; + cursor->session = &session->iface; + + cindex->table = table; + cindex->index = idx; + cindex->key_plan = idx->key_plan; + cindex->value_plan = idx->value_plan; + + cursor->internal_uri = idx->name; + cursor->key_format = idx->idxkey_format; + cursor->value_format = table->value_format; + + /* + * XXX + * A very odd corner case is an index with a recno key. + * The only way to get here is by creating an index on a column store + * using only the primary's recno as the index key. Disallow that for + * now. + */ + if (WT_CURSOR_RECNO(cursor)) + WT_ERR_MSG(session, WT_ERROR, + "Column store indexes based on a record number primary " + "key are not supported."); + + /* Handle projections. */ + if (columns != NULL) { + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__wt_struct_reformat(session, table, + columns, strlen(columns), NULL, 0, tmp)); + WT_ERR(__wt_strndup( + session, tmp->data, tmp->size, &cursor->value_format)); + + WT_ERR(__wt_buf_init(session, tmp, 0)); + WT_ERR(__wt_struct_plan(session, table, + columns, strlen(columns), 0, tmp)); + WT_ERR(__wt_strndup( + session, tmp->data, tmp->size, &cindex->value_plan)); + } + + WT_ERR(__wt_cursor_init( + cursor, cursor->internal_uri, owner, cfg, cursorp)); + + WT_ERR(__wt_open_cursor( + session, idx->source, cursor, cfg, &cindex->child)); + + /* Open the column groups needed for this index cursor. */ + WT_ERR(__curindex_open_colgroups(session, cindex, cfg)); + + if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) + WT_ERR(__wt_json_column_init(cursor, table->key_format, + &idx->colconf, &table->colconf)); + + if (0) { +err: WT_TRET(__curindex_close(cursor)); + *cursorp = NULL; + } + + __wt_scr_free(&tmp); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/cursor/cur_json.c b/src/third_party/wiredtiger/src/cursor/cur_json.c new file mode 100644 index 00000000000..f4459819259 --- /dev/null +++ b/src/third_party/wiredtiger/src/cursor/cur_json.c @@ -0,0 +1,931 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static size_t __json_unpack_put(WT_SESSION_IMPL *, void *, u_char *, size_t, + WT_CONFIG_ITEM *); +static inline int __json_struct_size(WT_SESSION_IMPL *, const void *, size_t, + const char *, WT_CONFIG_ITEM *, int, size_t *); +static inline int __json_struct_unpackv(WT_SESSION_IMPL *, const void *, size_t, + const char *, WT_CONFIG_ITEM *, u_char *, size_t, int, va_list); +static int json_string_arg(WT_SESSION_IMPL *, const char **, WT_ITEM *); +static int json_int_arg(WT_SESSION_IMPL *, const char **, int64_t *); +static int json_uint_arg(WT_SESSION_IMPL *, const char **, uint64_t *); +static int __json_pack_struct(WT_SESSION_IMPL *, void *, size_t, const char *, + const char *); +static int __json_pack_size(WT_SESSION_IMPL *, const char *, WT_CONFIG_ITEM *, + int, const char *, size_t *); + +#define WT_PACK_JSON_GET(session, pv, jstr) do { \ + switch (pv.type) { \ + case 'x': \ + break; \ + case 's': \ + case 'S': \ + WT_RET(json_string_arg(session, &jstr, &pv.u.item)); \ + pv.type = pv.type == 's' ? 'j' : 'J'; \ + break; \ + case 'b': \ + case 'h': \ + case 'i': \ + case 'l': \ + case 'q': \ + WT_RET(json_int_arg(session, &jstr, &pv.u.i)); \ + break; \ + case 'B': \ + case 'H': \ + case 'I': \ + case 'L': \ + case 'Q': \ + case 'r': \ + case 'R': \ + case 't': \ + WT_RET(json_uint_arg(session, &jstr, &pv.u.u)); \ + break; \ + /* User format strings have already been validated. */ \ + WT_ILLEGAL_VALUE(session); \ + } \ +} while (0) + +/* + * __json_unpack_put -- + * Calculate the size of a packed byte string as formatted for JSON. + */ +static size_t +__json_unpack_put(WT_SESSION_IMPL *session, void *voidpv, + u_char *buf, size_t bufsz, WT_CONFIG_ITEM *name) +{ + WT_PACK_VALUE *pv; + const char *p, *end; + size_t s, n; + + pv = (WT_PACK_VALUE *)voidpv; + s = (size_t)snprintf((char *)buf, bufsz, "\"%.*s\" : ", + (int)name->len, name->str); + if (s <= bufsz) { + bufsz -= s; + buf += s; + } + else + bufsz = 0; + + switch (pv->type) { + case 'x': + return (0); + case 's': + case 'S': + /* Account for '"' quote in front and back. */ + s += 2; + p = (const char *)pv->u.s; + if (bufsz > 0) { + *buf++ = '"'; + bufsz--; + } + if (pv->type == 's' || pv->havesize) { + end = p + pv->size; + for (; p < end; p++) { + n = __wt_json_unpack_char(*p, buf, bufsz, 0); + if (n > bufsz) + bufsz = 0; + else { + bufsz -= n; + buf += n; + } + s += n; + } + } else + for (; *p; p++) { + n = __wt_json_unpack_char(*p, buf, bufsz, 0); + if (n > bufsz) + bufsz = 0; + else { + bufsz -= n; + buf += n; + } + s += n; + } + if (bufsz > 0) + *buf++ = '"'; + return (s); + case 'U': + case 'u': + s += 2; + p = (const char *)pv->u.item.data; + end = p + pv->u.item.size; + if (bufsz > 0) { + *buf++ = '"'; + bufsz--; + } + for (; p < end; p++) { + n = __wt_json_unpack_char(*p, buf, bufsz, 1); + if (n > bufsz) + bufsz = 0; + else { + bufsz -= n; + buf += n; + } + s += n; + } + if (bufsz > 0) + *buf++ = '"'; + return (s); + case 'b': + case 'h': + case 'i': + case 'l': + case 'q': + return (s + + (size_t)snprintf((char *)buf, bufsz, "%" PRId64, pv->u.i)); + case 'B': + case 't': + case 'H': + case 'I': + case 'L': + case 'Q': + case 'r': + case 'R': + return (s + + (size_t)snprintf((char *)buf, bufsz, "%" PRId64, pv->u.u)); + } + __wt_err(session, EINVAL, "unknown pack-value type: %c", (int)pv->type); + return ((size_t)-1); +} + +/* + * __json_struct_size -- + * Calculate the size of a packed byte string as formatted for JSON. + */ +static inline int +__json_struct_size(WT_SESSION_IMPL *session, const void *buffer, + size_t size, const char *fmt, WT_CONFIG_ITEM *names, int iskey, + size_t *presult) +{ + WT_CONFIG_ITEM name; + WT_DECL_PACK_VALUE(pv); + WT_DECL_RET; + WT_PACK pack; + WT_PACK_NAME packname; + const uint8_t *p, *end; + size_t result; + int needcr; + + p = buffer; + end = p + size; + result = 0; + needcr = 0; + + WT_RET(__pack_name_init(session, names, iskey, &packname)); + WT_RET(__pack_init(session, &pack, fmt)); + while ((ret = __pack_next(&pack, &pv)) == 0) { + if (needcr) + result += 2; + needcr = 1; + WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p))); + WT_RET(__pack_name_next(&packname, &name)); + result += __json_unpack_put(session, &pv, NULL, 0, &name); + } + if (ret == WT_NOTFOUND) + ret = 0; + + /* Be paranoid - __pack_write should never overflow. */ + WT_ASSERT(session, p <= end); + + *presult = result; + return (ret); +} + +/* + * __json_struct_unpackv -- + * Unpack a byte string to JSON (va_list version). + */ +static inline int +__json_struct_unpackv(WT_SESSION_IMPL *session, + const void *buffer, size_t size, const char *fmt, WT_CONFIG_ITEM *names, + u_char *jbuf, size_t jbufsize, int iskey, va_list ap) +{ + WT_CONFIG_ITEM name; + WT_DECL_PACK_VALUE(pv); + WT_DECL_RET; + WT_PACK pack; + WT_PACK_NAME packname; + int needcr; + size_t jsize; + const uint8_t *p, *end; + + p = buffer; + end = p + size; + needcr = 0; + + /* Unpacking a cursor marked as json implies a single arg. */ + *va_arg(ap, const char **) = (char *)jbuf; + + WT_RET(__pack_name_init(session, names, iskey, &packname)); + WT_RET(__pack_init(session, &pack, fmt)); + while ((ret = __pack_next(&pack, &pv)) == 0) { + if (needcr) { + WT_ASSERT(session, jbufsize >= 3); + strncat((char *)jbuf, ",\n", jbufsize); + jbuf += 2; + jbufsize -= 2; + } + needcr = 1; + WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p))); + WT_RET(__pack_name_next(&packname, &name)); + jsize = __json_unpack_put(session, + (u_char *)&pv, jbuf, jbufsize, &name); + WT_ASSERT(session, jsize <= jbufsize); + jbuf += jsize; + jbufsize -= jsize; + } + if (ret == WT_NOTFOUND) + ret = 0; + + /* Be paranoid - __unpack_read should never overflow. */ + WT_ASSERT(session, p <= end); + + WT_ASSERT(session, jbufsize == 1); + + return (ret); +} + +/* + * __wt_json_alloc_unpack -- + * Allocate space for, and unpack an entry into JSON format. + */ +int +__wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer, + size_t size, const char *fmt, WT_CURSOR_JSON *json, + int iskey, va_list ap) +{ + WT_CONFIG_ITEM *names; + WT_DECL_RET; + size_t needed; + char **json_bufp; + + if (iskey) { + names = &json->key_names; + json_bufp = &json->key_buf; + } else { + names = &json->value_names; + json_bufp = &json->value_buf; + } + needed = 0; + WT_RET(__json_struct_size(session, buffer, size, fmt, names, + iskey, &needed)); + WT_RET(__wt_realloc(session, NULL, needed + 1, json_bufp)); + WT_RET(__json_struct_unpackv(session, buffer, size, fmt, + names, (u_char *)*json_bufp, needed + 1, iskey, ap)); + + return (ret); +} + +/* + * __wt_json_close -- + * Release any json related resources. + */ +void +__wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor) +{ + WT_CURSOR_JSON *json; + + if ((json = (WT_CURSOR_JSON *)cursor->json_private) != NULL) { + __wt_free(session, json->key_buf); + __wt_free(session, json->value_buf); + __wt_free(session, json); + } + return; +} + +/* + * __wt_json_unpack_char -- + * Unpack a single character into JSON escaped format. + * Can be called with null buf for sizing. + */ +size_t +__wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, int force_unicode) +{ + char abbrev; + u_char h; + + if (!force_unicode) { + if (isprint(ch) && ch != '\\' && ch != '"') { + if (bufsz >= 1) + *buf = (u_char)ch; + return (1); + } else { + abbrev = '\0'; + switch (ch) { + case '\\': + case '"': + abbrev = ch; + break; + case '\f': + abbrev = 'f'; + break; + case '\n': + abbrev = 'n'; + break; + case '\r': + abbrev = 'r'; + break; + case '\t': + abbrev = 't'; + break; + } + if (abbrev != '\0') { + if (bufsz >= 2) { + *buf++ = '\\'; + *buf = (u_char)abbrev; + } + return (2); + } + } + } + if (bufsz >= 6) { + *buf++ = '\\'; + *buf++ = 'u'; + *buf++ = '0'; + *buf++ = '0'; + h = (((u_char)ch) >> 4) & 0xF; + if (h >= 10) + *buf++ = 'A' + (h - 10); + else + *buf++ = '0' + h; + h = ((u_char)ch) & 0xF; + if (h >= 10) + *buf++ = 'A' + (h - 10); + else + *buf++ = '0' + h; + } + return (6); +} + +/* + * __wt_json_column_init -- + * set json_key_names, json_value_names to comma separated lists + * of column names. + */ +int +__wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, + const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf) +{ + WT_CURSOR_JSON *json; + const char *p, *end, *beginkey; + uint32_t keycnt, nkeys; + + json = (WT_CURSOR_JSON *)cursor->json_private; + beginkey = colconf->str; + end = beginkey + colconf->len; + + if (idxconf != NULL) { + json->key_names.str = idxconf->str; + json->key_names.len = idxconf->len; + } else if (colconf->len > 0 && *beginkey == '(') { + beginkey++; + if (end[-1] == ')') + end--; + } + + for (nkeys = 0; *keyformat; keyformat++) + if (!isdigit(*keyformat)) + nkeys++; + + p = beginkey; + keycnt = 0; + while (p < end && keycnt < nkeys) { + if (*p == ',') + keycnt++; + p++; + } + json->value_names.str = p; + json->value_names.len = WT_PTRDIFF(end, p); + if (idxconf == NULL) { + if (p > beginkey) + p--; + json->key_names.str = beginkey; + json->key_names.len = WT_PTRDIFF(p, beginkey); + } + return (0); +} + +#define MATCH_KEYWORD(session, in, result, keyword, matchval) do { \ + size_t _kwlen = strlen(keyword); \ + if (strncmp(in, keyword, _kwlen) == 0 && !isalnum(in[_kwlen])) { \ + in += _kwlen; \ + result = matchval; \ + } else { \ + const char *_bad = in; \ + while (isalnum(*in)) \ + in++; \ + __wt_errx(session, "unknown keyword \"%.*s\" in JSON", \ + (int)(in - _bad), _bad); \ + } \ +} while (0) + +/* + * __wt_json_token -- + * Return the type, start position and length of the next JSON + * token in the input. String tokens include the quotes. JSON + * can be entirely parsed using calls to this tokenizer, each + * call using a src pointer that is the previously returned + * tokstart + toklen. + * + * The token type returned is one of: + * 0 : EOF + * 's' : string + * 'i' : intnum + * 'f' : floatnum + * ':' : colon + * ',' : comma + * '{' : lbrace + * '}' : rbrace + * '[' : lbracket + * ']' : rbracket + * 'N' : null + * 'T' : true + * 'F' : false + */ +int +__wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype, + const char **tokstart, size_t *toklen) +{ + WT_SESSION_IMPL *session; + char ch; + const char *bad; + int backslash, isalph, isfloat, result; + + result = -1; + session = (WT_SESSION_IMPL *)wt_session; + while (isspace(*src)) + src++; + *tokstart = src; + + if (*src == '\0') { + *toktype = 0; + *toklen = 0; + return (0); + } + + /* JSON is specified in RFC 4627. */ + switch (*src) { + case '"': + backslash = 0; + src++; + while ((ch = *src) != '\0') { + if (!backslash) { + if (ch == '"') { + src++; + result = 's'; + break; + } + if (ch == '\\') + backslash = 1; + } else { + /* We validate Unicode on this pass. */ + if (ch == 'u') { + u_char ignored; + const u_char *uc; + + uc = (const u_char *)src; + if (__wt_hex2byte(&uc[1], &ignored) || + __wt_hex2byte(&uc[3], &ignored)) { + __wt_errx(session, + "invalid Unicode within JSON string"); + return (-1); + } + src += 5; + } + backslash = 0; + } + src++; + } + if (result != 's') + __wt_errx(session, "unterminated string in JSON"); + break; + case '-': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + isfloat = 0; + if (*src == '-') + src++; + while ((ch = *src) != '\0' && isdigit(ch)) + src++; + if (*src == '.') { + isfloat = 1; + src++; + while ((ch = *src) != '\0' && + isdigit(ch)) + src++; + } + if (*src == 'e' || *src == 'E') { + isfloat = 1; + src++; + if (*src == '+' || *src == '-') + src++; + while ((ch = *src) != '\0' && + isdigit(ch)) + src++; + } + result = isfloat ? 'f' : 'i'; + break; + case ':': + case ',': + case '{': + case '}': + case '[': + case ']': + result = *src++; + break; + case 'n': + MATCH_KEYWORD(session, src, result, "null", 'N'); + break; + case 't': + MATCH_KEYWORD(session, src, result, "true", 'T'); + break; + case 'f': + MATCH_KEYWORD(session, src, result, "false", 'F'); + break; + default: + /* An illegal token, move past it anyway */ + bad = src; + isalph = isalnum(*src); + src++; + if (isalph) + while (*src != '\0' && isalnum(*src)) + src++; + __wt_errx(session, "unknown token \"%.*s\" in JSON", + (int)(src - bad), bad); + break; + } + *toklen = (size_t)(src - *tokstart); + *toktype = result; + return (result < 0 ? EINVAL : 0); +} + +/* + * __wt_json_tokname + * Return a descriptive name from the token type returned by + * __wt_json_token + */ +const char * +__wt_json_tokname(int toktype) +{ + switch (toktype) { + case 0: return ("<EOF>"); + case 's': return ("<string>"); + case 'i': return ("<integer>"); + case 'f': return ("<float>"); + case ':': return ("':'"); + case ',': return ("','"); + case '{': return ("'{'"); + case '}': return ("'}'"); + case '[': return ("'['"); + case ']': return ("']'"); + case 'N': return ("'null'"); + case 'T': return ("'true'"); + case 'F': return ("'false'"); + default: return ("<UNKNOWN>"); + } +} + +/* + * json_string_arg -- + * Returns a first cut of the needed string in item. + * The result has not been stripped of escapes. + */ +static int +json_string_arg(WT_SESSION_IMPL *session, const char **jstr, WT_ITEM *item) +{ + const char *tokstart; + int tok; + WT_DECL_RET; + + WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart, + &item->size)); + if (tok == 's') { + *jstr = tokstart + item->size; + /* The tokenizer includes the '"' chars */ + item->data = tokstart + 1; + item->size -= 2; + ret = 0; + } else { + __wt_errx(session, "expected JSON <string>, got %s", + __wt_json_tokname(tok)); + ret = EINVAL; + } + return (ret); +} + +/* + * json_int_arg -- + * Returns a signed integral value from the current position + * in the JSON string. + */ +static int +json_int_arg(WT_SESSION_IMPL *session, const char **jstr, int64_t *ip) +{ + char *end; + const char *tokstart; + int tok; + size_t toksize; + + WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart, + &toksize)); + if (tok == 'i') { + /* JSON only allows decimal */ + *ip = strtoll(tokstart, &end, 10); + if (end != tokstart + toksize) + WT_RET_MSG(session, EINVAL, + "JSON <int> extraneous input"); + *jstr = tokstart + toksize; + } else { + __wt_errx(session, "expected JSON <int>, got %s", + __wt_json_tokname(tok)); + return (EINVAL); + } + return (0); +} + +/* + * json_uint_arg -- + * Returns an unsigned integral value from the current position + * in the JSON string. + */ +static int +json_uint_arg(WT_SESSION_IMPL *session, const char **jstr, uint64_t *up) +{ + char *end; + const char *tokstart; + int tok; + size_t toksize; + + WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart, + &toksize)); + if (tok == 'i' && *tokstart != '-') { + /* JSON only allows decimal */ + *up = strtoull(tokstart, &end, 10); + if (end != tokstart + toksize) + WT_RET_MSG(session, EINVAL, + "JSON <int> extraneous input"); + *jstr = tokstart + toksize; + } else { + __wt_errx(session, "expected unsigned JSON <int>, got %s", + __wt_json_tokname(tok)); + return (EINVAL); + } + return (0); +} + +#define JSON_EXPECT_TOKEN_GET(session, jstr, tokval, start, sz) do { \ + int __tok; \ + WT_RET(__wt_json_token((WT_SESSION *)session, jstr, &__tok, &start, &sz));\ + if (__tok != tokval) { \ + __wt_errx(session, "expected JSON %s, got %s", \ + __wt_json_tokname(tokval), __wt_json_tokname(__tok)); \ + return (EINVAL); \ + } \ + jstr = start + sz; \ +} while (0) + +#define JSON_EXPECT_TOKEN(session, jstr, tokval) do { \ + const char *__start; \ + size_t __sz; \ + JSON_EXPECT_TOKEN_GET(session, jstr, tokval, __start, __sz); \ +} while (0) + +/* + * __json_pack_struct -- + * Pack a byte string from a JSON string. + */ +static int +__json_pack_struct(WT_SESSION_IMPL *session, void *buffer, size_t size, + const char *fmt, const char *jstr) +{ + WT_DECL_PACK_VALUE(pv); + WT_DECL_RET; + WT_PACK pack; + const char *tokstart; + int multi; + size_t toksize; + uint8_t *p, *end; + + p = buffer; + end = p + size; + multi = 0; + + if (fmt[0] != '\0' && fmt[1] == '\0') { + JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize); + /* the key name was verified in __json_pack_size */ + JSON_EXPECT_TOKEN(session, jstr, ':'); + pv.type = fmt[0]; + WT_PACK_JSON_GET(session, pv, jstr); + return (__pack_write(session, &pv, &p, size)); + } + + WT_RET(__pack_init(session, &pack, fmt)); + while ((ret = __pack_next(&pack, &pv)) == 0) { + if (multi) + JSON_EXPECT_TOKEN(session, jstr, ','); + JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize); + /* the key name was verified in __json_pack_size */ + JSON_EXPECT_TOKEN(session, jstr, ':'); + WT_PACK_JSON_GET(session, pv, jstr); + WT_RET(__pack_write(session, &pv, &p, (size_t)(end - p))); + multi = 1; + } + + /* Be paranoid - __pack_write should never overflow. */ + WT_ASSERT(session, p <= end); + + if (ret != WT_NOTFOUND) + return (ret); + + return (0); +} + +/* + * __json_pack_size -- + * Calculate the size of a packed byte string from a JSON string. + * We verify that the names and value types provided in JSON match + * the column names and type from the schema format, returning error + * if not. + */ +static int +__json_pack_size( + WT_SESSION_IMPL *session, const char *fmt, WT_CONFIG_ITEM *names, + int iskey, const char *jstr, size_t *sizep) +{ + WT_CONFIG_ITEM name; + WT_DECL_PACK_VALUE(pv); + WT_PACK pack; + WT_PACK_NAME packname; + const char *tokstart; + int multi; + size_t toksize, total; + + WT_RET(__pack_name_init(session, names, iskey, &packname)); + multi = 0; + WT_RET(__pack_init(session, &pack, fmt)); + for (total = 0; __pack_next(&pack, &pv) == 0;) { + if (multi) + JSON_EXPECT_TOKEN(session, jstr, ','); + JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize); + WT_RET(__pack_name_next(&packname, &name)); + if (toksize - 2 != name.len || + strncmp(tokstart + 1, name.str, toksize - 2) != 0) { + __wt_errx(session, "JSON expected %s name: \"%.*s\"", + iskey ? "key" : "value", (int)name.len, name.str); + return (EINVAL); + } + JSON_EXPECT_TOKEN(session, jstr, ':'); + WT_PACK_JSON_GET(session, pv, jstr); + total += __pack_size(session, &pv); + multi = 1; + } + /* check end of string */ + JSON_EXPECT_TOKEN(session, jstr, 0); + + *sizep = total; + return (0); +} + +/* + * __wt_json_to_item -- + * Convert a JSON input string for either key/value to a raw WT_ITEM. + * Checks that the input matches the expected format. + */ +int +__wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr, + const char *format, WT_CURSOR_JSON *json, int iskey, WT_ITEM *item) +{ + size_t sz; + sz = 0; /* Initialize because GCC 4.1 is paranoid */ + + WT_RET(__json_pack_size(session, format, + iskey ? &json->key_names : &json->value_names, iskey, jstr, &sz)); + WT_RET(__wt_buf_initsize(session, item, sz)); + WT_RET(__json_pack_struct(session, item->mem, sz, format, jstr)); + return (0); +} + +/* + * __wt_json_strlen -- + * Return the number of bytes represented by a string in JSON format, + * or -1 if the format is incorrect. + */ +ssize_t +__wt_json_strlen(const char *src, size_t srclen) +{ + const char *srcend; + size_t dstlen; + u_char hi, lo; + + dstlen = 0; + srcend = src + srclen; + while (src < srcend) { + /* JSON can include any UTF-8 expressed in 4 hex chars. */ + if (*src == '\\') { + if (*++src == 'u') { + if (__wt_hex2byte((const u_char *)++src, &hi)) + return (-1); + src += 2; + if (__wt_hex2byte((const u_char *)src, &lo)) + return (-1); + src += 2; + /* RFC 3629 */ + if (hi >= 0x8) { + /* 3 bytes total */ + dstlen += 2; + } + else if (hi != 0 || lo >= 0x80) { + /* 2 bytes total */ + dstlen++; + } + /* else 1 byte total */ + } + } + dstlen++; + src++; + } + if (src != srcend) + return (-1); /* invalid input, e.g. final char is '\\' */ + return ((ssize_t)dstlen); +} + +/* + * __wt_json_strncpy -- + * Copy bytes of string in JSON format to a destination, + * up to dstlen bytes. If dstlen is greater than the needed size, + * the result if zero padded. + */ +int +__wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t srclen) +{ + char *dst; + const char *dstend, *srcend; + u_char hi, lo; + + dst = *pdst; + dstend = dst + dstlen; + srcend = src + srclen; + while (src < srcend && dst < dstend) { + /* JSON can include any UTF-8 expressed in 4 hex chars. */ + if (*src == '\\') { + if (*++src == 'u') { + if (__wt_hex2byte((const u_char *)++src, &hi)) + return (EINVAL); + src += 2; + if (__wt_hex2byte((const u_char *)src, &lo)) + return (EINVAL); + src += 2; + /* RFC 3629 */ + if (hi >= 0x8) { + /* 3 bytes total */ + /* byte 0: 1110HHHH */ + /* byte 1: 10HHHHLL */ + /* byte 2: 10LLLLLL */ + *dst++ = (char)(0xe0 | + ((hi >> 4) & 0x0f)); + *dst++ = (char)(0x80 | + ((hi << 2) & 0x3c) | + ((lo >> 6) & 0x03)); + *dst++ = (char)(0x80 | (lo & 0x3f)); + } else if (hi != 0 || lo >= 0x80) { + /* 2 bytes total */ + /* byte 0: 110HHHLL */ + /* byte 1: 10LLLLLL */ + *dst++ = (char)(0xc0 | + (hi << 2) | + ((lo >> 6) & 0x03)); + *dst++ = (char)(0x80 | (lo & 0x3f)); + } else + /* else 1 byte total */ + /* byte 0: 0LLLLLLL */ + *dst++ = (char)lo; + } + else + *dst++ = *src; + } else + *dst++ = *src; + src++; + } + if (src != srcend) + return (ENOMEM); + *pdst = dst; + while (dst < dstend) + *dst++ = '\0'; + return (0); +} diff --git a/src/third_party/wiredtiger/src/cursor/cur_log.c b/src/third_party/wiredtiger/src/cursor/cur_log.c new file mode 100644 index 00000000000..803d68e890c --- /dev/null +++ b/src/third_party/wiredtiger/src/cursor/cur_log.c @@ -0,0 +1,380 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __curlog_logrec -- + * Callback function from log_scan to get a log record. + */ +static int +__curlog_logrec( + WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, void *cookie) +{ + WT_CURSOR_LOG *cl; + + cl = cookie; + + /* Set up the LSNs and take a copy of the log record for the cursor. */ + *cl->cur_lsn = *lsnp; + *cl->next_lsn = *lsnp; + cl->next_lsn->offset += (wt_off_t)logrec->size; + WT_RET(__wt_buf_set(session, cl->logrec, logrec->data, logrec->size)); + + /* + * Read the log header. Set up the step pointers to walk the + * operations inside the record. Get the record type. + */ + cl->stepp = LOG_SKIP_HEADER(cl->logrec->data); + cl->stepp_end = (uint8_t *)cl->logrec->data + logrec->size; + WT_RET(__wt_logrec_read(session, &cl->stepp, cl->stepp_end, + &cl->rectype)); + + /* A step count of 0 means the entire record. */ + cl->step_count = 0; + + /* + * Unpack the txnid so that we can return each + * individual operation for this txnid. + */ + if (cl->rectype == WT_LOGREC_COMMIT) + WT_RET(__wt_vunpack_uint(&cl->stepp, + WT_PTRDIFF(cl->stepp_end, cl->stepp), &cl->txnid)); + else { + /* + * Step over anything else. + * Setting stepp to NULL causes the next() + * method to read a new record on the next call. + */ + cl->stepp = NULL; + cl->txnid = 0; + } + return (0); +} + +/* + * __curlog_compare -- + * WT_CURSOR.compare method for the log cursor type. + */ +static int +__curlog_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) +{ + WT_CURSOR_LOG *acl, *bcl; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + CURSOR_API_CALL(a, session, compare, NULL); + + acl = (WT_CURSOR_LOG *)a; + bcl = (WT_CURSOR_LOG *)b; + WT_ASSERT(session, cmpp != NULL); + *cmpp = LOG_CMP(acl->cur_lsn, bcl->cur_lsn); + /* + * If both are on the same LSN, compare step counter. + */ + if (*cmpp == 0) + *cmpp = (acl->step_count != bcl->step_count ? + (acl->step_count < bcl->step_count ? -1 : 1) : 0); +err: API_END_RET(session, ret); + +} + +/* + * __curlog_op_read -- + * Read out any key/value from an individual operation record + * in the log. We're only interested in put and remove operations + * since truncate is not a cursor operation. All successful + * returns from this function will have set up the cursor copy of + * key and value to give the user. + */ +static int +__curlog_op_read(WT_SESSION_IMPL *session, + WT_CURSOR_LOG *cl, uint32_t optype, uint32_t opsize, uint32_t *fileid) +{ + WT_ITEM key, value; + uint64_t recno; + const uint8_t *end, *pp; + + pp = cl->stepp; + end = pp + opsize; + switch (optype) { + case WT_LOGOP_COL_PUT: + WT_RET(__wt_logop_col_put_unpack(session, &pp, end, + fileid, &recno, &value)); + WT_RET(__wt_buf_set(session, cl->opkey, &recno, sizeof(recno))); + WT_RET(__wt_buf_set(session, + cl->opvalue, value.data, value.size)); + break; + case WT_LOGOP_COL_REMOVE: + WT_RET(__wt_logop_col_remove_unpack(session, &pp, end, + fileid, &recno)); + WT_RET(__wt_buf_set(session, cl->opkey, &recno, sizeof(recno))); + WT_RET(__wt_buf_set(session, cl->opvalue, NULL, 0)); + break; + case WT_LOGOP_ROW_PUT: + WT_RET(__wt_logop_row_put_unpack(session, &pp, end, + fileid, &key, &value)); + WT_RET(__wt_buf_set(session, cl->opkey, key.data, key.size)); + WT_RET(__wt_buf_set(session, + cl->opvalue, value.data, value.size)); + break; + case WT_LOGOP_ROW_REMOVE: + WT_RET(__wt_logop_row_remove_unpack(session, &pp, end, + fileid, &key)); + WT_RET(__wt_buf_set(session, cl->opkey, key.data, key.size)); + WT_RET(__wt_buf_set(session, cl->opvalue, NULL, 0)); + break; + default: + /* + * Any other operations return the record in the value + * and an empty key. + */ + *fileid = 0; + WT_RET(__wt_buf_set(session, cl->opkey, NULL, 0)); + WT_RET(__wt_buf_set(session, cl->opvalue, cl->stepp, opsize)); + } + return (0); +} + +/* + * __curlog_kv -- + * Set the key and value of the log cursor to return to the user. + */ +static int +__curlog_kv(WT_SESSION_IMPL *session, WT_CURSOR *cursor) +{ + WT_CURSOR_LOG *cl; + uint32_t fileid, key_count, opsize, optype; + + cl = (WT_CURSOR_LOG *)cursor; + /* + * If it is a commit and we have stepped over the header, peek to get + * the size and optype and read out any key/value from this operation. + */ + if ((key_count = cl->step_count++) > 0) { + WT_RET(__wt_logop_read(session, + &cl->stepp, cl->stepp_end, &optype, &opsize)); + WT_RET(__curlog_op_read(session, cl, optype, opsize, &fileid)); + /* Position on the beginning of the next record part. */ + cl->stepp += opsize; + } else { + optype = WT_LOGOP_INVALID; + fileid = 0; + cl->opkey->data = NULL; + cl->opkey->size = 0; + /* + * Non-commit records we want to return the record without the + * header and the adjusted size. Add one to skip over the type + * which is normally consumed by __wt_logrec_read. + */ + cl->opvalue->data = LOG_SKIP_HEADER(cl->logrec->data) + 1; + cl->opvalue->size = LOG_REC_SIZE(cl->logrec->size) - 1; + } + /* + * The log cursor sets the LSN and step count as the cursor key and + * and log record related data in the value. The data in the value + * contains any operation key/value that was in the log record. + */ + __wt_cursor_set_key(cursor, cl->cur_lsn->file, cl->cur_lsn->offset, + key_count); + __wt_cursor_set_value(cursor, cl->txnid, cl->rectype, optype, + fileid, cl->opkey, cl->opvalue); + return (0); +} + +/* + * __curlog_next -- + * WT_CURSOR.next method for the step log cursor type. + */ +static int +__curlog_next(WT_CURSOR *cursor) +{ + WT_CURSOR_LOG *cl; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cl = (WT_CURSOR_LOG *)cursor; + + CURSOR_API_CALL(cursor, session, next, NULL); + + /* + * If we don't have a record, or went to the end of the record we + * have, or we are in the zero-fill portion of the record, get a + * new one. + */ + if (cl->stepp == NULL || cl->stepp >= cl->stepp_end || !*cl->stepp) { + cl->txnid = 0; + WT_ERR(__wt_log_scan(session, cl->next_lsn, WT_LOGSCAN_ONE, + __curlog_logrec, cl)); + } + WT_ASSERT(session, cl->logrec->data != NULL); + WT_ERR(__curlog_kv(session, cursor)); + WT_STAT_FAST_CONN_INCR(session, cursor_next); + WT_STAT_FAST_DATA_INCR(session, cursor_next); + +err: API_END_RET(session, ret); + +} + +/* + * __curlog_search -- + * WT_CURSOR.search method for the log cursor type. + */ +static int +__curlog_search(WT_CURSOR *cursor) +{ + WT_CURSOR_LOG *cl; + WT_DECL_RET; + WT_LSN key; + WT_SESSION_IMPL *session; + uint32_t counter; + + cl = (WT_CURSOR_LOG *)cursor; + + CURSOR_API_CALL(cursor, session, search, NULL); + + /* + * !!! We are ignoring the counter and only searching based on the LSN. + */ + WT_ERR(__wt_cursor_get_key((WT_CURSOR *)cl, + &key.file, &key.offset, &counter)); + WT_ERR(__wt_log_scan(session, &key, WT_LOGSCAN_ONE, + __curlog_logrec, cl)); + WT_ERR(__curlog_kv(session, cursor)); + WT_STAT_FAST_CONN_INCR(session, cursor_search); + WT_STAT_FAST_DATA_INCR(session, cursor_search); + +err: API_END_RET(session, ret); +} + +/* + * __curlog_reset -- + * WT_CURSOR.reset method for the log cursor type. + */ +static int +__curlog_reset(WT_CURSOR *cursor) +{ + WT_CURSOR_LOG *cl; + + cl = (WT_CURSOR_LOG *)cursor; + cl->stepp = cl->stepp_end = NULL; + cl->step_count = 0; + INIT_LSN(cl->cur_lsn); + INIT_LSN(cl->next_lsn); + return (0); +} + +/* + * __curlog_close -- + * WT_CURSOR.close method for the log cursor type. + */ +static int +__curlog_close(WT_CURSOR *cursor) +{ + WT_CONNECTION_IMPL *conn; + WT_CURSOR_LOG *cl; + WT_DECL_RET; + WT_LOG *log; + WT_SESSION_IMPL *session; + + CURSOR_API_CALL(cursor, session, close, NULL); + cl = (WT_CURSOR_LOG *)cursor; + conn = S2C(session); + WT_ASSERT(session, conn->logging); + log = conn->log; + WT_TRET(__wt_readunlock(session, log->log_archive_lock)); + WT_TRET(__curlog_reset(cursor)); + __wt_free(session, cl->cur_lsn); + __wt_free(session, cl->next_lsn); + __wt_scr_free(&cl->logrec); + __wt_scr_free(&cl->opkey); + __wt_scr_free(&cl->opvalue); + WT_TRET(__wt_cursor_close(cursor)); + +err: API_END_RET(session, ret); +} + +/* + * __wt_curlog_open -- + * Initialize a log cursor. + */ +int +__wt_curlog_open(WT_SESSION_IMPL *session, + const char *uri, const char *cfg[], WT_CURSOR **cursorp) +{ + WT_CONNECTION_IMPL *conn; + WT_CURSOR_STATIC_INIT(iface, + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __curlog_compare, /* compare */ + __curlog_next, /* next */ + __wt_cursor_notsup, /* prev */ + __curlog_reset, /* reset */ + __curlog_search, /* search */ + __wt_cursor_notsup, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __curlog_close); /* close */ + WT_CURSOR *cursor; + WT_CURSOR_LOG *cl; + WT_DECL_RET; + WT_LOG *log; + + WT_STATIC_ASSERT(offsetof(WT_CURSOR_LOG, iface) == 0); + conn = S2C(session); + if (!conn->logging) + WT_RET_MSG(session, EINVAL, + "Cannot open a log cursor without logging enabled"); + + log = conn->log; + cl = NULL; + WT_RET(__wt_calloc_def(session, 1, &cl)); + cursor = &cl->iface; + *cursor = iface; + cursor->session = &session->iface; + WT_ERR(__wt_calloc_def(session, 1, &cl->cur_lsn)); + WT_ERR(__wt_calloc_def(session, 1, &cl->next_lsn)); + WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec)); + WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey)); + WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue)); + cursor->key_format = LOGC_KEY_FORMAT; + cursor->value_format = LOGC_VALUE_FORMAT; + + INIT_LSN(cl->cur_lsn); + INIT_LSN(cl->next_lsn); + + WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); + + /* Log cursors are read only. */ + WT_ERR(__wt_cursor_config_readonly(cursor, cfg, 1)); + /* Log cursors block archiving. */ + WT_ERR(__wt_readlock(session, log->log_archive_lock)); + + if (0) { +err: if (F_ISSET(cursor, WT_CURSTD_OPEN)) + WT_TRET(cursor->close(cursor)); + else { + __wt_free(session, cl->cur_lsn); + __wt_free(session, cl->next_lsn); + __wt_scr_free(&cl->logrec); + __wt_scr_free(&cl->opkey); + __wt_scr_free(&cl->opvalue); + /* + * NOTE: We cannot get on the error path with the + * readlock held. No need to unlock it unless that + * changes above. + */ + __wt_free(session, cl); + } + *cursorp = NULL; + } + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/cursor/cur_metadata.c b/src/third_party/wiredtiger/src/cursor/cur_metadata.c new file mode 100644 index 00000000000..30fe3b28625 --- /dev/null +++ b/src/third_party/wiredtiger/src/cursor/cur_metadata.c @@ -0,0 +1,444 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * Custom NEED macros for metadata cursors - that copy the values into the + * backing metadata table cursor. + */ +#define WT_MD_CURSOR_NEEDKEY(cursor) do { \ + WT_CURSOR_NEEDKEY(cursor); \ + WT_ERR(__wt_buf_set(session, \ + &((WT_CURSOR_METADATA *)(cursor))->file_cursor->key, \ + cursor->key.data, cursor->key.size)); \ + F_SET(((WT_CURSOR_METADATA *)(cursor))->file_cursor, \ + WT_CURSTD_KEY_EXT); \ +} while (0) + +#define WT_MD_CURSOR_NEEDVALUE(cursor) do { \ + WT_CURSOR_NEEDVALUE(cursor); \ + WT_ERR(__wt_buf_set(session, \ + &((WT_CURSOR_METADATA *)(cursor))->file_cursor->value, \ + cursor->value.data, cursor->value.size)); \ + F_SET(((WT_CURSOR_METADATA *)(cursor))->file_cursor, \ + WT_CURSTD_VALUE_EXT); \ +} while (0) + +#define WT_MD_SET_KEY_VALUE(c, mc, fc) do { \ + (c)->key.data = (fc)->key.data; \ + (c)->key.size = (fc)->key.size; \ + (c)->value.data = (fc)->value.data; \ + (c)->value.size = (fc)->value.size; \ + F_SET((c), WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); \ + F_CLR((mc), WT_MDC_ONMETADATA); \ + F_SET((mc), WT_MDC_POSITIONED); \ +} while (0) + +/* + * Check if a key matches the metadata. The public value is "metadata:", + * but also check for the internal version of the URI. + */ +#define WT_KEY_IS_METADATA(key) \ + (WT_STRING_MATCH(WT_METADATA_URI, (key)->data, (key)->size - 1) ||\ + WT_STRING_MATCH(WT_METAFILE_URI, (key)->data, (key)->size - 1)) + +/* + * __curmetadata_metadata_search -- + * Retrieve the metadata for the metadata table + */ +static int +__curmetadata_metadata_search(WT_SESSION_IMPL *session, WT_CURSOR *cursor) +{ + WT_CURSOR_METADATA *mdc; + WT_DECL_RET; + const char *value; + + mdc = (WT_CURSOR_METADATA *)cursor; + + /* The metadata search interface allocates a new string in value. */ + WT_RET(__wt_metadata_search(session, WT_METAFILE_URI, &value)); + + /* + * Copy the value to the underlying btree cursor's tmp item which will + * be freed when the cursor is closed. + */ + ret = __wt_buf_setstr(session, &cursor->value, value); + __wt_free(session, value); + WT_RET(ret); + + WT_RET(__wt_buf_setstr(session, &cursor->key, WT_METADATA_URI)); + + F_SET(mdc, WT_MDC_ONMETADATA | WT_MDC_POSITIONED); + F_SET(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); + return (0); +} + +/* + * __curmetadata_compare -- + * WT_CURSOR->compare method for the metadata cursor type. + */ +static int +__curmetadata_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) +{ + WT_CURSOR *a_file_cursor, *b_file_cursor; + WT_CURSOR_METADATA *a_mdc, *b_mdc; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + a_mdc = ((WT_CURSOR_METADATA *)a); + b_mdc = ((WT_CURSOR_METADATA *)b); + a_file_cursor = a_mdc->file_cursor; + b_file_cursor = b_mdc->file_cursor; + + CURSOR_API_CALL(a, session, + compare, ((WT_CURSOR_BTREE *)a_file_cursor)->btree); + + if (b->compare != __curmetadata_compare) + WT_ERR_MSG(session, EINVAL, + "Can only compare cursors of the same type"); + + WT_MD_CURSOR_NEEDKEY(a); + WT_MD_CURSOR_NEEDKEY(b); + + if (F_ISSET(a_mdc, WT_MDC_ONMETADATA)) { + if (F_ISSET(b_mdc, WT_MDC_ONMETADATA)) + *cmpp = 0; + else + *cmpp = 1; + } else if (F_ISSET(b_mdc, WT_MDC_ONMETADATA)) + *cmpp = -1; + else + ret = a_file_cursor->compare( + a_file_cursor, b_file_cursor, cmpp); + +err: API_END_RET(session, ret); +} + +/* + * __curmetadata_next -- + * WT_CURSOR->next method for the metadata cursor type. + */ +static int +__curmetadata_next(WT_CURSOR *cursor) +{ + WT_CURSOR *file_cursor; + WT_CURSOR_METADATA *mdc; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + mdc = (WT_CURSOR_METADATA *)cursor; + file_cursor = mdc->file_cursor; + CURSOR_API_CALL(cursor, session, + next, ((WT_CURSOR_BTREE *)file_cursor)->btree); + + if (!F_ISSET(mdc, WT_MDC_POSITIONED)) + WT_ERR(__curmetadata_metadata_search(session, cursor)); + else { + WT_ERR(file_cursor->next(mdc->file_cursor)); + WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor); + } + +err: if (ret != 0) { + F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA); + F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); + } + API_END_RET(session, ret); +} + +/* + * __curmetadata_prev -- + * WT_CURSOR->prev method for the metadata cursor type. + */ +static int +__curmetadata_prev(WT_CURSOR *cursor) +{ + WT_CURSOR *file_cursor; + WT_CURSOR_METADATA *mdc; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + mdc = (WT_CURSOR_METADATA *)cursor; + file_cursor = mdc->file_cursor; + CURSOR_API_CALL(cursor, session, + prev, ((WT_CURSOR_BTREE *)file_cursor)->btree); + + if (F_ISSET(mdc, WT_MDC_ONMETADATA)) { + ret = WT_NOTFOUND; + goto err; + } + + ret = file_cursor->prev(file_cursor); + if (ret == 0) { + WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor); + } else if (ret == WT_NOTFOUND) + WT_ERR(__curmetadata_metadata_search(session, cursor)); + +err: if (ret != 0) { + F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA); + F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); + } + API_END_RET(session, ret); +} + +/* + * __curmetadata_reset -- + * WT_CURSOR->reset method for the metadata cursor type. + */ +static int +__curmetadata_reset(WT_CURSOR *cursor) +{ + WT_CURSOR *file_cursor; + WT_CURSOR_METADATA *mdc; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + mdc = (WT_CURSOR_METADATA *)cursor; + file_cursor = mdc->file_cursor; + CURSOR_API_CALL(cursor, session, + reset, ((WT_CURSOR_BTREE *)file_cursor)->btree); + + if (F_ISSET(mdc, WT_MDC_POSITIONED) && !F_ISSET(mdc, WT_MDC_ONMETADATA)) + ret = file_cursor->reset(file_cursor); + F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA); + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + +err: API_END_RET(session, ret); +} + +/* + * __curmetadata_search -- + * WT_CURSOR->search method for the metadata cursor type. + */ +static int +__curmetadata_search(WT_CURSOR *cursor) +{ + WT_CURSOR *file_cursor; + WT_CURSOR_METADATA *mdc; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + mdc = (WT_CURSOR_METADATA *)cursor; + file_cursor = mdc->file_cursor; + CURSOR_API_CALL(cursor, session, + search, ((WT_CURSOR_BTREE *)file_cursor)->btree); + + WT_MD_CURSOR_NEEDKEY(cursor); + + if (WT_KEY_IS_METADATA(&cursor->key)) + WT_ERR(__curmetadata_metadata_search(session, cursor)); + else { + WT_ERR(file_cursor->search(file_cursor)); + WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor); + } + +err: if (ret != 0) { + F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA); + F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); + } + API_END_RET(session, ret); +} + +/* + * __curmetadata_search_near -- + * WT_CURSOR->search_near method for the metadata cursor type. + */ +static int +__curmetadata_search_near(WT_CURSOR *cursor, int *exact) +{ + WT_CURSOR *file_cursor; + WT_CURSOR_METADATA *mdc; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + mdc = (WT_CURSOR_METADATA *)cursor; + file_cursor = mdc->file_cursor; + CURSOR_API_CALL(cursor, session, + search_near, ((WT_CURSOR_BTREE *)file_cursor)->btree); + + WT_MD_CURSOR_NEEDKEY(cursor); + + if (WT_KEY_IS_METADATA(&cursor->key)) { + WT_ERR(__curmetadata_metadata_search(session, cursor)); + *exact = 1; + } else { + WT_ERR(file_cursor->search_near(file_cursor, exact)); + WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor); + } + +err: if (ret != 0) { + F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA); + F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); + } + API_END_RET(session, ret); +} + +/* + * __curmetadata_insert -- + * WT_CURSOR->insert method for the metadata cursor type. + */ +static int +__curmetadata_insert(WT_CURSOR *cursor) +{ + WT_CURSOR *file_cursor; + WT_CURSOR_METADATA *mdc; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + mdc = (WT_CURSOR_METADATA *)cursor; + file_cursor = mdc->file_cursor; + CURSOR_API_CALL(cursor, session, + insert, ((WT_CURSOR_BTREE *)file_cursor)->btree); + + WT_MD_CURSOR_NEEDKEY(cursor); + WT_MD_CURSOR_NEEDVALUE(cursor); + + /* + * Since the key/value formats are 's' the WT_ITEMs must contain a + * NULL terminated string. + */ + ret = + __wt_metadata_insert(session, cursor->key.data, cursor->value.data); + +err: API_END_RET(session, ret); +} + +/* + * __curmetadata_update -- + * WT_CURSOR->update method for the metadata cursor type. + */ +static int +__curmetadata_update(WT_CURSOR *cursor) +{ + WT_CURSOR *file_cursor; + WT_CURSOR_METADATA *mdc; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + mdc = (WT_CURSOR_METADATA *)cursor; + file_cursor = mdc->file_cursor; + CURSOR_API_CALL(cursor, session, + update, ((WT_CURSOR_BTREE *)file_cursor)->btree); + + WT_MD_CURSOR_NEEDKEY(cursor); + WT_MD_CURSOR_NEEDVALUE(cursor); + + /* + * Since the key/value formats are 's' the WT_ITEMs must contain a + * NULL terminated string. + */ + ret = + __wt_metadata_update(session, cursor->key.data, cursor->value.data); + +err: API_END_RET(session, ret); +} + +/* + * __curmetadata_remove -- + * WT_CURSOR->remove method for the metadata cursor type. + */ +static int +__curmetadata_remove(WT_CURSOR *cursor) +{ + WT_CURSOR *file_cursor; + WT_CURSOR_METADATA *mdc; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + mdc = (WT_CURSOR_METADATA *)cursor; + file_cursor = mdc->file_cursor; + CURSOR_API_CALL(cursor, session, + remove, ((WT_CURSOR_BTREE *)file_cursor)->btree); + + WT_MD_CURSOR_NEEDKEY(cursor); + + /* + * Since the key format is 's' the WT_ITEM must contain a NULL + * terminated string. + */ + ret = __wt_metadata_remove(session, cursor->key.data); + +err: API_END_RET(session, ret); +} + +/* + * __curmetadata_close -- + * WT_CURSOR->close method for the metadata cursor type. + */ +static int +__curmetadata_close(WT_CURSOR *cursor) +{ + WT_CURSOR *file_cursor; + WT_CURSOR_METADATA *mdc; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + mdc = (WT_CURSOR_METADATA *)cursor; + file_cursor = mdc->file_cursor; + CURSOR_API_CALL(cursor, session, + close, ((WT_CURSOR_BTREE *)file_cursor)->btree); + + ret = file_cursor->close(file_cursor); + WT_TRET(__wt_cursor_close(cursor)); + +err: API_END_RET(session, ret); +} + +/* + * __wt_curmetadata_open -- + * WT_SESSION->open_cursor method for metadata cursors. + * + * Metadata cursors are a similar to a file cursor on the special metadata + * table, except that the metadata for the metadata table (which is stored + * in the turtle file) can also be queried. + * + * Metadata cursors are read-only by default. + */ +int +__wt_curmetadata_open(WT_SESSION_IMPL *session, + const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) +{ + WT_CURSOR_STATIC_INIT(iface, + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __curmetadata_compare, /* compare */ + __curmetadata_next, /* next */ + __curmetadata_prev, /* prev */ + __curmetadata_reset, /* reset */ + __curmetadata_search, /* search */ + __curmetadata_search_near, /* search-near */ + __curmetadata_insert, /* insert */ + __curmetadata_update, /* update */ + __curmetadata_remove, /* remove */ + __curmetadata_close); /* close */ + WT_CURSOR *cursor; + WT_CURSOR_METADATA *mdc; + WT_DECL_RET; + + WT_RET(__wt_calloc(session, 1, sizeof(WT_CURSOR_METADATA), &mdc)); + + cursor = &mdc->iface; + *cursor = iface; + cursor->session = &session->iface; + cursor->key_format = "S"; + cursor->value_format = "S"; + + /* Open the file cursor for operations on the regular metadata */ + WT_ERR(__wt_metadata_cursor(session, cfg[1], &mdc->file_cursor)); + + WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp)); + + /* Metadata cursors default to read only. */ + WT_ERR(__wt_cursor_config_readonly(cursor, cfg, 1)); + + if (0) { +err: __wt_free(session, mdc); + } + return (ret); +} diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c new file mode 100644 index 00000000000..c06efced369 --- /dev/null +++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c @@ -0,0 +1,574 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __curstat_next(WT_CURSOR *cursor); +static int __curstat_prev(WT_CURSOR *cursor); + +/* + * The statistics identifier is an offset from a base to ensure the integer ID + * values don't overlap (the idea is if they overlap it's easy for application + * writers to confuse them). + */ +#define WT_STAT_KEY_MAX(cst) (((cst)->stats_base + (cst)->stats_count) - 1) +#define WT_STAT_KEY_MIN(cst) ((cst)->stats_base) +#define WT_STAT_KEY_OFFSET(cst) ((cst)->key - (cst)->stats_base) + +/* + * __curstat_print_value -- + * Convert statistics cursor value to printable format. + */ +static int +__curstat_print_value(WT_SESSION_IMPL *session, uint64_t v, WT_ITEM *buf) +{ + if (v >= WT_BILLION) + WT_RET(__wt_buf_fmt(session, buf, + "%" PRIu64 "B (%" PRIu64 ")", v / WT_BILLION, v)); + else if (v >= WT_MILLION) + WT_RET(__wt_buf_fmt(session, buf, + "%" PRIu64 "M (%" PRIu64 ")", v / WT_MILLION, v)); + else + WT_RET(__wt_buf_fmt(session, buf, "%" PRIu64, v)); + + return (0); +} + +/* + * __curstat_get_key -- + * WT_CURSOR->get_key for statistics cursors. + */ +static int +__curstat_get_key(WT_CURSOR *cursor, ...) +{ + WT_CURSOR_STAT *cst; + WT_DECL_RET; + WT_ITEM *item; + WT_SESSION_IMPL *session; + size_t size; + va_list ap; + + cst = (WT_CURSOR_STAT *)cursor; + va_start(ap, cursor); + CURSOR_API_CALL(cursor, session, get_key, NULL); + + WT_CURSOR_NEEDKEY(cursor); + + if (F_ISSET(cursor, WT_CURSTD_RAW)) { + WT_ERR(__wt_struct_size( + session, &size, cursor->key_format, cst->key)); + WT_ERR(__wt_buf_initsize(session, &cursor->key, size)); + WT_ERR(__wt_struct_pack(session, cursor->key.mem, size, + cursor->key_format, cst->key)); + + item = va_arg(ap, WT_ITEM *); + item->data = cursor->key.data; + item->size = cursor->key.size; + } else + *va_arg(ap, int *) = cst->key; + +err: va_end(ap); + API_END_RET(session, ret); +} + +/* + * __curstat_get_value -- + * WT_CURSOR->get_value for statistics cursors. + */ +static int +__curstat_get_value(WT_CURSOR *cursor, ...) +{ + WT_CURSOR_STAT *cst; + WT_DECL_RET; + WT_ITEM *item; + WT_SESSION_IMPL *session; + va_list ap; + size_t size; + uint64_t *v; + const char **p; + + cst = (WT_CURSOR_STAT *)cursor; + va_start(ap, cursor); + CURSOR_API_CALL(cursor, session, get_value, NULL); + + WT_CURSOR_NEEDVALUE(cursor); + + if (F_ISSET(cursor, WT_CURSTD_RAW)) { + WT_ERR(__wt_struct_size(session, &size, cursor->value_format, + cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc, + cst->pv.data, cst->v)); + WT_ERR(__wt_buf_initsize(session, &cursor->value, size)); + WT_ERR(__wt_struct_pack(session, cursor->value.mem, size, + cursor->value_format, + cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc, + cst->pv.data, cst->v)); + + item = va_arg(ap, WT_ITEM *); + item->data = cursor->value.data; + item->size = cursor->value.size; + } else { + /* + * Don't drop core if the statistics value isn't requested; NULL + * pointer support isn't documented, but it's a cheap test. + */ + if ((p = va_arg(ap, const char **)) != NULL) + *p = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc; + if ((p = va_arg(ap, const char **)) != NULL) + *p = cst->pv.data; + if ((v = va_arg(ap, uint64_t *)) != NULL) + *v = cst->v; + } + +err: va_end(ap); + API_END_RET(session, ret); +} + +/* + * __curstat_set_key -- + * WT_CURSOR->set_key for statistics cursors. + */ +static void +__curstat_set_key(WT_CURSOR *cursor, ...) +{ + WT_CURSOR_STAT *cst; + WT_DECL_RET; + WT_ITEM *item; + WT_SESSION_IMPL *session; + va_list ap; + + cst = (WT_CURSOR_STAT *)cursor; + CURSOR_API_CALL(cursor, session, set_key, NULL); + F_CLR(cursor, WT_CURSTD_KEY_SET); + + va_start(ap, cursor); + if (F_ISSET(cursor, WT_CURSTD_RAW)) { + item = va_arg(ap, WT_ITEM *); + ret = __wt_struct_unpack(session, item->data, item->size, + cursor->key_format, &cst->key); + } else + cst->key = va_arg(ap, int); + va_end(ap); + + if ((cursor->saved_err = ret) == 0) + F_SET(cursor, WT_CURSTD_KEY_EXT); + +err: API_END(session, ret); +} + +/* + * __curstat_set_value -- + * WT_CURSOR->set_value for statistics cursors. + */ +static void +__curstat_set_value(WT_CURSOR *cursor, ...) +{ + WT_UNUSED(cursor); + return; +} + +/* + * __curstat_next -- + * WT_CURSOR->next method for the statistics cursor type. + */ +static int +__curstat_next(WT_CURSOR *cursor) +{ + WT_CURSOR_STAT *cst; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cst = (WT_CURSOR_STAT *)cursor; + CURSOR_API_CALL(cursor, session, next, NULL); + + /* Move to the next item. */ + if (cst->notpositioned) { + cst->notpositioned = 0; + cst->key = WT_STAT_KEY_MIN(cst); + } else if (cst->key < WT_STAT_KEY_MAX(cst)) + ++cst->key; + else { + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + WT_ERR(WT_NOTFOUND); + } + cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v; + WT_ERR(__curstat_print_value(session, cst->v, &cst->pv)); + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + +err: API_END_RET(session, ret); +} + +/* + * __curstat_prev -- + * WT_CURSOR->prev method for the statistics cursor type. + */ +static int +__curstat_prev(WT_CURSOR *cursor) +{ + WT_CURSOR_STAT *cst; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cst = (WT_CURSOR_STAT *)cursor; + CURSOR_API_CALL(cursor, session, prev, NULL); + + /* Move to the previous item. */ + if (cst->notpositioned) { + cst->notpositioned = 0; + cst->key = WT_STAT_KEY_MAX(cst); + } else if (cst->key > WT_STAT_KEY_MIN(cst)) + --cst->key; + else { + F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + WT_ERR(WT_NOTFOUND); + } + + cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v; + WT_ERR(__curstat_print_value(session, cst->v, &cst->pv)); + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + +err: API_END_RET(session, ret); +} + +/* + * __curstat_reset -- + * WT_CURSOR->reset method for the statistics cursor type. + */ +static int +__curstat_reset(WT_CURSOR *cursor) +{ + WT_CURSOR_STAT *cst; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cst = (WT_CURSOR_STAT *)cursor; + CURSOR_API_CALL(cursor, session, reset, NULL); + + cst->notpositioned = 1; + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + +err: API_END_RET(session, ret); +} + +/* + * __curstat_search -- + * WT_CURSOR->search method for the statistics cursor type. + */ +static int +__curstat_search(WT_CURSOR *cursor) +{ + WT_CURSOR_STAT *cst; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cst = (WT_CURSOR_STAT *)cursor; + CURSOR_API_CALL(cursor, session, search, NULL); + + WT_CURSOR_NEEDKEY(cursor); + F_CLR(cursor, WT_CURSTD_VALUE_SET | WT_CURSTD_VALUE_SET); + + if (cst->key < WT_STAT_KEY_MIN(cst) || cst->key > WT_STAT_KEY_MAX(cst)) + WT_ERR(WT_NOTFOUND); + + cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v; + WT_ERR(__curstat_print_value(session, cst->v, &cst->pv)); + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + +err: API_END_RET(session, ret); +} + +/* + * __curstat_close -- + * WT_CURSOR->close method for the statistics cursor type. + */ +static int +__curstat_close(WT_CURSOR *cursor) +{ + WT_CURSOR_STAT *cst; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cst = (WT_CURSOR_STAT *)cursor; + CURSOR_API_CALL(cursor, session, close, NULL); + + __wt_buf_free(session, &cst->pv); + + WT_ERR(__wt_cursor_close(cursor)); + +err: API_END_RET(session, ret); +} + +/* + * __curstat_conn_init -- + * Initialize the statistics for a connection. + */ +static void +__curstat_conn_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + /* + * Fill in the connection statistics, and copy them to the cursor. + * Optionally clear the connection statistics. + */ + __wt_conn_stat_init(session); + cst->u.conn_stats = conn->stats; + if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + __wt_stat_refresh_connection_stats(&conn->stats); + + cst->stats_first = cst->stats = (WT_STATS *)&cst->u.conn_stats; + cst->stats_base = WT_CONNECTION_STATS_BASE; + cst->stats_count = sizeof(WT_CONNECTION_STATS) / sizeof(WT_STATS); +} + +/* + * When returning the statistics for a file URI, we review open handles, and + * aggregate checkpoint handle statistics with the file URI statistics. To + * make that work, we have to pass information to the function reviewing the + * handles, this structure is what we pass. + */ +struct __checkpoint_args { + const char *name; /* Data source handle name */ + WT_DSRC_STATS *stats; /* Stat structure being filled */ + int clear; /* WT_STATISTICS_CLEAR */ +}; + +/* + * __curstat_checkpoint -- + * Aggregate statistics from checkpoint handles. + */ +static int +__curstat_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) +{ + struct __checkpoint_args *args; + WT_DATA_HANDLE *dhandle; + + dhandle = session->dhandle; + args = (struct __checkpoint_args *)cfg[0]; + + /* Aggregate the flagged file's checkpoint handles. */ + if (dhandle->checkpoint != NULL && + strcmp(dhandle->name, args->name) == 0) { + __wt_stat_aggregate_dsrc_stats(&dhandle->stats, args->stats); + if (args->clear) + __wt_stat_refresh_dsrc_stats(&dhandle->stats); + } + + return (0); +} + +/* + * __curstat_file_init -- + * Initialize the statistics for a file. + */ +static int +__curstat_file_init(WT_SESSION_IMPL *session, + const char *uri, const char *cfg[], WT_CURSOR_STAT *cst) +{ + struct __checkpoint_args args; + WT_DATA_HANDLE *dhandle, *saved_dhandle; + WT_DECL_RET; + const char *cfg_arg[] = { NULL, NULL }; + + WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, 0)); + dhandle = session->dhandle; + + /* + * Fill in the data source statistics, and copy them to the cursor. + * Optionally clear the data source statistics. + */ + if ((ret = __wt_btree_stat_init(session, cst)) == 0) { + cst->u.dsrc_stats = dhandle->stats; + if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + __wt_stat_refresh_dsrc_stats(&dhandle->stats); + __wt_curstat_dsrc_final(cst); + } + + /* Release the handle, we're done with it. */ + WT_TRET(__wt_session_release_btree(session)); + WT_RET(ret); + + /* + * If no checkpoint was specified, review the open handles and aggregate + * the statistics from any checkpoint handles matching this file. + */ + if (dhandle->checkpoint == NULL) { + args.name = dhandle->name; + args.stats = &cst->u.dsrc_stats; + args.clear = F_ISSET(cst, WT_CONN_STAT_CLEAR); + cfg_arg[0] = (char *)&args; + + /* + * We're likely holding the schema lock inside the statistics + * logging thread, not to mention calling __wt_conn_btree_apply + * from there as well. Save/restore the handle. + */ + saved_dhandle = dhandle; + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_conn_btree_apply( + session, 1, __curstat_checkpoint, cfg_arg)); + session->dhandle = saved_dhandle; + } + + return (ret); +} + +/* + * __wt_curstat_dsrc_final -- + * Finalize a data-source statistics cursor. + */ +void +__wt_curstat_dsrc_final(WT_CURSOR_STAT *cst) +{ + + cst->stats_first = cst->stats = (WT_STATS *)&cst->u.dsrc_stats; + cst->stats_base = WT_DSRC_STATS_BASE; + cst->stats_count = sizeof(WT_DSRC_STATS) / sizeof(WT_STATS); +} + +/* + * __wt_curstat_init -- + * Initialize a statistics cursor. + */ +int +__wt_curstat_init(WT_SESSION_IMPL *session, + const char *uri, const char *cfg[], WT_CURSOR_STAT *cst) +{ + const char *dsrc_uri; + + cst->notpositioned = 1; + + if (strcmp(uri, "statistics:") == 0) { + __curstat_conn_init(session, cst); + return (0); + } + + dsrc_uri = uri + strlen("statistics:"); + + if (WT_PREFIX_MATCH(dsrc_uri, "colgroup:")) + return ( + __wt_curstat_colgroup_init(session, dsrc_uri, cfg, cst)); + + if (WT_PREFIX_MATCH(dsrc_uri, "file:")) + return (__curstat_file_init(session, dsrc_uri, cfg, cst)); + + if (WT_PREFIX_MATCH(dsrc_uri, "index:")) + return (__wt_curstat_index_init(session, dsrc_uri, cfg, cst)); + + if (WT_PREFIX_MATCH(dsrc_uri, "lsm:")) + return (__wt_curstat_lsm_init(session, dsrc_uri, cst)); + + if (WT_PREFIX_MATCH(dsrc_uri, "table:")) + return (__wt_curstat_table_init(session, dsrc_uri, cfg, cst)); + + return (__wt_bad_object_type(session, uri)); +} + +/* + * __wt_curstat_open -- + * WT_SESSION->open_cursor method for the statistics cursor type. + */ +int +__wt_curstat_open(WT_SESSION_IMPL *session, + const char *uri, const char *cfg[], WT_CURSOR **cursorp) +{ + WT_CONNECTION_IMPL *conn; + WT_CURSOR_STATIC_INIT(iface, + __curstat_get_key, /* get-key */ + __curstat_get_value, /* get-value */ + __curstat_set_key, /* set-key */ + __curstat_set_value, /* set-value */ + __wt_cursor_notsup, /* compare */ + __curstat_next, /* next */ + __curstat_prev, /* prev */ + __curstat_reset, /* reset */ + __curstat_search, /* search */ + __wt_cursor_notsup, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __curstat_close); /* close */ + WT_CONFIG_ITEM cval, sval; + WT_CURSOR *cursor; + WT_CURSOR_STAT *cst; + WT_DECL_RET; + + WT_STATIC_ASSERT(offsetof(WT_CURSOR_STAT, iface) == 0); + + conn = S2C(session); + + WT_ERR(__wt_calloc_def(session, 1, &cst)); + cursor = &cst->iface; + *cursor = iface; + cursor->session = &session->iface; + + /* + * Statistics cursor configuration: must match (and defaults to), the + * database configuration. + */ + if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_NONE)) + goto config_err; + if ((ret = __wt_config_gets(session, cfg, "statistics", &cval)) == 0) { + if ((ret = __wt_config_subgets( + session, &cval, "all", &sval)) == 0 && sval.val != 0) { + if (!FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ALL)) + goto config_err; + F_SET(cst, WT_CONN_STAT_ALL | WT_CONN_STAT_FAST); + } + WT_ERR_NOTFOUND_OK(ret); + if ((ret = __wt_config_subgets( + session, &cval, "fast", &sval)) == 0 && sval.val != 0) { + if (F_ISSET(cst, WT_CONN_STAT_ALL)) + WT_ERR_MSG(session, EINVAL, + "only one statistics configuration value " + "may be specified"); + F_SET(cst, WT_CONN_STAT_FAST); + } + WT_ERR_NOTFOUND_OK(ret); + if ((ret = __wt_config_subgets( + session, &cval, "clear", &sval)) == 0 && sval.val != 0) + F_SET(cst, WT_CONN_STAT_CLEAR); + WT_ERR_NOTFOUND_OK(ret); + + /* If no configuration, use the connection's configuration. */ + if (cst->flags == 0) { + if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ALL)) + F_SET(cst, WT_CONN_STAT_ALL); + if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_FAST)) + F_SET(cst, WT_CONN_STAT_FAST); + } + + /* If the connection configures clear, so do we. */ + if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR)) + F_SET(cst, WT_CONN_STAT_CLEAR); + } + + /* + * We return the statistics field's offset as the key, and a string + * description, a string value, and a uint64_t value as the value + * columns. + */ + cursor->key_format = "i"; + cursor->value_format = "SSq"; + WT_ERR(__wt_curstat_init(session, uri, cfg, cst)); + + /* __wt_cursor_init is last so we don't have to clean up on error. */ + WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); + + if (0) { +config_err: WT_ERR_MSG(session, EINVAL, + "cursor's statistics configuration doesn't match the " + "database statistics configuration"); + } + + if (0) { +err: __wt_free(session, cst); + } + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c new file mode 100644 index 00000000000..21d676d943a --- /dev/null +++ b/src/third_party/wiredtiger/src/cursor/cur_std.c @@ -0,0 +1,625 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_cursor_notsup -- + * Unsupported cursor actions. + */ +int +__wt_cursor_notsup(WT_CURSOR *cursor) +{ + WT_UNUSED(cursor); + + return (ENOTSUP); +} + +/* + * __wt_cursor_noop -- + * Cursor noop. + */ +int +__wt_cursor_noop(WT_CURSOR *cursor) +{ + WT_UNUSED(cursor); + + return (0); +} + +/* + * __wt_cursor_set_notsup -- + * Reset the cursor methods to not-supported. + */ +void +__wt_cursor_set_notsup(WT_CURSOR *cursor) +{ + /* + * Set all of the cursor methods (except for close and reset), to fail. + * Close is unchanged so the cursor can be discarded, reset defaults to + * a no-op because session transactional operations reset all of the + * cursors in a session, and random cursors shouldn't block transactions + * or checkpoints. + */ + cursor->compare = + (int (*)(WT_CURSOR *, WT_CURSOR *, int *))__wt_cursor_notsup; + cursor->next = __wt_cursor_notsup; + cursor->prev = __wt_cursor_notsup; + cursor->reset = __wt_cursor_noop; + cursor->search = __wt_cursor_notsup; + cursor->search_near = (int (*)(WT_CURSOR *, int *))__wt_cursor_notsup; + cursor->insert = __wt_cursor_notsup; + cursor->update = __wt_cursor_notsup; + cursor->remove = __wt_cursor_notsup; +} + +/* + * __wt_cursor_config_readonly -- + * Parse read only configuration and setup cursor appropriately. + */ +int +__wt_cursor_config_readonly(WT_CURSOR *cursor, const char *cfg[], int def) +{ + WT_CONFIG_ITEM cval; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cursor->session; + + WT_RET(__wt_config_gets_def(session, cfg, "readonly", def, &cval)); + if (cval.val != 0) { + /* Reset all cursor methods that could modify data. */ + cursor->insert = __wt_cursor_notsup; + cursor->update = __wt_cursor_notsup; + cursor->remove = __wt_cursor_notsup; + } + return (0); +} + +/* + * __wt_cursor_kv_not_set -- + * Standard error message for key/values not set. + */ +int +__wt_cursor_kv_not_set(WT_CURSOR *cursor, int key) +{ + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cursor->session; + + WT_RET_MSG(session, + cursor->saved_err == 0 ? EINVAL : cursor->saved_err, + "requires %s be set", key ? "key" : "value"); +} + +/* + * __wt_cursor_get_key -- + * WT_CURSOR->get_key default implementation. + */ +int +__wt_cursor_get_key(WT_CURSOR *cursor, ...) +{ + WT_DECL_RET; + va_list ap; + + va_start(ap, cursor); + ret = __wt_cursor_get_keyv(cursor, cursor->flags, ap); + va_end(ap); + return (ret); +} + +/* + * __wt_cursor_set_key -- + * WT_CURSOR->set_key default implementation. + */ +void +__wt_cursor_set_key(WT_CURSOR *cursor, ...) +{ + va_list ap; + + va_start(ap, cursor); + __wt_cursor_set_keyv(cursor, cursor->flags, ap); + va_end(ap); +} + +/* + * __wt_cursor_get_raw_key -- + * Temporarily force raw mode in a cursor to get a canonical copy of + * the key. + */ +int +__wt_cursor_get_raw_key(WT_CURSOR *cursor, WT_ITEM *key) +{ + WT_DECL_RET; + int raw_set; + + raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0; + if (!raw_set) + F_SET(cursor, WT_CURSTD_RAW); + ret = cursor->get_key(cursor, key); + if (!raw_set) + F_CLR(cursor, WT_CURSTD_RAW); + return (ret); +} + +/* + * __wt_cursor_set_raw_key -- + * Temporarily force raw mode in a cursor to set a canonical copy of + * the key. + */ +void +__wt_cursor_set_raw_key(WT_CURSOR *cursor, WT_ITEM *key) +{ + int raw_set; + + raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0; + if (!raw_set) + F_SET(cursor, WT_CURSTD_RAW); + cursor->set_key(cursor, key); + if (!raw_set) + F_CLR(cursor, WT_CURSTD_RAW); +} + +/* + * __wt_cursor_get_raw_value -- + * Temporarily force raw mode in a cursor to get a canonical copy of + * the value. + */ +int +__wt_cursor_get_raw_value(WT_CURSOR *cursor, WT_ITEM *value) +{ + WT_DECL_RET; + int raw_set; + + raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0; + if (!raw_set) + F_SET(cursor, WT_CURSTD_RAW); + ret = cursor->get_value(cursor, value); + if (!raw_set) + F_CLR(cursor, WT_CURSTD_RAW); + return (ret); +} + +/* + * __wt_cursor_set_raw_value -- + * Temporarily force raw mode in a cursor to set a canonical copy of + * the value. + */ +void +__wt_cursor_set_raw_value(WT_CURSOR *cursor, WT_ITEM *value) +{ + int raw_set; + + raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0; + if (!raw_set) + F_SET(cursor, WT_CURSTD_RAW); + cursor->set_value(cursor, value); + if (!raw_set) + F_CLR(cursor, WT_CURSTD_RAW); +} + +/* + * __wt_cursor_get_keyv -- + * WT_CURSOR->get_key worker function. + */ +int +__wt_cursor_get_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap) +{ + WT_DECL_RET; + WT_ITEM *key; + WT_SESSION_IMPL *session; + size_t size; + const char *fmt; + + CURSOR_API_CALL(cursor, session, get_key, NULL); + if (!F_ISSET(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT)) + WT_ERR(__wt_cursor_kv_not_set(cursor, 1)); + + if (WT_CURSOR_RECNO(cursor)) { + if (LF_ISSET(WT_CURSTD_RAW)) { + key = va_arg(ap, WT_ITEM *); + key->data = cursor->raw_recno_buf; + WT_ERR(__wt_struct_size( + session, &size, "q", cursor->recno)); + key->size = size; + ret = __wt_struct_pack(session, cursor->raw_recno_buf, + sizeof(cursor->raw_recno_buf), "q", cursor->recno); + } else + *va_arg(ap, uint64_t *) = cursor->recno; + } else { + /* Fast path some common cases. */ + fmt = cursor->key_format; + if (LF_ISSET(WT_CURSOR_RAW_OK) || WT_STREQ(fmt, "u")) { + key = va_arg(ap, WT_ITEM *); + key->data = cursor->key.data; + key->size = cursor->key.size; + } else if (WT_STREQ(fmt, "S")) + *va_arg(ap, const char **) = cursor->key.data; + else + ret = __wt_struct_unpackv(session, + cursor->key.data, cursor->key.size, fmt, ap); + } + +err: API_END_RET(session, ret); +} + +/* + * __wt_cursor_set_keyv -- + * WT_CURSOR->set_key default implementation. + */ +void +__wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + WT_ITEM *buf, *item; + size_t sz; + va_list ap_copy; + const char *fmt, *str; + + CURSOR_API_CALL(cursor, session, set_key, NULL); + F_CLR(cursor, WT_CURSTD_KEY_SET); + + if (WT_CURSOR_RECNO(cursor)) { + if (LF_ISSET(WT_CURSTD_RAW)) { + item = va_arg(ap, WT_ITEM *); + WT_ERR(__wt_struct_unpack(session, + item->data, item->size, "q", &cursor->recno)); + } else + cursor->recno = va_arg(ap, uint64_t); + if (cursor->recno == 0) + WT_ERR_MSG(session, EINVAL, + "Record numbers must be greater than zero"); + cursor->key.data = &cursor->recno; + sz = sizeof(cursor->recno); + } else { + /* Fast path some common cases and special case WT_ITEMs. */ + fmt = cursor->key_format; + if (LF_ISSET(WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON) || + WT_STREQ(fmt, "u")) { + item = va_arg(ap, WT_ITEM *); + sz = item->size; + cursor->key.data = item->data; + } else if (WT_STREQ(fmt, "S")) { + str = va_arg(ap, const char *); + sz = strlen(str) + 1; + cursor->key.data = (void *)str; + } else { + buf = &cursor->key; + + va_copy(ap_copy, ap); + ret = __wt_struct_sizev( + session, &sz, cursor->key_format, ap_copy); + va_end(ap_copy); + WT_ERR(ret); + + WT_ERR(__wt_buf_initsize(session, buf, sz)); + WT_ERR(__wt_struct_packv( + session, buf->mem, sz, cursor->key_format, ap)); + } + } + if (sz == 0) + WT_ERR_MSG(session, EINVAL, "Empty keys not permitted"); + else if ((uint32_t)sz != sz) + WT_ERR_MSG(session, EINVAL, + "Key size (%" PRIu64 ") out of range", (uint64_t)sz); + cursor->saved_err = 0; + cursor->key.size = sz; + F_SET(cursor, WT_CURSTD_KEY_EXT); + if (0) { +err: cursor->saved_err = ret; + } + + API_END(session, ret); +} + +/* + * __wt_cursor_get_value -- + * WT_CURSOR->get_value default implementation. + */ +int +__wt_cursor_get_value(WT_CURSOR *cursor, ...) +{ + WT_DECL_RET; + va_list ap; + + va_start(ap, cursor); + ret = __wt_cursor_get_valuev(cursor, ap); + va_end(ap); + return (ret); +} + +/* + * __wt_cursor_get_valuev -- + * WT_CURSOR->get_value worker implementation. + */ +int +__wt_cursor_get_valuev(WT_CURSOR *cursor, va_list ap) +{ + WT_DECL_RET; + WT_ITEM *value; + WT_SESSION_IMPL *session; + const char *fmt; + + CURSOR_API_CALL(cursor, session, get_value, NULL); + + if (!F_ISSET(cursor, WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT)) + WT_ERR(__wt_cursor_kv_not_set(cursor, 0)); + + /* Fast path some common cases. */ + fmt = cursor->value_format; + if (F_ISSET(cursor, WT_CURSOR_RAW_OK) || WT_STREQ(fmt, "u")) { + value = va_arg(ap, WT_ITEM *); + value->data = cursor->value.data; + value->size = cursor->value.size; + } else if (WT_STREQ(fmt, "S")) + *va_arg(ap, const char **) = cursor->value.data; + else if (WT_STREQ(fmt, "t") || + (isdigit(fmt[0]) && WT_STREQ(fmt + 1, "t"))) + *va_arg(ap, uint8_t *) = *(uint8_t *)cursor->value.data; + else + ret = __wt_struct_unpackv(session, + cursor->value.data, cursor->value.size, fmt, ap); + +err: API_END_RET(session, ret); +} + +/* + * __wt_cursor_set_value -- + * WT_CURSOR->set_value default implementation. + */ +void +__wt_cursor_set_value(WT_CURSOR *cursor, ...) +{ + va_list ap; + + va_start(ap, cursor); + __wt_cursor_set_valuev(cursor, ap); + va_end(ap); +} + +/* + * __wt_cursor_set_valuev -- + * WT_CURSOR->set_value worker implementation. + */ +void +__wt_cursor_set_valuev(WT_CURSOR *cursor, va_list ap) +{ + WT_DECL_RET; + WT_ITEM *buf, *item; + WT_SESSION_IMPL *session; + const char *fmt, *str; + va_list ap_copy; + size_t sz; + + CURSOR_API_CALL(cursor, session, set_value, NULL); + F_CLR(cursor, WT_CURSTD_VALUE_SET); + + /* Fast path some common cases. */ + fmt = cursor->value_format; + if (F_ISSET(cursor, WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON) || + WT_STREQ(fmt, "u")) { + item = va_arg(ap, WT_ITEM *); + sz = item->size; + cursor->value.data = item->data; + } else if (WT_STREQ(fmt, "S")) { + str = va_arg(ap, const char *); + sz = strlen(str) + 1; + cursor->value.data = str; + } else if (WT_STREQ(fmt, "t") || + (isdigit(fmt[0]) && WT_STREQ(fmt + 1, "t"))) { + sz = 1; + buf = &cursor->value; + WT_ERR(__wt_buf_initsize(session, buf, sz)); + *(uint8_t *)buf->mem = (uint8_t)va_arg(ap, int); + } else { + va_copy(ap_copy, ap); + ret = __wt_struct_sizev(session, + &sz, cursor->value_format, ap_copy); + va_end(ap_copy); + WT_ERR(ret); + buf = &cursor->value; + WT_ERR(__wt_buf_initsize(session, buf, sz)); + WT_ERR(__wt_struct_packv(session, buf->mem, sz, + cursor->value_format, ap)); + } + F_SET(cursor, WT_CURSTD_VALUE_EXT); + cursor->value.size = sz; + + if (0) { +err: cursor->saved_err = ret; + } + API_END(session, ret); +} + +/* + * __wt_cursor_close -- + * WT_CURSOR->close default implementation. + */ +int +__wt_cursor_close(WT_CURSOR *cursor) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cursor->session; + __wt_buf_free(session, &cursor->key); + __wt_buf_free(session, &cursor->value); + + if (F_ISSET(cursor, WT_CURSTD_OPEN)) { + TAILQ_REMOVE(&session->cursors, cursor, q); + + WT_STAT_FAST_DATA_DECR(session, session_cursor_open); + WT_STAT_FAST_CONN_ATOMIC_DECR(session, session_cursor_open); + } + + __wt_free(session, cursor->internal_uri); + __wt_free(session, cursor->uri); + __wt_overwrite_and_free(session, cursor); + return (ret); +} + +/* + * __cursor_runtime_config -- + * Set runtime-configurable settings. + */ +static int +__cursor_runtime_config(WT_CURSOR *cursor, const char *cfg[]) +{ + WT_CONFIG_ITEM cval; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cursor->session; + + /* + * !!! + * There's no way yet to reconfigure cursor flags at runtime; if, in + * the future there is a way to do that, similar support needs to be + * added for data-source cursors, or, this call needs to return an + * error in the case of a data-source cursor. + */ + WT_RET(__wt_config_gets_def(session, cfg, "overwrite", 1, &cval)); + if (cval.val) + F_SET(cursor, WT_CURSTD_OVERWRITE); + else + F_CLR(cursor, WT_CURSTD_OVERWRITE); + + return (0); +} + +/* + * __wt_cursor_dup_position -- + * Set a cursor to another cursor's position. + */ +int +__wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor) +{ + WT_ITEM key; + + /* + * Get a copy of the cursor's raw key, and set it in the new cursor, + * then search for that key to position the cursor. + * + * We don't clear the WT_ITEM structure: all that happens when getting + * and setting the key is the data/size fields are reset to reference + * the original cursor's key. + * + * That said, we're playing games with the cursor flags: setting the key + * sets the key/value application-set flags in the new cursor, which may + * or may not be correct, but there's nothing simple that fixes it. We + * depend on the subsequent cursor search to clean things up, as search + * is required to copy and/or reference private memory after success. + */ + WT_RET(__wt_cursor_get_raw_key(to_dup, &key)); + __wt_cursor_set_raw_key(cursor, &key); + + /* + * We now have a reference to the raw key, but we don't know anything + * about the memory in which it's stored, it could be btree/file page + * memory in the cache, application memory or the original cursor's + * key/value WT_ITEMs. Memory allocated in support of another cursor + * could be discarded when that cursor is closed, so it's a problem. + * However, doing a search to position the cursor will fix the problem: + * cursors cannot reference application memory after cursor operations + * and that requirement will save the day. + */ + WT_RET(cursor->search(cursor)); + + return (0); +} + +/* + * __wt_cursor_init -- + * Default cursor initialization. + */ +int +__wt_cursor_init(WT_CURSOR *cursor, + const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) +{ + WT_CONFIG_ITEM cval; + WT_CURSOR *cdump; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cursor->session; + + if (cursor->internal_uri == NULL) + WT_RET(__wt_strdup(session, uri, &cursor->internal_uri)); + + /* Set runtime-configurable settings. */ + WT_RET(__cursor_runtime_config(cursor, cfg)); + + /* + * append + * The append flag is only relevant to column stores. + */ + if (WT_CURSOR_RECNO(cursor)) { + WT_RET(__wt_config_gets_def(session, cfg, "append", 0, &cval)); + if (cval.val != 0) + F_SET(cursor, WT_CURSTD_APPEND); + } + + /* + * checkpoint + * Checkpoint cursors are read-only. + */ + WT_RET(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval)); + if (cval.len != 0) { + cursor->insert = __wt_cursor_notsup; + cursor->update = __wt_cursor_notsup; + cursor->remove = __wt_cursor_notsup; + } + + /* + * dump + * If an index cursor is opened with dump, then this + * function is called on the index files, with the dump + * config string, and with the index cursor as an owner. + * We don't want to create a dump cursor in that case, because + * we'll create the dump cursor on the index cursor itself. + */ + WT_RET(__wt_config_gets_def(session, cfg, "dump", 0, &cval)); + if (cval.len != 0 && owner == NULL) { + F_SET(cursor, + WT_STRING_MATCH("json", cval.str, cval.len) ? + WT_CURSTD_DUMP_JSON : + (WT_STRING_MATCH("print", cval.str, cval.len) ? + WT_CURSTD_DUMP_PRINT : WT_CURSTD_DUMP_HEX)); + /* + * Dump cursors should not have owners: only the + * top-level cursor should be wrapped in a dump cursor. + */ + WT_RET(__wt_curdump_create(cursor, owner, &cdump)); + owner = cdump; + } else + cdump = NULL; + + /* raw */ + WT_RET(__wt_config_gets_def(session, cfg, "raw", 0, &cval)); + if (cval.val != 0) + F_SET(cursor, WT_CURSTD_RAW); + + /* readonly */ + WT_RET(__wt_cursor_config_readonly(cursor, cfg, 0)); + + /* + * Cursors that are internal to some other cursor (such as file cursors + * inside a table cursor) should be closed after the containing cursor. + * Arrange for that to happen by putting internal cursors after their + * owners on the queue. + */ + if (owner != NULL) { + WT_ASSERT(session, F_ISSET(owner, WT_CURSTD_OPEN)); + TAILQ_INSERT_AFTER(&session->cursors, owner, cursor, q); + } else + TAILQ_INSERT_HEAD(&session->cursors, cursor, q); + + F_SET(cursor, WT_CURSTD_OPEN); + WT_STAT_FAST_DATA_INCR(session, session_cursor_open); + WT_STAT_FAST_CONN_ATOMIC_INCR(session, session_cursor_open); + + *cursorp = (cdump != NULL) ? cdump : cursor; + return (0); +} diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c new file mode 100644 index 00000000000..ea267f96f9c --- /dev/null +++ b/src/third_party/wiredtiger/src/cursor/cur_table.c @@ -0,0 +1,808 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __curtable_open_indices(WT_CURSOR_TABLE *ctable); +static int __curtable_update(WT_CURSOR *cursor); + +#define APPLY_CG(ctable, f) do { \ + WT_CURSOR **__cp; \ + u_int __i; \ + for (__i = 0, __cp = ctable->cg_cursors; \ + __i < WT_COLGROUPS(ctable->table); \ + __i++, __cp++) \ + WT_TRET((*__cp)->f(*__cp)); \ +} while (0) + +#define APPLY_IDX(ctable, f) do { \ + WT_INDEX *idx; \ + WT_CURSOR **__cp; \ + u_int __i; \ + __cp = (ctable)->idx_cursors; \ + for (__i = 0; __i < ctable->table->nindices; __i++, __cp++) { \ + idx = ctable->table->indices[__i]; \ + WT_ERR(__wt_schema_project_merge(session, \ + ctable->cg_cursors, \ + idx->key_plan, idx->key_format, &(*__cp)->key)); \ + F_SET(*__cp, WT_CURSTD_KEY_EXT | \ + WT_CURSTD_VALUE_EXT); \ + WT_ERR((*__cp)->f(*__cp)); \ + WT_ERR((*__cp)->reset(*__cp)); \ + } \ +} while (0) + +/* + * __wt_curtable_get_key -- + * WT_CURSOR->get_key implementation for tables. + */ +int +__wt_curtable_get_key(WT_CURSOR *cursor, ...) +{ + WT_CURSOR *primary; + WT_CURSOR_TABLE *ctable; + WT_DECL_RET; + va_list ap; + + ctable = (WT_CURSOR_TABLE *)cursor; + primary = *ctable->cg_cursors; + + va_start(ap, cursor); + ret = __wt_cursor_get_keyv(primary, cursor->flags, ap); + va_end(ap); + + return (ret); +} + +/* + * __wt_curtable_get_value -- + * WT_CURSOR->get_value implementation for tables. + */ +int +__wt_curtable_get_value(WT_CURSOR *cursor, ...) +{ + WT_CURSOR *primary; + WT_CURSOR_TABLE *ctable; + WT_DECL_RET; + WT_ITEM *item; + WT_SESSION_IMPL *session; + va_list ap; + + ctable = (WT_CURSOR_TABLE *)cursor; + primary = *ctable->cg_cursors; + CURSOR_API_CALL(cursor, session, get_value, NULL); + WT_CURSOR_NEEDVALUE(primary); + + va_start(ap, cursor); + if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) { + ret = __wt_schema_project_merge(session, + ctable->cg_cursors, ctable->plan, + cursor->value_format, &cursor->value); + if (ret == 0) { + item = va_arg(ap, WT_ITEM *); + item->data = cursor->value.data; + item->size = cursor->value.size; + } + } else + ret = __wt_schema_project_out(session, + ctable->cg_cursors, ctable->plan, ap); + va_end(ap); + +err: API_END_RET(session, ret); +} + +/* + * __wt_curtable_set_key -- + * WT_CURSOR->set_key implementation for tables. + */ +void +__wt_curtable_set_key(WT_CURSOR *cursor, ...) +{ + WT_CURSOR **cp, *primary; + WT_CURSOR_TABLE *ctable; + va_list ap; + u_int i; + + ctable = (WT_CURSOR_TABLE *)cursor; + cp = ctable->cg_cursors; + primary = *cp++; + + va_start(ap, cursor); + __wt_cursor_set_keyv(primary, cursor->flags, ap); + va_end(ap); + + if (!F_ISSET(primary, WT_CURSTD_KEY_SET)) + return; + + /* Copy the primary key to the other cursors. */ + for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) { + (*cp)->recno = primary->recno; + (*cp)->key.data = primary->key.data; + (*cp)->key.size = primary->key.size; + F_SET(*cp, WT_CURSTD_KEY_EXT); + } +} + +/* + * __wt_curtable_set_value -- + * WT_CURSOR->set_value implementation for tables. + */ +void +__wt_curtable_set_value(WT_CURSOR *cursor, ...) +{ + WT_CURSOR **cp; + WT_CURSOR_TABLE *ctable; + WT_DECL_RET; + WT_ITEM *item; + WT_SESSION_IMPL *session; + va_list ap; + u_int i; + + ctable = (WT_CURSOR_TABLE *)cursor; + CURSOR_API_CALL(cursor, session, set_value, NULL); + + va_start(ap, cursor); + if (F_ISSET(cursor, WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON)) { + item = va_arg(ap, WT_ITEM *); + cursor->value.data = item->data; + cursor->value.size = item->size; + ret = __wt_schema_project_slice(session, + ctable->cg_cursors, ctable->plan, 0, + cursor->value_format, &cursor->value); + } else + ret = __wt_schema_project_in(session, + ctable->cg_cursors, ctable->plan, ap); + va_end(ap); + + for (i = 0, cp = ctable->cg_cursors; + i < WT_COLGROUPS(ctable->table); i++, cp++) + if (ret == 0) + F_SET(*cp, WT_CURSTD_VALUE_EXT); + else { + (*cp)->saved_err = ret; + F_CLR(*cp, WT_CURSTD_VALUE_SET); + } + +err: API_END(session, ret); +} + +/* + * __curtable_compare -- + * WT_CURSOR->compare implementation for tables. + */ +static int +__curtable_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + + CURSOR_API_CALL(a, session, compare, NULL); + + /* + * Confirm both cursors refer to the same source and have keys, then + * call the underlying object's comparison routine. + */ + if (strcmp(a->internal_uri, b->internal_uri) != 0) + WT_ERR_MSG(session, EINVAL, + "comparison method cursors must reference the same object"); + WT_CURSOR_CHECKKEY(WT_CURSOR_PRIMARY(a)); + WT_CURSOR_CHECKKEY(WT_CURSOR_PRIMARY(b)); + + ret = WT_CURSOR_PRIMARY(a)->compare( + WT_CURSOR_PRIMARY(a), WT_CURSOR_PRIMARY(b), cmpp); + +err: API_END_RET(session, ret); +} + +/* + * __curtable_next -- + * WT_CURSOR->next method for the table cursor type. + */ +static int +__curtable_next(WT_CURSOR *cursor) +{ + WT_CURSOR_TABLE *ctable; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + ctable = (WT_CURSOR_TABLE *)cursor; + CURSOR_API_CALL(cursor, session, next, NULL); + APPLY_CG(ctable, next); + +err: API_END_RET(session, ret); +} + +/* + * __curtable_next_random -- + * WT_CURSOR->next method for the table cursor type when configured with + * next_random. + */ +static int +__curtable_next_random(WT_CURSOR *cursor) +{ + WT_CURSOR *primary, **cp; + WT_CURSOR_TABLE *ctable; + WT_DECL_RET; + WT_SESSION_IMPL *session; + u_int i; + + ctable = (WT_CURSOR_TABLE *)cursor; + CURSOR_API_CALL(cursor, session, next, NULL); + cp = ctable->cg_cursors; + + /* Split out the first next, it retrieves the random record. */ + primary = *cp++; + WT_ERR(primary->next(primary)); + + /* Fill in the rest of the columns. */ + for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) { + (*cp)->key.data = primary->key.data; + (*cp)->key.size = primary->key.size; + (*cp)->recno = primary->recno; + F_SET(*cp, WT_CURSTD_KEY_EXT); + WT_ERR((*cp)->search(*cp)); + } + +err: API_END_RET(session, ret); +} + +/* + * __curtable_prev -- + * WT_CURSOR->prev method for the table cursor type. + */ +static int +__curtable_prev(WT_CURSOR *cursor) +{ + WT_CURSOR_TABLE *ctable; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + ctable = (WT_CURSOR_TABLE *)cursor; + CURSOR_API_CALL(cursor, session, prev, NULL); + APPLY_CG(ctable, prev); + +err: API_END_RET(session, ret); +} + +/* + * __curtable_reset -- + * WT_CURSOR->reset method for the table cursor type. + */ +static int +__curtable_reset(WT_CURSOR *cursor) +{ + WT_CURSOR_TABLE *ctable; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + ctable = (WT_CURSOR_TABLE *)cursor; + CURSOR_API_CALL(cursor, session, reset, NULL); + APPLY_CG(ctable, reset); + +err: API_END_RET(session, ret); +} + +/* + * __curtable_search -- + * WT_CURSOR->search method for the table cursor type. + */ +static int +__curtable_search(WT_CURSOR *cursor) +{ + WT_CURSOR_TABLE *ctable; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + ctable = (WT_CURSOR_TABLE *)cursor; + CURSOR_API_CALL(cursor, session, search, NULL); + APPLY_CG(ctable, search); + +err: API_END_RET(session, ret); +} + +/* + * __curtable_search_near -- + * WT_CURSOR->search_near method for the table cursor type. + */ +static int +__curtable_search_near(WT_CURSOR *cursor, int *exact) +{ + WT_CURSOR_TABLE *ctable; + WT_CURSOR *primary, **cp; + WT_DECL_RET; + WT_SESSION_IMPL *session; + u_int i; + + ctable = (WT_CURSOR_TABLE *)cursor; + CURSOR_API_CALL(cursor, session, search_near, NULL); + cp = ctable->cg_cursors; + primary = *cp; + WT_ERR(primary->search_near(primary, exact)); + + for (i = 1, ++cp; i < WT_COLGROUPS(ctable->table); i++) { + (*cp)->key.data = primary->key.data; + (*cp)->key.size = primary->key.size; + (*cp)->recno = primary->recno; + F_SET(*cp, WT_CURSTD_KEY_EXT); + WT_ERR((*cp)->search(*cp)); + } + +err: API_END_RET(session, ret); +} + +/* + * __curtable_insert -- + * WT_CURSOR->insert method for the table cursor type. + */ +static int +__curtable_insert(WT_CURSOR *cursor) +{ + WT_CURSOR *primary, **cp; + WT_CURSOR_TABLE *ctable; + WT_DECL_RET; + WT_SESSION_IMPL *session; + uint32_t flag_orig; + u_int i; + + ctable = (WT_CURSOR_TABLE *)cursor; + CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL); + WT_ERR(__curtable_open_indices(ctable)); + + /* + * Split out the first insert, it may be allocating a recno. + * + * If the table has indices, we also need to know whether this record + * is replacing an existing record so that the existing index entries + * can be removed. We discover if this is an overwrite by configuring + * the primary cursor for no-overwrite, and checking if the insert + * detects a duplicate key. + */ + cp = ctable->cg_cursors; + primary = *cp++; + + flag_orig = F_ISSET(primary, WT_CURSTD_OVERWRITE); + if (ctable->table->nindices > 0) + F_CLR(primary, WT_CURSTD_OVERWRITE); + ret = primary->insert(primary); + F_SET(primary, flag_orig); + + if (ret == WT_DUPLICATE_KEY && F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { + /* + * !!! + * The insert failure clears these flags, but does not touch the + * items. We could make a copy each time for overwrite cursors, + * but for now we just reset the flags. + */ + F_SET(primary, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); + ret = __curtable_update(cursor); + goto err; + } + WT_ERR(ret); + + for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) { + (*cp)->recno = primary->recno; + WT_ERR((*cp)->insert(*cp)); + } + + APPLY_IDX(ctable, insert); + +err: CURSOR_UPDATE_API_END(session, ret); + return (ret); +} + +/* + * __curtable_update -- + * WT_CURSOR->update method for the table cursor type. + */ +static int +__curtable_update(WT_CURSOR *cursor) +{ + WT_CURSOR_TABLE *ctable; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + ctable = (WT_CURSOR_TABLE *)cursor; + CURSOR_UPDATE_API_CALL(cursor, session, update, NULL); + WT_ERR(__curtable_open_indices(ctable)); + + /* + * If the table has indices, first delete any old index keys, then + * update the primary, then insert the new index keys. This is + * complicated by the fact that we need the old value to generate the + * old index keys, so we make a temporary copy of the new value. + */ + if (ctable->table->nindices > 0) { + WT_ERR(__wt_schema_project_merge(session, + ctable->cg_cursors, ctable->plan, + cursor->value_format, &cursor->value)); + APPLY_CG(ctable, search); + /* + * Remove only if the key exists. + */ + if (ret == 0) { + APPLY_IDX(ctable, remove); + WT_ERR(__wt_schema_project_slice(session, + ctable->cg_cursors, ctable->plan, 0, + cursor->value_format, &cursor->value)); + } else if (ret == WT_NOTFOUND) + ret = 0; + else + WT_ERR(ret); + } + APPLY_CG(ctable, update); + WT_ERR(ret); + if (ctable->idx_cursors != NULL) + APPLY_IDX(ctable, insert); + +err: CURSOR_UPDATE_API_END(session, ret); + return (ret); +} + +/* + * __curtable_remove -- + * WT_CURSOR->remove method for the table cursor type. + */ +static int +__curtable_remove(WT_CURSOR *cursor) +{ + WT_CURSOR_TABLE *ctable; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + ctable = (WT_CURSOR_TABLE *)cursor; + CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL); + WT_ERR(__curtable_open_indices(ctable)); + + /* Find the old record so it can be removed from indices */ + if (ctable->table->nindices > 0) { + APPLY_CG(ctable, search); + WT_ERR(ret); + APPLY_IDX(ctable, remove); + } + + APPLY_CG(ctable, remove); + +err: CURSOR_UPDATE_API_END(session, ret); + return (ret); +} + +/* + * __wt_table_range_truncate -- + * Truncate of a cursor range, table implementation. + */ +int +__wt_table_range_truncate(WT_CURSOR_TABLE *start, WT_CURSOR_TABLE *stop) +{ + WT_CURSOR *wt_start, *wt_stop; + WT_CURSOR_TABLE *ctable; + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_ITEM raw; + WT_SESSION_IMPL *session; + u_int i; + int cmp; + + ctable = (start != NULL) ? start : stop; + session = (WT_SESSION_IMPL *)ctable->iface.session; + wt_start = &start->iface; + wt_stop = &stop->iface; + + /* Open any indices. */ + WT_RET(__curtable_open_indices(ctable)); + WT_RET(__wt_scr_alloc(session, 128, &key)); + + /* + * Step through the cursor range, removing the index entries. + * + * If there are indices, copy the key we're using to step through the + * cursor range (so we can reset the cursor to its original position), + * then remove all of the index records in the truncated range. Copy + * the raw key because the memory is only valid until the cursor moves. + */ + if (ctable->table->nindices > 0) { + if (start == NULL) { + WT_ERR(__wt_cursor_get_raw_key(wt_stop, &raw)); + WT_ERR(__wt_buf_set(session, key, raw.data, raw.size)); + + do { + APPLY_CG(stop, search); + WT_ERR(ret); + APPLY_IDX(stop, remove); + } while ((ret = wt_stop->prev(wt_stop)) == 0); + WT_ERR_NOTFOUND_OK(ret); + + __wt_cursor_set_raw_key(wt_stop, key); + APPLY_CG(stop, search); + } else { + WT_ERR(__wt_cursor_get_raw_key(wt_start, &raw)); + WT_ERR(__wt_buf_set(session, key, raw.data, raw.size)); + + cmp = -1; + do { + APPLY_CG(start, search); + WT_ERR(ret); + APPLY_IDX(start, remove); + if (stop != NULL) + WT_ERR(wt_start->compare( + wt_start, wt_stop, + &cmp)); + } while (cmp < 0 && + (ret = wt_start->next(wt_start)) == 0); + WT_ERR_NOTFOUND_OK(ret); + + __wt_cursor_set_raw_key(wt_start, key); + APPLY_CG(start, search); + } + } + + /* Truncate the column groups. */ + for (i = 0; i < WT_COLGROUPS(ctable->table); i++) + WT_ERR(__wt_range_truncate( + (start == NULL) ? NULL : start->cg_cursors[i], + (stop == NULL) ? NULL : stop->cg_cursors[i])); + +err: __wt_scr_free(&key); + return (ret); +} + +/* + * __curtable_close -- + * WT_CURSOR->close method for the table cursor type. + */ +static int +__curtable_close(WT_CURSOR *cursor) +{ + WT_CURSOR_TABLE *ctable; + WT_CURSOR **cp; + WT_DECL_RET; + WT_SESSION_IMPL *session; + u_int i; + + ctable = (WT_CURSOR_TABLE *)cursor; + CURSOR_API_CALL(cursor, session, close, NULL); + + for (i = 0, cp = ctable->cg_cursors; + i < WT_COLGROUPS(ctable->table); i++, cp++) + if (*cp != NULL) { + WT_TRET((*cp)->close(*cp)); + *cp = NULL; + } + + if (ctable->idx_cursors != NULL) + for (i = 0, cp = ctable->idx_cursors; + i < ctable->table->nindices; i++, cp++) + if (*cp != NULL) { + WT_TRET((*cp)->close(*cp)); + *cp = NULL; + } + + if (ctable->plan != ctable->table->plan) + __wt_free(session, ctable->plan); + for (i = 0; ctable->cfg[i] != NULL; ++i) + __wt_free(session, ctable->cfg[i]); + __wt_free(session, ctable->cfg); + if (cursor->value_format != ctable->table->value_format) + __wt_free(session, cursor->value_format); + __wt_free(session, ctable->cg_cursors); + __wt_free(session, ctable->idx_cursors); + __wt_schema_release_table(session, ctable->table); + /* The URI is owned by the table. */ + cursor->internal_uri = NULL; + WT_TRET(__wt_cursor_close(cursor)); + +err: API_END_RET(session, ret); +} + +/* + * __curtable_open_colgroups -- + * Open cursors on column groups for a table cursor. + */ +static int +__curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[]) +{ + WT_SESSION_IMPL *session; + WT_TABLE *table; + WT_CURSOR **cp; + /* + * Underlying column groups are always opened without dump, and only + * the primary is opened with next_random. + */ + const char *cfg[] = { + cfg_arg[0], cfg_arg[1], "dump=\"\"", NULL, NULL + }; + u_int i; + + session = (WT_SESSION_IMPL *)ctable->iface.session; + table = ctable->table; + + if (!table->cg_complete) + WT_RET_MSG(session, EINVAL, + "Can't use '%s' until all column groups are created", + table->name); + + WT_RET(__wt_calloc_def(session, + WT_COLGROUPS(table), &ctable->cg_cursors)); + + for (i = 0, cp = ctable->cg_cursors; + i < WT_COLGROUPS(table); + i++, cp++) { + WT_RET(__wt_open_cursor(session, table->cgroups[i]->source, + &ctable->iface, cfg, cp)); + cfg[3] = "next_random=false"; + } + return (0); +} + +/* + * __curtable_open_indices -- + * Open cursors on indices for a table cursor. + */ +static int +__curtable_open_indices(WT_CURSOR_TABLE *ctable) +{ + WT_CURSOR **cp, *primary; + WT_SESSION_IMPL *session; + WT_TABLE *table; + u_int i; + + session = (WT_SESSION_IMPL *)ctable->iface.session; + table = ctable->table; + + WT_RET(__wt_schema_open_indices(session, table)); + if (table->nindices == 0 || ctable->idx_cursors != NULL) + return (0); + + /* Check for bulk cursors. */ + primary = *ctable->cg_cursors; + if (F_ISSET(primary, WT_CURSTD_BULK)) + WT_RET_MSG(session, ENOTSUP, + "Bulk load is not supported for tables with indices"); + + WT_RET(__wt_calloc_def(session, table->nindices, &ctable->idx_cursors)); + for (i = 0, cp = ctable->idx_cursors; i < table->nindices; i++, cp++) + WT_RET(__wt_open_cursor(session, table->indices[i]->source, + &ctable->iface, ctable->cfg, cp)); + return (0); +} + +/* + * __wt_curtable_open -- + * WT_SESSION->open_cursor method for table cursors. + */ +int +__wt_curtable_open(WT_SESSION_IMPL *session, + const char *uri, const char *cfg[], WT_CURSOR **cursorp) +{ + WT_CURSOR_STATIC_INIT(iface, + __wt_curtable_get_key, /* get-key */ + __wt_curtable_get_value, /* get-value */ + __wt_curtable_set_key, /* set-key */ + __wt_curtable_set_value, /* set-value */ + __curtable_compare, /* compare */ + __curtable_next, /* next */ + __curtable_prev, /* prev */ + __curtable_reset, /* reset */ + __curtable_search, /* search */ + __curtable_search_near, /* search-near */ + __curtable_insert, /* insert */ + __curtable_update, /* update */ + __curtable_remove, /* remove */ + __curtable_close); /* close */ + WT_CONFIG_ITEM cval; + WT_CURSOR *cursor; + WT_CURSOR_TABLE *ctable; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_TABLE *table; + size_t size; + int cfg_cnt; + const char *tablename, *columns; + + WT_STATIC_ASSERT(offsetof(WT_CURSOR_TABLE, iface) == 0); + + ctable = NULL; + + tablename = uri; + if (!WT_PREFIX_SKIP(tablename, "table:")) + return (EINVAL); + columns = strchr(tablename, '('); + if (columns == NULL) + size = strlen(tablename); + else + size = WT_PTRDIFF(columns, tablename); + WT_RET(__wt_schema_get_table(session, tablename, size, 0, &table)); + + if (table->is_simple) { + /* Just return a cursor on the underlying data source. */ + ret = __wt_open_cursor(session, + table->cgroups[0]->source, NULL, cfg, cursorp); + + __wt_schema_release_table(session, table); + return (ret); + } + + WT_RET(__wt_calloc_def(session, 1, &ctable)); + + cursor = &ctable->iface; + *cursor = iface; + cursor->session = &session->iface; + cursor->internal_uri = table->name; + cursor->key_format = table->key_format; + cursor->value_format = table->value_format; + + ctable->table = table; + ctable->plan = table->plan; + + /* Handle projections. */ + if (columns != NULL) { + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__wt_struct_reformat(session, table, + columns, strlen(columns), NULL, 1, tmp)); + WT_ERR(__wt_strndup( + session, tmp->data, tmp->size, &cursor->value_format)); + + WT_ERR(__wt_buf_init(session, tmp, 0)); + WT_ERR(__wt_struct_plan(session, table, + columns, strlen(columns), 0, tmp)); + WT_ERR(__wt_strndup( + session, tmp->data, tmp->size, &ctable->plan)); + } + + /* + * random_retrieval + * Random retrieval cursors only support next, reset and close. + */ + WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval)); + if (cval.val != 0) { + __wt_cursor_set_notsup(cursor); + cursor->next = __curtable_next_random; + cursor->reset = __curtable_reset; + } + + WT_ERR(__wt_cursor_init( + cursor, cursor->internal_uri, NULL, cfg, cursorp)); + + if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) + WT_ERR(__wt_json_column_init(cursor, table->key_format, + NULL, &table->colconf)); + + /* + * Open the colgroup cursors immediately: we're going to need them for + * any operation. We defer opening index cursors until we need them + * for an update. Note that this must come after the call to + * __wt_cursor_init: the table cursor must already be on the list of + * session cursors or we can't work out where to put the colgroup + * cursor(s). + */ + WT_ERR(__curtable_open_colgroups(ctable, cfg)); + + /* + * We'll need to squirrel away a copy of the cursor configuration + * for if/when we open indices. + * + * cfg[0] is the baseline configuration for the cursor open and we can + * acquire another copy from the configuration structures, so it would + * be reasonable not to copy it here: but I'd rather be safe than sorry. + * + * Underlying indices are always opened without dump. + */ + for (cfg_cnt = 0; cfg[cfg_cnt] != NULL; ++cfg_cnt) + ; + WT_ERR(__wt_calloc_def(session, cfg_cnt + 2, &ctable->cfg)); + for (cfg_cnt = 0; cfg[cfg_cnt] != NULL; ++cfg_cnt) + WT_ERR( + __wt_strdup(session, cfg[cfg_cnt], &ctable->cfg[cfg_cnt])); + WT_ERR(__wt_strdup(session, "dump=\"\"", &ctable->cfg[cfg_cnt])); + + if (0) { +err: WT_TRET(__curtable_close(cursor)); + *cursorp = NULL; + } + + __wt_scr_free(&tmp); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h new file mode 100644 index 00000000000..e358d22b278 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/api.h @@ -0,0 +1,128 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* Standard entry points to the API: declares/initializes local variables. */ +#define API_SESSION_INIT(s, h, n, cur, dh) \ + WT_DATA_HANDLE *__olddh = (s)->dhandle; \ + const char *__oldname = (s)->name; \ + (s)->cursor = (cur); \ + (s)->dhandle = (dh); \ + (s)->name = (s)->lastop = #h "." #n; \ + +#define API_CALL_NOCONF(s, h, n, cur, dh) do { \ + API_SESSION_INIT(s, h, n, cur, dh); \ + WT_ERR(F_ISSET(S2C(s), WT_CONN_PANIC) ? __wt_panic(s) : 0); \ + WT_ERR(__wt_verbose((s), WT_VERB_API, "CALL: " #h ":" #n)) + +#define API_CALL(s, h, n, cur, dh, config, cfg) do { \ + const char *cfg[] = \ + { WT_CONFIG_BASE(s, h##_##n), config, NULL }; \ + API_SESSION_INIT(s, h, n, cur, dh); \ + WT_ERR(F_ISSET(S2C(s), WT_CONN_PANIC) ? __wt_panic(s) : 0); \ + WT_ERR(((config) != NULL) ? \ + __wt_config_check((s), \ + WT_CONFIG_REF(session, h##_##n), (config), 0) : 0); \ + WT_ERR(__wt_verbose((s), WT_VERB_API, "CALL: " #h ":" #n)) + +#define API_END(s, ret) \ + if ((s) != NULL) { \ + (s)->dhandle = __olddh; \ + (s)->name = __oldname; \ + if (F_ISSET(&(s)->txn, TXN_RUNNING) && \ + (ret) != 0 && \ + (ret) != WT_NOTFOUND && \ + (ret) != WT_DUPLICATE_KEY) \ + F_SET(&(s)->txn, TXN_ERROR); \ + } \ +} while (0) + +/* An API call wrapped in a transaction if necessary. */ +#define TXN_API_CALL(s, h, n, cur, bt, config, cfg) do { \ + int __autotxn = 0; \ + API_CALL(s, h, n, bt, cur, config, cfg); \ + __autotxn = !F_ISSET(&(s)->txn, TXN_AUTOCOMMIT | TXN_RUNNING); \ + if (__autotxn) \ + F_SET(&(s)->txn, TXN_AUTOCOMMIT) + +/* An API call wrapped in a transaction if necessary. */ +#define TXN_API_CALL_NOCONF(s, h, n, cur, bt) do { \ + int __autotxn = 0; \ + API_CALL_NOCONF(s, h, n, cur, bt); \ + __autotxn = !F_ISSET(&(s)->txn, TXN_AUTOCOMMIT | TXN_RUNNING); \ + if (__autotxn) \ + F_SET(&(s)->txn, TXN_AUTOCOMMIT) + +/* End a transactional API call, optional retry on deadlock. */ +#define TXN_API_END_RETRY(s, ret, retry) \ + API_END(s, ret); \ + if (__autotxn) { \ + if (F_ISSET(&(s)->txn, TXN_AUTOCOMMIT)) \ + F_CLR(&(s)->txn, TXN_AUTOCOMMIT); \ + else if (ret == 0 && !F_ISSET(&(s)->txn, TXN_ERROR)) \ + ret = __wt_txn_commit((s), NULL); \ + else { \ + WT_TRET(__wt_txn_rollback((s), NULL)); \ + if ((ret == 0 || ret == WT_ROLLBACK) && \ + (retry)) { \ + ret = 0; \ + continue; \ + } \ + WT_TRET(__wt_session_reset_cursors(s)); \ + } \ + } \ + break; \ +} while (ret == 0) + +/* End a transactional API call, retry on deadlock. */ +#define TXN_API_END(s, ret) TXN_API_END_RETRY(s, ret, 1) + +/* + * In almost all cases, API_END is returning immediately, make it simple. + * If a session or connection method is about to return WT_NOTFOUND (some + * underlying object was not found), map it to ENOENT, only cursor methods + * return WT_NOTFOUND. + */ +#define API_END_RET(s, ret) \ + API_END(s, ret); \ + return (ret) +#define API_END_RET_NOTFOUND_MAP(s, ret) \ + API_END(s, ret); \ + return ((ret) == WT_NOTFOUND ? ENOENT : (ret)) + +#define CONNECTION_API_CALL(conn, s, n, config, cfg) \ + s = (conn)->default_session; \ + API_CALL(s, connection, n, NULL, NULL, config, cfg) + +#define CONNECTION_API_CALL_NOCONF(conn, s, n) \ + s = (conn)->default_session; \ + API_CALL_NOCONF(s, connection, n, NULL, NULL) + +#define SESSION_API_CALL(s, n, config, cfg) \ + API_CALL(s, session, n, NULL, NULL, config, cfg) + +#define SESSION_API_CALL_NOCONF(s, n) \ + API_CALL_NOCONF(s, session, n, NULL, NULL) + +#define SESSION_TXN_API_CALL(s, n, config, cfg) \ + TXN_API_CALL(s, session, n, NULL, NULL, config, cfg) + +#define CURSOR_API_CALL(cur, s, n, bt) \ + (s) = (WT_SESSION_IMPL *)(cur)->session; \ + API_CALL_NOCONF(s, cursor, n, cur, \ + ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle) + +#define CURSOR_UPDATE_API_CALL(cur, s, n, bt) \ + (s) = (WT_SESSION_IMPL *)(cur)->session; \ + TXN_API_CALL_NOCONF(s, cursor, n, cur, \ + ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle) + +#define CURSOR_UPDATE_API_END(s, ret) \ + TXN_API_END(s, ret) + +#define ASYNCOP_API_CALL(conn, s, n) \ + s = (conn)->default_session; \ + API_CALL_NOCONF(s, asyncop, n, NULL, NULL) diff --git a/src/third_party/wiredtiger/src/include/async.h b/src/third_party/wiredtiger/src/include/async.h new file mode 100644 index 00000000000..8565874c2f3 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/async.h @@ -0,0 +1,128 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +typedef enum { + WT_ASYNCOP_ENQUEUED, /* Placed on the work queue */ + WT_ASYNCOP_FREE, /* Able to be allocated to user */ + WT_ASYNCOP_READY, /* Allocated and ready for user to use */ + WT_ASYNCOP_WORKING /* Operation in progress by worker */ +} WT_ASYNC_STATE; + +typedef enum { + WT_ASYNC_FLUSH_NONE=0, /* No flush in progress */ + WT_ASYNC_FLUSH_COMPLETE, /* Notify flush caller it's done */ + WT_ASYNC_FLUSH_IN_PROGRESS, /* Prevent other callers */ + WT_ASYNC_FLUSHING /* Notify workers */ +} WT_ASYNC_FLUSH_STATE; + +#define MAX_ASYNC_SLEEP_USECS 100000 /* Maximum sleep waiting for work */ +#define MAX_ASYNC_YIELD 200 /* Maximum number of yields for work */ + +#define O2C(op) ((WT_CONNECTION_IMPL *)(op)->iface.connection) +#define O2S(op) \ + (((WT_CONNECTION_IMPL *)(op)->iface.connection)->default_session) +/* + * WT_ASYNC_FORMAT -- + * The URI/config/format cache. + */ +struct __wt_async_format { + STAILQ_ENTRY(__wt_async_format) q; + const char *config; + uint64_t cfg_hash; /* Config hash */ + const char *uri; + uint64_t uri_hash; /* URI hash */ + const char *key_format; + const char *value_format; +}; + +/* + * WT_ASYNC_OP_IMPL -- + * Implementation of the WT_ASYNC_OP. + */ +struct __wt_async_op_impl { + WT_ASYNC_OP iface; + + WT_ASYNC_CALLBACK *cb; + + uint32_t internal_id; /* Array position id. */ + uint64_t unique_id; /* Unique identifier. */ + + WT_ASYNC_FORMAT *format; /* Format structure */ + WT_ASYNC_STATE state; /* Op state */ + WT_ASYNC_OPTYPE optype; /* Operation type */ +}; + +/* + * Definition of the async subsystem. + */ +struct __wt_async { + /* + * Ops array protected by the ops_lock. + */ + WT_SPINLOCK ops_lock; /* Locked: ops array */ + WT_ASYNC_OP_IMPL *async_ops; /* Async ops */ +#define OPS_INVALID_INDEX 0xffffffff + uint32_t ops_index; /* Active slot index */ + uint64_t op_id; /* Unique ID counter */ + WT_ASYNC_OP_IMPL **async_queue; /* Async ops work queue */ + uint32_t async_qsize; /* Async work queue size */ + /* + * We need to have two head and tail values. All but one is + * maintained as an ever increasing value to ease wrap around. + * + * alloc_head: the next one to allocate for producers. + * head: the current head visible to consumers. + * head is always <= alloc_head. + * alloc_tail: the next slot for consumers to dequeue. + * alloc_tail is always <= head. + * tail_slot: the last slot consumed. + * A producer may need wait for tail_slot to advance. + */ + uint64_t alloc_head; /* Next slot to enqueue */ + uint64_t head; /* Head visible to worker */ + uint64_t alloc_tail; /* Next slot to dequeue */ + uint64_t tail_slot; /* Worker slot consumed */ + + STAILQ_HEAD(__wt_async_format_qh, __wt_async_format) formatqh; + int cur_queue; /* Currently enqueued */ + int max_queue; /* Maximum enqueued */ + WT_ASYNC_FLUSH_STATE flush_state; /* Queue flush state */ + /* Notify any waiting threads when flushing is done. */ + WT_CONDVAR *flush_cond; + WT_ASYNC_OP_IMPL flush_op; /* Special flush op */ + uint32_t flush_count; /* Worker count */ + uint64_t flush_gen; /* Flush generation number */ + +#define WT_ASYNC_MAX_WORKERS 20 + WT_SESSION_IMPL *worker_sessions[WT_ASYNC_MAX_WORKERS]; + /* Async worker threads */ + wt_thread_t worker_tids[WT_ASYNC_MAX_WORKERS]; + + uint32_t flags; /* Currently unused. */ +}; + +/* + * WT_ASYNC_CURSOR -- + * Async container for a cursor. Each async worker thread + * has a cache of async cursors to reuse for operations. + */ +struct __wt_async_cursor { + STAILQ_ENTRY(__wt_async_cursor) q; /* Worker cache */ + uint64_t cfg_hash; /* Config hash */ + uint64_t uri_hash; /* URI hash */ + WT_CURSOR *c; /* WT cursor */ +}; + +/* + * WT_ASYNC_WORKER_STATE -- + * State for an async worker thread. + */ +struct __wt_async_worker_state { + uint32_t id; + STAILQ_HEAD(__wt_cursor_qh, __wt_async_cursor) cursorqh; + uint32_t num_cursors; +}; diff --git a/src/third_party/wiredtiger/src/include/bitstring.i b/src/third_party/wiredtiger/src/include/bitstring.i new file mode 100644 index 00000000000..95af6731bf9 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/bitstring.i @@ -0,0 +1,316 @@ +/*- + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/*- + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Paul Vixie. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/sys/bitstring.h,v 1.5 2005/01/07 02:29:23 imp Exp $ + */ + + /* byte of the bitstring bit is in */ +#define __bit_byte(bit) ((bit) >> 3) + + /* mask for the bit within its byte */ +#define __bit_mask(bit) (1 << ((bit) & 0x7)) + + /* Bytes in a bitstring of nbits */ +#define __bitstr_size(nbits) (((nbits) + 7) >> 3) + +/* + * __bit_alloc -- + * Allocate a bitstring. + */ +static inline int +__bit_alloc(WT_SESSION_IMPL *session, uint64_t nbits, void *retp) +{ + return (__wt_calloc( + session, (size_t)__bitstr_size(nbits), sizeof(uint8_t), retp)); +} + +/* + * __bit_test -- + * Test one bit in name. + */ +static inline int +__bit_test(uint8_t *bitf, uint64_t bit) +{ + return (bitf[__bit_byte(bit)] & __bit_mask(bit) ? 1 : 0); +} + +/* + * __bit_set -- + * Set one bit in name. + */ +static inline void +__bit_set(uint8_t *bitf, uint64_t bit) +{ + bitf[__bit_byte(bit)] |= __bit_mask(bit); +} + +/* + * __bit_clear -- + * Clear one bit in name. + */ +static inline void +__bit_clear(uint8_t *bitf, uint64_t bit) +{ + bitf[__bit_byte(bit)] &= ~__bit_mask(bit); +} + +/* + * __bit_nclr -- + * Clear bits start-to-stop in name. + */ +static inline void +__bit_nclr(uint8_t *bitf, uint64_t start, uint64_t stop) +{ + uint64_t startbyte, stopbyte; + + startbyte = __bit_byte(start); + stopbyte = __bit_byte(stop); + + if (startbyte == stopbyte) + bitf[startbyte] &= + ((0xff >> (8 - (start & 0x7))) | + (0xff << ((stop & 0x7) + 1))); + else { + bitf[startbyte] &= 0xff >> (8 - (start & 0x7)); + while (++startbyte < stopbyte) + bitf[startbyte] = 0; + bitf[stopbyte] &= 0xff << ((stop & 0x7) + 1); + } +} + +/* + * __bit_nset -- + * Set bits start-to-stop in name. + */ +static inline void +__bit_nset(uint8_t *bitf, uint64_t start, uint64_t stop) +{ + uint64_t startbyte, stopbyte; + + startbyte = __bit_byte(start); + stopbyte = __bit_byte(stop); + if (startbyte == stopbyte) + bitf[startbyte] |= + ((0xff << (start & 0x7)) & (0xff >> (7 - (stop & 0x7)))); + else { + bitf[startbyte] |= 0xff << (start & 0x7); + while (++startbyte < stopbyte) + bitf[startbyte] = 0xff; + bitf[stopbyte] |= 0xff >> (7 - (stop & 0x7)); + } +} + +/* + * __bit_ffc -- + * Find first clear bit in name, return 0 on success, -1 on no bit clear. + */ +static inline int +__bit_ffc(uint8_t *bitf, uint64_t nbits, uint64_t *retp) +{ + uint8_t lb; + uint64_t byte, stopbyte, value; + + value = 0; /* -Wuninitialized */ + + if (nbits == 0) + return (-1); + + for (byte = 0, + stopbyte = __bit_byte(nbits - 1); byte <= stopbyte; ++byte) + if (bitf[byte] != 0xff) { + value = byte << 3; + for (lb = bitf[byte]; lb & 0x01; ++value, lb >>= 1) + ; + break; + } + + if (byte > stopbyte || value >= nbits) + return (-1); + + *retp = value; + return (0); +} + +/* + * __bit_ffs -- + * Find first set bit in name, return 0 on success, -1 on no bit set. + */ +static inline int +__bit_ffs(uint8_t *bitf, uint64_t nbits, uint64_t *retp) +{ + uint8_t lb; + uint64_t byte, stopbyte, value; + + value = 0; + if (nbits == 0) + return (-1); + + for (byte = 0, + stopbyte = __bit_byte(nbits - 1); byte <= stopbyte; ++byte) + if (bitf[byte] != 0) { + value = byte << 3; + for (lb = bitf[byte]; !(lb & 0x01); ++value, lb >>= 1) + ; + break; + } + + if (byte > stopbyte || value >= nbits) + return (-1); + + *retp = value; + return (0); +} + +/* + * __bit_getv -- + * Return a fixed-length column store bit-field value. + */ +static inline uint8_t +__bit_getv(uint8_t *bitf, uint64_t entry, uint8_t width) +{ + uint8_t value; + uint64_t bit; + +#define __BIT_GET(len, mask) \ + case len: \ + if (__bit_test(bitf, bit)) \ + value |= mask; \ + ++bit \ + /* FALLTHROUGH */ + + value = 0; + bit = entry * width; + + /* + * Fast-path single bytes, do repeated tests for the rest: we could + * slice-and-dice instead, but the compiler is probably going to do + * a better job than I will. + */ + switch (width) { + case 8: + return (bitf[__bit_byte(bit)]); + __BIT_GET(7, 0x40); + __BIT_GET(6, 0x20); + __BIT_GET(5, 0x10); + __BIT_GET(4, 0x08); + __BIT_GET(3, 0x04); + __BIT_GET(2, 0x02); + __BIT_GET(1, 0x01); + } + return (value); +} + +/* + * __bit_getv_recno -- + * Return a record number's bit-field value. + */ +static inline uint8_t +__bit_getv_recno(WT_PAGE *page, uint64_t recno, uint8_t width) +{ + return (__bit_getv( + page->pg_fix_bitf, recno - page->pg_fix_recno, width)); +} + +/* + * __bit_setv -- + * Set a fixed-length column store bit-field value. + */ +static inline void +__bit_setv(uint8_t *bitf, uint64_t entry, uint8_t width, uint8_t value) +{ + uint64_t bit; + +#define __BIT_SET(len, mask) \ + case len: \ + if (value & (mask)) \ + __bit_set(bitf, bit); \ + else \ + __bit_clear(bitf, bit); \ + ++bit \ + /* FALLTHROUGH */ + + bit = entry * width; + + /* + * Fast-path single bytes, do repeated tests for the rest: we could + * slice-and-dice instead, but the compiler is probably going to do + * a better job than I will. + */ + switch (width) { + case 8: + bitf[__bit_byte(bit)] = value; + return; + __BIT_SET(7, 0x40); + __BIT_SET(6, 0x20); + __BIT_SET(5, 0x10); + __BIT_SET(4, 0x08); + __BIT_SET(3, 0x04); + __BIT_SET(2, 0x02); + __BIT_SET(1, 0x01); + } +} + +/* + * __bit_setv_recno -- + * Set a record number's bit-field value. + */ +static inline void +__bit_setv_recno(WT_PAGE *page, uint64_t recno, uint8_t width, uint8_t value) +{ + __bit_setv(page->pg_fix_bitf, recno - page->pg_fix_recno, width, value); +} diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h new file mode 100644 index 00000000000..10fa51243ac --- /dev/null +++ b/src/third_party/wiredtiger/src/include/block.h @@ -0,0 +1,337 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * WiredTiger's block manager interface. + */ + +/* + * The file's description is written into the first block of the file, which + * means we can use an offset of 0 as an invalid offset. + */ +#define WT_BLOCK_INVALID_OFFSET 0 + +/* + * The block manager maintains three per-checkpoint extent lists: + * alloc: the extents allocated in this checkpoint + * avail: the extents available for allocation + * discard: the extents freed in this checkpoint + * + * An extent list is based on two skiplists: first, a by-offset list linking + * WT_EXT elements and sorted by file offset (low-to-high), second, a by-size + * list linking WT_SIZE elements and sorted by chunk size (low-to-high). + * + * Additionally, each WT_SIZE element on the by-size has a skiplist of its own, + * linking WT_EXT elements and sorted by file offset (low-to-high). This list + * has an entry for extents of a particular size. + * + * The trickiness is each individual WT_EXT element appears on two skiplists. + * In order to minimize allocation calls, we allocate a single array of WT_EXT + * pointers at the end of the WT_EXT structure, for both skiplists, and store + * the depth of the skiplist in the WT_EXT structure. The skiplist entries for + * the offset skiplist start at WT_EXT.next[0] and the entries for the size + * skiplist start at WT_EXT.next[WT_EXT.depth]. + * + * One final complication: we only maintain the per-size skiplist for the avail + * list, the alloc and discard extent lists are not searched based on size. + */ + +/* + * WT_EXTLIST -- + * An extent list. + */ +struct __wt_extlist { + char *name; /* Name */ + + uint64_t bytes; /* Byte count */ + uint32_t entries; /* Entry count */ + + wt_off_t offset; /* Written extent offset */ + uint32_t cksum, size; /* Written extent cksum, size */ + + int track_size; /* Maintain per-size skiplist */ + + WT_EXT *last; /* Cached last element */ + + WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Size/offset skiplists */ + WT_SIZE *sz[WT_SKIP_MAXDEPTH]; +}; + +/* + * WT_EXT -- + * Encapsulation of an extent, either allocated or freed within the + * checkpoint. + */ +struct __wt_ext { + wt_off_t off; /* Extent's file offset */ + wt_off_t size; /* Extent's Size */ + + uint8_t depth; /* Skip list depth */ + + /* + * Variable-length array, sized by the number of skiplist elements. + * The first depth array entries are the address skiplist elements, + * the second depth array entries are the size skiplist. + */ + WT_EXT *next[0]; /* Offset, size skiplists */ +}; + +/* + * WT_SIZE -- + * Encapsulation of a block size skiplist entry. + */ +struct __wt_size { + wt_off_t size; /* Size */ + + uint8_t depth; /* Skip list depth */ + + WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Per-size offset skiplist */ + + /* + * We don't use a variable-length array for the size skiplist, we want + * to be able to use any cached WT_SIZE structure as the head of a list, + * and we don't know the related WT_EXT structure's depth. + */ + WT_SIZE *next[WT_SKIP_MAXDEPTH]; /* Size skiplist */ +}; + +/* + * WT_EXT_FOREACH -- + * Walk a block manager skiplist. + * WT_EXT_FOREACH_OFF -- + * Walk a block manager skiplist where the WT_EXT.next entries are offset + * by the depth. + */ +#define WT_EXT_FOREACH(skip, head) \ + for ((skip) = (head)[0]; \ + (skip) != NULL; (skip) = (skip)->next[0]) +#define WT_EXT_FOREACH_OFF(skip, head) \ + for ((skip) = (head)[0]; \ + (skip) != NULL; (skip) = (skip)->next[(skip)->depth]) + +/* + * Checkpoint cookie: carries a version number as I don't want to rev the schema + * file version should the default block manager checkpoint format change. + * + * Version #1 checkpoint cookie format: + * [1] [root addr] [alloc addr] [avail addr] [discard addr] + * [file size] [checkpoint size] [write generation] + */ +#define WT_BM_CHECKPOINT_VERSION 1 /* Checkpoint format version */ +#define WT_BLOCK_EXTLIST_MAGIC 71002 /* Identify a list */ +struct __wt_block_ckpt { + uint8_t version; /* Version */ + + wt_off_t root_offset; /* The root */ + uint32_t root_cksum, root_size; + + WT_EXTLIST alloc; /* Extents allocated */ + WT_EXTLIST avail; /* Extents available */ + WT_EXTLIST discard; /* Extents discarded */ + + wt_off_t file_size; /* Checkpoint file size */ + uint64_t ckpt_size; /* Checkpoint byte count */ + + WT_EXTLIST ckpt_avail; /* Checkpoint free'd extents */ + + /* + * Checkpoint archive: the block manager may potentially free a lot of + * memory from the allocation and discard extent lists when checkpoint + * completes. Put it off until the checkpoint resolves, that lets the + * upper btree layer continue eviction sooner. + */ + WT_EXTLIST ckpt_alloc; /* Checkpoint archive */ + WT_EXTLIST ckpt_discard; /* Checkpoint archive */ +}; + +/* + * WT_BM -- + * Block manager handle, references a single checkpoint in a file. + */ +struct __wt_bm { + /* Methods */ + int (*addr_string) + (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t); + int (*addr_valid)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); + u_int (*block_header)(WT_BM *); + int (*checkpoint) + (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, WT_CKPT *, int); + int (*checkpoint_load)(WT_BM *, WT_SESSION_IMPL *, + const uint8_t *, size_t, uint8_t *, size_t *, int); + int (*checkpoint_resolve)(WT_BM *, WT_SESSION_IMPL *); + int (*checkpoint_unload)(WT_BM *, WT_SESSION_IMPL *); + int (*close)(WT_BM *, WT_SESSION_IMPL *); + int (*compact_end)(WT_BM *, WT_SESSION_IMPL *); + int (*compact_page_skip) + (WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t, int *); + int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, int *); + int (*compact_start)(WT_BM *, WT_SESSION_IMPL *); + int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); + int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); + int (*read) + (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t); + int (*salvage_end)(WT_BM *, WT_SESSION_IMPL *); + int (*salvage_next) + (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t *, int *); + int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *); + int (*salvage_valid) + (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, int); + int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats); + int (*sync)(WT_BM *, WT_SESSION_IMPL *, int); + int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); + int (*verify_end)(WT_BM *, WT_SESSION_IMPL *); + int (*verify_start)(WT_BM *, WT_SESSION_IMPL *, WT_CKPT *); + int (*write) (WT_BM *, + WT_SESSION_IMPL *, WT_ITEM *, uint8_t *, size_t *, int); + int (*write_size)(WT_BM *, WT_SESSION_IMPL *, size_t *); + + WT_BLOCK *block; /* Underlying file */ + + void *map; /* Mapped region */ + size_t maplen; + void *mappingcookie; + + /* + * There's only a single block manager handle that can be written, all + * others are checkpoints. + */ + int is_live; /* The live system */ +}; + +/* + * WT_BLOCK -- + * Block manager handle, references a single file. + */ +struct __wt_block { + const char *name; /* Name */ + + /* A list of block manager handles, sharing a file descriptor. */ + uint32_t ref; /* References */ + WT_FH *fh; /* Backing file handle */ + TAILQ_ENTRY(__wt_block) q; /* Linked list of handles */ + + /* Configuration information, set when the file is opened. */ + int allocfirst; /* Allocation is first-fit */ + int allocfirst_save; /* Allocation is first-fit, saved */ + uint32_t allocsize; /* Allocation size */ + size_t os_cache; /* System buffer cache flush max */ + size_t os_cache_max; + size_t os_cache_dirty; /* System buffer cache write max */ + size_t os_cache_dirty_max; + + u_int block_header; /* Header length */ + + /* + * There is only a single checkpoint in a file that can be written. The + * information could logically live in the WT_BM structure, but then we + * would be re-creating it every time we opened a new checkpoint and I'd + * rather not do that. So, it's stored here, only accessed by one WT_BM + * handle. + */ + WT_SPINLOCK live_lock; /* Live checkpoint lock */ + WT_BLOCK_CKPT live; /* Live checkpoint */ + int ckpt_inprogress;/* Live checkpoint in progress */ + + /* Salvage support */ + wt_off_t slvg_off; /* Salvage file offset */ + + /* Verification support */ + int verify; /* If performing verification */ + wt_off_t verify_size; /* Checkpoint's file size */ + WT_EXTLIST verify_alloc; /* Verification allocation list */ + uint64_t frags; /* Maximum frags in the file */ + uint8_t *fragfile; /* Per-file frag tracking list */ + uint8_t *fragckpt; /* Per-checkpoint frag tracking list */ +}; + +/* + * WT_BLOCK_DESC -- + * The file's description. + */ +struct __wt_block_desc { +#define WT_BLOCK_MAGIC 120897 + uint32_t magic; /* 00-03: Magic number */ +#define WT_BLOCK_MAJOR_VERSION 1 + uint16_t majorv; /* 04-05: Major version */ +#define WT_BLOCK_MINOR_VERSION 0 + uint16_t minorv; /* 06-07: Minor version */ + + uint32_t cksum; /* 08-11: Description block checksum */ + + uint32_t unused; /* 12-15: Padding */ +}; +/* + * WT_BLOCK_DESC_SIZE is the expected structure size -- we verify the build to + * ensure the compiler hasn't inserted padding (padding won't cause failure, + * we reserve the first allocation-size block of the file for this information, + * but it would be worth investigation, regardless). + */ +#define WT_BLOCK_DESC_SIZE 16 + +/* + * WT_BLOCK_HEADER -- + * Blocks have a common header, a WT_PAGE_HEADER structure followed by a + * block-manager specific structure: WT_BLOCK_HEADER is WiredTiger's default. + */ +struct __wt_block_header { + /* + * We write the page size in the on-disk page header because it makes + * salvage easier. (If we don't know the expected page length, we'd + * have to read increasingly larger chunks from the file until we find + * one that checksums, and that's going to be harsh given WiredTiger's + * potentially large page sizes.) + */ + uint32_t disk_size; /* 00-03: on-disk page size */ + + /* + * Page checksums are stored in two places. First, the page checksum + * is written within the internal page that references it as part of + * the address cookie. This is done to improve the chances of detecting + * not only disk corruption but other bugs (for example, overwriting a + * page with another valid page image). Second, a page's checksum is + * stored in the disk header. This is for salvage, so salvage knows it + * has found a page that may be useful. + */ + uint32_t cksum; /* 04-07: checksum */ + +#define WT_BLOCK_DATA_CKSUM 0x01 /* Block data is part of the checksum */ + uint8_t flags; /* 08: flags */ + + /* + * End the structure with 3 bytes of padding: it wastes space, but it + * leaves the structure 32-bit aligned and having a few bytes to play + * with in the future can't hurt. + */ + uint8_t unused[3]; /* 09-11: unused padding */ +}; +/* + * WT_BLOCK_HEADER_SIZE is the number of bytes we allocate for the structure: if + * the compiler inserts padding it will break the world. + */ +#define WT_BLOCK_HEADER_SIZE 12 + +/* + * WT_BLOCK_HEADER_BYTE + * WT_BLOCK_HEADER_BYTE_SIZE -- + * The first usable data byte on the block (past the combined headers). + */ +#define WT_BLOCK_HEADER_BYTE_SIZE \ + (WT_PAGE_HEADER_SIZE + WT_BLOCK_HEADER_SIZE) +#define WT_BLOCK_HEADER_BYTE(dsk) \ + ((void *)((uint8_t *)(dsk) + WT_BLOCK_HEADER_BYTE_SIZE)) + +/* + * Don't compress the block's WT_PAGE_HEADER and WT_BLOCK_HEADER structures. + * We need the WT_PAGE_HEADER in-memory size, and the WT_BLOCK_HEADER checksum + * and on-disk size to be immediately available without decompression. We use + * the on-disk size and checksum during salvage to figure out where the blocks + * are, and the in-memory size tells us how large a buffer we need to decompress + * the block. We could skip less than 64B, but a 64B boundary may offer better + * alignment for the underlying compression engine, and skipping 64B won't make + * a difference in terms of compression efficiency. + */ +#define WT_BLOCK_COMPRESS_SKIP 64 diff --git a/src/third_party/wiredtiger/src/include/bloom.h b/src/third_party/wiredtiger/src/include/bloom.h new file mode 100644 index 00000000000..4ae6d96b935 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/bloom.h @@ -0,0 +1,28 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ +/* + * REFERENCES: + * http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf + * http://code.google.com/p/cityhash-c/ + */ + +struct __wt_bloom { + const char *uri; + char *config; + uint8_t *bitstring; /* For in memory representation. */ + WT_SESSION_IMPL *session; + WT_CURSOR *c; + + uint32_t k; /* The number of hash functions used. */ + uint32_t factor; /* The number of bits per item inserted. */ + uint64_t m; /* The number of slots in the bit string. */ + uint64_t n; /* The number of items to be inserted. */ +}; + +struct __wt_bloom_hash { + uint64_t h1, h2; /* The two hashes used to calculate bits. */ +}; diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h new file mode 100644 index 00000000000..0c4fe876e5e --- /dev/null +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -0,0 +1,1015 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * WT_PAGE_HEADER -- + * Blocks have a common header, a WT_PAGE_HEADER structure followed by a + * block-manager specific structure. + */ +struct __wt_page_header { + /* + * The record number of the first record of the page is stored on disk + * so we can figure out where the column-store leaf page fits into the + * key space during salvage. + */ + uint64_t recno; /* 00-07: column-store starting recno */ + + /* + * We maintain page write-generations in the non-transactional case + * as that's how salvage can determine the most recent page between + * pages overlapping the same key range. + */ + uint64_t write_gen; /* 08-15: write generation */ + + /* + * The page's in-memory size isn't rounded or aligned, it's the actual + * number of bytes the disk-image consumes when instantiated in memory. + */ + uint32_t mem_size; /* 16-19: in-memory page size */ + + union { + uint32_t entries; /* 20-23: number of cells on page */ + uint32_t datalen; /* 20-23: overflow data length */ + } u; + + uint8_t type; /* 24: page type */ + +#define WT_PAGE_COMPRESSED 0x01 /* Page is compressed on disk */ +#define WT_PAGE_EMPTY_V_ALL 0x02 /* Page has all zero-length values */ +#define WT_PAGE_EMPTY_V_NONE 0x04 /* Page has no zero-length values */ + uint8_t flags; /* 25: flags */ + + /* + * End the structure with 2 bytes of padding: it wastes space, but it + * leaves the structure 32-bit aligned and having a few bytes to play + * with in the future can't hurt. + */ + uint8_t unused[2]; /* 26-27: unused padding */ +}; +/* + * WT_PAGE_HEADER_SIZE is the number of bytes we allocate for the structure: if + * the compiler inserts padding it will break the world. + */ +#define WT_PAGE_HEADER_SIZE 28 + +/* + * The block-manager specific information immediately follows the WT_PAGE_HEADER + * structure. + */ +#define WT_BLOCK_HEADER_REF(dsk) \ + ((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_SIZE)) + +/* + * WT_PAGE_HEADER_BYTE -- + * WT_PAGE_HEADER_BYTE_SIZE -- + * The first usable data byte on the block (past the combined headers). + */ +#define WT_PAGE_HEADER_BYTE_SIZE(btree) \ + ((u_int)(WT_PAGE_HEADER_SIZE + (btree)->block_header)) +#define WT_PAGE_HEADER_BYTE(btree, dsk) \ + ((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_BYTE_SIZE(btree))) + +/* + * WT_ADDR -- + * An in-memory structure to hold a block's location. + */ +struct __wt_addr { + uint8_t *addr; /* Block-manager's cookie */ + uint8_t size; /* Block-manager's cookie length */ + +#define WT_ADDR_INT 1 /* Internal page */ +#define WT_ADDR_LEAF 2 /* Leaf page */ +#define WT_ADDR_LEAF_NO 3 /* Leaf page, no overflow */ + uint8_t type; + + /* + * If an address is both as an address for the previous and the current + * multi-block reconciliations, that is, a block we're writing matches + * the block written the last time, it will appear in both the current + * boundary points as well as the page modification's list of previous + * blocks. The reuse flag is how we know that's happening so the block + * is treated correctly (not free'd on error, for example). + */ + uint8_t reuse; +}; + +/* + * Overflow tracking for reuse: When a page is reconciled, we write new K/V + * overflow items. If pages are reconciled multiple times, we need to know + * if we've already written a particular overflow record (so we don't write + * it again), as well as if we've modified an overflow record previously + * written (in which case we want to write a new record and discard blocks + * used by the previously written record). Track overflow records written + * for the page, storing the values in a skiplist with the record's value as + * the "key". + */ +struct __wt_ovfl_reuse { + uint32_t value_offset; /* Overflow value offset */ + uint32_t value_size; /* Overflow value size */ + uint8_t addr_offset; /* Overflow addr offset */ + uint8_t addr_size; /* Overflow addr size */ + + /* + * On each page reconciliation, we clear the entry's in-use flag, and + * reset it as the overflow record is re-used. After reconciliation + * completes, unused skiplist entries are discarded, along with their + * underlying blocks. + * + * On each page reconciliation, set the just-added flag for each new + * skiplist entry; if reconciliation fails for any reason, discard the + * newly added skiplist entries, along with their underlying blocks. + */ +#define WT_OVFL_REUSE_INUSE 0x01 +#define WT_OVFL_REUSE_JUST_ADDED 0x02 + uint8_t flags; + + /* + * The untyped address immediately follows the WT_OVFL_REUSE structure, + * the untyped value immediately follows the address. + */ +#define WT_OVFL_REUSE_ADDR(p) \ + ((void *)((uint8_t *)(p) + (p)->addr_offset)) +#define WT_OVFL_REUSE_VALUE(p) \ + ((void *)((uint8_t *)(p) + (p)->value_offset)) + + WT_OVFL_REUSE *next[0]; /* Forward-linked skip list */ +}; + +/* + * Overflow tracking for cached values: When a page is reconciled, we write new + * K/V overflow items, and discard previous underlying blocks. If there's a + * transaction in the system that needs to read the previous value, we have to + * cache the old value until no running transaction needs it. + */ +struct __wt_ovfl_txnc { + uint64_t current; /* Maximum transaction ID at store */ + + uint32_t value_offset; /* Overflow value offset */ + uint32_t value_size; /* Overflow value size */ + uint8_t addr_offset; /* Overflow addr offset */ + uint8_t addr_size; /* Overflow addr size */ + + /* + * The untyped address immediately follows the WT_OVFL_TXNC + * structure, the untyped value immediately follows the address. + */ +#define WT_OVFL_TXNC_ADDR(p) \ + ((void *)((uint8_t *)(p) + (p)->addr_offset)) +#define WT_OVFL_TXNC_VALUE(p) \ + ((void *)((uint8_t *)(p) + (p)->value_offset)) + + WT_OVFL_TXNC *next[0]; /* Forward-linked skip list */ +}; + +/* + * WT_PAGE_MODIFY -- + * When a page is modified, there's additional information to maintain. + */ +struct __wt_page_modify { + /* + * Track the highest transaction ID at which the page was written to + * disk. This can be used to avoid trying to write the page multiple + * times if a snapshot is keeping old versions pinned (e.g., in a + * checkpoint). + */ + uint64_t disk_snap_min; + + /* The largest transaction ID seen on the page by reconciliation. */ + uint64_t rec_max_txn; + + /* The first unwritten transaction ID (approximate). */ + uint64_t first_dirty_txn; + + /* The largest update transaction ID (approximate). */ + uint64_t update_txn; + + /* Dirty bytes added to the cache. */ + uint64_t bytes_dirty; + + /* + * When pages are reconciled, the result is one or more replacement + * blocks. A replacement block can be in one of two states: it was + * written to disk, and so we have a block address, or it contained + * unresolved modifications and we have a disk image for it with a + * list of those unresolved modifications. The former is the common + * case: we only build lists of unresolved modifications when we're + * evicting a page, and we only expect to see unresolved modifications + * on a page being evicted in the case of a hot page that's too large + * to keep in memory as it is. In other words, checkpoints will skip + * unresolved modifications, and will write the blocks rather than + * build lists of unresolved modifications. + * + * Ugly union/struct layout to conserve memory, we never have both + * a replace address and multiple replacement blocks. + */ + union { + WT_ADDR replace; /* Single, written replacement block */ +#define mod_replace u1.replace + + struct { /* Multiple replacement blocks */ + struct __wt_multi { + /* + * Block's key: either a column-store record number or a + * row-store variable length byte string. + */ + union { + uint64_t recno; + WT_IKEY *ikey; + } key; + + /* + * Eviction, but block wasn't written: unresolved updates and + * associated disk image. + * + * Skipped updates are either a WT_INSERT, or a row-store leaf + * page entry. + */ + struct __wt_upd_skipped { + WT_INSERT *ins; + WT_ROW *rip; + } *skip; + uint32_t skip_entries; + void *skip_dsk; + + /* + * Block was written: address, size and checksum. + * On subsequent reconciliations of this page, we avoid writing + * the block if it's unchanged by comparing size and checksum; + * the reuse flag is set when the block is unchanged and we're + * reusing a previous address. + */ + WT_ADDR addr; + uint32_t size; + uint32_t cksum; + } *multi; + uint32_t multi_entries; /* Multiple blocks element count */ + } m; +#define mod_multi u1.m.multi +#define mod_multi_entries u1.m.multi_entries + } u1; + + /* + * Internal pages need to be able to chain root-page splits and have a + * special transactional eviction requirement. Column-store leaf pages + * need update and append lists. + * + * Ugly union/struct layout to conserve memory, a page is either a leaf + * page or an internal page. + */ + union { + struct { + /* + * When a root page splits, we create a new page and write it; + * the new page can also split and so on, and we continue this + * process until we write a single replacement root page. We + * use the root split field to track the list of created pages + * so they can be discarded when no longer needed. + */ + WT_PAGE *root_split; /* Linked list of root split pages */ + + /* + * When we deepen the tree, newly created internal pages cannot + * be evicted until all threads have exited the original page + * index structure. We set a transaction value during the split + * that's checked during eviction. + */ + uint64_t split_txn; /* Split eviction transaction value */ + } intl; +#define mod_root_split u2.intl.root_split +#define mod_split_txn u2.intl.split_txn + struct { + /* + * Appended items to column-stores: there is only a single one + * of these per column-store tree. + */ + WT_INSERT_HEAD **append; + + /* + * Updated items in column-stores: variable-length RLE entries + * can expand to multiple entries which requires some kind of + * list we can expand on demand. Updated items in fixed-length + * files could be done based on an WT_UPDATE array as in + * row-stores, but there can be a very large number of bits on + * a single page, and the cost of the WT_UPDATE array would be + * huge. + */ + WT_INSERT_HEAD **update; + } leaf; +#define mod_append u2.leaf.append +#define mod_update u2.leaf.update + } u2; + + /* + * Overflow record tracking for reconciliation. We assume overflow + * records are relatively rare, so we don't allocate the structures + * to track them until we actually see them in the data. + */ + struct __wt_ovfl_track { + /* + * Overflow key/value address/byte-string pairs we potentially + * reuse each time we reconcile the page. + */ + WT_OVFL_REUSE *ovfl_reuse[WT_SKIP_MAXDEPTH]; + + /* + * Overflow value address/byte-string pairs cached until no + * running transaction will possibly read them. + */ + WT_OVFL_TXNC *ovfl_txnc[WT_SKIP_MAXDEPTH]; + + /* + * Overflow key/value addresses to be discarded from the block + * manager after reconciliation completes successfully. + */ + WT_CELL **discard; + size_t discard_entries; + size_t discard_allocated; + } *ovfl_track; + + /* + * The write generation is incremented when a page is modified, a page + * is clean if the write generation is 0. + * + * !!! + * 4B values are probably larger than required, but I'm more confident + * 4B types will always be backed by atomic writes to memory. + */ + uint32_t write_gen; + +#define WT_PAGE_LOCK(s, p) \ + __wt_spin_lock((s), &S2C(s)->page_lock[(p)->modify->page_lock]) +#define WT_PAGE_UNLOCK(s, p) \ + __wt_spin_unlock((s), &S2C(s)->page_lock[(p)->modify->page_lock]) + uint8_t page_lock; /* Page's spinlock */ + +#define WT_PM_REC_EMPTY 0x01 /* Reconciliation: no replacement */ +#define WT_PM_REC_MULTIBLOCK 0x02 /* Reconciliation: multiple blocks */ +#define WT_PM_REC_REPLACE 0x04 /* Reconciliation: single block */ +#define WT_PM_REC_MASK \ + (WT_PM_REC_EMPTY | WT_PM_REC_MULTIBLOCK | WT_PM_REC_REPLACE) + uint8_t flags; /* Page flags */ +}; + +/* + * WT_PAGE -- + * The WT_PAGE structure describes the in-memory page information. + */ +struct __wt_page { + /* Per page-type information. */ + union { + /* + * Internal pages (both column- and row-store). + * + * The page record number is only used by column-store, but it + * makes some things simpler and it doesn't cost us any memory, + * other structures in this union are still as large. + * + * In-memory internal pages have an array of pointers to child + * structures, maintained in collated order. When a page is + * read into memory, the initial list of children is stored in + * the "orig_index" field, and it and the collated order are + * the same. After a page splits, the collated order and the + * original order will differ. + * + * Multiple threads of control may be searching the in-memory + * internal page and a child page of the internal page may + * cause a split at any time. When a page splits, a new array + * is allocated and atomically swapped into place. Threads in + * the old array continue without interruption (the old array is + * still valid), but have to avoid racing. No barrier is needed + * because the array reference is updated atomically, but code + * reading the fields multiple times would be a very bad idea. + * Specifically, do not do this: + * WT_REF **refp = page->u.intl__index->index; + * uint32_t entries = page->u.intl__index->entries; + * + * The field is declared volatile (so the compiler knows not to + * read it multiple times), and we obscure the field name and + * use a copy macro in all references to the field (so the code + * doesn't read it multiple times). + */ + struct { + uint64_t recno; /* Starting recno */ + WT_REF *parent_ref; /* Parent reference */ + + struct __wt_page_index { + uint32_t entries; + WT_REF **index; + } * volatile __index; /* Collated children */ + } intl; +#undef pg_intl_recno +#define pg_intl_recno u.intl.recno +#define pg_intl_parent_ref u.intl.parent_ref + + /* + * Macros to copy/set the index because the name is obscured to ensure + * the field isn't read multiple times. + */ +#define WT_INTL_INDEX_COPY(page) ((page)->u.intl.__index) +#define WT_INTL_INDEX_SET(page, v) do { \ + WT_WRITE_BARRIER(); \ + ((page)->u.intl.__index) = (v); \ +} while (0) + + /* + * Macro to walk the list of references in an internal page. + */ +#define WT_INTL_FOREACH_BEGIN(session, page, ref) do { \ + WT_PAGE_INDEX *__pindex; \ + WT_REF **__refp; \ + WT_SESSION_IMPL *__session = (session); \ + uint32_t __entries; \ + WT_ENTER_PAGE_INDEX(session); \ + for (__pindex = WT_INTL_INDEX_COPY(page), \ + __refp = __pindex->index, \ + __entries = __pindex->entries; __entries > 0; --__entries) {\ + (ref) = *__refp++; +#define WT_INTL_FOREACH_END \ + } \ + WT_LEAVE_PAGE_INDEX(__session); \ + } while (0) + + /* Row-store leaf page. */ + struct { + WT_ROW *d; /* Key/value pairs */ + + /* + * The column-store leaf page modification structures + * live in the WT_PAGE_MODIFY structure to keep the + * WT_PAGE structure as small as possible for read-only + * pages. For consistency, we could move the row-store + * modification structures into WT_PAGE_MODIFY too, but + * that doesn't shrink WT_PAGE any further and it would + * require really ugly naming inside of WT_PAGE_MODIFY + * to avoid growing that structure. + */ + WT_INSERT_HEAD **ins; /* Inserts */ + WT_UPDATE **upd; /* Updates */ + + uint32_t entries; /* Entries */ + } row; +#undef pg_row_d +#define pg_row_d u.row.d +#undef pg_row_ins +#define pg_row_ins u.row.ins +#undef pg_row_upd +#define pg_row_upd u.row.upd +#define pg_row_entries u.row.entries +#define pg_row_entries u.row.entries + + /* Fixed-length column-store leaf page. */ + struct { + uint64_t recno; /* Starting recno */ + + uint8_t *bitf; /* Values */ + uint32_t entries; /* Entries */ + } col_fix; +#undef pg_fix_recno +#define pg_fix_recno u.col_fix.recno +#undef pg_fix_bitf +#define pg_fix_bitf u.col_fix.bitf +#undef pg_fix_entries +#define pg_fix_entries u.col_fix.entries + + /* Variable-length column-store leaf page. */ + struct { + uint64_t recno; /* Starting recno */ + + WT_COL *d; /* Values */ + + /* + * Variable-length column-store files maintain a list of + * RLE entries on the page so it's unnecessary to walk + * the page counting records to find a specific entry. + */ + WT_COL_RLE *repeats; /* RLE array for lookups */ + uint32_t nrepeats; /* Number of repeat slots */ + + uint32_t entries; /* Entries */ + } col_var; +#undef pg_var_recno +#define pg_var_recno u.col_var.recno +#undef pg_var_d +#define pg_var_d u.col_var.d +#undef pg_var_repeats +#define pg_var_repeats u.col_var.repeats +#undef pg_var_nrepeats +#define pg_var_nrepeats u.col_var.nrepeats +#undef pg_var_entries +#define pg_var_entries u.col_var.entries + } u; + + /* Page's on-disk representation: NULL for pages created in memory. */ + const WT_PAGE_HEADER *dsk; + + /* If/when the page is modified, we need lots more information. */ + WT_PAGE_MODIFY *modify; + + /* + * The page's read generation acts as an LRU value for each page in the + * tree; it is used by the eviction server thread to select pages to be + * discarded from the in-memory tree. + * + * The read generation is a 64-bit value, if incremented frequently, a + * 32-bit value could overflow. + * + * The read generation is a piece of shared memory potentially read + * by many threads. We don't want to update page read generations for + * in-cache workloads and suffer the cache misses, so we don't simply + * increment the read generation value on every access. Instead, the + * read generation is incremented by the eviction server each time it + * becomes active. To avoid incrementing a page's read generation too + * frequently, it is set to a future point. + */ +#define WT_READGEN_NOTSET 0 +#define WT_READGEN_OLDEST 1 +#define WT_READGEN_STEP 100 + uint64_t read_gen; + + uint64_t memory_footprint; /* Memory attached to the page */ + +#define WT_PAGE_IS_INTERNAL(page) \ + ((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT) +#define WT_PAGE_INVALID 0 /* Invalid page */ +#define WT_PAGE_BLOCK_MANAGER 1 /* Block-manager page */ +#define WT_PAGE_COL_FIX 2 /* Col-store fixed-len leaf */ +#define WT_PAGE_COL_INT 3 /* Col-store internal page */ +#define WT_PAGE_COL_VAR 4 /* Col-store var-length leaf page */ +#define WT_PAGE_OVFL 5 /* Overflow page */ +#define WT_PAGE_ROW_INT 6 /* Row-store internal page */ +#define WT_PAGE_ROW_LEAF 7 /* Row-store leaf page */ + uint8_t type; /* Page type */ + +#define WT_PAGE_BUILD_KEYS 0x01 /* Keys have been built in memory */ +#define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */ +#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ +#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ +#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */ +#define WT_PAGE_SPLITTING 0x20 /* An internal page is growing. */ + uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ +}; + +/* + * WT_PAGE_DISK_OFFSET, WT_PAGE_REF_OFFSET -- + * Return the offset/pointer of a pointer/offset in a page disk image. + */ +#define WT_PAGE_DISK_OFFSET(page, p) \ + WT_PTRDIFF32(p, (page)->dsk) +#define WT_PAGE_REF_OFFSET(page, o) \ + ((void *)((uint8_t *)((page)->dsk) + (o))) + +/* + * Page state. + * + * Synchronization is based on the WT_REF->state field, which has a number of + * possible states: + * + * WT_REF_DISK: + * The initial setting before a page is brought into memory, and set as a + * result of page eviction; the page is on disk, and must be read into + * memory before use. WT_REF_DISK has a value of 0 (the default state + * after allocating cleared memory). + * + * WT_REF_DELETED: + * The page is on disk, but has been deleted from the tree; we can delete + * row-store leaf pages without reading them if they don't reference + * overflow items. + * + * WT_REF_LOCKED: + * Locked for exclusive access. In eviction, this page or a parent has + * been selected for eviction; once hazard pointers are checked, the page + * will be evicted. When reading a page that was previously deleted, it + * is locked until the page is in memory with records marked deleted. The + * thread that set the page to WT_REF_LOCKED has exclusive access, no + * other thread may use the WT_REF until the state is changed. + * + * WT_REF_MEM: + * Set by a reading thread once the page has been read from disk; the page + * is in the cache and the page reference is OK. + * + * WT_REF_READING: + * Set by a reading thread before reading an ordinary page from disk; + * other readers of the page wait until the read completes. Sync can + * safely skip over such pages: they are clean by definition. + * + * WT_REF_SPLIT: + * Set when the page is split; the WT_REF is dead and can no longer be + * used. + * + * The life cycle of a typical page goes like this: pages are read into memory + * from disk and their state set to WT_REF_MEM. When the page is selected for + * eviction, the page state is set to WT_REF_LOCKED. In all cases, evicting + * threads reset the page's state when finished with the page: if eviction was + * successful (a clean page was discarded, and a dirty page was written to disk + * and then discarded), the page state is set to WT_REF_DISK; if eviction failed + * because the page was busy, page state is reset to WT_REF_MEM. + * + * Readers check the state field and if it's WT_REF_MEM, they set a hazard + * pointer to the page, flush memory and re-confirm the page state. If the + * page state is unchanged, the reader has a valid reference and can proceed. + * + * When an evicting thread wants to discard a page from the tree, it sets the + * WT_REF_LOCKED state, flushes memory, then checks hazard pointers. If a + * hazard pointer is found, state is reset to WT_REF_MEM, restoring the page + * to the readers. If the evicting thread does not find a hazard pointer, + * the page is evicted. + */ +typedef enum __wt_page_state { + WT_REF_DISK=0, /* Page is on disk */ + WT_REF_DELETED, /* Page is on disk, but deleted */ + WT_REF_LOCKED, /* Page locked for exclusive access */ + WT_REF_MEM, /* Page is in cache and valid */ + WT_REF_READING, /* Page being read */ + WT_REF_SPLIT /* Page was split */ +} WT_PAGE_STATE; + +/* + * WT_PAGE_DELETED -- + * Related information for fast-delete, on-disk pages. + */ +struct __wt_page_deleted { + uint64_t txnid; /* Transaction ID */ + + WT_UPDATE **update_list; /* List of updates for abort */ +}; + +/* + * WT_REF -- + * A single in-memory page and the state information used to determine if + * it's OK to dereference the pointer to the page. + */ +struct __wt_ref { + WT_PAGE *page; /* Page */ + + /* + * When the tree deepens as a result of a split, the home page value + * changes. Don't cache it, we need to see that change when looking + * up our slot in the page's index structure. + */ + WT_PAGE * volatile home; /* Reference page */ + uint32_t ref_hint; /* Reference page index hint */ + + volatile WT_PAGE_STATE state; /* Page state */ + + /* + * Address: on-page cell if read from backing block, off-page WT_ADDR + * if instantiated in-memory, or NULL if page created in-memory. + */ + void *addr; + + /* + * The child page's key. Do NOT change this union without reviewing + * __wt_ref_key. + */ + union { + uint64_t recno; /* Column-store: starting recno */ + void *ikey; /* Row-store: key */ + } key; + + WT_PAGE_DELETED *page_del; /* Deleted on-disk page information */ +}; +/* + * WT_REF_SIZE is the expected structure size -- we verify the build to ensure + * the compiler hasn't inserted padding which would break the world. + */ +#define WT_REF_SIZE 48 + +/* + * WT_ROW -- + * Each in-memory page row-store leaf page has an array of WT_ROW structures: + * this is created from on-page data when a page is read from the file. It's + * sorted by key, fixed in size, and starts with a reference to on-page data. + * + * Multiple threads of control may be searching the in-memory row-store pages, + * and the key may be instantiated at any time. Code must be able to handle + * both when the key has not been instantiated (the key field points into the + * page's disk image), and when the key has been instantiated (the key field + * points outside the page's disk image). We don't need barriers because the + * key is updated atomically, but code that reads the key field multiple times + * is a very, very bad idea. Specifically, do not do this: + * + * key = rip->key; + * if (key_is_on_page(key)) { + * cell = rip->key; + * } + * + * The field is declared volatile (so the compiler knows it shouldn't read it + * multiple times), and we obscure the field name and use a copy macro in all + * references to the field (so the code doesn't read it multiple times), all + * to make sure we don't introduce this bug (again). + */ +struct __wt_row { /* On-page key, on-page cell, or off-page WT_IKEY */ + void * volatile __key; +}; +#define WT_ROW_KEY_COPY(rip) ((rip)->__key) +#define WT_ROW_KEY_SET(rip, v) ((rip)->__key) = (void *)(v) + +/* + * WT_ROW_FOREACH -- + * Walk the entries of an in-memory row-store leaf page. + */ +#define WT_ROW_FOREACH(page, rip, i) \ + for ((i) = (page)->pg_row_entries, \ + (rip) = (page)->pg_row_d; (i) > 0; ++(rip), --(i)) +#define WT_ROW_FOREACH_REVERSE(page, rip, i) \ + for ((i) = (page)->pg_row_entries, \ + (rip) = (page)->pg_row_d + ((page)->pg_row_entries - 1); \ + (i) > 0; --(rip), --(i)) + +/* + * WT_ROW_SLOT -- + * Return the 0-based array offset based on a WT_ROW reference. + */ +#define WT_ROW_SLOT(page, rip) \ + ((uint32_t)(((WT_ROW *)(rip)) - (page)->pg_row_d)) + +/* + * WT_COL -- + * Each in-memory variable-length column-store leaf page has an array of WT_COL + * structures: this is created from on-page data when a page is read from the + * file. It's fixed in size, and references data on the page. + */ +struct __wt_col { + /* + * Variable-length column-store data references are page offsets, not + * pointers (we boldly re-invent short pointers). The trade-off is 4B + * per K/V pair on a 64-bit machine vs. a single cycle for the addition + * of a base pointer. The on-page data is a WT_CELL (same as row-store + * pages). + * + * If the value is 0, it's a single, deleted record. + * + * Obscure the field name, code shouldn't use WT_COL->__col_value, the + * public interface is WT_COL_PTR and WT_COL_PTR_SET. + */ + uint32_t __col_value; +}; + +/* + * WT_COL_RLE -- + * In variable-length column store leaf pages, we build an array of entries + * with RLE counts greater than 1 when reading the page. We can do a binary + * search in this array, then an offset calculation to find the cell. + */ +struct __wt_col_rle { + uint64_t recno; /* Record number of first repeat. */ + uint64_t rle; /* Repeat count. */ + uint32_t indx; /* Slot of entry in col_var.d */ +} WT_GCC_ATTRIBUTE((packed)); + +/* + * WT_COL_PTR, WT_COL_PTR_SET -- + * Return/Set a pointer corresponding to the data offset. (If the item does + * not exist on the page, return a NULL.) + */ +#define WT_COL_PTR(page, cip) \ + ((cip)->__col_value == 0 ? \ + NULL : WT_PAGE_REF_OFFSET(page, (cip)->__col_value)) +#define WT_COL_PTR_SET(cip, value) \ + (cip)->__col_value = (value) + +/* + * WT_COL_FOREACH -- + * Walk the entries of variable-length column-store leaf page. + */ +#define WT_COL_FOREACH(page, cip, i) \ + for ((i) = (page)->pg_var_entries, \ + (cip) = (page)->pg_var_d; (i) > 0; ++(cip), --(i)) + +/* + * WT_COL_SLOT -- + * Return the 0-based array offset based on a WT_COL reference. + */ +#define WT_COL_SLOT(page, cip) \ + ((uint32_t)(((WT_COL *)cip) - (page)->pg_var_d)) + +/* + * WT_IKEY -- + * Instantiated key: row-store keys are usually prefix compressed and sometimes + * Huffman encoded or overflow objects. Normally, a row-store page in-memory + * key points to the on-page WT_CELL, but in some cases, we instantiate the key + * in memory, in which case the row-store page in-memory key points to a WT_IKEY + * structure. + */ +struct __wt_ikey { + uint32_t size; /* Key length */ + + /* + * If we no longer point to the key's on-page WT_CELL, we can't find its + * related value. Save the offset of the key cell in the page. + * + * Row-store cell references are page offsets, not pointers (we boldly + * re-invent short pointers). The trade-off is 4B per K/V pair on a + * 64-bit machine vs. a single cycle for the addition of a base pointer. + */ + uint32_t cell_offset; + + /* The key bytes immediately follow the WT_IKEY structure. */ +#define WT_IKEY_DATA(ikey) \ + ((void *)((uint8_t *)(ikey) + sizeof(WT_IKEY))) +}; + +/* + * WT_UPDATE -- + * Entries on leaf pages can be updated, either modified or deleted. Updates + * to entries referenced from the WT_ROW and WT_COL arrays are stored in the + * page's WT_UPDATE array. When the first element on a page is updated, the + * WT_UPDATE array is allocated, with one slot for every existing element in + * the page. A slot points to a WT_UPDATE structure; if more than one update + * is done for an entry, WT_UPDATE structures are formed into a forward-linked + * list. + */ +struct __wt_update { + uint64_t txnid; /* update transaction */ + + WT_UPDATE *next; /* forward-linked list */ + + /* + * We use the maximum size as an is-deleted flag, which means we can't + * store 4GB objects; I'd rather do that than increase the size of this + * structure for a flag bit. + */ +#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == UINT32_MAX) +#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = UINT32_MAX) + uint32_t size; /* update length */ + + /* The untyped value immediately follows the WT_UPDATE structure. */ +#define WT_UPDATE_DATA(upd) \ + ((void *)((uint8_t *)(upd) + sizeof(WT_UPDATE))) +} WT_GCC_ATTRIBUTE((packed)); + +/* + * WT_INSERT -- + * + * Row-store leaf pages support inserts of new K/V pairs. When the first K/V + * pair is inserted, the WT_INSERT_HEAD array is allocated, with one slot for + * every existing element in the page, plus one additional slot. A slot points + * to a WT_INSERT_HEAD structure for the items which sort after the WT_ROW + * element that references it and before the subsequent WT_ROW element; the + * skiplist structure has a randomly chosen depth of next pointers in each + * inserted node. + * + * The additional slot is because it's possible to insert items smaller than any + * existing key on the page: for that reason, the first slot of the insert array + * holds keys smaller than any other key on the page. + * + * In column-store variable-length run-length encoded pages, a single indx + * entry may reference a large number of records, because there's a single + * on-page entry representing many identical records. (We don't expand those + * entries when the page comes into memory, as that would require resources as + * pages are moved to/from the cache, including read-only files.) Instead, a + * single indx entry represents all of the identical records originally found + * on the page. + * + * Modifying (or deleting) run-length encoded column-store records is hard + * because the page's entry no longer references a set of identical items. We + * handle this by "inserting" a new entry into the insert array, with its own + * record number. (This is the only case where it's possible to insert into a + * column-store: only appends are allowed, as insert requires re-numbering + * subsequent records. Berkeley DB did support mutable records, but it won't + * scale and it isn't useful enough to re-implement, IMNSHO.) + */ +struct __wt_insert { + WT_UPDATE *upd; /* value */ + + union { + uint64_t recno; /* column-store record number */ + struct { + uint32_t offset; /* row-store key data start */ + uint32_t size; /* row-store key data size */ + } key; + } u; + +#define WT_INSERT_KEY_SIZE(ins) (((WT_INSERT *)ins)->u.key.size) +#define WT_INSERT_KEY(ins) \ + ((void *)((uint8_t *)(ins) + ((WT_INSERT *)ins)->u.key.offset)) +#define WT_INSERT_RECNO(ins) (((WT_INSERT *)ins)->u.recno) + + WT_INSERT *next[0]; /* forward-linked skip list */ +}; + +/* + * Skiplist helper macros. + */ +#define WT_SKIP_FIRST(ins_head) \ + (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)ins_head)->head[0]) +#define WT_SKIP_LAST(ins_head) \ + (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)ins_head)->tail[0]) +#define WT_SKIP_NEXT(ins) ((ins)->next[0]) +#define WT_SKIP_FOREACH(ins, ins_head) \ + for ((ins) = WT_SKIP_FIRST(ins_head); \ + (ins) != NULL; \ + (ins) = WT_SKIP_NEXT(ins)) + +/* + * Atomically allocate and swap a structure or array into place. + */ +#define WT_PAGE_ALLOC_AND_SWAP(s, page, dest, v, count) do { \ + if (((v) = (dest)) == NULL) { \ + WT_ERR(__wt_calloc_def(s, count, &(v))); \ + if (WT_ATOMIC_CAS8(dest, NULL, v)) \ + __wt_cache_page_inmem_incr( \ + s, page, (count) * sizeof(*(v))); \ + else \ + __wt_free(s, v); \ + } \ +} while (0) + +/* + * WT_INSERT_HEAD -- + * The head of a skiplist of WT_INSERT items. + */ +struct __wt_insert_head { + WT_INSERT *head[WT_SKIP_MAXDEPTH]; /* first item on skiplists */ + WT_INSERT *tail[WT_SKIP_MAXDEPTH]; /* last item on skiplists */ +}; + +/* + * The row-store leaf page insert lists are arrays of pointers to structures, + * and may not exist. The following macros return an array entry if the array + * of pointers and the specific structure exist, else NULL. + */ +#define WT_ROW_INSERT_SLOT(page, slot) \ + ((page)->pg_row_ins == NULL ? NULL : (page)->pg_row_ins[slot]) +#define WT_ROW_INSERT(page, ip) \ + WT_ROW_INSERT_SLOT(page, WT_ROW_SLOT(page, ip)) +#define WT_ROW_UPDATE(page, ip) \ + ((page)->pg_row_upd == NULL ? \ + NULL : (page)->pg_row_upd[WT_ROW_SLOT(page, ip)]) +/* + * WT_ROW_INSERT_SMALLEST references an additional slot past the end of the + * the "one per WT_ROW slot" insert array. That's because the insert array + * requires an extra slot to hold keys that sort before any key found on the + * original page. + */ +#define WT_ROW_INSERT_SMALLEST(page) \ + ((page)->pg_row_ins == NULL ? \ + NULL : (page)->pg_row_ins[(page)->pg_row_entries]) + +/* + * The column-store leaf page update lists are arrays of pointers to structures, + * and may not exist. The following macros return an array entry if the array + * of pointers and the specific structure exist, else NULL. + */ +#define WT_COL_UPDATE_SLOT(page, slot) \ + ((page)->modify == NULL || (page)->modify->mod_update == NULL ? \ + NULL : (page)->modify->mod_update[slot]) +#define WT_COL_UPDATE(page, ip) \ + WT_COL_UPDATE_SLOT(page, WT_COL_SLOT(page, ip)) + +/* + * WT_COL_UPDATE_SINGLE is a single WT_INSERT list, used for any fixed-length + * column-store updates for a page. + */ +#define WT_COL_UPDATE_SINGLE(page) \ + WT_COL_UPDATE_SLOT(page, 0) + +/* + * WT_COL_APPEND is an WT_INSERT list, used for fixed- and variable-length + * appends. + */ +#define WT_COL_APPEND(page) \ + ((page)->modify != NULL && (page)->modify->mod_append != NULL ? \ + (page)->modify->mod_append[0] : NULL) + +/* WT_FIX_FOREACH walks fixed-length bit-fields on a disk page. */ +#define WT_FIX_FOREACH(btree, dsk, v, i) \ + for ((i) = 0, \ + (v) = (i) < (dsk)->u.entries ? \ + __bit_getv( \ + WT_PAGE_HEADER_BYTE(btree, dsk), 0, (btree)->bitcnt) : 0; \ + (i) < (dsk)->u.entries; ++(i), \ + (v) = __bit_getv( \ + WT_PAGE_HEADER_BYTE(btree, dsk), i, (btree)->bitcnt)) + +/* + * Manage split generation numbers. Splits walk the list of sessions to check + * when it is safe to free structures that have been replaced. We also check + * that list periodically (e.g., when wrapping up a transaction) to free any + * memory we can. + * + * Before a thread enters code that will examine page indexes (which are + * swapped out by splits), it publishes a copy of the current split generation + * into its session. Don't assume that threads never re-enter this code: if we + * already have a split generation, leave it alone. If our caller is examining + * an index, we don't want the oldest split generation to move forward and + * potentially free it. + */ +#define WT_ENTER_PAGE_INDEX(session) do { \ + uint64_t __prev_split_gen = (session)->split_gen; \ + if (__prev_split_gen == 0) \ + WT_PUBLISH((session)->split_gen, S2C(session)->split_gen) + +#define WT_LEAVE_PAGE_INDEX(session) \ + if (__prev_split_gen == 0) \ + (session)->split_gen = 0; \ + } while (0) + +#define WT_WITH_PAGE_INDEX(session, e) \ + WT_ENTER_PAGE_INDEX(session); \ + (e); \ + WT_LEAVE_PAGE_INDEX(session) diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h new file mode 100644 index 00000000000..05250951a65 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/btree.h @@ -0,0 +1,155 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * Supported btree formats: the "current" version is the maximum supported + * major/minor versions. + */ +#define WT_BTREE_MAJOR_VERSION_MIN 1 /* Oldest version supported */ +#define WT_BTREE_MINOR_VERSION_MIN 1 + +#define WT_BTREE_MAJOR_VERSION_MAX 1 /* Newest version supported */ +#define WT_BTREE_MINOR_VERSION_MAX 1 + +/* + * The maximum btree leaf and internal page size is 512MB (2^29). The limit + * is enforced in software, it could be larger, specifically, the underlying + * default block manager can support 4GB (2^32). Currently, the maximum page + * size must accommodate our dependence on the maximum page size fitting into + * a number of bits less than 32; see the row-store page key-lookup functions + * for the magic. + */ +#define WT_BTREE_PAGE_SIZE_MAX (512 * WT_MEGABYTE) + +/* + * The length of variable-length column-store values and row-store keys/values + * are stored in a 4B type, so the largest theoretical key/value item is 4GB. + * However, in the WT_UPDATE structure we use the UINT32_MAX size as a "deleted" + * flag, and second, the size of an overflow object is constrained by what an + * underlying block manager can actually write. (For example, in the default + * block manager, writing an overflow item includes the underlying block's page + * header and block manager specific structure, aligned to an allocation-sized + * unit). The btree engine limits the size of a single object to (4GB - 1KB); + * that gives us additional bytes if we ever want to store a structure length + * plus the object size in 4B, or if we need additional flag values. Attempts + * to store large key/value items in the tree trigger an immediate check to the + * block manager, to make sure it can write the item. Storing 4GB objects in a + * btree borders on clinical insanity, anyway. + * + * Record numbers are stored in 64-bit unsigned integers, meaning the largest + * record number is "really, really big". + */ +#define WT_BTREE_MAX_OBJECT_SIZE (UINT32_MAX - 1024) + +/* + * A location in a file is a variable-length cookie, but it has a maximum size + * so it's easy to create temporary space in which to store them. (Locations + * can't be much larger than this anyway, they must fit onto the minimum size + * page because a reference to an overflow page is itself a location.) + */ +#define WT_BTREE_MAX_ADDR_COOKIE 255 /* Maximum address cookie */ + +/* + * WT_BTREE -- + * A btree handle. + */ +struct __wt_btree { + WT_DATA_HANDLE *dhandle; + + WT_CKPT *ckpt; /* Checkpoint information */ + + enum { BTREE_COL_FIX=1, /* Fixed-length column store */ + BTREE_COL_VAR=2, /* Variable-length column store */ + BTREE_ROW=3 /* Row-store */ + } type; /* Type */ + + const char *key_format; /* Key format */ + const char *value_format; /* Value format */ + uint8_t bitcnt; /* Fixed-length field size in bits */ + + WT_COLLATOR *collator; /* Row-store comparator */ + int collator_owned; /* The collator needs to be freed */ + + uint32_t id; /* File ID, for logging */ + + uint32_t key_gap; /* Row-store prefix key gap */ + + uint32_t allocsize; /* Allocation size */ + uint32_t maxintlpage; /* Internal page max size */ + uint32_t maxintlitem; /* Internal page max item size */ + uint32_t maxleafpage; /* Leaf page max size */ + uint32_t maxleafitem; /* Leaf page max item size */ + uint64_t maxmempage; /* In memory page max size */ + + void *huffman_key; /* Key huffman encoding */ + void *huffman_value; /* Value huffman encoding */ + + enum { CKSUM_ON=1, /* On */ + CKSUM_OFF=2, /* Off */ + CKSUM_UNCOMPRESSED=3 /* Uncompressed blocks only */ + } checksum; /* Checksum configuration */ + + u_int dictionary; /* Reconcile: dictionary slots */ + int internal_key_truncate; /* Reconcile: internal key truncate */ + int maximum_depth; /* Reconcile: maximum tree depth */ + int prefix_compression; /* Reconcile: prefix compression */ + u_int prefix_compression_min; /* Reconcile: prefix compression min */ + int split_pct; /* Reconcile: split page percent */ + WT_COMPRESSOR *compressor; /* Reconcile: page compressor */ + WT_RWLOCK *ovfl_lock; /* Reconcile: overflow lock */ + + uint64_t last_recno; /* Column-store last record number */ + + WT_REF root; /* Root page reference */ + int modified; /* If the tree ever modified */ + int bulk_load_ok; /* Bulk-load is a possibility */ + + WT_BM *bm; /* Block manager reference */ + u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */ + + uint64_t write_gen; /* Write generation */ + + WT_REF *evict_ref; /* Eviction thread's location */ + uint64_t evict_priority; /* Relative priority of cached pages */ + u_int evict_walk_period; /* Skip this many LRU walks */ + u_int evict_walk_skips; /* Number of walks skipped */ + volatile uint32_t evict_busy; /* Count of threads in eviction */ + + int checkpointing; /* Checkpoint in progress */ + + /* + * We flush pages from the tree (in order to make checkpoint faster), + * without a high-level lock. To avoid multiple threads flushing at + * the same time, lock the tree. + */ + WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */ + + /* Flags values up to 0xff are reserved for WT_DHANDLE_* */ +#define WT_BTREE_BULK 0x00100 /* Bulk-load handle */ +#define WT_BTREE_NO_EVICTION 0x00200 /* Disable eviction */ +#define WT_BTREE_NO_HAZARD 0x00400 /* Disable hazard pointers */ +#define WT_BTREE_SALVAGE 0x00800 /* Handle is for salvage */ +#define WT_BTREE_UPGRADE 0x01000 /* Handle is for upgrade */ +#define WT_BTREE_VERIFY 0x02000 /* Handle is for verify */ + uint32_t flags; +}; + +/* Flags that make a btree handle special (not for normal use). */ +#define WT_BTREE_SPECIAL_FLAGS \ + (WT_BTREE_BULK | WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY) + +/* + * WT_SALVAGE_COOKIE -- + * Encapsulation of salvage information for reconciliation. + */ +struct __wt_salvage_cookie { + uint64_t missing; /* Initial items to create */ + uint64_t skip; /* Initial items to skip */ + uint64_t take; /* Items to take */ + + int done; /* Ignore the rest */ +}; diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i new file mode 100644 index 00000000000..b7957e6647f --- /dev/null +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -0,0 +1,1216 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * __wt_ref_is_root -- + * Return if the page reference is for the root page. + */ +static inline int +__wt_ref_is_root(WT_REF *ref) +{ + return (ref->home == NULL ? 1 : 0); +} + +/* + * __wt_page_is_modified -- + * Return if the page is dirty. + */ +static inline int +__wt_page_is_modified(WT_PAGE *page) +{ + return (page->modify != NULL && page->modify->write_gen != 0 ? 1 : 0); +} + +/* + * Estimate the per-allocation overhead. All implementations of malloc / free + * have some kind of header and pad for alignment. We can't know for sure what + * that adds up to, but this is an estimate based on some measurements of heap + * size versus bytes in use. + */ +#define WT_ALLOC_OVERHEAD 32U + +/* + * __wt_cache_page_inmem_incr -- + * Increment a page's memory footprint in the cache. + */ +static inline void +__wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) +{ + WT_CACHE *cache; + + size += WT_ALLOC_OVERHEAD; + + cache = S2C(session)->cache; + (void)WT_ATOMIC_ADD8(cache->bytes_inmem, size); + (void)WT_ATOMIC_ADD8(page->memory_footprint, size); + if (__wt_page_is_modified(page)) { + (void)WT_ATOMIC_ADD8(cache->bytes_dirty, size); + (void)WT_ATOMIC_ADD8(page->modify->bytes_dirty, size); + } +} + +/* + * __wt_cache_page_inmem_decr -- + * Decrement a page's memory footprint in the cache. + */ +static inline void +__wt_cache_page_inmem_decr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) +{ + WT_CACHE *cache; + + size += WT_ALLOC_OVERHEAD; + + cache = S2C(session)->cache; + (void)WT_ATOMIC_SUB8(cache->bytes_inmem, size); + (void)WT_ATOMIC_SUB8(page->memory_footprint, size); + if (__wt_page_is_modified(page)) { + (void)WT_ATOMIC_SUB8(cache->bytes_dirty, size); + (void)WT_ATOMIC_SUB8(page->modify->bytes_dirty, size); + } +} + +/* + * __wt_cache_dirty_incr -- + * Increment the cache dirty page/byte counts. + */ +static inline void +__wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_CACHE *cache; + size_t size; + + cache = S2C(session)->cache; + (void)WT_ATOMIC_ADD8(cache->pages_dirty, 1); + + /* + * Take care to read the memory_footprint once in case we are racing + * with updates. + */ + size = page->memory_footprint; + (void)WT_ATOMIC_ADD8(cache->bytes_dirty, size); + (void)WT_ATOMIC_ADD8(page->modify->bytes_dirty, size); +} + +/* + * __wt_cache_dirty_decr -- + * Decrement the cache dirty page/byte counts. + */ +static inline void +__wt_cache_dirty_decr(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_CACHE *cache; + size_t size; + + cache = S2C(session)->cache; + + if (cache->pages_dirty < 1) { + (void)__wt_errx(session, + "cache dirty decrement failed: cache dirty page count went " + "negative"); + cache->pages_dirty = 0; + } else + (void)WT_ATOMIC_SUB8(cache->pages_dirty, 1); + + /* + * It is possible to decrement the footprint of the page without making + * the page dirty (for example when freeing an obsolete update list), + * so the footprint could change between read and decrement, and we + * might attempt to decrement by a different amount than the bytes held + * by the page. + * + * We catch that by maintaining a per-page dirty size, and fixing the + * cache stats if that is non-zero when the page is discarded. + * + * Also take care that the global size doesn't go negative. This may + * lead to small accounting errors (particularly on the last page of the + * last file in a checkpoint), but that will come out in the wash when + * the page is evicted. + */ + size = WT_MIN(page->memory_footprint, cache->bytes_dirty); + (void)WT_ATOMIC_SUB8(cache->bytes_dirty, size); + (void)WT_ATOMIC_SUB8(page->modify->bytes_dirty, size); +} + +/* + * __wt_cache_page_evict -- + * Evict pages from the cache. + */ +static inline void +__wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_CACHE *cache; + WT_PAGE_MODIFY *mod; + + cache = S2C(session)->cache; + mod = page->modify; + + /* + * In rare cases, we may race tracking a page's dirty footprint. + * If so, we will get here with a non-zero dirty_size in the page, and + * we can fix the global stats. + */ + if (mod != NULL && mod->bytes_dirty != 0) + (void)WT_ATOMIC_SUB8(cache->bytes_dirty, mod->bytes_dirty); + + WT_ASSERT(session, page->memory_footprint != 0); + (void)WT_ATOMIC_ADD8(cache->bytes_evict, page->memory_footprint); + page->memory_footprint = 0; + + (void)WT_ATOMIC_ADD8(cache->pages_evict, 1); +} + +/* + * __wt_cache_read_gen -- + * Get the current read generation number. + */ +static inline uint64_t +__wt_cache_read_gen(WT_SESSION_IMPL *session) +{ + return (S2C(session)->cache->read_gen); +} + +/* + * __wt_cache_read_gen_incr -- + * Increment the current read generation number. + */ +static inline void +__wt_cache_read_gen_incr(WT_SESSION_IMPL *session) +{ + ++S2C(session)->cache->read_gen; +} + +/* + * __wt_cache_read_gen_set -- + * Get the read generation to store in a page. + */ +static inline uint64_t +__wt_cache_read_gen_set(WT_SESSION_IMPL *session) +{ + /* + * We return read-generations from the future (where "the future" is + * measured by increments of the global read generation). The reason + * is because when acquiring a new hazard pointer for a page, we can + * check its read generation, and if the read generation isn't less + * than the current global generation, we don't bother updating the + * page. In other words, the goal is to avoid some number of updates + * immediately after each update we have to make. + */ + return (__wt_cache_read_gen(session) + WT_READGEN_STEP); +} + +/* + * __wt_cache_pages_inuse -- + * Return the number of pages in use. + */ +static inline uint64_t +__wt_cache_pages_inuse(WT_CACHE *cache) +{ + return (cache->pages_inmem - cache->pages_evict); +} + +/* + * __wt_cache_bytes_inuse -- + * Return the number of bytes in use. + */ +static inline uint64_t +__wt_cache_bytes_inuse(WT_CACHE *cache) +{ + return (cache->bytes_inmem - cache->bytes_evict); +} + +/* + * __wt_page_refp -- + * Return the page's index and slot for a reference. + */ +static inline void +__wt_page_refp(WT_SESSION_IMPL *session, + WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp) +{ + WT_PAGE_INDEX *pindex; + uint32_t i; + + WT_ASSERT(session, + WT_SESSION_TXN_STATE(session)->snap_min != WT_TXN_NONE); + + /* + * Copy the parent page's index value: the page can split at any time, + * but the index's value is always valid, even if it's not up-to-date. + */ +retry: pindex = WT_INTL_INDEX_COPY(ref->home); + + /* + * Use the page's reference hint: it should be correct unless the page + * split before our slot. If the page splits after our slot, the hint + * will point earlier in the array than our actual slot, so the first + * loop is from the hint to the end of the list, and the second loop + * is from the start of the list to the end of the list. (The second + * loop overlaps the first, but that only happen in cases where we've + * deepened the tree and aren't going to find our slot at all, that's + * not worth optimizing.) + * + * It's not an error for the reference hint to be wrong, it just means + * the first retrieval (which sets the hint for subsequent retrievals), + * is slower. + */ + for (i = ref->ref_hint; i < pindex->entries; ++i) + if (pindex->index[i]->page == ref->page) { + *pindexp = pindex; + *slotp = ref->ref_hint = i; + return; + } + for (i = 0; i < pindex->entries; ++i) + if (pindex->index[i]->page == ref->page) { + *pindexp = pindex; + *slotp = ref->ref_hint = i; + return; + } + + /* + * If we don't find our reference, the page split into a new level and + * our home pointer references the wrong page. After internal pages + * deepen, their reference structure home value are updated; yield and + * wait for that to happen. + */ + __wt_yield(); + goto retry; +} + +/* + * __wt_page_modify_init -- + * A page is about to be modified, allocate the modification structure. + */ +static inline int +__wt_page_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + return (page->modify == NULL ? + __wt_page_modify_alloc(session, page) : 0); +} + +/* + * __wt_page_only_modify_set -- + * Mark the page (but only the page) dirty. + */ +static inline void +__wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + uint64_t last_running; + + last_running = 0; + if (page->modify->write_gen == 0) + last_running = S2C(session)->txn_global.last_running; + + /* + * We depend on atomic-add being a write barrier, that is, a barrier to + * ensure all changes to the page are flushed before updating the page + * write generation and/or marking the tree dirty, otherwise checkpoints + * and/or page reconciliation might be looking at a clean page/tree. + * + * Every time the page transitions from clean to dirty, update the cache + * and transactional information. + */ + if (WT_ATOMIC_ADD4(page->modify->write_gen, 1) == 1) { + __wt_cache_dirty_incr(session, page); + + /* + * The page can never end up with changes older than the oldest + * running transaction. + */ + if (F_ISSET(&session->txn, TXN_HAS_SNAPSHOT)) + page->modify->disk_snap_min = session->txn.snap_min; + + /* + * We won the race to dirty the page, but another thread could + * have committed in the meantime, and the last_running field + * been updated past it. That is all very unlikely, but not + * impossible, so we take care to read the global state before + * the atomic increment. If we raced with reconciliation, just + * leave the previous value here: at worst, we will write a + * page in a checkpoint when not absolutely necessary. + */ + if (last_running != 0) + page->modify->first_dirty_txn = last_running; + } + + /* Check if this is the largest transaction ID to update the page. */ + if (TXNID_LT(page->modify->update_txn, session->txn.id)) + page->modify->update_txn = session->txn.id; +} + +/* + * __wt_page_modify_set -- + * Mark the page and tree dirty. + */ +static inline void +__wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + /* + * Mark the tree dirty (even if the page is already marked dirty), newly + * created pages to support "empty" files are dirty, but the file isn't + * marked dirty until there's a real change needing to be written. Test + * before setting the dirty flag, it's a hot cache line. + * + * The tree's modified flag is cleared by the checkpoint thread: set it + * and insert a barrier before dirtying the page. (I don't think it's + * a problem if the tree is marked dirty with all the pages clean, it + * might result in an extra checkpoint that doesn't do any work but it + * shouldn't cause problems; regardless, let's play it safe.) + */ + if (S2BT(session)->modified == 0) { + S2BT(session)->modified = 1; + WT_FULL_BARRIER(); + } + + __wt_page_only_modify_set(session, page); +} + +/* + * __wt_page_parent_modify_set -- + * Mark the parent page and tree dirty. + */ +static inline int +__wt_page_parent_modify_set( + WT_SESSION_IMPL *session, WT_REF *ref, int page_only) +{ + WT_PAGE *parent; + + /* + * This function exists as a place to stash this comment. There are a + * few places where we need to dirty a page's parent. The trick is the + * page's parent might split at any point, and the page parent might be + * the wrong parent at any particular time. We ignore this and dirty + * whatever page the page's reference structure points to. This is safe + * because if we're pointing to the wrong parent, that parent must have + * split, deepening the tree, which implies marking the original parent + * and all of the newly-created children as dirty. In other words, if + * we have the wrong parent page, everything was marked dirty already. + */ + parent = ref->home; + WT_RET(__wt_page_modify_init(session, parent)); + if (page_only) + __wt_page_only_modify_set(session, parent); + else + __wt_page_modify_set(session, parent); + return (0); +} + +/* + * __wt_off_page -- + * Return if a pointer references off-page data. + */ +static inline int +__wt_off_page(WT_PAGE *page, const void *p) +{ + /* + * There may be no underlying page, in which case the reference is + * off-page by definition. + */ + return (page->dsk == NULL || + p < (void *)page->dsk || + p >= (void *)((uint8_t *)page->dsk + page->dsk->mem_size)); +} + +/* + * __wt_ref_key -- + * Return a reference to a row-store internal page key as cheaply as + * possible. + */ +static inline void +__wt_ref_key(WT_PAGE *page, WT_REF *ref, void *keyp, size_t *sizep) +{ + uintptr_t v; + + /* + * An internal page key is in one of two places: if we instantiated the + * key (for example, when reading the page), WT_REF.key.ikey references + * a WT_IKEY structure, otherwise WT_REF.key.ikey references an on-page + * key offset/length pair. + * + * Now the magic: allocated memory must be aligned to store any standard + * type, and we expect some standard type to require at least quad-byte + * alignment, so allocated memory should have some clear low-order bits. + * On-page objects consist of an offset/length pair: the maximum page + * size currently fits into 29 bits, so we use the low-order bits of the + * pointer to mark the other bits of the pointer as encoding the key's + * location and length. This breaks if allocated memory isn't aligned, + * of course. + * + * In this specific case, we use bit 0x01 to mark an on-page key, else + * it's a WT_IKEY reference. The bit pattern for internal row-store + * on-page keys is: + * 32 bits key length + * 31 bits page offset of the key's bytes, + * 1 bits flags + */ +#define WT_IK_FLAG 0x01 +#define WT_IK_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 32) +#define WT_IK_DECODE_KEY_LEN(v) ((v) >> 32) +#define WT_IK_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 1) +#define WT_IK_DECODE_KEY_OFFSET(v) (((v) & 0xFFFFFFFF) >> 1) + v = (uintptr_t)ref->key.ikey; + if (v & WT_IK_FLAG) { + *(void **)keyp = + WT_PAGE_REF_OFFSET(page, WT_IK_DECODE_KEY_OFFSET(v)); + *sizep = WT_IK_DECODE_KEY_LEN(v); + } else { + *(void **)keyp = WT_IKEY_DATA(ref->key.ikey); + *sizep = ((WT_IKEY *)ref->key.ikey)->size; + } +} + +/* + * __wt_ref_key_onpage_set -- + * Set a WT_REF to reference an on-page key. + */ +static inline void +__wt_ref_key_onpage_set(WT_PAGE *page, WT_REF *ref, WT_CELL_UNPACK *unpack) +{ + uintptr_t v; + + /* + * See the comment in __wt_ref_key for an explanation of the magic. + */ + v = WT_IK_ENCODE_KEY_LEN(unpack->size) | + WT_IK_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) | + WT_IK_FLAG; + ref->key.ikey = (void *)v; +} + +/* + * __wt_ref_key_instantiated -- + * Return if a WT_REF key is instantiated. + */ +static inline WT_IKEY * +__wt_ref_key_instantiated(WT_REF *ref) +{ + uintptr_t v; + + /* + * See the comment in __wt_ref_key for an explanation of the magic. + */ + v = (uintptr_t)ref->key.ikey; + return (v & WT_IK_FLAG ? NULL : ref->key.ikey); +} + +/* + * __wt_ref_key_clear -- + * Clear a WT_REF key. + */ +static inline void +__wt_ref_key_clear(WT_REF *ref) +{ + /* The key union has 2 fields, both of which are 8B. */ + ref->key.recno = 0; +} + +/* + * __wt_row_leaf_key_info -- + * Return a row-store leaf page key referenced by a WT_ROW if it can be + * had without unpacking a cell, and information about the cell, if the key + * isn't cheaply available. + */ +static inline int +__wt_row_leaf_key_info(WT_PAGE *page, void *copy, + WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep) +{ + WT_IKEY *ikey; + uintptr_t v; + + v = (uintptr_t)copy; + + /* + * A row-store leaf page key is in one of two places: if instantiated, + * the WT_ROW pointer references a WT_IKEY structure, otherwise, it + * references an on-page offset. Further, on-page keys are in one of + * two states: if the key is a simple key (not an overflow key, prefix + * compressed or Huffman encoded, all of which are likely), the key's + * offset/size is encoded in the pointer. Otherwise, the offset is to + * the key's on-page cell. + * + * Now the magic: allocated memory must be aligned to store any standard + * type, and we expect some standard type to require at least quad-byte + * alignment, so allocated memory should have some clear low-order bits. + * On-page objects consist of an offset/length pair: the maximum page + * size currently fits into 29 bits, so we use the low-order bits of the + * pointer to mark the other bits of the pointer as encoding the key's + * location and length. This breaks if allocated memory isn't aligned, + * of course. + * + * In this specific case, we use bit 0x01 to mark an on-page cell, bit + * 0x02 to mark an on-page key, 0x03 to mark an on-page key/value pair, + * otherwise it's a WT_IKEY reference. The bit pattern for on-page cells + * is: + * 29 bits page offset of the key's cell, + * 2 bits flags + * + * The bit pattern for on-page keys is: + * 32 bits key length, + * 29 bits page offset of the key's bytes, + * 2 bits flags + * + * But, while that allows us to skip decoding simple key cells, we also + * want to skip decoding the value cell in the case where the value cell + * is also simple/short. We use bit 0x03 to mark an encoded on-page key + * and value pair. The bit pattern for on-page key/value pairs is: + * 9 bits key length, + * 13 bits value length, + * 20 bits page offset of the key's bytes, + * 20 bits page offset of the value's bytes, + * 2 bits flags + * + * These bit patterns are in-memory only, of course, so can be modified + * (we could even tune for specific workloads). Generally, the fields + * are larger than the anticipated values being stored (512B keys, 8KB + * values, 1MB pages), hopefully that won't be necessary. + * + * This function returns a list of things about the key (instantiation + * reference, cell reference and key/length pair). Our callers know + * the order in which we look things up and the information returned; + * for example, the cell will never be returned if we are working with + * an on-page key. + */ +#define WT_CELL_FLAG 0x01 +#define WT_CELL_ENCODE_OFFSET(v) ((uintptr_t)(v) << 2) +#define WT_CELL_DECODE_OFFSET(v) (((v) & 0xFFFFFFFF) >> 2) + +#define WT_K_FLAG 0x02 +#define WT_K_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 32) +#define WT_K_DECODE_KEY_LEN(v) ((v) >> 32) +#define WT_K_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 2) +#define WT_K_DECODE_KEY_OFFSET(v) (((v) & 0xFFFFFFFF) >> 2) + +#define WT_KV_FLAG 0x03 +#define WT_KV_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 55) +#define WT_KV_DECODE_KEY_LEN(v) ((v) >> 55) +#define WT_KV_MAX_KEY_LEN (0x200 - 1) +#define WT_KV_ENCODE_VALUE_LEN(v) ((uintptr_t)(v) << 42) +#define WT_KV_DECODE_VALUE_LEN(v) (((v) & 0x007FFC0000000000) >> 42) +#define WT_KV_MAX_VALUE_LEN (0x2000 - 1) +#define WT_KV_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 22) +#define WT_KV_DECODE_KEY_OFFSET(v) (((v) & 0x000003FFFFC00000) >> 22) +#define WT_KV_MAX_KEY_OFFSET (0x100000 - 1) +#define WT_KV_ENCODE_VALUE_OFFSET(v) ((uintptr_t)(v) << 2) +#define WT_KV_DECODE_VALUE_OFFSET(v) (((v) & 0x00000000003FFFFC) >> 2) +#define WT_KV_MAX_VALUE_OFFSET (0x100000 - 1) + switch (v & 0x03) { + case WT_CELL_FLAG: + /* On-page cell: no instantiated key. */ + if (ikeyp != NULL) + *ikeyp = NULL; + if (cellp != NULL) + *cellp = + WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v)); + return (0); + case WT_K_FLAG: + /* Encoded key: no instantiated key, no cell. */ + if (cellp != NULL) + *cellp = NULL; + if (ikeyp != NULL) + *ikeyp = NULL; + if (datap != NULL) { + *(void **)datap = + WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_OFFSET(v)); + *sizep = WT_K_DECODE_KEY_LEN(v); + return (1); + } + return (0); + case WT_KV_FLAG: + /* Encoded key/value pair: no instantiated key, no cell. */ + if (cellp != NULL) + *cellp = NULL; + if (ikeyp != NULL) + *ikeyp = NULL; + if (datap != NULL) { + *(void **)datap = WT_PAGE_REF_OFFSET( + page, WT_KV_DECODE_KEY_OFFSET(v)); + *sizep = WT_KV_DECODE_KEY_LEN(v); + return (1); + } + return (0); + + } + + /* Instantiated key. */ + ikey = copy; + if (ikeyp != NULL) + *ikeyp = copy; + if (cellp != NULL) + *cellp = WT_PAGE_REF_OFFSET(page, ikey->cell_offset); + if (datap != NULL) { + *(void **)datap = WT_IKEY_DATA(ikey); + *sizep = ikey->size; + return (1); + } + return (0); +} + +/* + * __wt_row_leaf_key_set_cell -- + * Set a WT_ROW to reference an on-page row-store leaf cell. + */ +static inline void +__wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell) +{ + uintptr_t v; + + /* + * See the comment in __wt_row_leaf_key_info for an explanation of the + * magic. + */ + v = WT_CELL_ENCODE_OFFSET(WT_PAGE_DISK_OFFSET(page, cell)) | + WT_CELL_FLAG; + WT_ROW_KEY_SET(rip, v); +} + +/* + * __wt_row_leaf_key_set -- + * Set a WT_ROW to reference an on-page row-store leaf key. + */ +static inline void +__wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack) +{ + uintptr_t v; + + /* + * See the comment in __wt_row_leaf_key_info for an explanation of the + * magic. + */ + v = WT_K_ENCODE_KEY_LEN(unpack->size) | + WT_K_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) | + WT_K_FLAG; + WT_ROW_KEY_SET(rip, v); +} + +/* + * __wt_row_leaf_value_set -- + * Set a WT_ROW to reference an on-page row-store leaf value. + */ +static inline void +__wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack) +{ + uintptr_t key_len, key_offset, value_offset, v; + + v = (uintptr_t)WT_ROW_KEY_COPY(rip); + + /* + * See the comment in __wt_row_leaf_key_info for an explanation of the + * magic. + */ + if (!(v & WT_K_FLAG)) /* Already an encoded key */ + return; + + key_len = WT_K_DECODE_KEY_LEN(v); /* Key length */ + if (key_len > WT_KV_MAX_KEY_LEN) + return; + if (unpack->size > WT_KV_MAX_VALUE_LEN) /* Value length */ + return; + + key_offset = WT_K_DECODE_KEY_OFFSET(v); /* Page offsets */ + if (key_offset > WT_KV_MAX_KEY_OFFSET) + return; + value_offset = WT_PAGE_DISK_OFFSET(page, unpack->data); + if (value_offset > WT_KV_MAX_VALUE_OFFSET) + return; + + v = WT_KV_ENCODE_KEY_LEN(key_len) | + WT_KV_ENCODE_VALUE_LEN(unpack->size) | + WT_KV_ENCODE_KEY_OFFSET(key_offset) | + WT_KV_ENCODE_VALUE_OFFSET(value_offset) | WT_KV_FLAG; + WT_ROW_KEY_SET(rip, v); +} + +/* + * __wt_row_leaf_key -- + * Set a buffer to reference a row-store leaf page key as cheaply as + * possible. + */ +static inline int +__wt_row_leaf_key(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_ROW *rip, WT_ITEM *key, int instantiate) +{ + void *copy; + + /* + * A front-end for __wt_row_leaf_key_work, here to inline fast paths. + * + * The row-store key can change underfoot; explicitly take a copy. + */ + copy = WT_ROW_KEY_COPY(rip); + + /* + * All we handle here are on-page keys (which should be a common case), + * and instantiated keys (which start out rare, but become more common + * as a leaf page is searched, instantiating prefix-compressed keys). + */ + if (__wt_row_leaf_key_info( + page, copy, NULL, NULL, &key->data, &key->size)) + return (0); + + /* + * The alternative is an on-page cell with some kind of compressed or + * overflow key that's never been instantiated. Call the underlying + * worker function to figure it out. + */ + return (__wt_row_leaf_key_work(session, page, rip, key, instantiate)); +} + +/* + * __wt_cursor_row_leaf_key -- + * Set a buffer to reference a cursor-referenced row-store leaf page key. + */ +static inline int +__wt_cursor_row_leaf_key(WT_CURSOR_BTREE *cbt, WT_ITEM *key) +{ + WT_PAGE *page; + WT_ROW *rip; + WT_SESSION_IMPL *session; + + /* + * If the cursor references a WT_INSERT item, take the key from there, + * else take the key from the original page. + */ + if (cbt->ins == NULL) { + session = (WT_SESSION_IMPL *)cbt->iface.session; + page = cbt->ref->page; + rip = &page->u.row.d[cbt->slot]; + WT_RET(__wt_row_leaf_key(session, page, rip, key, 0)); + } else { + key->data = WT_INSERT_KEY(cbt->ins); + key->size = WT_INSERT_KEY_SIZE(cbt->ins); + } + return (0); +} + +/* + * __wt_row_leaf_value_cell -- + * Return a pointer to the value cell for a row-store leaf page key, or + * NULL if there isn't one. + */ +static inline WT_CELL * +__wt_row_leaf_value_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *kpack) +{ + WT_CELL *kcell, *vcell; + WT_CELL_UNPACK unpack; + void *copy, *key; + size_t size; + + /* If we already have an unpacked key cell, use it. */ + if (kpack != NULL) + vcell = (WT_CELL *) + ((uint8_t *)kpack->cell + __wt_cell_total_len(kpack)); + else { + /* + * The row-store key can change underfoot; explicitly take a + * copy. + */ + copy = WT_ROW_KEY_COPY(rip); + + /* + * Figure out where the key is, step past it to the value cell. + * The test for a cell not being set tells us that we have an + * on-page key, otherwise we're looking at an instantiated key + * or on-page cell, both of which require an unpack of the key's + * cell to find the value cell that follows. + */ + if (__wt_row_leaf_key_info( + page, copy, NULL, &kcell, &key, &size) && kcell == NULL) + vcell = (WT_CELL *)((uint8_t *)key + size); + else { + __wt_cell_unpack(kcell, &unpack); + vcell = (WT_CELL *)((uint8_t *) + unpack.cell + __wt_cell_total_len(&unpack)); + } + } + + return (__wt_cell_leaf_value_parse(page, vcell)); +} + +/* + * __wt_row_leaf_value -- + * Return the value for a row-store leaf page encoded key/value pair. + */ +static inline int +__wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value) +{ + uintptr_t v; + + /* The row-store key can change underfoot; explicitly take a copy. */ + v = (uintptr_t)WT_ROW_KEY_COPY(rip); + + /* + * See the comment in __wt_row_leaf_key_info for an explanation of the + * magic. + */ + if ((v & 0x03) == WT_KV_FLAG) { + value->data = + WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_VALUE_OFFSET(v)); + value->size = WT_KV_DECODE_VALUE_LEN(v); + return (1); + } + return (0); +} + +/* + * __wt_ref_info -- + * Return the addr/size and type triplet for a reference. + */ +static inline int +__wt_ref_info(WT_SESSION_IMPL *session, + WT_REF *ref, const uint8_t **addrp, size_t *sizep, u_int *typep) +{ + WT_ADDR *addr; + WT_CELL_UNPACK *unpack, _unpack; + + addr = ref->addr; + unpack = &_unpack; + + /* + * If NULL, there is no location. + * If off-page, the pointer references a WT_ADDR structure. + * If on-page, the pointer references a cell. + * + * The type is of a limited set: internal, leaf or no-overflow leaf. + */ + if (addr == NULL) { + *addrp = NULL; + *sizep = 0; + if (typep != NULL) + *typep = 0; + } else if (__wt_off_page(ref->home, addr)) { + *addrp = addr->addr; + *sizep = addr->size; + if (typep != NULL) + switch (addr->type) { + case WT_ADDR_INT: + *typep = WT_CELL_ADDR_INT; + break; + case WT_ADDR_LEAF: + *typep = WT_CELL_ADDR_LEAF; + break; + case WT_ADDR_LEAF_NO: + *typep = WT_CELL_ADDR_LEAF_NO; + break; + WT_ILLEGAL_VALUE(session); + } + } else { + __wt_cell_unpack((WT_CELL *)addr, unpack); + *addrp = unpack->data; + *sizep = unpack->size; + if (typep != NULL) + *typep = unpack->type; + } + return (0); +} + +/* + * __wt_page_release -- + * Release a reference to a page. + */ +static inline int +__wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *page; + int locked; + + btree = S2BT(session); + + /* + * Discard our hazard pointer. Ignore pages we don't have and the root + * page, which sticks in memory, regardless. + */ + if (ref == NULL || __wt_ref_is_root(ref)) + return (0); + page = ref->page; + + /* + * Attempt to evict pages with the special "oldest" read generation. + * + * This is set for pages that grow larger than the configured + * memory_page_max setting, and when we are attempting to scan without + * trashing the cache. + * + * Skip this if eviction is disabled for this operation or this tree, + * or if there is no chance of eviction succeeding for dirty pages due + * to a checkpoint or because we've already tried writing this page and + * it contains an update that isn't stable. + */ + if (LF_ISSET(WT_READ_NO_EVICT) || + page->read_gen != WT_READGEN_OLDEST || + F_ISSET(btree, WT_BTREE_NO_EVICTION) || + (__wt_page_is_modified(page) && (btree->checkpointing || + !__wt_txn_visible_all(session, page->modify->first_dirty_txn)))) + return (__wt_hazard_clear(session, page)); + + /* + * Take some care with order of operations: if we release the hazard + * reference without first locking the page, it could be evicted in + * between. + */ + locked = WT_ATOMIC_CAS4(ref->state, WT_REF_MEM, WT_REF_LOCKED); + WT_TRET(__wt_hazard_clear(session, page)); + if (!locked) + return (ret); + + (void)WT_ATOMIC_ADD4(btree->evict_busy, 1); + if ((ret = __wt_evict_page(session, ref)) == 0) + WT_STAT_FAST_CONN_INCR(session, cache_eviction_force); + else { + WT_STAT_FAST_CONN_INCR(session, cache_eviction_force_fail); + if (ret == EBUSY) + ret = 0; + } + (void)WT_ATOMIC_SUB4(btree->evict_busy, 1); + + return (ret); +} + +/* + * __wt_page_swap_func -- + * Swap one page's hazard pointer for another one when hazard pointer + * coupling up/down the tree. + */ +static inline int +__wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held, + WT_REF *want, uint32_t flags +#ifdef HAVE_DIAGNOSTIC + , const char *file, int line +#endif + ) +{ + WT_DECL_RET; + int acquired; + + /* + * This function is here to simplify the error handling during hazard + * pointer coupling so we never leave a hazard pointer dangling. The + * assumption is we're holding a hazard pointer on "held", and want to + * acquire a hazard pointer on "want", releasing the hazard pointer on + * "held" when we're done. + */ + ret = __wt_page_in_func(session, want, flags +#ifdef HAVE_DIAGNOSTIC + , file, line +#endif + ); + + /* An expected failure: WT_NOTFOUND when doing a cache-only read. */ + if (LF_ISSET(WT_READ_CACHE) && ret == WT_NOTFOUND) + return (WT_NOTFOUND); + + /* An expected failure: WT_RESTART */ + if (ret == WT_RESTART) + return (WT_RESTART); + + /* Discard the original held page. */ + acquired = ret == 0; + WT_TRET(__wt_page_release(session, held, flags)); + + /* + * If there was an error discarding the original held page, discard + * the acquired page too, keeping it is never useful. + */ + if (acquired && ret != 0) + WT_TRET(__wt_page_release(session, want, flags)); + return (ret); +} + +/* + * __wt_page_hazard_check -- + * Return if there's a hazard pointer to the page in the system. + */ +static inline WT_HAZARD * +__wt_page_hazard_check(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_CONNECTION_IMPL *conn; + WT_HAZARD *hp; + WT_SESSION_IMPL *s; + uint32_t i, hazard_size, session_cnt; + + conn = S2C(session); + + /* + * No lock is required because the session array is fixed size, but it + * may contain inactive entries. We must review any active session + * that might contain a hazard pointer, so insert a barrier before + * reading the active session count. That way, no matter what sessions + * come or go, we'll check the slots for all of the sessions that could + * have been active when we started our check. + */ + WT_ORDERED_READ(session_cnt, conn->session_cnt); + for (s = conn->sessions, i = 0; i < session_cnt; ++s, ++i) { + if (!s->active) + continue; + WT_ORDERED_READ(hazard_size, s->hazard_size); + for (hp = s->hazard; hp < s->hazard + hazard_size; ++hp) + if (hp->page == page) + return (hp); + } + return (NULL); +} + +/* + * __wt_skip_choose_depth -- + * Randomly choose a depth for a skiplist insert. + */ +static inline u_int +__wt_skip_choose_depth(WT_SESSION_IMPL *session) +{ + u_int d; + + for (d = 1; d < WT_SKIP_MAXDEPTH && + __wt_random(session->rnd) < WT_SKIP_PROBABILITY; d++) + ; + return (d); +} + +/* + * __wt_btree_size_overflow -- + * Check if the size of an in-memory tree with a single leaf page is over + * a specified maximum. If called on anything other than a simple tree with a + * single leaf page, returns true so the calling code will switch to a new tree. + */ +static inline int +__wt_btree_size_overflow(WT_SESSION_IMPL *session, uint64_t maxsize) +{ + WT_BTREE *btree; + WT_PAGE *child, *root; + WT_PAGE_INDEX *pindex; + WT_REF *first; + + btree = S2BT(session); + root = btree->root.page; + + /* Check for a non-existent tree. */ + if (root == NULL) + return (0); + + /* A tree that can be evicted always requires a switch. */ + if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) + return (1); + + /* Check for a tree with a single leaf page. */ + pindex = WT_INTL_INDEX_COPY(root); + if (pindex->entries != 1) /* > 1 child page, switch */ + return (1); + + first = pindex->index[0]; + if (first->state != WT_REF_MEM) /* no child page, ignore */ + return (0); + + /* + * We're reaching down into the page without a hazard pointer, but + * that's OK because we know that no-eviction is set and so the page + * cannot disappear. + */ + child = first->page; + if (child->type != WT_PAGE_ROW_LEAF) /* not a single leaf page */ + return (1); + + return (child->memory_footprint > maxsize); +} + +/* + * __wt_lex_compare -- + * Lexicographic comparison routine. + * + * Returns: + * < 0 if user_item is lexicographically < tree_item + * = 0 if user_item is lexicographically = tree_item + * > 0 if user_item is lexicographically > tree_item + * + * We use the names "user" and "tree" so it's clear in the btree code which + * the application is looking at when we call its comparison func. + */ +static inline int +__wt_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item) +{ + const uint8_t *userp, *treep; + size_t len, usz, tsz; + + usz = user_item->size; + tsz = tree_item->size; + len = WT_MIN(usz, tsz); + + for (userp = user_item->data, treep = tree_item->data; + len > 0; + --len, ++userp, ++treep) + if (*userp != *treep) + return (*userp < *treep ? -1 : 1); + + /* Contents are equal up to the smallest length. */ + return ((usz == tsz) ? 0 : (usz < tsz) ? -1 : 1); +} + +/* + * __wt_compare -- + * The same as __wt_lex_compare, but using the application's collator + * function when configured. + */ +static inline int +__wt_compare(WT_SESSION_IMPL *session, WT_COLLATOR *collator, + const WT_ITEM *user_item, const WT_ITEM *tree_item, int *cmpp) +{ + if (collator == NULL) { + *cmpp = __wt_lex_compare(user_item, tree_item); + return (0); + } + return (collator->compare( + collator, &session->iface, user_item, tree_item, cmpp)); +} + +/* + * __wt_lex_compare_skip -- + * Lexicographic comparison routine, skipping leading bytes. + * + * Returns: + * < 0 if user_item is lexicographically < tree_item + * = 0 if user_item is lexicographically = tree_item + * > 0 if user_item is lexicographically > tree_item + * + * We use the names "user" and "tree" so it's clear in the btree code which + * the application is looking at when we call its comparison func. + */ +static inline int +__wt_lex_compare_skip( + const WT_ITEM *user_item, const WT_ITEM *tree_item, size_t *matchp) +{ + const uint8_t *userp, *treep; + size_t len, usz, tsz; + + usz = user_item->size; + tsz = tree_item->size; + len = WT_MIN(usz, tsz) - *matchp; + + for (userp = (uint8_t *)user_item->data + *matchp, + treep = (uint8_t *)tree_item->data + *matchp; + len > 0; + --len, ++userp, ++treep, ++*matchp) + if (*userp != *treep) + return (*userp < *treep ? -1 : 1); + + /* Contents are equal up to the smallest length. */ + return ((usz == tsz) ? 0 : (usz < tsz) ? -1 : 1); +} + +/* + * __wt_compare_skip -- + * The same as __wt_lex_compare_skip, but using the application's collator + * function when configured. + */ +static inline int +__wt_compare_skip(WT_SESSION_IMPL *session, WT_COLLATOR *collator, + const WT_ITEM *user_item, const WT_ITEM *tree_item, int *cmpp, + size_t *matchp) +{ + if (collator == NULL) { + *cmpp = __wt_lex_compare_skip(user_item, tree_item, matchp); + return (0); + } + return (collator->compare( + collator, &session->iface, user_item, tree_item, cmpp)); +} diff --git a/src/third_party/wiredtiger/src/include/buf.i b/src/third_party/wiredtiger/src/include/buf.i new file mode 100644 index 00000000000..09bee9ff831 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/buf.i @@ -0,0 +1,133 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * __wt_buf_grow -- + * Grow a buffer that may be in-use, and ensure that all data is local to + * the buffer. + */ +static inline int +__wt_buf_grow(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) +{ + return (size > buf->memsize || !WT_DATA_IN_ITEM(buf) ? + __wt_buf_grow_worker(session, buf, size) : 0); +} + +/* + * __wt_buf_extend -- + * Grow a buffer that's currently in-use. + */ +static inline int +__wt_buf_extend(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) +{ + /* + * The difference between __wt_buf_grow and __wt_buf_extend is that the + * latter is expected to be called repeatedly for the same buffer, and + * so grows the buffer exponentially to avoid repeated costly calls to + * realloc. + */ + return (size > buf->memsize ? + __wt_buf_grow(session, buf, WT_MAX(size, 2 * buf->memsize)) : 0); +} + +/* + * __wt_buf_init -- + * Initialize a buffer at a specific size. + */ +static inline int +__wt_buf_init(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) +{ + buf->data = buf->mem; + buf->size = 0; /* Clear existing data length */ + WT_RET(__wt_buf_grow(session, buf, size)); + + return (0); +} + +/* + * __wt_buf_initsize -- + * Initialize a buffer at a specific size, and set the data length. + */ +static inline int +__wt_buf_initsize(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) +{ + buf->data = buf->mem; + buf->size = 0; /* Clear existing data length */ + WT_RET(__wt_buf_grow(session, buf, size)); + buf->size = size; /* Set the data length. */ + + return (0); +} + +/* + * __wt_buf_set -- + * Set the contents of the buffer. + */ +static inline int +__wt_buf_set( + WT_SESSION_IMPL *session, WT_ITEM *buf, const void *data, size_t size) +{ + /* Ensure the buffer is large enough. */ + WT_RET(__wt_buf_initsize(session, buf, size)); + + /* Copy the data, allowing for overlapping strings. */ + memmove(buf->mem, data, size); + + return (0); +} + +/* + * __wt_buf_setstr -- + * Set the contents of the buffer to a NUL-terminated string. + */ +static inline int +__wt_buf_setstr(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *s) +{ + return (__wt_buf_set(session, buf, s, strlen(s) + 1)); +} + +/* + * __wt_buf_set_printable -- + * Set the contents of the buffer to a printable representation of a + * byte string. + */ +static inline int +__wt_buf_set_printable( + WT_SESSION_IMPL *session, WT_ITEM *buf, const void *from_arg, size_t size) +{ + return (__wt_raw_to_esc_hex(session, from_arg, size, buf)); +} + +/* + * __wt_buf_free -- + * Free a buffer. + */ +static inline void +__wt_buf_free(WT_SESSION_IMPL *session, WT_ITEM *buf) +{ + __wt_free(session, buf->mem); + + memset(buf, 0, sizeof(WT_ITEM)); +} + +/* + * __wt_scr_free -- + * Release a scratch buffer. + */ +static inline void +__wt_scr_free(WT_ITEM **bufp) +{ + WT_ITEM *buf; + + if ((buf = *bufp) != NULL) { + *bufp = NULL; + + buf->data = NULL; + buf->size = 0; + F_CLR(buf, WT_ITEM_INUSE); + } +} diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h new file mode 100644 index 00000000000..b7dbd8401a9 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/cache.h @@ -0,0 +1,139 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * Tuning constants: I hesitate to call this tuning, but we want to review some + * number of pages from each file's in-memory tree for each page we evict. + */ +#define WT_EVICT_INT_SKEW (1<<20) /* Prefer leaf pages over internal + pages by this many increments of the + read generation. */ +#define WT_EVICT_WALK_PER_FILE 10 /* Pages to visit per file */ +#define WT_EVICT_WALK_BASE 300 /* Pages tracked across file visits */ +#define WT_EVICT_WALK_INCR 100 /* Pages added each walk */ + +#define WT_EVICT_PASS_AGGRESSIVE 0x01 +#define WT_EVICT_PASS_ALL 0x02 +#define WT_EVICT_PASS_DIRTY 0x04 + +/* + * WT_EVICT_ENTRY -- + * Encapsulation of an eviction candidate. + */ +struct __wt_evict_entry { + WT_BTREE *btree; /* Enclosing btree object */ + WT_REF *ref; /* Page to flush/evict */ +}; + +/* + * WT_EVICT_WORKER -- + * Encapsulation of an eviction worker thread. + */ + +struct __wt_evict_worker { + WT_SESSION_IMPL *session; + u_int id; + wt_thread_t tid; +#define WT_EVICT_WORKER_RUN 0x01 + uint32_t flags; +}; + +/* + * WiredTiger cache structure. + */ +struct __wt_cache { + /* + * Different threads read/write pages to/from the cache and create pages + * in the cache, so we cannot know precisely how much memory is in use + * at any specific time. However, even though the values don't have to + * be exact, they can't be garbage, we track what comes in and what goes + * out and calculate the difference as needed. + */ + uint64_t bytes_inmem; /* Bytes/pages in memory */ + uint64_t pages_inmem; + uint64_t bytes_evict; /* Bytes/pages discarded by eviction */ + uint64_t pages_evict; + uint64_t bytes_dirty; /* Bytes/pages currently dirty */ + uint64_t pages_dirty; + + /* + * Read information. + */ + uint64_t read_gen; /* Page read generation (LRU) */ + + /* + * Eviction thread information. + */ + WT_CONDVAR *evict_cond; /* Eviction server condition */ + WT_SPINLOCK evict_lock; /* Eviction LRU queue */ + WT_SPINLOCK evict_walk_lock; /* Eviction walk location */ + /* Condition signalled when the eviction server populates the queue */ + WT_CONDVAR *evict_waiter_cond; + + u_int eviction_trigger; /* Percent to trigger eviction */ + u_int eviction_target; /* Percent to end eviction */ + u_int eviction_dirty_target; /* Percent to allow dirty */ + + /* + * LRU eviction list information. + */ + WT_EVICT_ENTRY *evict; /* LRU pages being tracked */ + WT_EVICT_ENTRY *evict_current; /* LRU current page to be evicted */ + uint32_t evict_candidates; /* LRU list pages to evict */ + uint32_t evict_entries; /* LRU entries in the queue */ + volatile uint32_t evict_max; /* LRU maximum eviction slot used */ + uint32_t evict_slots; /* LRU list eviction slots */ + WT_DATA_HANDLE + *evict_file_next; /* LRU next file to search */ + + /* + * Sync/flush request information. + */ + volatile uint64_t sync_request; /* File sync requests */ + volatile uint64_t sync_complete;/* File sync requests completed */ + + /* + * Cache pool information. + */ + uint64_t cp_saved_evict; /* Evict count from last pass */ + uint64_t cp_current_evict; /* Evict count from current pass */ + uint32_t cp_skip_count; /* Post change stabilization */ + uint64_t cp_reserved; /* Base size for this cache */ + WT_SESSION_IMPL *cp_session; /* May be used for cache management */ + wt_thread_t cp_tid; /* Thread ID for cache pool manager */ + + /* + * Flags. + */ +#define WT_CACHE_POOL_MANAGER 0x01 /* The active cache pool manager */ +#define WT_CACHE_POOL_RUN 0x02 /* Cache pool thread running */ +#define WT_EVICT_ACTIVE 0x04 /* Eviction server is active */ +#define WT_EVICT_CLEAR_WALKS 0x08 /* Clear eviction walks */ +#define WT_EVICT_NO_PROGRESS 0x10 /* Check if pages are being evicted */ +#define WT_EVICT_STUCK 0x20 /* Eviction server is stuck */ + uint32_t flags; +}; + +/* + * WT_CACHE_POOL -- + * A structure that represents a shared cache. + */ +struct __wt_cache_pool { + WT_SPINLOCK cache_pool_lock; + WT_CONDVAR *cache_pool_cond; + const char *name; + uint64_t size; + uint64_t chunk; + uint64_t currently_used; + uint32_t refs; /* Reference count for structure. */ + /* Locked: List of connections participating in the cache pool. */ + TAILQ_HEAD(__wt_cache_pool_qh, __wt_connection_impl) cache_pool_qh; + +#define WT_CACHE_POOL_MANAGED 0x01 /* Cache pool has a manager thread */ +#define WT_CACHE_POOL_ACTIVE 0x02 /* Cache pool is active */ + uint8_t flags_atomic; +}; diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i new file mode 100644 index 00000000000..fdb7302f4a8 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/cache.i @@ -0,0 +1,174 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * __wt_eviction_check -- + * Wake the eviction server if necessary. + */ +static inline int +__wt_eviction_check(WT_SESSION_IMPL *session, int *fullp, int wake) +{ + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + uint64_t bytes_inuse, bytes_max, dirty_inuse; + + conn = S2C(session); + cache = conn->cache; + + /* + * If we're over the maximum cache, shut out reads (which include page + * allocations) until we evict to back under the maximum cache. + * Eviction will keep pushing out pages so we don't run on the edge all + * the time. Avoid division by zero if the cache size has not yet been + * in a shared cache. + */ + bytes_inuse = __wt_cache_bytes_inuse(cache); + dirty_inuse = cache->bytes_dirty; + bytes_max = conn->cache_size + 1; + + /* Calculate the cache full percentage. */ + *fullp = (int)((100 * bytes_inuse) / bytes_max); + + /* Wake eviction when we're over the trigger cache size. */ + if (wake && + (bytes_inuse > (cache->eviction_trigger * bytes_max) / 100 || + dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100)) + WT_RET(__wt_evict_server_wake(session)); + return (0); +} + +/* + * __wt_session_can_wait -- + * Return if a session available for a potentially slow operation. + */ +static inline int +__wt_session_can_wait(WT_SESSION_IMPL *session) +{ + /* + * Return if a session available for a potentially slow operation; + * for example, used by the block manager in the case of flushing + * the system cache. + */ + if (!F_ISSET(session, WT_SESSION_CAN_WAIT)) + return (0); + + /* + * LSM sets the no-cache-check flag when holding the LSM tree lock, + * in that case, or when holding the schema lock, we don't want to + * highjack the thread for eviction. + */ + if (F_ISSET(session, + WT_SESSION_NO_CACHE_CHECK | WT_SESSION_SCHEMA_LOCKED)) + return (0); + + return (1); +} + +/* + * __wt_cache_full_check -- + * Wait for there to be space in the cache before a read or update. + */ +static inline int +__wt_cache_full_check(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *txn_state; + int busy, count, full; + + /* + * LSM sets the no-cache-check flag when holding the LSM tree lock, + * in that case, or when holding the schema lock, we don't want to + * highjack the thread for eviction. + */ + if (F_ISSET(session, + WT_SESSION_NO_CACHE_CHECK | WT_SESSION_SCHEMA_LOCKED)) + return (0); + + /* + * Threads operating on trees that cannot be evicted are ignored, + * mostly because they're not contributing to the problem. + */ + if ((btree = S2BT_SAFE(session)) != NULL && + F_ISSET(btree, WT_BTREE_NO_EVICTION)) + return (0); + + /* + * Only wake the eviction server the first time through here (if the + * cache is too full). + * + * If the cache is less than 95% full, no work to be done. + */ + WT_RET(__wt_eviction_check(session, &full, 1)); + if (full < 95) + return (0); + + /* + * If we are at the API boundary and the cache is more than 95% full, + * try to evict at least one page before we start an operation. This + * helps with some eviction-dominated workloads. + * + * If the current transaction is keeping the oldest ID pinned, it is in + * the middle of an operation. This may prevent the oldest ID from + * moving forward, leading to deadlock, so only evict what we can. + * Otherwise, we are at a transaction boundary and we can work harder + * to make sure there is free space in the cache. + */ + txn_global = &S2C(session)->txn_global; + txn_state = &txn_global->states[session->id]; + busy = txn_state->id != WT_TXN_NONE || + session->nhazard > 0 || + (txn_state->snap_min != WT_TXN_NONE && + txn_global->current != txn_global->oldest_id); + if (busy && full < 100) + return (0); + count = busy ? 1 : 10; + + for (;;) { + switch (ret = __wt_evict_lru_page(session, 1)) { + case 0: + if (--count == 0) + return (0); + break; + case EBUSY: + continue; + case WT_NOTFOUND: + break; + default: + return (ret); + } + + WT_RET(__wt_eviction_check(session, &full, 0)); + if (full < 100) + return (0); + else if (ret == 0) + continue; + + /* + * The cache is still full and no pages were found in the queue + * to evict. If this transaction is the one holding back the + * oldest ID, we can't wait forever. We'll block next time we + * are not busy. + */ + if (busy) { + __wt_txn_update_oldest(session); + if (txn_state->id == txn_global->oldest_id || + txn_state->snap_min == txn_global->oldest_id) + return (0); + } + + /* Wait for the queue to re-populate before trying again. */ + WT_RET(__wt_cond_wait(session, + S2C(session)->cache->evict_waiter_cond, 100000)); + + /* Check if things have changed so that we are busy. */ + if (!busy && txn_state->snap_min != WT_TXN_NONE && + txn_global->current != txn_global->oldest_id) + busy = count = 1; + } +} diff --git a/src/third_party/wiredtiger/src/include/cell.i b/src/third_party/wiredtiger/src/include/cell.i new file mode 100644 index 00000000000..42c7c07a30c --- /dev/null +++ b/src/third_party/wiredtiger/src/include/cell.i @@ -0,0 +1,816 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * WT_CELL -- + * Variable-length cell type. + * + * Pages containing variable-length keys or values data (the WT_PAGE_ROW_INT, + * WT_PAGE_ROW_LEAF, WT_PAGE_COL_INT and WT_PAGE_COL_VAR page types), have + * cells after the page header. + * + * There are 4 basic cell types: keys and data (each of which has an overflow + * form), deleted cells and off-page references. The cell is usually followed + * by additional data, varying by type: a key or data cell is followed by a set + * of bytes, an address cookie follows overflow or off-page cells. + * + * Deleted cells are place-holders for column-store files, where entries cannot + * be removed in order to preserve the record count. + * + * Here's the cell use by page type: + * + * WT_PAGE_ROW_INT (row-store internal page): + * Keys and offpage-reference pairs (a WT_CELL_KEY or WT_CELL_KEY_OVFL + * cell followed by a WT_CELL_ADDR_XXX cell). + * + * WT_PAGE_ROW_LEAF (row-store leaf page): + * Keys with optional data cells (a WT_CELL_KEY or WT_CELL_KEY_OVFL cell, + * normally followed by a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell). + * + * WT_PAGE_ROW_LEAF pages optionally prefix-compress keys, using a single + * byte count immediately following the cell. + * + * WT_PAGE_COL_INT (Column-store internal page): + * Off-page references (a WT_CELL_ADDR_XXX cell). + * + * WT_PAGE_COL_VAR (Column-store leaf page storing variable-length cells): + * Data cells (a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell), or deleted + * cells (a WT_CELL_DEL cell). + * + * Each cell starts with a descriptor byte: + * + * Bits 1 and 2 are reserved for "short" key and value cells (that is, a cell + * carrying data less than 64B, where we can store the data length in the cell + * descriptor byte): + * 0x00 Not a short key/data cell + * 0x01 Short key cell + * 0x10 Short key cell, with a following prefix-compression byte + * 0x11 Short value cell + * In these cases, the other 6 bits of the descriptor byte are the data length. + * + * Bit 3 marks an 8B packed, uint64_t value following the cell description byte. + * (A run-length counter or a record number for variable-length column store.) + * + * Bit 4 is unused. + * + * Bits 5-8 are cell "types". + */ +#define WT_CELL_KEY_SHORT 0x01 /* Short key */ +#define WT_CELL_KEY_SHORT_PFX 0x02 /* Short key with prefix byte */ +#define WT_CELL_VALUE_SHORT 0x03 /* Short data */ +#define WT_CELL_SHORT_TYPE(v) ((v) & 0x03U) + +#define WT_CELL_SHORT_MAX 63 /* Maximum short key/value */ +#define WT_CELL_SHORT_SHIFT 2 /* Shift for short key/value */ + +#define WT_CELL_64V 0x04 /* Associated value */ + +/* + * We could use bit 4 as a single bit (similar to bit 3), or as a type bit in a + * backward compatible way by adding bit 4 to the type mask and adding new types + * that incorporate it. + */ +#define WT_CELL_UNUSED_BIT4 0x08 /* Unused */ + +/* + * WT_CELL_ADDR_INT is an internal block location, WT_CELL_ADDR_LEAF is a leaf + * block location, and WT_CELL_ADDR_LEAF_NO is a leaf block location where the + * page has no overflow items. (The goal is to speed up truncation as we don't + * have to read pages without overflow items in order to delete them. Note, + * WT_CELL_ADDR_LEAF_NO is not guaranteed to be set on every page without + * overflow items, the only guarantee is that if set, the page has no overflow + * items.) + * + * WT_CELL_VALUE_COPY is a reference to a previous cell on the page, supporting + * value dictionaries: if the two values are the same, we only store them once + * and have the second and subsequent use reference the original. + */ +#define WT_CELL_ADDR_DEL (0) /* Address: deleted */ +#define WT_CELL_ADDR_INT (1 << 4) /* Address: internal */ +#define WT_CELL_ADDR_LEAF (2 << 4) /* Address: leaf */ +#define WT_CELL_ADDR_LEAF_NO (3 << 4) /* Address: leaf no overflow */ +#define WT_CELL_DEL (4 << 4) /* Deleted value */ +#define WT_CELL_KEY (5 << 4) /* Key */ +#define WT_CELL_KEY_OVFL (6 << 4) /* Overflow key */ +#define WT_CELL_KEY_OVFL_RM (12 << 4) /* Overflow key (removed) */ +#define WT_CELL_KEY_PFX (7 << 4) /* Key with prefix byte */ +#define WT_CELL_VALUE (8 << 4) /* Value */ +#define WT_CELL_VALUE_COPY (9 << 4) /* Value copy */ +#define WT_CELL_VALUE_OVFL (10 << 4) /* Overflow value */ +#define WT_CELL_VALUE_OVFL_RM (11 << 4) /* Overflow value (removed) */ + +#define WT_CELL_TYPE_MASK (0x0fU << 4) /* Maximum 16 cell types */ +#define WT_CELL_TYPE(v) ((v) & WT_CELL_TYPE_MASK) + +/* + * When we aren't able to create a short key or value (and, in the case of a + * value, there's no associated RLE), the key or value is at least 64B, else + * we'd have been able to store it as a short cell. Decrement/Increment the + * size before storing it, in the hopes that relatively small key/value sizes + * will pack into a single byte instead of two bytes. + */ +#define WT_CELL_SIZE_ADJUST 64 + +/* + * WT_CELL -- + * Variable-length, on-page cell header. + */ +struct __wt_cell { + /* + * Maximum of 16 bytes: + * 1: cell descriptor byte + * 1: prefix compression count + * 9: associated 64-bit value (uint64_t encoding, max 9 bytes) + * 5: data length (uint32_t encoding, max 5 bytes) + * + * This calculation is pessimistic: the prefix compression count and + * 64V value overlap, the 64V value and data length are optional. + */ + uint8_t __chunk[1 + 1 + WT_INTPACK64_MAXSIZE + WT_INTPACK32_MAXSIZE]; +}; + +/* + * WT_CELL_UNPACK -- + * Unpacked cell. + */ +struct __wt_cell_unpack { + WT_CELL *cell; /* Cell's disk image address */ + + uint64_t v; /* RLE count or recno */ + + /* + * !!! + * The size and __len fields are reasonably type size_t; don't change + * the type, performance drops significantly if they're type size_t. + */ + const void *data; /* Data */ + uint32_t size; /* Data size */ + + uint32_t __len; /* Cell + data length (usually) */ + + uint8_t prefix; /* Cell prefix length */ + + uint8_t raw; /* Raw cell type (include "shorts") */ + uint8_t type; /* Cell type */ + + uint8_t ovfl; /* boolean: cell is an overflow */ +}; + +/* + * WT_CELL_FOREACH -- + * Walk the cells on a page. + */ +#define WT_CELL_FOREACH(btree, dsk, cell, unpack, i) \ + for ((cell) = \ + WT_PAGE_HEADER_BYTE(btree, dsk), (i) = (dsk)->u.entries; \ + (i) > 0; \ + (cell) = (WT_CELL *)((uint8_t *)(cell) + (unpack)->__len), --(i)) + +/* + * __wt_cell_pack_addr -- + * Pack an address cell. + */ +static inline size_t +__wt_cell_pack_addr(WT_CELL *cell, u_int cell_type, uint64_t recno, size_t size) +{ + uint8_t *p; + + p = cell->__chunk + 1; + + if (recno == 0) + cell->__chunk[0] = cell_type; /* Type */ + else { + cell->__chunk[0] = cell_type | WT_CELL_64V; + (void)__wt_vpack_uint(&p, 0, recno); /* Record number */ + } + (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */ + return (WT_PTRDIFF(p, cell)); +} + +/* + * __wt_cell_pack_data -- + * Set a data item's WT_CELL contents. + */ +static inline size_t +__wt_cell_pack_data(WT_CELL *cell, uint64_t rle, size_t size) +{ + uint8_t byte, *p; + + /* + * Short data cells without run-length encoding have 6 bits of data + * length in the descriptor byte. + */ + if (rle < 2 && size <= WT_CELL_SHORT_MAX) { + byte = (uint8_t)size; /* Type + length */ + cell->__chunk[0] = + (byte << WT_CELL_SHORT_SHIFT) | WT_CELL_VALUE_SHORT; + return (1); + } + + p = cell->__chunk + 1; + if (rle < 2) { + size -= WT_CELL_SIZE_ADJUST; + cell->__chunk[0] = WT_CELL_VALUE; /* Type */ + } else { + cell->__chunk[0] = WT_CELL_VALUE | WT_CELL_64V; + (void)__wt_vpack_uint(&p, 0, rle); /* RLE */ + } + (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */ + return (WT_PTRDIFF(p, cell)); +} + +/* + * __wt_cell_pack_data_match -- + * Return if two items would have identical WT_CELLs (except for any RLE). + */ +static inline int +__wt_cell_pack_data_match( + WT_CELL *page_cell, WT_CELL *val_cell, const uint8_t *val_data, int *matchp) +{ + const uint8_t *a, *b; + uint64_t av, bv; + int rle; + + *matchp = 0; /* Default to no-match */ + + /* + * This is a special-purpose function used by reconciliation to support + * dictionary lookups. We're passed an on-page cell and a created cell + * plus a chunk of data we're about to write on the page, and we return + * if they would match on the page. The column-store comparison ignores + * the RLE because the copied cell will have its own RLE. + */ + a = (uint8_t *)page_cell; + b = (uint8_t *)val_cell; + + if (WT_CELL_SHORT_TYPE(a[0]) == WT_CELL_VALUE_SHORT) { + av = a[0] >> WT_CELL_SHORT_SHIFT; + ++a; + } else if (WT_CELL_TYPE(a[0]) == WT_CELL_VALUE) { + rle = a[0] & WT_CELL_64V ? 1 : 0; /* Skip any RLE */ + ++a; + if (rle) + WT_RET(__wt_vunpack_uint(&a, 0, &av)); + WT_RET(__wt_vunpack_uint(&a, 0, &av)); /* Length */ + } else + return (0); + + if (WT_CELL_SHORT_TYPE(b[0]) == WT_CELL_VALUE_SHORT) { + bv = b[0] >> WT_CELL_SHORT_SHIFT; + ++b; + } else if (WT_CELL_TYPE(b[0]) == WT_CELL_VALUE) { + rle = b[0] & WT_CELL_64V ? 1 : 0; /* Skip any RLE */ + ++b; + if (rle) + WT_RET(__wt_vunpack_uint(&b, 0, &bv)); + WT_RET(__wt_vunpack_uint(&b, 0, &bv)); /* Length */ + } else + return (0); + + if (av == bv) + *matchp = memcmp(a, val_data, av) == 0 ? 1 : 0; + return (0); +} + +/* + * __wt_cell_pack_copy -- + * Write a copy value cell. + */ +static inline size_t +__wt_cell_pack_copy(WT_CELL *cell, uint64_t rle, uint64_t v) +{ + uint8_t *p; + + p = cell->__chunk + 1; + + if (rle < 2) /* Type */ + cell->__chunk[0] = WT_CELL_VALUE_COPY; + else { /* Type */ + cell->__chunk[0] = WT_CELL_VALUE_COPY | WT_CELL_64V; + (void)__wt_vpack_uint(&p, 0, rle); /* RLE */ + } + (void)__wt_vpack_uint(&p, 0, v); /* Copy offset */ + return (WT_PTRDIFF(p, cell)); +} + +/* + * __wt_cell_pack_del -- + * Write a deleted value cell. + */ +static inline size_t +__wt_cell_pack_del(WT_CELL *cell, uint64_t rle) +{ + uint8_t *p; + + p = cell->__chunk + 1; + if (rle < 2) { /* Type */ + cell->__chunk[0] = WT_CELL_DEL; + return (1); + } + /* Type */ + cell->__chunk[0] = WT_CELL_DEL | WT_CELL_64V; + (void)__wt_vpack_uint(&p, 0, rle); /* RLE */ + return (WT_PTRDIFF(p, cell)); +} + +/* + * __wt_cell_pack_int_key -- + * Set a row-store internal page key's WT_CELL contents. + */ +static inline size_t +__wt_cell_pack_int_key(WT_CELL *cell, size_t size) +{ + uint8_t byte, *p; + + /* Short keys have 6 bits of data length in the descriptor byte. */ + if (size <= WT_CELL_SHORT_MAX) { + byte = (uint8_t)size; + cell->__chunk[0] = + (byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT; + return (1); + } + + cell->__chunk[0] = WT_CELL_KEY; /* Type */ + p = cell->__chunk + 1; + + size -= WT_CELL_SIZE_ADJUST; + (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */ + + return (WT_PTRDIFF(p, cell)); +} + +/* + * __wt_cell_pack_leaf_key -- + * Set a row-store leaf page key's WT_CELL contents. + */ +static inline size_t +__wt_cell_pack_leaf_key(WT_CELL *cell, uint8_t prefix, size_t size) +{ + uint8_t byte, *p; + + /* Short keys have 6 bits of data length in the descriptor byte. */ + if (size <= WT_CELL_SHORT_MAX) { + if (prefix == 0) { + byte = (uint8_t)size; /* Type + length */ + cell->__chunk[0] = + (byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT; + return (1); + } else { + byte = (uint8_t)size; /* Type + length */ + cell->__chunk[0] = + (byte << WT_CELL_SHORT_SHIFT) | + WT_CELL_KEY_SHORT_PFX; + cell->__chunk[1] = prefix; /* Prefix */ + return (2); + } + } + + if (prefix == 0) { + cell->__chunk[0] = WT_CELL_KEY; /* Type */ + p = cell->__chunk + 1; + } else { + cell->__chunk[0] = WT_CELL_KEY_PFX; /* Type */ + cell->__chunk[1] = prefix; /* Prefix */ + p = cell->__chunk + 2; + } + + size -= WT_CELL_SIZE_ADJUST; + (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */ + + return (WT_PTRDIFF(p, cell)); +} + +/* + * __wt_cell_pack_ovfl -- + * Pack an overflow cell. + */ +static inline size_t +__wt_cell_pack_ovfl(WT_CELL *cell, uint8_t type, uint64_t rle, size_t size) +{ + uint8_t *p; + + p = cell->__chunk + 1; + if (rle < 2) /* Type */ + cell->__chunk[0] = type; + else { + cell->__chunk[0] = type | WT_CELL_64V; + (void)__wt_vpack_uint(&p, 0, rle); /* RLE */ + } + (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */ + return (WT_PTRDIFF(p, cell)); +} + +/* + * __wt_cell_rle -- + * Return the cell's RLE value. + */ +static inline uint64_t +__wt_cell_rle(WT_CELL_UNPACK *unpack) +{ + /* + * Any item with only 1 occurrence is stored with an RLE of 0, that is, + * without any RLE at all. This code is a single place to handle that + * correction, for simplicity. + */ + return (unpack->v < 2 ? 1 : unpack->v); +} + +/* + * __wt_cell_total_len -- + * Return the cell's total length, including data. + */ +static inline size_t +__wt_cell_total_len(WT_CELL_UNPACK *unpack) +{ + /* + * The length field is specially named because it's dangerous to use it: + * it represents the length of the current cell (normally used for the + * loop that walks through cells on the page), but occasionally we want + * to copy a cell directly from the page, and what we need is the cell's + * total length. The problem is dictionary-copy cells, because in that + * case, the __len field is the length of the current cell, not the cell + * for which we're returning data. To use the __len field, you must be + * sure you're not looking at a copy cell. + */ + return (unpack->__len); +} + +/* + * __wt_cell_type -- + * Return the cell's type (collapsing special types). + */ +static inline u_int +__wt_cell_type(WT_CELL *cell) +{ + u_int type; + + switch (WT_CELL_SHORT_TYPE(cell->__chunk[0])) { + case WT_CELL_KEY_SHORT: + case WT_CELL_KEY_SHORT_PFX: + return (WT_CELL_KEY); + case WT_CELL_VALUE_SHORT: + return (WT_CELL_VALUE); + } + + switch (type = WT_CELL_TYPE(cell->__chunk[0])) { + case WT_CELL_KEY_PFX: + return (WT_CELL_KEY); + case WT_CELL_KEY_OVFL_RM: + return (WT_CELL_KEY_OVFL); + case WT_CELL_VALUE_OVFL_RM: + return (WT_CELL_VALUE_OVFL); + } + return (type); +} + +/* + * __wt_cell_type_raw -- + * Return the cell's type. + */ +static inline u_int +__wt_cell_type_raw(WT_CELL *cell) +{ + return (WT_CELL_SHORT_TYPE(cell->__chunk[0]) == 0 ? + WT_CELL_TYPE(cell->__chunk[0]) : + WT_CELL_SHORT_TYPE(cell->__chunk[0])); +} + +/* + * __wt_cell_type_reset -- + * Reset the cell's type. + */ +static inline void +__wt_cell_type_reset( + WT_SESSION_IMPL *session, WT_CELL *cell, u_int old_type, u_int new_type) +{ + /* + * For all current callers of this function, this should happen once + * and only once, assert we're setting what we think we're setting. + */ + WT_ASSERT(session, old_type == 0 || old_type == __wt_cell_type(cell)); + WT_UNUSED(old_type); + + cell->__chunk[0] = + (cell->__chunk[0] & ~WT_CELL_TYPE_MASK) | WT_CELL_TYPE(new_type); +} + +/* + * __wt_cell_leaf_value_parse -- + * Return the cell if it's a row-store leaf page value, otherwise return + * NULL. + */ +static inline WT_CELL * +__wt_cell_leaf_value_parse(WT_PAGE *page, WT_CELL *cell) +{ + /* + * This function exists so there's a place for this comment. + * + * Row-store leaf pages may have a single data cell between each key, or + * keys may be adjacent (when the data cell is empty). + * + * One special case: if the last key on a page is a key without a value, + * don't walk off the end of the page: the size of the underlying disk + * image is exact, which means the end of the last cell on the page plus + * the length of the cell should be the byte immediately after the page + * disk image. + * + * !!! + * This line of code is really a call to __wt_off_page, but we know the + * cell we're given will either be on the page or past the end of page, + * so it's a simpler check. (I wouldn't bother, but the real problem is + * we can't call __wt_off_page directly, it's in btree.i which requires + * this file be included first.) + */ + if (cell >= (WT_CELL *)((uint8_t *)page->dsk + page->dsk->mem_size)) + return (NULL); + + switch (__wt_cell_type_raw(cell)) { + case WT_CELL_KEY: + case WT_CELL_KEY_OVFL: + case WT_CELL_KEY_OVFL_RM: + case WT_CELL_KEY_PFX: + case WT_CELL_KEY_SHORT: + case WT_CELL_KEY_SHORT_PFX: + return (NULL); + default: + return (cell); + } +} + +/* + * __wt_cell_unpack_safe -- + * Unpack a WT_CELL into a structure during verification. + */ +static inline int +__wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end) +{ + uint64_t saved_v, v; + uint32_t saved_len; + int copied; + const uint8_t *p; + + copied = 0; + saved_len = 0; + saved_v = 0; + + /* + * The verification code specifies an end argument, a pointer to 1 past + * the end-of-page. In that case, make sure we don't go past the end + * of the page when reading. If an error occurs, we simply return the + * error code, the verification code takes care of complaining (and, in + * the case of salvage, it won't complain at all, it's OK to fail). + */ +#define WT_CELL_LEN_CHK(p, len) do { \ + if (end != NULL && (((uint8_t *)p) + (len)) > end) \ + return (WT_ERROR); \ +} while (0) + +restart: + /* + * This code is performance critical for scans through read-only trees. + * Avoid WT_CLEAR here: it makes this code run significantly slower. + */ + WT_CLEAR_INLINE(WT_CELL_UNPACK, *unpack); + WT_CELL_LEN_CHK(cell, 0); + unpack->cell = cell; + unpack->type = __wt_cell_type(cell); + unpack->raw = __wt_cell_type_raw(cell); + + /* + * Handle cells with neither an RLE count or data length: short key/data + * cells have 6 bits of data length in the descriptor byte. + */ + switch (unpack->raw) { + case WT_CELL_KEY_SHORT_PFX: + WT_CELL_LEN_CHK(cell, 1); /* skip prefix */ + unpack->prefix = cell->__chunk[1]; + + unpack->data = cell->__chunk + 2; + unpack->size = cell->__chunk[0] >> WT_CELL_SHORT_SHIFT; + unpack->__len = 2 + unpack->size; + goto done; + case WT_CELL_KEY_SHORT: + case WT_CELL_VALUE_SHORT: + unpack->data = cell->__chunk + 1; + unpack->size = cell->__chunk[0] >> WT_CELL_SHORT_SHIFT; + unpack->__len = 1 + unpack->size; + goto done; + } + + p = (uint8_t *)cell + 1; /* skip cell */ + + /* + * Check for a prefix byte that optionally follows the cell descriptor + * byte on row-store leaf pages. + */ + if (unpack->raw == WT_CELL_KEY_PFX) { + ++p; /* skip prefix */ + WT_CELL_LEN_CHK(p, 0); + unpack->prefix = cell->__chunk[1]; + } + + /* + * Check for an RLE count or record number that optionally follows the + * cell descriptor byte on column-store variable-length pages. + */ + if (cell->__chunk[0] & WT_CELL_64V) /* skip value */ + WT_RET(__wt_vunpack_uint( + &p, end == NULL ? 0 : (size_t)(end - p), &unpack->v)); + + /* + * Handle special actions for a few different cell types and set the + * data length (deleted cells are fixed-size without length bytes, + * almost everything else has data length bytes). + */ + switch (unpack->raw) { + case WT_CELL_VALUE_COPY: + /* + * The cell is followed by an offset to a cell written earlier + * in the page. Save/restore the length and RLE of this cell, + * we need the length to step through the set of cells on the + * page and this RLE is probably different from the RLE of the + * earlier cell. + */ + WT_RET(__wt_vunpack_uint( + &p, end == NULL ? 0 : (size_t)(end - p), &v)); + saved_len = WT_PTRDIFF32(p, cell); + saved_v = unpack->v; + cell = (WT_CELL *)((uint8_t *)cell - v); + copied = 1; + goto restart; + + case WT_CELL_KEY_OVFL: + case WT_CELL_KEY_OVFL_RM: + case WT_CELL_VALUE_OVFL: + case WT_CELL_VALUE_OVFL_RM: + /* + * Set overflow flag. + */ + unpack->ovfl = 1; + /* FALLTHROUGH */ + + case WT_CELL_ADDR_DEL: + case WT_CELL_ADDR_INT: + case WT_CELL_ADDR_LEAF: + case WT_CELL_ADDR_LEAF_NO: + case WT_CELL_KEY: + case WT_CELL_KEY_PFX: + case WT_CELL_VALUE: + /* + * The cell is followed by a 4B data length and a chunk of + * data. + */ + WT_RET(__wt_vunpack_uint( + &p, end == NULL ? 0 : (size_t)(end - p), &v)); + + if (unpack->raw == WT_CELL_KEY || + unpack->raw == WT_CELL_KEY_PFX || + (unpack->raw == WT_CELL_VALUE && unpack->v == 0)) + v += WT_CELL_SIZE_ADJUST; + + unpack->data = p; + unpack->size = (uint32_t)v; + unpack->__len = WT_PTRDIFF32(p + unpack->size, cell); + break; + + case WT_CELL_DEL: + unpack->__len = WT_PTRDIFF32(p, cell); + break; + default: + return (WT_ERROR); /* Unknown cell type. */ + } + + /* + * Check the original cell against the full cell length (this is a + * diagnostic as well, we may be copying the cell from the page and + * we need the right length). + */ +done: WT_CELL_LEN_CHK(cell, unpack->__len); + if (copied) { + unpack->raw = WT_CELL_VALUE_COPY; + unpack->__len = saved_len; + unpack->v = saved_v; + } + + return (0); +} + +/* + * __wt_cell_unpack -- + * Unpack a WT_CELL into a structure. + */ +static inline void +__wt_cell_unpack(WT_CELL *cell, WT_CELL_UNPACK *unpack) +{ + (void)__wt_cell_unpack_safe(cell, unpack, NULL); +} + +/* + * __cell_data_ref -- + * Set a buffer to reference the data from an unpacked cell. + */ +static inline int +__cell_data_ref(WT_SESSION_IMPL *session, + WT_PAGE *page, int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store) +{ + WT_BTREE *btree; + void *huffman; + + btree = S2BT(session); + + /* Reference the cell's data, optionally decode it. */ + switch (unpack->type) { + case WT_CELL_KEY: + store->data = unpack->data; + store->size = unpack->size; + if (page_type == WT_PAGE_ROW_INT) + return (0); + + huffman = btree->huffman_key; + break; + case WT_CELL_VALUE: + store->data = unpack->data; + store->size = unpack->size; + huffman = btree->huffman_value; + break; + case WT_CELL_KEY_OVFL: + WT_RET(__wt_ovfl_read(session, page, unpack, store)); + if (page_type == WT_PAGE_ROW_INT) + return (0); + + huffman = btree->huffman_key; + break; + case WT_CELL_VALUE_OVFL: + WT_RET(__wt_ovfl_read(session, page, unpack, store)); + huffman = btree->huffman_value; + break; + WT_ILLEGAL_VALUE(session); + } + + return (huffman == NULL ? 0 : + __wt_huffman_decode( + session, huffman, store->data, store->size, store)); +} + +/* + * __wt_dsk_cell_data_ref -- + * Set a buffer to reference the data from an unpacked cell. + * + * There are two versions because of WT_CELL_VALUE_OVFL_RM type cells. When an + * overflow item is deleted, its backing blocks are removed; if there are still + * running transactions that might need to see the overflow item, we cache a + * copy of the item and reset the item's cell to WT_CELL_VALUE_OVFL_RM. If we + * find a WT_CELL_VALUE_OVFL_RM cell when reading an overflow item, we use the + * page reference to look aside into the cache. So, calling the "dsk" version + * of the function declares the cell cannot be of type WT_CELL_VALUE_OVFL_RM, + * and calling the "page" version means it might be. + */ +static inline int +__wt_dsk_cell_data_ref(WT_SESSION_IMPL *session, + int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store) +{ + WT_ASSERT(session, + __wt_cell_type_raw(unpack->cell) != WT_CELL_VALUE_OVFL_RM); + return (__cell_data_ref(session, NULL, page_type, unpack, store)); +} + +/* + * __wt_page_cell_data_ref -- + * Set a buffer to reference the data from an unpacked cell. + */ +static inline int +__wt_page_cell_data_ref(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store) +{ + return (__cell_data_ref(session, page, page->type, unpack, store)); +} + +/* + * __wt_cell_data_copy -- + * Copy the data from an unpacked cell into a buffer. + */ +static inline int +__wt_cell_data_copy(WT_SESSION_IMPL *session, + int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store) +{ + /* + * We have routines to both copy and reference a cell's information. In + * most cases, all we need is a reference and we prefer that, especially + * when returning key/value items. In a few we need a real copy: call + * the standard reference function and get a reference. In some cases, + * a copy will be made (for example, when reading an overflow item from + * the underlying object. If that happens, we're done, otherwise make + * a copy. + * + * We don't require two versions of this function, no callers need to + * handle WT_CELL_VALUE_OVFL_RM cells. + */ + WT_RET(__wt_dsk_cell_data_ref(session, page_type, unpack, store)); + if (!WT_DATA_IN_ITEM(store)) + WT_RET(__wt_buf_set(session, store, store->data, store->size)); + return (0); +} diff --git a/src/third_party/wiredtiger/src/include/column.i b/src/third_party/wiredtiger/src/include/column.i new file mode 100644 index 00000000000..42c3664323d --- /dev/null +++ b/src/third_party/wiredtiger/src/include/column.i @@ -0,0 +1,201 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * __col_insert_search_match -- + * Search an column-store insert list for an exact match. + */ +static inline WT_INSERT * +__col_insert_search_match(WT_INSERT_HEAD *inshead, uint64_t recno) +{ + WT_INSERT **insp, *ret_ins; + uint64_t ins_recno; + int cmp, i; + + /* If there's no insert chain to search, we're done. */ + if ((ret_ins = WT_SKIP_LAST(inshead)) == NULL) + return (NULL); + + /* Fast path the check for values at the end of the skiplist. */ + if (recno > WT_INSERT_RECNO(ret_ins)) + return (NULL); + else if (recno == WT_INSERT_RECNO(ret_ins)) + return (ret_ins); + + /* + * The insert list is a skip list: start at the highest skip level, then + * go as far as possible at each level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0; ) { + if (*insp == NULL) { + --i; + --insp; + continue; + } + + ins_recno = WT_INSERT_RECNO(*insp); + cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1; + + if (cmp == 0) /* Exact match: return */ + return (*insp); + else if (cmp > 0) /* Keep going at this level */ + insp = &(*insp)->next[i]; + else { /* Drop down a level */ + --i; + --insp; + } + } + + return (NULL); +} + +/* + * __col_insert_search -- + * Search a column-store insert list, creating a skiplist stack as we go. + */ +static inline WT_INSERT * +__col_insert_search(WT_INSERT_HEAD *inshead, + WT_INSERT ***ins_stack, WT_INSERT **next_stack, uint64_t recno) +{ + WT_INSERT **insp, *ret_ins; + uint64_t ins_recno; + int cmp, i; + + /* If there's no insert chain to search, we're done. */ + if ((ret_ins = WT_SKIP_LAST(inshead)) == NULL) + return (NULL); + + /* Fast path appends. */ + if (recno >= WT_INSERT_RECNO(ret_ins)) { + for (i = 0; i < WT_SKIP_MAXDEPTH; i++) { + ins_stack[i] = (i == 0) ? &ret_ins->next[0] : + (inshead->tail[i] != NULL) ? + &inshead->tail[i]->next[i] : &inshead->head[i]; + next_stack[i] = NULL; + } + return (ret_ins); + } + + /* + * The insert list is a skip list: start at the highest skip level, then + * go as far as possible at each level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0; ) { + if ((ret_ins = *insp) == NULL) { + next_stack[i] = NULL; + ins_stack[i--] = insp--; + continue; + } + + ins_recno = WT_INSERT_RECNO(ret_ins); + cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1; + + if (cmp > 0) /* Keep going at this level */ + insp = &ret_ins->next[i]; + else if (cmp == 0) /* Exact match: return */ + for (; i >= 0; i--) { + next_stack[i] = ret_ins->next[i]; + ins_stack[i] = &ret_ins->next[i]; + } + else { /* Drop down a level */ + next_stack[i] = ret_ins; + ins_stack[i--] = insp--; + } + } + return (ret_ins); +} + +/* + * __col_var_last_recno -- + * Return the last record number for a variable-length column-store page. + */ +static inline uint64_t +__col_var_last_recno(WT_PAGE *page) +{ + WT_COL_RLE *repeat; + + /* + * If there's an append list (the last page), then there may be more + * records on the page. This function ignores those records, so our + * callers have to handle that explicitly, if they care. + */ + if (page->pg_var_nrepeats == 0) + return (page->pg_var_entries == 0 ? 0 : + page->pg_var_recno + (page->pg_var_entries - 1)); + + repeat = &page->pg_var_repeats[page->pg_var_nrepeats - 1]; + return ((repeat->recno + repeat->rle) - 1 + + (page->pg_var_entries - (repeat->indx + 1))); +} + +/* + * __col_fix_last_recno -- + * Return the last record number for a fixed-length column-store page. + */ +static inline uint64_t +__col_fix_last_recno(WT_PAGE *page) +{ + /* + * If there's an append list (the last page), then there may be more + * records on the page. This function ignores those records, so our + * callers have to handle that explicitly, if they care. + */ + return (page->pg_fix_entries == 0 ? 0 : + page->pg_fix_recno + (page->pg_fix_entries - 1)); +} + +/* + * __col_var_search -- + * Search a variable-length column-store page for a record. + */ +static inline WT_COL * +__col_var_search(WT_PAGE *page, uint64_t recno) +{ + WT_COL_RLE *repeat; + uint64_t start_recno; + uint32_t base, indx, limit, start_indx; + + /* + * Find the matching slot. + * + * This is done in two stages: first, we do a binary search among any + * repeating records to find largest repeating less than the search key. + * Once there, we can do a simple offset calculation to find the correct + * slot for this record number, because we know any intervening records + * have repeat counts of 1. + */ + for (base = 0, limit = page->pg_var_nrepeats; limit != 0; limit >>= 1) { + indx = base + (limit >> 1); + + repeat = page->pg_var_repeats + indx; + if (recno >= repeat->recno && + recno < repeat->recno + repeat->rle) + return (page->pg_var_d + repeat->indx); + if (recno < repeat->recno) + continue; + base = indx + 1; + --limit; + } + + /* + * We didn't find an exact match, move forward from the largest repeat + * less than the search key. + */ + if (base == 0) { + start_indx = 0; + start_recno = page->pg_var_recno; + } else { + repeat = page->pg_var_repeats + (base - 1); + start_indx = repeat->indx + 1; + start_recno = repeat->recno + repeat->rle; + } + + if (recno >= start_recno + (page->pg_var_entries - start_indx)) + return (NULL); + + return (page->pg_var_d + start_indx + (uint32_t)(recno - start_recno)); +} diff --git a/src/third_party/wiredtiger/src/include/compact.h b/src/third_party/wiredtiger/src/include/compact.h new file mode 100644 index 00000000000..aa34eab4d24 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/compact.h @@ -0,0 +1,12 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +struct __wt_compact { + uint32_t lsm_count; /* Number of LSM trees seen */ + uint32_t file_count; /* Number of files seen */ + uint64_t max_time; /* Configured timeout */ +}; diff --git a/src/third_party/wiredtiger/src/include/config.h b/src/third_party/wiredtiger/src/include/config.h new file mode 100644 index 00000000000..b9c4c97fa00 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/config.h @@ -0,0 +1,85 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +struct __wt_config { + WT_SESSION_IMPL *session; + const char *orig; + const char *end; + const char *cur; + + int depth, top; + const int8_t *go; +}; + +struct __wt_config_check { + const char *name; + const char *type; + const char *checks; + const WT_CONFIG_CHECK *subconfigs; +}; + +#define WT_CONFIG_REF(session, n) \ + (S2C(session)->config_entries[WT_CONFIG_ENTRY_##n]) +struct __wt_config_entry { + const char *method; /* method name */ + +#define WT_CONFIG_BASE(session, n) (WT_CONFIG_REF(session, n)->base) + const char *base; /* configuration base */ + + const WT_CONFIG_CHECK *checks; /* check array */ +}; + +struct __wt_config_parser_impl { + WT_CONFIG_PARSER iface; + + WT_SESSION_IMPL *session; + WT_CONFIG config; + WT_CONFIG_ITEM config_item; +}; + +/* + * DO NOT EDIT: automatically built by dist/api_config.py. + * configuration section: BEGIN + */ +#define WT_CONFIG_ENTRY_colgroup_meta 0 +#define WT_CONFIG_ENTRY_connection_add_collator 1 +#define WT_CONFIG_ENTRY_connection_add_compressor 2 +#define WT_CONFIG_ENTRY_connection_add_data_source 3 +#define WT_CONFIG_ENTRY_connection_add_extractor 4 +#define WT_CONFIG_ENTRY_connection_async_new_op 5 +#define WT_CONFIG_ENTRY_connection_close 6 +#define WT_CONFIG_ENTRY_connection_load_extension 7 +#define WT_CONFIG_ENTRY_connection_open_session 8 +#define WT_CONFIG_ENTRY_connection_reconfigure 9 +#define WT_CONFIG_ENTRY_cursor_close 10 +#define WT_CONFIG_ENTRY_file_meta 11 +#define WT_CONFIG_ENTRY_index_meta 12 +#define WT_CONFIG_ENTRY_session_begin_transaction 13 +#define WT_CONFIG_ENTRY_session_checkpoint 14 +#define WT_CONFIG_ENTRY_session_close 15 +#define WT_CONFIG_ENTRY_session_commit_transaction 16 +#define WT_CONFIG_ENTRY_session_compact 17 +#define WT_CONFIG_ENTRY_session_create 18 +#define WT_CONFIG_ENTRY_session_drop 19 +#define WT_CONFIG_ENTRY_session_log_printf 20 +#define WT_CONFIG_ENTRY_session_open_cursor 21 +#define WT_CONFIG_ENTRY_session_reconfigure 22 +#define WT_CONFIG_ENTRY_session_rename 23 +#define WT_CONFIG_ENTRY_session_rollback_transaction 24 +#define WT_CONFIG_ENTRY_session_salvage 25 +#define WT_CONFIG_ENTRY_session_truncate 26 +#define WT_CONFIG_ENTRY_session_upgrade 27 +#define WT_CONFIG_ENTRY_session_verify 28 +#define WT_CONFIG_ENTRY_table_meta 29 +#define WT_CONFIG_ENTRY_wiredtiger_open 30 +#define WT_CONFIG_ENTRY_wiredtiger_open_all 31 +#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 32 +#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 33 +/* + * configuration section: END + * DO NOT EDIT: automatically built by dist/flags.py. + */ diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h new file mode 100644 index 00000000000..81866e39df9 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -0,0 +1,270 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/******************************************* + * Global per-process structure. + *******************************************/ +/* + * WT_PROCESS -- + * Per-process information for the library. + */ +struct __wt_process { + WT_SPINLOCK spinlock; /* Per-process spinlock */ + + /* Locked: connection queue */ + TAILQ_HEAD(__wt_connection_impl_qh, __wt_connection_impl) connqh; + WT_CACHE_POOL *cache_pool; +}; +extern WT_PROCESS __wt_process; + +/* + * WT_NAMED_COLLATOR -- + * A collator list entry + */ +struct __wt_named_collator { + const char *name; /* Name of collator */ + WT_COLLATOR *collator; /* User supplied object */ + TAILQ_ENTRY(__wt_named_collator) q; /* Linked list of collators */ +}; + +/* + * WT_NAMED_COMPRESSOR -- + * A compressor list entry + */ +struct __wt_named_compressor { + const char *name; /* Name of compressor */ + WT_COMPRESSOR *compressor; /* User supplied callbacks */ + /* Linked list of compressors */ + TAILQ_ENTRY(__wt_named_compressor) q; +}; + +/* + * WT_NAMED_DATA_SOURCE -- + * A data source list entry + */ +struct __wt_named_data_source { + const char *prefix; /* Name of data source */ + WT_DATA_SOURCE *dsrc; /* User supplied callbacks */ + /* Linked list of data sources */ + TAILQ_ENTRY(__wt_named_data_source) q; +}; + +/* + * Allocate some additional slots for internal sessions. There is a default + * session for each connection, plus a session for each server thread. + */ +#define WT_NUM_INTERNAL_SESSIONS 10 + +/* + * WT_CONNECTION_IMPL -- + * Implementation of WT_CONNECTION + */ +struct __wt_connection_impl { + WT_CONNECTION iface; + + /* For operations without an application-supplied session */ + WT_SESSION_IMPL *default_session; + WT_SESSION_IMPL dummy_session; + + const char *cfg; /* Connection configuration */ + + WT_SPINLOCK api_lock; /* Connection API spinlock */ + WT_SPINLOCK checkpoint_lock; /* Checkpoint spinlock */ + WT_SPINLOCK fh_lock; /* File handle queue spinlock */ + WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */ + WT_SPINLOCK schema_lock; /* Schema operation spinlock */ + + /* + * We distribute the btree page locks across a set of spin locks; it + * can't be an array, we impose cache-line alignment and gcc doesn't + * support that for arrays. Don't use too many: they are only held for + * very short operations, each one is 64 bytes, so 256 will fill the L1 + * cache on most CPUs. + */ +#define WT_PAGE_LOCKS(conn) 16 + WT_SPINLOCK *page_lock; /* Btree page spinlocks */ + u_int page_lock_cnt; /* Next spinlock to use */ + + /* Connection queue */ + TAILQ_ENTRY(__wt_connection_impl) q; + /* Cache pool queue */ + TAILQ_ENTRY(__wt_connection_impl) cpq; + + const char *home; /* Database home */ + const char *error_prefix; /* Database error prefix */ + int is_new; /* Connection created database */ + + WT_EXTENSION_API extension_api; /* Extension API */ + + /* Configuration */ + const WT_CONFIG_ENTRY **config_entries; + + void **foc; /* Free-on-close array */ + size_t foc_cnt; /* Array entries */ + size_t foc_size; /* Array size */ + + WT_FH *lock_fh; /* Lock file handle */ + + uint64_t split_gen; /* Generation number for splits */ + + WT_SPINLOCK dhandle_lock; /* Locked: dhandle sweep */ + /* Locked: data handle list */ + SLIST_HEAD(__wt_dhandle_lh, __wt_data_handle) dhlh; + /* Locked: LSM handle list. */ + TAILQ_HEAD(__wt_lsm_qh, __wt_lsm_tree) lsmqh; + /* Locked: file list */ + TAILQ_HEAD(__wt_fh_qh, __wt_fh) fhqh; + /* Locked: library list */ + TAILQ_HEAD(__wt_dlh_qh, __wt_dlh) dlhqh; + + WT_SPINLOCK block_lock; /* Locked: block manager list */ + TAILQ_HEAD(__wt_block_qh, __wt_block) blockqh; + + u_int open_btree_count; /* Locked: open writable btree count */ + uint32_t next_file_id; /* Locked: file ID counter */ + + /* + * WiredTiger allocates space for 50 simultaneous sessions (threads of + * control) by default. Growing the number of threads dynamically is + * possible, but tricky since server threads are walking the array + * without locking it. + * + * There's an array of WT_SESSION_IMPL pointers that reference the + * allocated array; we do it that way because we want an easy way for + * the server thread code to avoid walking the entire array when only a + * few threads are running. + */ + WT_SESSION_IMPL *sessions; /* Session reference */ + uint32_t session_size; /* Session array size */ + uint32_t session_cnt; /* Session count */ + + /* + * WiredTiger allocates space for a fixed number of hazard pointers + * in each thread of control. + */ + uint32_t hazard_max; /* Hazard array size */ + + WT_CACHE *cache; /* Page cache */ + uint64_t cache_size; + + WT_TXN_GLOBAL txn_global; /* Global transaction state */ + + WT_SPINLOCK hot_backup_lock; /* Hot backup serialization */ + int hot_backup; + + WT_SESSION_IMPL *ckpt_session; /* Checkpoint thread session */ + wt_thread_t ckpt_tid; /* Checkpoint thread */ + int ckpt_tid_set; /* Checkpoint thread set */ + WT_CONDVAR *ckpt_cond; /* Checkpoint wait mutex */ + const char *ckpt_config; /* Checkpoint configuration */ +#define WT_CKPT_LOGSIZE(conn) ((conn)->ckpt_logsize != 0) + wt_off_t ckpt_logsize; /* Checkpoint log size period */ + uint32_t ckpt_signalled; /* Checkpoint signalled */ + long ckpt_usecs; /* Checkpoint period */ + + int compact_in_memory_pass; /* Compaction serialization */ + +#define WT_CONN_STAT_ALL 0x01 /* "all" statistics configured */ +#define WT_CONN_STAT_CLEAR 0x02 /* clear after gathering */ +#define WT_CONN_STAT_FAST 0x04 /* "fast" statistics configured */ +#define WT_CONN_STAT_NONE 0x08 /* don't gather statistics */ +#define WT_CONN_STAT_ON_CLOSE 0x10 /* output statistics on close */ + uint32_t stat_flags; + + WT_CONNECTION_STATS stats; /* Connection statistics */ + +#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING + /* + * Spinlock registration, so we can track which spinlocks are heavily + * used, which are blocking and where. + * + * There's an array of spinlocks, and an array of blocking IDs. + */ +#define WT_SPINLOCK_MAX 1024 +#define WT_SPINLOCK_MAX_LOCATION_ID 60 + WT_SPINLOCK *spinlock_list[WT_SPINLOCK_MAX]; + + /* Spinlock blocking matrix */ + struct __wt_connection_stats_spinlock { + const char *name; /* Mutex name */ + + const char *file; /* Caller's file/line, ID location */ + int line; + + u_int total; /* Count of total, blocked calls */ + u_int blocked[WT_SPINLOCK_MAX_LOCATION_ID]; + } spinlock_block[WT_SPINLOCK_MAX_LOCATION_ID]; +#endif + + WT_ASYNC *async; /* Async structure */ + int async_cfg; /* Global async configuration */ + uint32_t async_size; /* Async op array size */ + uint32_t async_workers; /* Number of async workers */ + + WT_LSM_MANAGER lsm_manager; /* LSM worker thread information */ + + WT_SESSION_IMPL *evict_session; /* Eviction server sessions */ + wt_thread_t evict_tid; /* Eviction server thread ID */ + int evict_tid_set; /* Eviction server thread ID set */ + + uint32_t evict_workers_max;/* Max eviction workers */ + uint32_t evict_workers_min;/* Min eviction workers */ + uint32_t evict_workers; /* Number of eviction workers */ + WT_EVICT_WORKER *evict_workctx; /* Eviction worker context */ + + WT_SESSION_IMPL *stat_session; /* Statistics log session */ + wt_thread_t stat_tid; /* Statistics log thread */ + int stat_tid_set; /* Statistics log thread set */ + WT_CONDVAR *stat_cond; /* Statistics log wait mutex */ + const char *stat_format; /* Statistics log timestamp format */ + FILE *stat_fp; /* Statistics log file handle */ + char *stat_path; /* Statistics log path format */ + char **stat_sources; /* Statistics log list of objects */ + const char *stat_stamp; /* Statistics log entry timestamp */ + long stat_usecs; /* Statistics log period */ + + int logging; /* Global logging configuration */ + int archive; /* Global archive configuration */ + WT_CONDVAR *arch_cond; /* Log archive wait mutex */ + WT_SESSION_IMPL *arch_session; /* Log archive session */ + wt_thread_t arch_tid; /* Log archive thread */ + int arch_tid_set; /* Log archive thread set */ + WT_LOG *log; /* Logging structure */ + wt_off_t log_file_max; /* Log file max size */ + const char *log_path; /* Logging path format */ + uint32_t txn_logsync; /* Log sync configuration */ + + WT_SESSION_IMPL *sweep_session; /* Handle sweep session */ + wt_thread_t sweep_tid; /* Handle sweep thread */ + int sweep_tid_set; /* Handle sweep thread set */ + WT_CONDVAR *sweep_cond; /* Handle sweep wait mutex */ + + /* Locked: collator list */ + TAILQ_HEAD(__wt_coll_qh, __wt_named_collator) collqh; + + /* Locked: compressor list */ + TAILQ_HEAD(__wt_comp_qh, __wt_named_compressor) compqh; + + /* Locked: data source list */ + TAILQ_HEAD(__wt_dsrc_qh, __wt_named_data_source) dsrcqh; + + void *lang_private; /* Language specific private storage */ + + /* If non-zero, all buffers used for I/O will be aligned to this. */ + size_t buffer_alignment; + + uint32_t schema_gen; /* Schema generation number */ + + wt_off_t data_extend_len; /* file_extend data length */ + wt_off_t log_extend_len; /* file_extend log length */ + + uint32_t direct_io; /* O_DIRECT file type flags */ + int mmap; /* mmap configuration */ + uint32_t verbose; + + uint32_t flags; +}; diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h new file mode 100644 index 00000000000..17185499b88 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -0,0 +1,380 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * Initialize a static WT_CURSOR structure. + */ +#define WT_CURSOR_STATIC_INIT(n, \ + get_key, \ + get_value, \ + set_key, \ + set_value, \ + compare, \ + next, \ + prev, \ + reset, \ + search, \ + search_near, \ + insert, \ + update, \ + remove, \ + close) \ + static const WT_CURSOR n = { \ + NULL, /* session */ \ + NULL, /* uri */ \ + NULL, /* key_format */ \ + NULL, /* value_format */ \ + (int (*)(WT_CURSOR *, ...))(get_key), \ + (int (*)(WT_CURSOR *, ...))(get_value), \ + (void (*)(WT_CURSOR *, ...))(set_key), \ + (void (*)(WT_CURSOR *, ...))(set_value), \ + (int (*)(WT_CURSOR *, WT_CURSOR *, int *))(compare), \ + next, \ + prev, \ + reset, \ + search, \ + (int (*)(WT_CURSOR *, int *))(search_near), \ + insert, \ + update, \ + remove, \ + close, \ + { NULL, NULL }, /* TAILQ_ENTRY q */ \ + 0, /* recno key */ \ + { 0 }, /* recno raw buffer */ \ + NULL, /* json_private */ \ + NULL, /* lang_private */ \ + { NULL, 0, 0, NULL, 0 }, /* WT_ITEM key */ \ + { NULL, 0, 0, NULL, 0 }, /* WT_ITEM value */ \ + 0, /* int saved_err */ \ + NULL, /* internal_uri */ \ + 0 /* uint32_t flags */ \ +} + +struct __wt_cursor_backup_entry { + char *name; /* File name */ + WT_DATA_HANDLE *handle; /* Handle */ +}; +struct __wt_cursor_backup { + WT_CURSOR iface; + + size_t next; /* Cursor position */ + FILE *bfp; /* Backup file */ + + WT_CURSOR_BACKUP_ENTRY *list; /* List of files to be copied. */ + size_t list_allocated; + size_t list_next; +}; + +struct __wt_cursor_btree { + WT_CURSOR iface; + + WT_BTREE *btree; /* Enclosing btree */ + + /* + * The following fields are set by the search functions as a precursor + * to page modification: we have a page, a WT_COL/WT_ROW slot on the + * page, an insert head, insert list and a skiplist stack (the stack of + * skiplist entries leading to the insert point). The search functions + * also return the relationship of the search key to the found key. + */ + WT_REF *ref; /* Current page */ + uint32_t slot; /* WT_COL/WT_ROW 0-based slot */ + + WT_INSERT_HEAD *ins_head; /* Insert chain head */ + WT_INSERT *ins; /* Current insert node */ + /* Search stack */ + WT_INSERT **ins_stack[WT_SKIP_MAXDEPTH]; + + /* Next item(s) found during search */ + WT_INSERT *next_stack[WT_SKIP_MAXDEPTH]; + + uint64_t recno; /* Record number */ + + /* + * The search function sets compare to: + * < 1 if the found key is less than the specified key + * 0 if the found key matches the specified key + * > 1 if the found key is larger than the specified key + */ + int compare; + + /* + * The key value from a binary search of a row-store files; we keep a + * copy of the last key we retrieved in the search, it avoids having + * doing the additional work of getting the key again for return to + * the application. + */ + WT_ITEM search_key; + + /* + * It's relatively expensive to calculate the last record on a variable- + * length column-store page because of the repeat values. Calculate it + * once per page and cache it. This value doesn't include the skiplist + * of appended entries on the last page. + */ + uint64_t last_standard_recno; + + /* + * For row-store pages, we need a single item that tells us the part of + * the page we're walking (otherwise switching from next to prev and + * vice-versa is just too complicated), so we map the WT_ROW and + * WT_INSERT_HEAD insert array slots into a single name space: slot 1 + * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is + * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are + * odd-numbered slots, and WT_ROW array slots are even-numbered slots. + */ + uint32_t row_iteration_slot; /* Row-store iteration slot */ + + /* + * Variable-length column-store values are run-length encoded and may + * be overflow values or Huffman encoded. To avoid repeatedly reading + * overflow values or decompressing encoded values, process it once and + * store the result in a temporary buffer. The cip_saved field is used + * to determine if we've switched columns since our last cursor call. + */ + WT_COL *cip_saved; /* Last iteration reference */ + + /* + * We don't instantiate prefix-compressed keys on pages where there's no + * Huffman encoding because we don't want to waste memory if only moving + * a cursor through the page, and it's faster to build keys while moving + * through the page than to roll-forward from a previously instantiated + * key (we don't instantiate all of the keys, just the ones at binary + * search points). We can't use the application's WT_CURSOR key field + * as a copy of the last-returned key because it may have been altered + * by the API layer, for example, dump cursors. Instead we store the + * last-returned key in a temporary buffer. The rip_saved field is used + * to determine if the key in the temporary buffer has the prefix needed + * for building the current key. + */ + WT_ROW *rip_saved; /* Last-returned key reference */ + + /* + * A temporary buffer for caching RLE values for column-store files. + */ + WT_ITEM tmp; + + /* + * The update structure allocated by the row- and column-store modify + * functions, used to avoid a data copy in the WT_CURSOR.update call. + */ + WT_UPDATE *modify_update; + + /* + * Fixed-length column-store items are a single byte, and it's simpler + * and cheaper to allocate the space for it now than keep checking to + * see if we need to grow the buffer. + */ + uint8_t v; /* Fixed-length return value */ + + uint8_t append_tree; /* Cursor appended to the tree */ + +#define WT_CBT_ACTIVE 0x01 /* Active in the tree */ +#define WT_CBT_ITERATE_APPEND 0x02 /* Col-store: iterating append list */ +#define WT_CBT_ITERATE_NEXT 0x04 /* Next iteration configuration */ +#define WT_CBT_ITERATE_PREV 0x08 /* Prev iteration configuration */ +#define WT_CBT_MAX_RECORD 0x10 /* Col-store: past end-of-table */ +#define WT_CBT_SEARCH_SMALLEST 0x20 /* Row-store: small-key insert list */ + uint8_t flags; +}; + +struct __wt_cursor_bulk { + WT_CURSOR_BTREE cbt; + + WT_REF *ref; /* The leaf page */ + WT_PAGE *leaf; + + /* + * Variable-length column store compares values during bulk load as + * part of RLE compression, row-store compares keys during bulk load + * to avoid corruption. + */ + WT_ITEM last; /* Last key/value seen */ + + /* + * Variable-length column-store RLE counter (also overloaded to mean + * the first time through the bulk-load insert routine, when set to 0). + */ + uint64_t rle; + + /* + * Fixed-length column-store current entry in memory chunk count, and + * the maximum number of records per chunk. + */ + uint32_t entry; /* Entry count */ + uint32_t nrecs; /* Max records per chunk */ + + /* Special bitmap bulk load for fixed-length column stores. */ + int bitmap; + + void *reconcile; /* Reconciliation information */ +}; + +struct __wt_cursor_config { + WT_CURSOR iface; +}; + +struct __wt_cursor_data_source { + WT_CURSOR iface; + + WT_COLLATOR *collator; /* Configured collator */ + int collator_owned; /* Collator needs to be terminated */ + + WT_CURSOR *source; /* Application-owned cursor */ +}; + +struct __wt_cursor_dump { + WT_CURSOR iface; + + WT_CURSOR *child; +}; + +struct __wt_cursor_index { + WT_CURSOR iface; + + WT_TABLE *table; + WT_INDEX *index; + const char *key_plan, *value_plan; + + WT_CURSOR *child; + WT_CURSOR **cg_cursors; +}; + +struct __wt_cursor_json { + char *key_buf; /* JSON formatted string */ + char *value_buf; /* JSON formatted string */ + WT_CONFIG_ITEM key_names; /* Names of key columns */ + WT_CONFIG_ITEM value_names; /* Names of value columns */ +}; + +struct __wt_cursor_log { + WT_CURSOR iface; + + WT_LSN *cur_lsn; /* LSN of current record */ + WT_LSN *next_lsn; /* LSN of next record */ + WT_ITEM *logrec; /* Copy of record for cursor */ + WT_ITEM *opkey, *opvalue; /* Op key/value copy */ + const uint8_t *stepp, *stepp_end; /* Pointer within record */ + uint32_t step_count; /* Intra-record count */ + uint32_t rectype; /* Record type */ + uint64_t txnid; /* Record txnid */ + uint32_t flags; +}; + +struct __wt_cursor_metadata { + WT_CURSOR iface; + + WT_CURSOR *file_cursor; /* Queries of regular metadata */ + +#define WT_MDC_POSITIONED 0x01 +#define WT_MDC_ONMETADATA 0x02 + uint32_t flags; +}; + +struct __wt_cursor_stat { + WT_CURSOR iface; + + int notpositioned; /* Cursor not positioned */ + + WT_STATS *stats; /* Stats owned by the cursor */ + WT_STATS *stats_first; /* First stats reference */ + int stats_base; /* Base statistics value */ + int stats_count; /* Count of stats elements */ + + union { /* Copies of the statistics */ + WT_DSRC_STATS dsrc_stats; + WT_CONNECTION_STATS conn_stats; + } u; + + int key; /* Current stats key */ + uint64_t v; /* Current stats value */ + WT_ITEM pv; /* Current stats value (string) */ + + /* Uses the same values as WT_CONNECTION::stat_flags field */ + uint32_t flags; +}; + +/* + * WT_CURSOR_STATS -- + * Return a reference to a statistic cursor's stats structures; use the + * WT_CURSOR.stats_first field instead of WT_CURSOR.stats because the latter + * is NULL when non-cursor memory is used to hold the statistics. + */ +#define WT_CURSOR_STATS(cursor) \ + (((WT_CURSOR_STAT *)cursor)->stats_first) + +struct __wt_cursor_table { + WT_CURSOR iface; + + WT_TABLE *table; + const char *plan; + + const char **cfg; /* Saved configuration string */ + + WT_CURSOR **cg_cursors; + WT_CURSOR **idx_cursors; +}; + +#define WT_CURSOR_PRIMARY(cursor) \ + (((WT_CURSOR_TABLE *)cursor)->cg_cursors[0]) + +#define WT_CURSOR_RECNO(cursor) WT_STREQ((cursor)->key_format, "r") + +/* + * WT_CURSOR_NEEDKEY, WT_CURSOR_NEEDVALUE -- + * Check if we have a key/value set. There's an additional semantic + * implemented here: if we're pointing into the tree, and about to perform + * a cursor operation, get a local copy of whatever we're referencing in + * the tree, there's an obvious race with the cursor moving and the key or + * value reference, and it's better to solve it here than in the underlying + * data-source layers. + * + * WT_CURSOR_CHECKKEY -- + * Check if a key is set without making a copy. + * + * WT_CURSOR_NOVALUE -- + * Release any cached value before an operation that could update the + * transaction context and free data a value is pointing to. + */ +#define WT_CURSOR_CHECKKEY(cursor) do { \ + if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) \ + WT_ERR(__wt_cursor_kv_not_set(cursor, 1)); \ +} while (0) +#define WT_CURSOR_CHECKVALUE(cursor) do { \ + if (!F_ISSET(cursor, WT_CURSTD_VALUE_SET)) \ + WT_ERR(__wt_cursor_kv_not_set(cursor, 0)); \ +} while (0) +#define WT_CURSOR_NEEDKEY(cursor) do { \ + if (F_ISSET(cursor, WT_CURSTD_KEY_INT)) { \ + if (!WT_DATA_IN_ITEM(&(cursor)->key)) \ + WT_ERR(__wt_buf_set( \ + (WT_SESSION_IMPL *)(cursor)->session, \ + &(cursor)->key, \ + (cursor)->key.data, (cursor)->key.size)); \ + F_CLR(cursor, WT_CURSTD_KEY_INT); \ + F_SET(cursor, WT_CURSTD_KEY_EXT); \ + } \ + WT_CURSOR_CHECKKEY(cursor); \ +} while (0) +#define WT_CURSOR_NEEDVALUE(cursor) do { \ + if (F_ISSET(cursor, WT_CURSTD_VALUE_INT)) { \ + if (!WT_DATA_IN_ITEM(&(cursor)->value)) \ + WT_ERR(__wt_buf_set( \ + (WT_SESSION_IMPL *)(cursor)->session, \ + &(cursor)->value, \ + (cursor)->value.data, (cursor)->value.size));\ + F_CLR(cursor, WT_CURSTD_VALUE_INT); \ + F_SET(cursor, WT_CURSTD_VALUE_EXT); \ + } \ + WT_CURSOR_CHECKVALUE(cursor); \ +} while (0) +#define WT_CURSOR_NOVALUE(cursor) do { \ + F_CLR(cursor, WT_CURSTD_VALUE_INT); \ +} while (0) + +#define WT_CURSOR_RAW_OK \ + WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_PRINT | WT_CURSTD_RAW diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i new file mode 100644 index 00000000000..7f8e83643c5 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -0,0 +1,277 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * __cursor_set_recno -- + * The cursor value in the interface has to track the value in the + * underlying cursor, update them in parallel. + */ +static inline void +__cursor_set_recno(WT_CURSOR_BTREE *cbt, uint64_t v) +{ + cbt->iface.recno = cbt->recno = v; +} + +/* + * __cursor_pos_clear -- + * Reset the cursor's location. + */ +static inline void +__cursor_pos_clear(WT_CURSOR_BTREE *cbt) +{ + /* + * Most of the cursor's location information that needs to be set on + * successful return is always set by a successful return, for example, + * we don't initialize the compare return value because it's always + * set by the row-store search. The other stuff gets cleared here, + * and it's a minimal set of things we need to clear. It would be a + * lot simpler to clear everything, but we call this function a lot. + */ + cbt->recno = 0; + + cbt->ins = NULL; + cbt->ins_head = NULL; + cbt->ins_stack[0] = NULL; + + cbt->cip_saved = NULL; + cbt->rip_saved = NULL; + + /* + * Don't clear the active flag, it's owned by the cursor enter/leave + * functions. + */ + F_CLR(cbt, ~WT_CBT_ACTIVE); +} + +/* + * __cursor_enter -- + * Activate a cursor. + */ +static inline int +__cursor_enter(WT_SESSION_IMPL *session) +{ + /* + * If there are no other cursors positioned in the session, check + * whether the cache is full. + */ + if (session->ncursors == 0) + WT_RET(__wt_cache_full_check(session)); + ++session->ncursors; + return (0); +} + +/* + * __cursor_leave -- + * Deactivate a cursor. + */ +static inline int +__cursor_leave(WT_SESSION_IMPL *session) +{ + /* + * Decrement the count of active cursors in the session. When that + * goes to zero, there are no active cursors, and we can release any + * snapshot we're holding for read committed isolation. + */ + WT_ASSERT(session, session->ncursors > 0); + if (--session->ncursors == 0) + __wt_txn_read_last(session); + + return (0); +} + +/* + * __curfile_enter -- + * Activate a file cursor. + */ +static inline int +__curfile_enter(WT_CURSOR_BTREE *cbt) +{ + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + + WT_RET(__cursor_enter(session)); + F_SET(cbt, WT_CBT_ACTIVE); + return (0); +} + +/* + * __curfile_leave -- + * Clear a file cursor's position. + */ +static inline int +__curfile_leave(WT_CURSOR_BTREE *cbt) +{ + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + + /* If the cursor was active, deactivate it. */ + if (F_ISSET(cbt, WT_CBT_ACTIVE)) { + WT_RET(__cursor_leave(session)); + F_CLR(cbt, WT_CBT_ACTIVE); + } + + /* + * Release any page references we're holding. This can trigger + * eviction (e.g., forced eviction of big pages), so it is important to + * do it after releasing our snapshot above. + */ + WT_RET(__wt_page_release(session, cbt->ref, 0)); + cbt->ref = NULL; + return (0); +} + +/* + * __cursor_func_init -- + * Cursor call setup. + */ +static inline int +__cursor_func_init(WT_CURSOR_BTREE *cbt, int reenter) +{ + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + + if (reenter) + WT_RET(__curfile_leave(cbt)); + if (!F_ISSET(cbt, WT_CBT_ACTIVE)) + WT_RET(__curfile_enter(cbt)); + __wt_txn_cursor_op(session); + return (0); +} + +/* + * __cursor_reset -- + * Reset the cursor. + */ +static inline int +__cursor_reset(WT_CURSOR_BTREE *cbt) +{ + WT_DECL_RET; + + /* + * The cursor is leaving the API, and no longer holds any position, + * generally called to clean up the cursor after an error. + */ + ret = __curfile_leave(cbt); + __cursor_pos_clear(cbt); + return (ret); +} + +/* + * __cursor_row_slot_return -- + * Return a row-store leaf page slot's K/V pair. + */ +static inline int +__cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd) +{ + WT_BTREE *btree; + WT_ITEM *kb, *vb; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + WT_PAGE *page; + WT_SESSION_IMPL *session; + void *copy; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + btree = S2BT(session); + page = cbt->ref->page; + + unpack = NULL; + + kb = &cbt->iface.key; + vb = &cbt->iface.value; + + /* + * The row-store key can change underfoot; explicitly take a copy. + */ + copy = WT_ROW_KEY_COPY(rip); + + /* + * Get a key: we could just call __wt_row_leaf_key, but as a cursor + * is running through the tree, we may have additional information + * here (we may have the fully-built key that's immediately before + * the prefix-compressed key we want, so it's a faster construction). + * + * First, check for an immediately available key. + */ + if (__wt_row_leaf_key_info( + page, copy, NULL, &cell, &kb->data, &kb->size)) + goto value; + + /* Huffman encoded keys are a slow path in all cases. */ + if (btree->huffman_key != NULL) + goto slow; + + /* + * Unpack the cell and deal with overflow and prefix-compressed keys. + * Inline building simple prefix-compressed keys from a previous key, + * otherwise build from scratch. + */ + unpack = &_unpack; + __wt_cell_unpack(cell, unpack); + if (unpack->type == WT_CELL_KEY && + cbt->rip_saved != NULL && cbt->rip_saved == rip - 1) { + WT_ASSERT(session, cbt->tmp.size >= unpack->prefix); + + /* + * Grow the buffer as necessary as well as ensure data has been + * copied into local buffer space, then append the suffix to the + * prefix already in the buffer. + * + * Don't grow the buffer unnecessarily or copy data we don't + * need, truncate the item's data length to the prefix bytes. + */ + cbt->tmp.size = unpack->prefix; + WT_RET(__wt_buf_grow( + session, &cbt->tmp, cbt->tmp.size + unpack->size)); + memcpy((uint8_t *)cbt->tmp.data + cbt->tmp.size, + unpack->data, unpack->size); + cbt->tmp.size += unpack->size; + } else { + /* + * Call __wt_row_leaf_key_work instead of __wt_row_leaf_key: we + * already did __wt_row_leaf_key's fast-path checks inline. + */ +slow: WT_RET( + __wt_row_leaf_key_work(session, page, rip, &cbt->tmp, 0)); + } + kb->data = cbt->tmp.data; + kb->size = cbt->tmp.size; + cbt->rip_saved = rip; + +value: + /* + * If the item was ever modified, use the WT_UPDATE data. Note the + * caller passes us the update: it has already resolved which one + * (if any) is visible. + */ + if (upd != NULL) { + vb->data = WT_UPDATE_DATA(upd); + vb->size = upd->size; + return (0); + } + + /* Else, simple values have their location encoded in the WT_ROW. */ + if (__wt_row_leaf_value(page, rip, vb)) + return (0); + + /* + * Else, take the value from the original page cell (which may be + * empty). + */ + if ((cell = __wt_row_leaf_value_cell(page, rip, unpack)) == NULL) { + vb->data = ""; + vb->size = 0; + return (0); + } + + unpack = &_unpack; + __wt_cell_unpack(cell, unpack); + return (__wt_page_cell_data_ref(session, cbt->ref->page, unpack, vb)); +} diff --git a/src/third_party/wiredtiger/src/include/dhandle.h b/src/third_party/wiredtiger/src/include/dhandle.h new file mode 100644 index 00000000000..5556627c74d --- /dev/null +++ b/src/third_party/wiredtiger/src/include/dhandle.h @@ -0,0 +1,73 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * XXX + * The server threads use their own WT_SESSION_IMPL handles because they may + * want to block (for example, the eviction server calls reconciliation, and + * some of the reconciliation diagnostic code reads pages), and the user's + * session handle is already blocking on a server thread. The problem is the + * server thread needs to reference the correct btree handle, and that's + * hanging off the application's thread of control. For now, I'm just making + * it obvious where that's getting done. + */ +#define WT_SET_BTREE_IN_SESSION(s, b) ((s)->dhandle = b->dhandle) +#define WT_CLEAR_BTREE_IN_SESSION(s) ((s)->dhandle = NULL) + +#define WT_WITH_DHANDLE(s, d, e) do { \ + WT_DATA_HANDLE *__saved_dhandle = (s)->dhandle; \ + (s)->dhandle = (d); \ + e; \ + (s)->dhandle = __saved_dhandle; \ +} while (0) + +#define WT_WITH_BTREE(s, b, e) WT_WITH_DHANDLE(s, (b)->dhandle, e) + +/* + * WT_DATA_HANDLE -- + * A handle for a generic named data source. + */ +struct __wt_data_handle { + WT_RWLOCK *rwlock; /* Lock for shared/exclusive ops */ + SLIST_ENTRY(__wt_data_handle) l;/* Linked list of handles */ + + /* + * Sessions caching a connection's data handle will have a non-zero + * reference count; sessions using a connection's data handle will + * have a non-zero in-use count. + */ + uint32_t session_ref; /* Sessions referencing this handle */ + int32_t session_inuse; /* Sessions using this handle */ + time_t timeofdeath; /* Use count went to 0 */ + + uint64_t name_hash; /* Hash of name */ + const char *name; /* Object name as a URI */ + const char *checkpoint; /* Checkpoint name (or NULL) */ + const char **cfg; /* Configuration information */ + + WT_DATA_SOURCE *dsrc; /* Data source for this handle */ + void *handle; /* Generic handle */ + + /* + * Data handles can be closed without holding the schema lock; threads + * walk the list of open handles, operating on them (checkpoint is the + * best example). To avoid sources disappearing underneath checkpoint, + * lock the data handle when closing it. + */ + WT_SPINLOCK close_lock; /* Lock to close the handle */ + + WT_DSRC_STATS stats; /* Data-source statistics */ + + /* Flags values over 0xff are reserved for WT_BTREE_* */ +#define WT_DHANDLE_DISCARD 0x01 /* Discard on release */ +#define WT_DHANDLE_DISCARD_CLOSE 0x02 /* Close on release */ +#define WT_DHANDLE_EXCLUSIVE 0x04 /* Need exclusive access */ +#define WT_DHANDLE_HAVE_REF 0x08 /* Already have ref */ +#define WT_DHANDLE_LOCK_ONLY 0x10 /* Handle only used as a lock */ +#define WT_DHANDLE_OPEN 0x20 /* Handle is open */ + uint32_t flags; +}; diff --git a/src/third_party/wiredtiger/src/include/dlh.h b/src/third_party/wiredtiger/src/include/dlh.h new file mode 100644 index 00000000000..3974ae2792c --- /dev/null +++ b/src/third_party/wiredtiger/src/include/dlh.h @@ -0,0 +1,15 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +struct __wt_dlh { + TAILQ_ENTRY(__wt_dlh) q; /* List of open libraries. */ + + void *handle; /* Handle returned by dlopen. */ + char *name; + + int (*terminate)(WT_CONNECTION *); /* Terminate function. */ +}; diff --git a/src/third_party/wiredtiger/src/include/error.h b/src/third_party/wiredtiger/src/include/error.h new file mode 100644 index 00000000000..9bccc80faec --- /dev/null +++ b/src/third_party/wiredtiger/src/include/error.h @@ -0,0 +1,141 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#define WT_DEBUG_POINT ((void *)0xdeadbeef) +#define WT_DEBUG_BYTE (0xab) + +/* In DIAGNOSTIC mode, yield in places where we want to encourage races. */ +#ifdef HAVE_DIAGNOSTIC +#define WT_HAVE_DIAGNOSTIC_YIELD do { \ + __wt_yield(); \ +} while (0) +#else +#define WT_HAVE_DIAGNOSTIC_YIELD +#endif + +/* Set "ret" and branch-to-err-label tests. */ +#define WT_ERR(a) do { \ + if ((ret = (a)) != 0) \ + goto err; \ +} while (0) +#define WT_ERR_MSG(session, v, ...) do { \ + ret = (v); \ + __wt_err(session, ret, __VA_ARGS__); \ + goto err; \ +} while (0) +#define WT_ERR_BUSY_OK(a) do { \ + if ((ret = (a)) != 0) { \ + if (ret == EBUSY) \ + ret = 0; \ + else \ + goto err; \ + } \ +} while (0) +#define WT_ERR_NOTFOUND_OK(a) do { \ + if ((ret = (a)) != 0) { \ + if (ret == WT_NOTFOUND) \ + ret = 0; \ + else \ + goto err; \ + } \ +} while (0) +#define WT_ERR_TEST(a, v) do { \ + if (a) { \ + ret = (v); \ + goto err; \ + } \ +} while (0) + +/* Return tests. */ +#define WT_RET(a) do { \ + int __ret; \ + if ((__ret = (a)) != 0) \ + return (__ret); \ +} while (0) +#define WT_RET_TEST(a, v) do { \ + if (a) \ + return (v); \ +} while (0) +#define WT_RET_MSG(session, v, ...) do { \ + int __ret = (v); \ + __wt_err(session, __ret, __VA_ARGS__); \ + return (__ret); \ +} while (0) +#define WT_RET_BUSY_OK(a) do { \ + int __ret; \ + if ((__ret = (a)) != 0 && __ret != EBUSY) \ + return (__ret); \ +} while (0) +#define WT_RET_NOTFOUND_OK(a) do { \ + int __ret; \ + if ((__ret = (a)) != 0 && __ret != WT_NOTFOUND) \ + return (__ret); \ +} while (0) +/* Set "ret" if not already set. */ +#define WT_TRET(a) do { \ + int __ret; \ + if ((__ret = (a)) != 0 && \ + (__ret == WT_PANIC || \ + ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND)) \ + ret = __ret; \ +} while (0) +#define WT_TRET_BUSY_OK(a) do { \ + int __ret; \ + if ((__ret = (a)) != 0 && __ret != EBUSY && \ + (__ret == WT_PANIC || \ + ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND)) \ + ret = __ret; \ +} while (0) +#define WT_TRET_NOTFOUND_OK(a) do { \ + int __ret; \ + if ((__ret = (a)) != 0 && __ret != WT_NOTFOUND && \ + (__ret == WT_PANIC || \ + ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND)) \ + ret = __ret; \ +} while (0) + +/* Return and branch-to-err-label cases for switch statements. */ +#define WT_ILLEGAL_VALUE(session) \ + default: \ + return (__wt_illegal_value(session, NULL)) +#define WT_ILLEGAL_VALUE_ERR(session) \ + default: \ + WT_ERR(__wt_illegal_value(session, NULL)) +#define WT_ILLEGAL_VALUE_SET(session) \ + default: \ + ret = __wt_illegal_value(session, NULL); \ + break + +#define WT_PANIC_MSG(session, v, ...) do { \ + __wt_err(session, v, __VA_ARGS__); \ + (void)__wt_panic(session); \ +} while (0) +#define WT_PANIC_ERR(session, v, ...) do { \ + WT_PANIC_MSG(session, v, __VA_ARGS__); \ + WT_ERR(WT_PANIC); \ +} while (0) +#define WT_PANIC_RET(session, v, ...) do { \ + WT_PANIC_MSG(session, v, __VA_ARGS__); \ + /* Return WT_PANIC regardless of earlier return codes. */ \ + return (WT_PANIC); \ +} while (0) + +/* + * WT_ASSERT + * Assert an expression, aborting in diagnostic mode. Otherwise, + * "use" the session to keep the compiler quiet and don't evaluate the + * expression. + */ +#ifdef HAVE_DIAGNOSTIC +#define WT_ASSERT(session, exp) do { \ + if (!(exp)) \ + __wt_assert(session, 0, __FILE__, __LINE__, "%s", #exp);\ +} while (0) +#else +#define WT_ASSERT(session, exp) \ + WT_UNUSED(session) +#endif diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h new file mode 100644 index 00000000000..2ab964475d8 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -0,0 +1,650 @@ +/* DO NOT EDIT: automatically built by dist/s_prototypes. */ + +extern void __wt_async_stats_update(WT_SESSION_IMPL *session); +extern int __wt_async_create(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_async_destroy(WT_SESSION_IMPL *session); +extern int __wt_async_flush(WT_SESSION_IMPL *session); +extern int __wt_async_new_op(WT_SESSION_IMPL *session, const char *uri, const char *config, const char *cfg[], WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP_IMPL **opp); +extern int __wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op); +extern int __wt_async_op_init(WT_SESSION_IMPL *session); +extern void *__wt_async_worker(void *arg); +extern int __wt_block_addr_to_buffer(WT_BLOCK *block, uint8_t **pp, wt_off_t offset, uint32_t size, uint32_t cksum); +extern int __wt_block_buffer_to_addr(WT_BLOCK *block, const uint8_t *p, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump); +extern int __wt_block_addr_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int live); +extern int __wt_block_addr_string(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, const uint8_t *addr, size_t addr_size); +extern int __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci); +extern int __wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci); +extern int __wt_block_ckpt_init( WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name); +extern int __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, uint8_t *root_addr, size_t *root_addr_sizep, int checkpoint); +extern int __wt_block_checkpoint_unload( WT_SESSION_IMPL *session, WT_BLOCK *block, int checkpoint); +extern void __wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci); +extern int __wt_block_checkpoint(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, int data_cksum); +extern int __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block); +extern int __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block); +extern int __wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block); +extern int __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, int *skipp); +extern int __wt_block_compact_page_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int *skipp); +extern int __wt_block_misplaced(WT_SESSION_IMPL *session, WT_BLOCK *block, const char *tag, wt_off_t offset, uint32_t size, int live); +extern int __wt_block_off_remove_overlap( WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size); +extern int __wt_block_alloc( WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size); +extern int __wt_block_free(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size); +extern int __wt_block_off_free( WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size); +extern int __wt_block_extlist_check( WT_SESSION_IMPL *session, WT_EXTLIST *al, WT_EXTLIST *bl); +extern int __wt_block_extlist_overlap( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci); +extern int __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b); +extern int __wt_block_insert_ext( WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size); +extern int __wt_block_extlist_read_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size); +extern int __wt_block_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size); +extern int __wt_block_extlist_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, WT_EXTLIST *additional); +extern int __wt_block_extlist_truncate( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el); +extern int __wt_block_extlist_init(WT_SESSION_IMPL *session, WT_EXTLIST *el, const char *name, const char *extname, int track_size); +extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el); +extern int __wt_block_map( WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapp, size_t *maplenp, void **mappingcookie); +extern int __wt_block_unmap( WT_SESSION_IMPL *session, WT_BLOCK *block, void *map, size_t maplen, void **mappingcookie); +extern int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], int forced_salvage, int readonly, uint32_t allocsize, WT_BM **bmp); +extern int __wt_block_manager_truncate( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize); +extern int __wt_block_manager_create( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize); +extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], int forced_salvage, int readonly, uint32_t allocsize, WT_BLOCK **blockp); +extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block); +extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize); +extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats); +extern int __wt_bm_preload(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size); +extern int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size); +extern int __wt_block_read_off_blind( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset); +extern int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum); +extern int __wt_block_ext_alloc(WT_SESSION_IMPL *session, WT_EXT **extp); +extern void __wt_block_ext_free(WT_SESSION_IMPL *session, WT_EXT *ext); +extern int __wt_block_size_alloc(WT_SESSION_IMPL *session, WT_SIZE **szp); +extern void __wt_block_size_free(WT_SESSION_IMPL *session, WT_SIZE *sz); +extern int __wt_block_ext_prealloc(WT_SESSION_IMPL *session, u_int max); +extern int __wt_block_ext_discard(WT_SESSION_IMPL *session, u_int max); +extern int __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block); +extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block); +extern int __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size); +extern int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, int *eofp); +extern int __wt_block_salvage_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t addr_size, int valid); +extern int __wt_block_verify_start( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase); +extern int __wt_block_verify_end(WT_SESSION_IMPL *session, WT_BLOCK *block); +extern int __wt_verify_ckpt_load( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci); +extern int __wt_verify_ckpt_unload(WT_SESSION_IMPL *session, WT_BLOCK *block); +extern int __wt_block_verify_addr(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size); +extern u_int __wt_block_header(WT_BLOCK *block); +extern int __wt_block_write_size(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t *sizep); +extern int __wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int data_cksum); +extern int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int caller_locked); +extern int __wt_bloom_create( WT_SESSION_IMPL *session, const char *uri, const char *config, uint64_t count, uint32_t factor, uint32_t k, WT_BLOOM **bloomp); +extern int __wt_bloom_open(WT_SESSION_IMPL *session, const char *uri, uint32_t factor, uint32_t k, WT_CURSOR *owner, WT_BLOOM **bloomp); +extern int __wt_bloom_insert(WT_BLOOM *bloom, WT_ITEM *key); +extern int __wt_bloom_finalize(WT_BLOOM *bloom); +extern int __wt_bloom_hash(WT_BLOOM *bloom, WT_ITEM *key, WT_BLOOM_HASH *bhash); +extern int __wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash); +extern int __wt_bloom_get(WT_BLOOM *bloom, WT_ITEM *key); +extern int __wt_bloom_close(WT_BLOOM *bloom); +extern int __wt_bloom_drop(WT_BLOOM *bloom, const char *config); +extern int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp); +extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next); +extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating); +extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating); +extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt); +extern int __wt_btcur_search(WT_CURSOR_BTREE *cbt); +extern int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp); +extern int __wt_btcur_insert(WT_CURSOR_BTREE *cbt); +extern int __wt_btcur_update_check(WT_CURSOR_BTREE *cbt); +extern int __wt_btcur_remove(WT_CURSOR_BTREE *cbt); +extern int __wt_btcur_update(WT_CURSOR_BTREE *cbt); +extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt); +extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp); +extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop); +extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt); +extern int __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v); +extern int __wt_debug_addr_print( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size); +extern int __wt_debug_addr(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, const char *ofile); +extern int __wt_debug_offset_blind( WT_SESSION_IMPL *session, wt_off_t offset, const char *ofile); +extern int __wt_debug_offset(WT_SESSION_IMPL *session, wt_off_t offset, uint32_t size, uint32_t cksum, const char *ofile); +extern int __wt_debug_disk( WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile); +extern int __wt_debug_tree_shape( WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile); +extern int __wt_debug_tree_all(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile); +extern int __wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile); +extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile); +extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp); +extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref); +extern int __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref); +extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref); +extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref); +extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep); +extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, int free_pages); +extern void __wt_free_ref_index(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE_INDEX *pindex, int free_pages); +extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref); +extern int __wt_evict_server_wake(WT_SESSION_IMPL *session); +extern int __wt_evict_create(WT_SESSION_IMPL *session); +extern int __wt_evict_destroy(WT_SESSION_IMPL *session); +extern int __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref); +extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session); +extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session); +extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app); +extern void __wt_cache_dump(WT_SESSION_IMPL *session); +extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]); +extern int __wt_btree_close(WT_SESSION_IMPL *session); +extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno); +extern int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size); +extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep); +extern void __wt_btree_evictable(WT_SESSION_IMPL *session, int on); +extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize); +extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session); +extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session); +extern int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size); +extern int __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int checkpoint, int compressed); +extern const char *__wt_page_type_string(u_int type); +extern const char *__wt_cell_type_string(uint8_t type); +extern const char *__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf); +extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *buf); +extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store); +extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack); +extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell); +extern int +__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags +#ifdef HAVE_DIAGNOSTIC + , const char *file, int line +#endif + ); +extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep); +extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep); +extern int __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref); +extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd); +extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]); +extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst); +extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op); +extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk, size_t size); +extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf); +extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags); +extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove); +extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt); +extern int __wt_rec_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive); +extern void __wt_split_stash_discard(WT_SESSION_IMPL *session); +extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session); +extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp); +extern int __wt_split_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive); +extern int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell); +extern void __wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page); +extern int __wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page, uint8_t **addrp, size_t *addr_sizep, const void *value, size_t value_size); +extern int __wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, size_t addr_size, const void *value, size_t value_size); +extern void __wt_ovfl_reuse_free(WT_SESSION_IMPL *session, WT_PAGE *page); +extern int __wt_ovfl_txnc_search( WT_PAGE *page, const uint8_t *addr, size_t addr_size, WT_ITEM *store); +extern int __wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, size_t addr_size, const void *value, size_t value_size); +extern void __wt_ovfl_txnc_free(WT_SESSION_IMPL *session, WT_PAGE *page); +extern int __wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page); +extern int __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page); +extern int __wt_rec_write(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags); +extern int __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); +extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); +extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); +extern int __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); +extern int __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); +extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page); +extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key); +extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, int instantiate); +extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t cell_offset, const void *key, size_t size, void *ikeyp); +extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, void *ikeyp); +extern int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page); +extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove); +extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep); +extern int __wt_update_alloc( WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep); +extern WT_UPDATE *__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd); +extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); +extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key); +extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, int insert); +extern int __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); +extern int __wt_config_initn( WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len); +extern int __wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str); +extern int __wt_config_subinit( WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item); +extern int __wt_config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value); +extern int __wt_config_get(WT_SESSION_IMPL *session, const char **cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value); +extern int __wt_config_gets(WT_SESSION_IMPL *session, const char **cfg, const char *key, WT_CONFIG_ITEM *value); +extern int __wt_config_getone(WT_SESSION_IMPL *session, const char *config, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value); +extern int __wt_config_getones(WT_SESSION_IMPL *session, const char *config, const char *key, WT_CONFIG_ITEM *value); +extern int __wt_config_gets_def(WT_SESSION_IMPL *session, const char **cfg, const char *key, int def, WT_CONFIG_ITEM *value); +extern int __wt_config_subgetraw(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value); +extern int __wt_config_subgets(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cfg, const char *key, WT_CONFIG_ITEM *value); +extern void __wt_conn_foc_discard(WT_SESSION_IMPL *session); +extern int __wt_configure_method(WT_SESSION_IMPL *session, const char *method, const char *uri, const char *config, const char *type, const char *check); +extern int __wt_config_check(WT_SESSION_IMPL *session, const WT_CONFIG_ENTRY *entry, const char *config, size_t config_len); +extern int __wt_config_collapse( WT_SESSION_IMPL *session, const char **cfg, const char **config_ret); +extern int __wt_config_merge( WT_SESSION_IMPL *session, const char **cfg, const char **config_ret); +extern int __wt_config_concat( WT_SESSION_IMPL *session, const char **cfg, const char **config_ret); +extern int __wt_conn_config_init(WT_SESSION_IMPL *session); +extern void __wt_conn_config_discard(WT_SESSION_IMPL *session); +extern int __wt_ext_config_parser_open(WT_EXTENSION_API *wt_ext, WT_SESSION *wt_session, const char *config, size_t len, WT_CONFIG_PARSER **config_parserp); +extern int __wt_ext_config_get(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_CONFIG_ARG *cfg_arg, const char *key, WT_CONFIG_ITEM *cval); +extern int __wt_config_upgrade(WT_SESSION_IMPL *session, WT_ITEM *buf); +extern int __wt_collator_config(WT_SESSION_IMPL *session, const char **cfg, WT_COLLATOR **collatorp, int *ownp); +extern int __wt_conn_remove_collator(WT_SESSION_IMPL *session); +extern int __wt_conn_remove_compressor(WT_SESSION_IMPL *session); +extern int __wt_conn_remove_data_source(WT_SESSION_IMPL *session); +extern int __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]); +extern void __wt_cache_stats_update(WT_SESSION_IMPL *session); +extern int __wt_cache_destroy(WT_SESSION_IMPL *session); +extern int __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg); +extern int __wt_conn_cache_pool_open(WT_SESSION_IMPL *session); +extern int __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session); +extern void *__wt_cache_pool_server(void *arg); +extern int __wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session); +extern int __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize); +extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int force); +extern int __wt_conn_btree_get(WT_SESSION_IMPL *session, const char *name, const char *ckpt, const char *op_cfg[], uint32_t flags); +extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, int apply_checkpoints, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); +extern int __wt_conn_btree_apply_single(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); +extern void __wt_conn_btree_close(WT_SESSION_IMPL *session); +extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *name, int force); +extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, int final); +extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session); +extern int __wt_connection_init(WT_CONNECTION_IMPL *conn); +extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn); +extern int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_logmgr_destroy(WT_SESSION_IMPL *session); +extern int __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]); +extern int __wt_connection_close(WT_CONNECTION_IMPL *conn); +extern int __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]); +extern void __wt_conn_stat_init(WT_SESSION_IMPL *session); +extern int __wt_statlog_log_one(WT_SESSION_IMPL *session); +extern int __wt_statlog_create(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_statlog_destroy(WT_SESSION_IMPL *session, int is_close); +extern int __wt_sweep_create(WT_SESSION_IMPL *session); +extern int __wt_sweep_destroy(WT_SESSION_IMPL *session); +extern int __wt_curbackup_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_backup_list_uri_append( WT_SESSION_IMPL *session, const char *name, int *skip); +extern int __wt_curbulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, int bitmap, int skip_sort_check); +extern int __wt_curconfig_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_curds_open( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp); +extern int __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp); +extern int __wt_curfile_update_check(WT_CURSOR *cursor); +extern int __wt_curfile_create(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[], int bulk, int bitmap, WT_CURSOR **cursorp); +extern int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, WT_CURSOR_JSON *json, int iskey, va_list ap); +extern void __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor); +extern size_t __wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, int force_unicode); +extern int __wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf); +extern int __wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype, const char **tokstart, size_t *toklen); +extern const char *__wt_json_tokname(int toktype); +extern int __wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr, const char *format, WT_CURSOR_JSON *json, int iskey, WT_ITEM *item); +extern ssize_t __wt_json_strlen(const char *src, size_t srclen); +extern int __wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t srclen); +extern int __wt_curlog_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_curmetadata_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); +extern void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst); +extern int __wt_curstat_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst); +extern int __wt_curstat_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_cursor_notsup(WT_CURSOR *cursor); +extern int __wt_cursor_noop(WT_CURSOR *cursor); +extern void __wt_cursor_set_notsup(WT_CURSOR *cursor); +extern int __wt_cursor_config_readonly(WT_CURSOR *cursor, const char *cfg[], int def); +extern int __wt_cursor_kv_not_set(WT_CURSOR *cursor, int key); +extern int __wt_cursor_get_key(WT_CURSOR *cursor, ...); +extern void __wt_cursor_set_key(WT_CURSOR *cursor, ...); +extern int __wt_cursor_get_raw_key(WT_CURSOR *cursor, WT_ITEM *key); +extern void __wt_cursor_set_raw_key(WT_CURSOR *cursor, WT_ITEM *key); +extern int __wt_cursor_get_raw_value(WT_CURSOR *cursor, WT_ITEM *value); +extern void __wt_cursor_set_raw_value(WT_CURSOR *cursor, WT_ITEM *value); +extern int __wt_cursor_get_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap); +extern void __wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap); +extern int __wt_cursor_get_value(WT_CURSOR *cursor, ...); +extern int __wt_cursor_get_valuev(WT_CURSOR *cursor, va_list ap); +extern void __wt_cursor_set_value(WT_CURSOR *cursor, ...); +extern void __wt_cursor_set_valuev(WT_CURSOR *cursor, va_list ap); +extern int __wt_cursor_close(WT_CURSOR *cursor); +extern int __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor); +extern int __wt_cursor_init(WT_CURSOR *cursor, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_curtable_get_key(WT_CURSOR *cursor, ...); +extern int __wt_curtable_get_value(WT_CURSOR *cursor, ...); +extern void __wt_curtable_set_key(WT_CURSOR *cursor, ...); +extern void __wt_curtable_set_value(WT_CURSOR *cursor, ...); +extern int __wt_table_range_truncate(WT_CURSOR_TABLE *start, WT_CURSOR_TABLE *stop); +extern int __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn); +extern void __wt_log_written_reset(WT_SESSION_IMPL *session); +extern int __wt_log_get_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp); +extern int __wt_log_get_active_files( WT_SESSION_IMPL *session, char ***filesp, u_int *countp); +extern void __wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count); +extern int __wt_log_filename(WT_SESSION_IMPL *session, uint32_t id, WT_ITEM *buf); +extern int __wt_log_extract_lognum( WT_SESSION_IMPL *session, const char *name, uint32_t *id); +extern int __wt_log_remove(WT_SESSION_IMPL *session, uint32_t lognum); +extern int __wt_log_open(WT_SESSION_IMPL *session); +extern int __wt_log_close(WT_SESSION_IMPL *session); +extern int __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create); +extern int __wt_log_read(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags); +extern int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie); +extern int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags); +extern int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap); +extern int __wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp); +extern void __wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp); +extern int __wt_logrec_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *rectypep); +extern int __wt_logop_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *optypep, uint32_t *opsizep); +extern int __wt_logop_col_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno, WT_ITEM *value); +extern int __wt_logop_col_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep); +extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_col_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno); +extern int __wt_logop_col_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop); +extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_col_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t start, uint64_t stop); +extern int __wt_logop_col_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *startp, uint64_t *stopp); +extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_row_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key, WT_ITEM *value); +extern int __wt_logop_row_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep); +extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_row_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key); +extern int __wt_logop_row_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp); +extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode); +extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep); +extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_log_slot_init(WT_SESSION_IMPL *session); +extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session); +extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslotp); +extern int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); +extern int __wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); +extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); +extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size); +extern int __wt_log_slot_free(WT_LOGSLOT *slot); +extern int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize); +extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks); +extern int __wt_clsm_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_lsm_manager_config(WT_SESSION_IMPL *session, const char **cfg); +extern int __wt_lsm_manager_reconfig(WT_SESSION_IMPL *session, const char **cfg); +extern int __wt_lsm_manager_start(WT_SESSION_IMPL *session); +extern void __wt_lsm_manager_free_work_unit( WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT *entry); +extern int __wt_lsm_manager_destroy(WT_SESSION_IMPL *session); +extern int __wt_lsm_manager_clear_tree( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_manager_pop_entry( WT_SESSION_IMPL *session, uint32_t type, WT_LSM_WORK_UNIT **entryp); +extern int __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, uint32_t type, uint32_t flags, WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_merge_update_tree(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int start_chunk, u_int nchunks, WT_LSM_CHUNK *chunk); +extern int __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id); +extern int __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); +extern int __wt_curstat_lsm_init( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR_STAT *cst); +extern int __wt_lsm_tree_close_all(WT_SESSION_IMPL *session); +extern int __wt_lsm_tree_bloom_name(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp); +extern int __wt_lsm_tree_chunk_name(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp); +extern int __wt_lsm_tree_set_chunk_size( WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk); +extern int __wt_lsm_tree_setup_chunk( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk); +extern int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config); +extern int __wt_lsm_tree_get(WT_SESSION_IMPL *session, const char *uri, int exclusive, WT_LSM_TREE **treep); +extern void __wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); +extern void __wt_lsm_tree_throttle( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int decrease_only); +extern int __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_tree_drop( WT_SESSION_IMPL *session, const char *name, const char *cfg[]); +extern int __wt_lsm_tree_rename(WT_SESSION_IMPL *session, const char *olduri, const char *newuri, const char *cfg[]); +extern int __wt_lsm_tree_truncate( WT_SESSION_IMPL *session, const char *name, const char *cfg[]); +extern int __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip); +extern int __wt_lsm_tree_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, int *), const char *cfg[], uint32_t open_flags); +extern int __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int force, WT_LSM_CHUNK **chunkp); +extern int __wt_lsm_work_switch( WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT **entryp, int *ran); +extern int __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk); +extern int __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args); +extern int __wt_meta_btree_apply(WT_SESSION_IMPL *session, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); +extern int __wt_meta_checkpoint(WT_SESSION_IMPL *session, const char *fname, const char *checkpoint, WT_CKPT *ckpt); +extern int __wt_meta_checkpoint_last_name( WT_SESSION_IMPL *session, const char *fname, const char **namep); +extern int __wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fname); +extern int __wt_meta_ckptlist_get( WT_SESSION_IMPL *session, const char *fname, WT_CKPT **ckptbasep); +extern int __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, const char *fname, WT_CKPT *ckptbase, WT_LSN *ckptlsn); +extern void __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase); +extern void __wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt); +extern int __wt_ext_metadata_insert(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key, const char *value); +extern int __wt_ext_metadata_remove( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key); +extern int __wt_ext_metadata_search(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key, const char **valuep); +extern int __wt_ext_metadata_update(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key, const char *value); +extern int __wt_metadata_get_ckptlist( WT_SESSION *session, const char *name, WT_CKPT **ckptbasep); +extern void __wt_metadata_free_ckptlist(WT_SESSION *session, WT_CKPT *ckptbase); +extern int __wt_metadata_open(WT_SESSION_IMPL *session); +extern int __wt_metadata_cursor( WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp); +extern int __wt_metadata_insert( WT_SESSION_IMPL *session, const char *key, const char *value); +extern int __wt_metadata_update( WT_SESSION_IMPL *session, const char *key, const char *value); +extern int __wt_metadata_remove(WT_SESSION_IMPL *session, const char *key); +extern int __wt_metadata_search( WT_SESSION_IMPL *session, const char *key, const char **valuep); +extern void __wt_meta_track_discard(WT_SESSION_IMPL *session); +extern int __wt_meta_track_on(WT_SESSION_IMPL *session); +extern int __wt_meta_track_off(WT_SESSION_IMPL *session, int unroll); +extern int __wt_meta_track_sub_on(WT_SESSION_IMPL *session); +extern int __wt_meta_track_sub_off(WT_SESSION_IMPL *session); +extern int __wt_meta_track_checkpoint(WT_SESSION_IMPL *session); +extern int __wt_meta_track_insert(WT_SESSION_IMPL *session, const char *key); +extern int __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key); +extern int __wt_meta_track_fileop( WT_SESSION_IMPL *session, const char *olduri, const char *newuri); +extern int __wt_meta_track_handle_lock(WT_SESSION_IMPL *session, int created); +extern int __wt_turtle_init(WT_SESSION_IMPL *session); +extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, const char **valuep); +extern int __wt_turtle_update( WT_SESSION_IMPL *session, const char *key, const char *value); +extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_ATTRIBUTE((noreturn)); +extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp); +extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp); +extern int __wt_realloc_aligned(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp); +extern int __wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp); +extern int __wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp); +extern void __wt_free_int(WT_SESSION_IMPL *session, const void *p_arg); +extern int __wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix, uint32_t flags, char ***dirlist, u_int *countp); +extern int __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp); +extern int __wt_dlsym(WT_SESSION_IMPL *session, WT_DLH *dlh, const char *name, int fail, void *sym_ret); +extern int __wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh); +extern int __wt_errno(void); +extern int __wt_exist(WT_SESSION_IMPL *session, const char *filename, int *existp); +extern void __wt_fallocate_config(WT_SESSION_IMPL *session, WT_FH *fh); +extern int __wt_fallocate( WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len); +extern int __wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep); +extern int __wt_filesize_name( WT_SESSION_IMPL *session, const char *filename, wt_off_t *sizep); +extern int __wt_bytelock(WT_FH *fhp, wt_off_t byte, int lock); +extern int __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh); +extern int __wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh); +extern int __wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len); +extern int __wt_getline(WT_SESSION_IMPL *session, WT_ITEM *buf, FILE *fp); +extern int __wt_getopt( const char *progname, int nargc, char *const *nargv, const char *ostr); +extern int __wt_mmap(WT_SESSION_IMPL *session, WT_FH *fh, void *mapp, size_t *lenp, void **mappingcookie); +extern int __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size); +extern int __wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size); +extern int __wt_munmap(WT_SESSION_IMPL *session, WT_FH *fh, void *map, size_t len, void **mappingcookie); +extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, int is_signalled, WT_CONDVAR **condp); +extern int __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs); +extern int __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond); +extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp); +extern int __wt_rwlock_alloc( WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name); +extern int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock); +extern int __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock); +extern int __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock); +extern int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock); +extern int __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock); +extern int __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock); +extern int __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp); +extern int __wt_once(void (*init_routine)(void)); +extern int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp); +extern int __wt_close(WT_SESSION_IMPL *session, WT_FH *fh); +extern int __wt_absolute_path(const char *path); +extern const char *__wt_path_separator(void); +extern int __wt_has_priv(void); +extern int __wt_remove(WT_SESSION_IMPL *session, const char *name); +extern int __wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to); +extern int __wt_read( WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf); +extern int __wt_write(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, const void *buf); +extern void __wt_sleep(long seconds, long micro_seconds); +extern uint64_t __wt_strtouq(const char *nptr, char **endptr, int base); +extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, void *(*func)(void *), void *arg); +extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid); +extern void __wt_thread_id(char *buf, size_t buflen); +extern int __wt_seconds(WT_SESSION_IMPL *session, time_t *timep); +extern int __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp); +extern void __wt_yield(void); +extern int __wt_ext_struct_pack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *buffer, size_t size, const char *fmt, ...); +extern int __wt_ext_struct_size(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t *sizep, const char *fmt, ...); +extern int __wt_ext_struct_unpack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const void *buffer, size_t size, const char *fmt, ...); +extern int __wt_struct_check(WT_SESSION_IMPL *session, const char *fmt, size_t len, int *fixedp, uint32_t *fixed_lenp); +extern int __wt_struct_size(WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, ...); +extern int __wt_struct_pack(WT_SESSION_IMPL *session, void *buffer, size_t size, const char *fmt, ...); +extern int __wt_struct_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, ...); +extern int __wt_direct_io_size_check(WT_SESSION_IMPL *session, const char **cfg, const char *config_name, uint32_t *allocsizep); +extern int __wt_schema_colgroup_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf); +extern int __wt_schema_index_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, const char *config, WT_ITEM *buf); +extern int __wt_schema_create( WT_SESSION_IMPL *session, const char *uri, const char *config); +extern int __wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]); +extern int __wt_schema_get_table(WT_SESSION_IMPL *session, const char *name, size_t namelen, int ok_incomplete, WT_TABLE **tablep); +extern void __wt_schema_release_table(WT_SESSION_IMPL *session, WT_TABLE *table); +extern void __wt_schema_destroy_colgroup(WT_SESSION_IMPL *session, WT_COLGROUP *colgroup); +extern void __wt_schema_destroy_index(WT_SESSION_IMPL *session, WT_INDEX *idx); +extern void __wt_schema_destroy_table(WT_SESSION_IMPL *session, WT_TABLE *table); +extern void __wt_schema_remove_table( WT_SESSION_IMPL *session, WT_TABLE *table); +extern void __wt_schema_close_tables(WT_SESSION_IMPL *session); +extern int __wt_schema_colgroup_name(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, size_t len, WT_ITEM *buf); +extern int __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table); +extern int __wt_schema_open_index(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp); +extern int __wt_schema_open_indices(WT_SESSION_IMPL *session, WT_TABLE *table); +extern int __wt_schema_open_table(WT_SESSION_IMPL *session, const char *name, size_t namelen, WT_TABLE **tablep); +extern int __wt_schema_get_colgroup(WT_SESSION_IMPL *session, const char *uri, WT_TABLE **tablep, WT_COLGROUP **colgroupp); +extern int __wt_schema_get_index(WT_SESSION_IMPL *session, const char *uri, WT_TABLE **tablep, WT_INDEX **indexp); +extern int __wt_schema_colcheck(WT_SESSION_IMPL *session, const char *key_format, const char *value_format, WT_CONFIG_ITEM *colconf, u_int *kcolsp, u_int *vcolsp); +extern int __wt_table_check(WT_SESSION_IMPL *session, WT_TABLE *table); +extern int __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table, const char *columns, size_t len, int value_only, WT_ITEM *plan); +extern int __wt_struct_reformat(WT_SESSION_IMPL *session, WT_TABLE *table, const char *columns, size_t len, const char *extra_cols, int value_only, WT_ITEM *format); +extern int __wt_struct_truncate(WT_SESSION_IMPL *session, const char *input_fmt, u_int ncols, WT_ITEM *format); +extern int __wt_schema_project_in(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, va_list ap); +extern int __wt_schema_project_out(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, va_list ap); +extern int __wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, int key_only, const char *vformat, WT_ITEM *value); +extern int __wt_schema_project_merge(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, const char *vformat, WT_ITEM *value); +extern int __wt_schema_rename(WT_SESSION_IMPL *session, const char *uri, const char *newuri, const char *cfg[]); +extern int __wt_curstat_colgroup_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst); +extern int __wt_curstat_index_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst); +extern int __wt_curstat_table_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst); +extern int __wt_schema_truncate( WT_SESSION_IMPL *session, const char *uri, const char *cfg[]); +extern int __wt_range_truncate(WT_CURSOR *start, WT_CURSOR *stop); +extern int __wt_schema_range_truncate( WT_SESSION_IMPL *session, WT_CURSOR *start, WT_CURSOR *stop); +extern WT_DATA_SOURCE *__wt_schema_get_source(WT_SESSION_IMPL *session, const char *name); +extern int __wt_str_name_check(WT_SESSION_IMPL *session, const char *str); +extern int __wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len); +extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, int *), const char *cfg[], uint32_t open_flags); +extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session); +extern int __wt_session_copy_values(WT_SESSION_IMPL *session); +extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_session_create_strip(WT_SESSION *wt_session, const char *v1, const char *v2, const char **value_ret); +extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp); +extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, WT_SESSION_IMPL **sessionp); +extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip); +extern int __wt_session_compact( WT_SESSION *wt_session, const char *uri, const char *config); +extern void __wt_session_dhandle_incr_use(WT_SESSION_IMPL *session); +extern int __wt_session_lock_btree(WT_SESSION_IMPL *session, uint32_t flags); +extern int __wt_session_release_btree(WT_SESSION_IMPL *session); +extern int __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], uint32_t flags); +extern void __wt_session_close_cache(WT_SESSION_IMPL *session); +extern int __wt_session_get_btree(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags); +extern int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint); +extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]); +extern uint32_t __wt_cksum(const void *chunk, size_t len); +extern void __wt_cksum_init(void); +extern void __wt_event_handler_set(WT_SESSION_IMPL *session, WT_EVENT_HANDLER *handler); +extern int __wt_eventv(WT_SESSION_IMPL *session, int msg_event, int error, const char *file_name, int line_number, const char *fmt, va_list ap); +extern void __wt_err(WT_SESSION_IMPL *session, int error, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4))); +extern void __wt_errx(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 2, 3))); +extern int __wt_ext_err_printf( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4))); +extern int __wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 2, 3))); +extern int __wt_ext_msg_printf( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4))); +extern int __wt_progress(WT_SESSION_IMPL *session, const char *s, uint64_t v); +extern void __wt_assert(WT_SESSION_IMPL *session, int error, const char *file_name, int line_number, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 5, 6))); +extern int __wt_panic(WT_SESSION_IMPL *session); +extern int __wt_illegal_value(WT_SESSION_IMPL *session, const char *name); +extern int __wt_object_unsupported(WT_SESSION_IMPL *session, const char *uri); +extern int __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri); +extern int __wt_filename(WT_SESSION_IMPL *session, const char *name, char **path); +extern int __wt_nfilename( WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path); +extern int __wt_library_init(void); +extern int __wt_breakpoint(void); +extern void __wt_attach(WT_SESSION_IMPL *session); +extern uint64_t __wt_hash_city64(const void *s, size_t len); +extern uint64_t __wt_hash_fnv64(const void *string, size_t len); +extern int +__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp +#ifdef HAVE_DIAGNOSTIC + , const char *file, int line +#endif + ); +extern int __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page); +extern void __wt_hazard_close(WT_SESSION_IMPL *session); +extern int __wt_raw_to_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to); +extern int __wt_raw_to_esc_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to); +extern int __wt_hex2byte(const u_char *from, u_char *to); +extern int __wt_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to); +extern int __wt_nhex_to_raw( WT_SESSION_IMPL *session, const char *from, size_t size, WT_ITEM *to); +extern int __wt_esc_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to); +extern int __wt_huffman_open(WT_SESSION_IMPL *session, void *symbol_frequency_array, u_int symcnt, u_int numbytes, void *retp); +extern void __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg); +extern int __wt_print_huffman_code(void *huffman_arg, uint16_t symbol); +extern int __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf); +extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf); +extern int __wt_spin_lock_register_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t); +extern void __wt_spin_lock_unregister_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t); +extern int __wt_spin_lock_register_caller(WT_SESSION_IMPL *session, const char *name, const char *file, int line, int *idp); +extern int __wt_statlog_dump_spinlock(WT_CONNECTION_IMPL *conn, const char *tag); +extern uint32_t __wt_nlpo2_round(uint32_t v); +extern uint32_t __wt_nlpo2(uint32_t v); +extern uint32_t __wt_log2_int(uint32_t n); +extern int __wt_ispo2(uint32_t v); +extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2); +extern void __wt_random_init(uint32_t *rnd); +extern uint32_t __wt_random(uint32_t *rnd); +extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size); +extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4))); +extern int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4))); +extern int +__wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp +#ifdef HAVE_DIAGNOSTIC + , const char *file, int line +#endif + ); +extern void __wt_scr_discard(WT_SESSION_IMPL *session); +extern void *__wt_ext_scr_alloc( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t size); +extern void __wt_ext_scr_free(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *p); +extern void __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats); +extern void __wt_stat_refresh_dsrc_stats(void *stats_arg); +extern void __wt_stat_aggregate_dsrc_stats(const void *child, const void *parent); +extern void __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats); +extern void __wt_stat_refresh_connection_stats(void *stats_arg); +extern int __wt_txnid_cmp(const void *v1, const void *v2); +extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session); +extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session); +extern void __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot); +extern int __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]); +extern void __wt_txn_release(WT_SESSION_IMPL *session); +extern int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_txn_init(WT_SESSION_IMPL *session); +extern void __wt_txn_stats_update(WT_SESSION_IMPL *session); +extern void __wt_txn_destroy(WT_SESSION_IMPL *session); +extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]); +extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session); +extern int __wt_checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len); +extern int __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_checkpoint_close(WT_SESSION_IMPL *session, int force); +extern uint64_t __wt_ext_transaction_id(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session); +extern int __wt_ext_transaction_isolation_level( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session); +extern int __wt_ext_transaction_notify( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_TXN_NOTIFY *notify); +extern uint64_t __wt_ext_transaction_oldest(WT_EXTENSION_API *wt_api); +extern int __wt_ext_transaction_visible( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, uint64_t transaction_id); +extern void __wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op); +extern int __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); +extern int __wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_txn_checkpoint_logread( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_LSN *ckpt_lsn); +extern int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, int full, uint32_t flags, WT_LSN *lsnp); +extern int __wt_txn_truncate_log( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop); +extern int __wt_txn_truncate_end(WT_SESSION_IMPL *session); +extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out); +extern int __wt_txn_recover(WT_CONNECTION_IMPL *conn); diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h new file mode 100644 index 00000000000..3aac7193407 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/flags.h @@ -0,0 +1,88 @@ +/* + * DO NOT EDIT: automatically built by dist/flags.py. + * flags section: BEGIN + */ +#define WT_CONN_CACHE_POOL 0x00001000 +#define WT_CONN_CKPT_SYNC 0x00000800 +#define WT_CONN_EVICTION_RUN 0x00000400 +#define WT_CONN_LEAK_MEMORY 0x00000200 +#define WT_CONN_LSM_MERGE 0x00000100 +#define WT_CONN_PANIC 0x00000080 +#define WT_CONN_SERVER_ASYNC 0x00000040 +#define WT_CONN_SERVER_CHECKPOINT 0x00000020 +#define WT_CONN_SERVER_LSM 0x00000010 +#define WT_CONN_SERVER_RUN 0x00000008 +#define WT_CONN_SERVER_STATISTICS 0x00000004 +#define WT_CONN_SERVER_SWEEP 0x00000002 +#define WT_CONN_WAS_BACKUP 0x00000001 +#define WT_EVICTING 0x00000004 +#define WT_FILE_TYPE_CHECKPOINT 0x00000004 +#define WT_FILE_TYPE_DATA 0x00000002 +#define WT_FILE_TYPE_LOG 0x00000001 +#define WT_LOGSCAN_FIRST 0x00000008 +#define WT_LOGSCAN_FROM_CKP 0x00000004 +#define WT_LOGSCAN_ONE 0x00000002 +#define WT_LOGSCAN_RECOVER 0x00000001 +#define WT_LOG_DSYNC 0x00000004 +#define WT_LOG_FLUSH 0x00000002 +#define WT_LOG_FSYNC 0x00000001 +#define WT_READ_CACHE 0x00000200 +#define WT_READ_COMPACT 0x00000100 +#define WT_READ_NO_EVICT 0x00000080 +#define WT_READ_NO_GEN 0x00000040 +#define WT_READ_NO_WAIT 0x00000020 +#define WT_READ_PREV 0x00000010 +#define WT_READ_SKIP_INTL 0x00000008 +#define WT_READ_SKIP_LEAF 0x00000004 +#define WT_READ_TRUNCATE 0x00000002 +#define WT_READ_WONT_NEED 0x00000001 +#define WT_SESSION_CAN_WAIT 0x00000800 +#define WT_SESSION_DISCARD_FORCE 0x00000400 +#define WT_SESSION_INTERNAL 0x00000200 +#define WT_SESSION_LOGGING_INMEM 0x00000100 +#define WT_SESSION_NO_CACHE 0x00000080 +#define WT_SESSION_NO_CACHE_CHECK 0x00000040 +#define WT_SESSION_NO_DATA_HANDLES 0x00000020 +#define WT_SESSION_NO_LOGGING 0x00000010 +#define WT_SESSION_NO_SCHEMA_LOCK 0x00000008 +#define WT_SESSION_SALVAGE_CORRUPT_OK 0x00000004 +#define WT_SESSION_SCHEMA_LOCKED 0x00000002 +#define WT_SESSION_SERVER_ASYNC 0x00000001 +#define WT_SKIP_UPDATE_ERR 0x00000002 +#define WT_SKIP_UPDATE_RESTORE 0x00000001 +#define WT_SYNC_CHECKPOINT 0x00000010 +#define WT_SYNC_CLOSE 0x00000008 +#define WT_SYNC_DISCARD 0x00000004 +#define WT_SYNC_DISCARD_FORCE 0x00000002 +#define WT_SYNC_WRITE_LEAVES 0x00000001 +#define WT_TXN_LOG_CKPT_FAIL 0x00000008 +#define WT_TXN_LOG_CKPT_PREPARE 0x00000004 +#define WT_TXN_LOG_CKPT_START 0x00000002 +#define WT_TXN_LOG_CKPT_STOP 0x00000001 +#define WT_VERB_API 0x00400000 +#define WT_VERB_BLOCK 0x00200000 +#define WT_VERB_CHECKPOINT 0x00100000 +#define WT_VERB_COMPACT 0x00080000 +#define WT_VERB_EVICT 0x00040000 +#define WT_VERB_EVICTSERVER 0x00020000 +#define WT_VERB_FILEOPS 0x00010000 +#define WT_VERB_LOG 0x00008000 +#define WT_VERB_LSM 0x00004000 +#define WT_VERB_METADATA 0x00002000 +#define WT_VERB_MUTEX 0x00001000 +#define WT_VERB_OVERFLOW 0x00000800 +#define WT_VERB_READ 0x00000400 +#define WT_VERB_RECONCILE 0x00000200 +#define WT_VERB_RECOVERY 0x00000100 +#define WT_VERB_SALVAGE 0x00000080 +#define WT_VERB_SHARED_CACHE 0x00000040 +#define WT_VERB_SPLIT 0x00000020 +#define WT_VERB_TEMPORARY 0x00000010 +#define WT_VERB_TRANSACTION 0x00000008 +#define WT_VERB_VERIFY 0x00000004 +#define WT_VERB_VERSION 0x00000002 +#define WT_VERB_WRITE 0x00000001 +/* + * flags section: END + * DO NOT EDIT: automatically built by dist/flags.py. + */ diff --git a/src/third_party/wiredtiger/src/include/gcc.h b/src/third_party/wiredtiger/src/include/gcc.h new file mode 100644 index 00000000000..50e237a1fed --- /dev/null +++ b/src/third_party/wiredtiger/src/include/gcc.h @@ -0,0 +1,152 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* Add GCC-specific attributes to types and function declarations. */ +#define WT_GCC_ATTRIBUTE(x) __attribute__(x) + +/* + * Attribute are only permitted on function declarations, not definitions. + * This macro is a marker for function definitions that is rewritten by + * dist/s_prototypes to create extern.h. + */ +#define WT_GCC_FUNC_ATTRIBUTE(x) + +/* + * Atomic writes: + * + * WiredTiger requires pointers (void *) and some variables to be read/written + * atomically, that is, in a single cycle. This is not write ordering -- to be + * clear, the requirement is that no partial value can ever be read or written. + * For example, if 8-bits of a 32-bit quantity were written, then the rest of + * the 32-bits were written, and another thread of control was able to read the + * memory location after the first 8-bits were written and before the subsequent + * 24-bits were written, WiredTiger would break. Or, if two threads of control + * attempt to write the same location simultaneously, the result must be one or + * the other of the two values, not some combination of both. + * + * To reduce memory requirements, we use a 32-bit type on 64-bit machines, which + * is OK if the compiler doesn't accumulate two adjacent 32-bit variables into a + * single 64-bit write, that is, there needs to be a single load/store of the 32 + * bits, not a load/store of 64 bits, where the 64 bits is comprised of two + * adjacent 32-bit locations. The problem is when two threads are cooperating + * (thread X finds 32-bits set to 0, writes in a new value, flushes memory; + * thread Y reads 32-bits that are non-zero, does some operation, resets the + * memory location to 0 and flushes). If thread X were to read the 32 bits + * adjacent to a different 32 bits, and write them both, the two threads could + * race. If that can happen, you must increase the size of the memory type to + * a type guaranteed to be written atomically in a single cycle, without writing + * an adjacent memory location. + * + * WiredTiger additionally requires atomic writes for 64-bit memory locations, + * and so cannot run on machines with a 32-bit memory bus. + * + * We don't depend on writes across cache lines being atomic, and to make sure + * that never happens, we check address alignment: we know of no architectures + * with cache lines other than a multiple of 4 bytes in size, so aligned 4-byte + * accesses will always be in a single cache line. + * + * Atomic writes are often associated with memory barriers, implemented by the + * WT_READ_BARRIER and WT_WRITE_BARRIER macros. WiredTiger's requirement as + * described by the Solaris membar_enter description: + * + * No stores from after the memory barrier will reach visibility and + * no loads from after the barrier will be resolved before the lock + * acquisition reaches global visibility + * + * In other words, the WT_WRITE_BARRIER macro must ensure that memory stores by + * the processor, made before the WT_WRITE_BARRIER call, be visible to all + * processors in the system before any memory stores by the processor, made + * after the WT_WRITE_BARRIER call, are visible to any processor. The + * WT_READ_BARRIER macro ensures that all loads before the barrier are complete + * before any loads after the barrier. The compiler cannot reorder or cache + * values across a barrier. + * + * Lock and unlock operations imply both read and write barriers. In other + * words, barriers are not required for values protected by locking. + * + * Data locations may also be marked volatile, forcing the compiler to re-load + * the data on each access. This is a weaker semantic than barriers provide, + * only ensuring that the compiler will not cache values. It makes no ordering + * guarantees and may have no effect on systems with weaker cache guarantees. + * + * In summary, locking > barriers > volatile. + * + * To avoid locking shared data structures such as statistics and to permit + * atomic state changes, we rely on the WT_ATOMIC_ADD and WT_ATOMIC_CAS + * (compare and swap) operations. + */ +#define __WT_ATOMIC_ADD(v, val, n) \ + (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_add_and_fetch(&(v), val)) +#define __WT_ATOMIC_FETCH_ADD(v, val, n) \ + (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_fetch_and_add(&(v), val)) +#define __WT_ATOMIC_CAS(v, old, new, n) \ + (WT_STATIC_ASSERT(sizeof(v) == (n)), \ + __sync_bool_compare_and_swap(&(v), old, new)) +#define __WT_ATOMIC_CAS_VAL(v, old, new, n) \ + (WT_STATIC_ASSERT(sizeof(v) == (n)), \ + __sync_val_compare_and_swap(&(v), old, new)) +#define __WT_ATOMIC_STORE(v, val, n) \ + (WT_STATIC_ASSERT(sizeof(v) == (n)), \ + __sync_lock_test_and_set(&(v), val)) +#define __WT_ATOMIC_SUB(v, val, n) \ + (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_sub_and_fetch(&(v), val)) + +#define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val, 1) +#define WT_ATOMIC_FETCH_ADD1(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 1) +#define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new, 1) +#define WT_ATOMIC_CAS_VAL1(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new, 1) +#define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val, 1) +#define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val, 1) + +#define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val, 2) +#define WT_ATOMIC_FETCH_ADD2(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 2) +#define WT_ATOMIC_CAS2(v, old, new) __WT_ATOMIC_CAS(v, old, new, 2) +#define WT_ATOMIC_CAS_VAL2(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new, 2) +#define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val, 2) +#define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val, 2) + +#define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val, 4) +#define WT_ATOMIC_FETCH_ADD4(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 4) +#define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new, 4) +#define WT_ATOMIC_CAS_VAL4(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new, 4) +#define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val, 4) +#define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val, 4) + +#define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val, 8) +#define WT_ATOMIC_FETCH_ADD8(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 8) +#define WT_ATOMIC_CAS8(v, old, new) __WT_ATOMIC_CAS(v, old, new, 8) +#define WT_ATOMIC_CAS_VAL8(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new, 8) +#define WT_ATOMIC_STORE8(v, val) __WT_ATOMIC_STORE(v, val, 8) +#define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val, 8) + +/* Compile read-write barrier */ +#define WT_BARRIER() __asm__ volatile("" ::: "memory") + +/* Pause instruction to prevent excess processor bus usage */ +#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory") + +#if defined(x86_64) || defined(__x86_64__) +#define WT_FULL_BARRIER() do { \ + __asm__ volatile ("mfence" ::: "memory"); \ +} while (0) +#define WT_READ_BARRIER() do { \ + __asm__ volatile ("lfence" ::: "memory"); \ +} while (0) +#define WT_WRITE_BARRIER() do { \ + __asm__ volatile ("sfence" ::: "memory"); \ +} while (0) + +#elif defined(i386) || defined(__i386__) +#define WT_FULL_BARRIER() do { \ + __asm__ volatile ("lock; addl $0, 0(%%esp)" ::: "memory"); \ +} while (0) +#define WT_READ_BARRIER() WT_FULL_BARRIER() +#define WT_WRITE_BARRIER() WT_FULL_BARRIER() + +#else +#error "No write barrier implementation for this hardware" +#endif diff --git a/src/third_party/wiredtiger/src/include/hardware.h b/src/third_party/wiredtiger/src/include/hardware.h new file mode 100644 index 00000000000..720f512cf2d --- /dev/null +++ b/src/third_party/wiredtiger/src/include/hardware.h @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * Publish a value to a shared location. All previous stores must complete + * before the value is made public. + */ +#define WT_PUBLISH(v, val) do { \ + WT_WRITE_BARRIER(); \ + (v) = (val); \ +} while (0) + +/* + * Read a shared location and guarantee that subsequent reads do not see any + * earlier state. + */ +#define WT_ORDERED_READ(v, val) do { \ + (v) = (val); \ + WT_READ_BARRIER(); \ +} while (0) + +/* + * Atomic versions of the flag set/clear macros. + */ +#define F_ISSET_ATOMIC(p, mask) ((p)->flags_atomic & (uint8_t)(mask)) + +#define F_SET_ATOMIC(p, mask) do { \ + uint8_t __orig; \ + do { \ + __orig = (p)->flags_atomic; \ + } while (!WT_ATOMIC_CAS1((p)->flags_atomic, \ + __orig, __orig | (uint8_t)(mask))); \ +} while (0) + +#define F_CAS_ATOMIC(p, mask, ret) do { \ + uint8_t __orig; \ + ret = 0; \ + do { \ + __orig = (p)->flags_atomic; \ + if ((__orig & (uint8_t)(mask)) != 0) { \ + ret = EBUSY; \ + break; \ + } \ + } while (!WT_ATOMIC_CAS1((p)->flags_atomic, \ + __orig, __orig | (uint8_t)(mask))); \ +} while (0) + +#define F_CLR_ATOMIC(p, mask) do { \ + uint8_t __orig; \ + do { \ + __orig = (p)->flags_atomic; \ + } while (!WT_ATOMIC_CAS1((p)->flags_atomic, \ + __orig, __orig & ~(uint8_t)(mask))); \ +} while (0) + +#define WT_CACHE_LINE_ALIGNMENT 64 /* Cache line alignment */ diff --git a/src/third_party/wiredtiger/src/include/intpack.i b/src/third_party/wiredtiger/src/include/intpack.i new file mode 100644 index 00000000000..01559657acd --- /dev/null +++ b/src/third_party/wiredtiger/src/include/intpack.i @@ -0,0 +1,371 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * Variable-length integer encoding. + * We need up to 64 bits, signed and unsigned. Further, we want the packed + * representation to have the same lexicographic ordering as the integer + * values. This avoids the need for special-purpose comparison code. + * + * Try hard to keep small values small (up to ~2 bytes): that gives the biggest + * benefit for common cases storing small values. After that, just encode the + * length in the first byte: we could squeeze in a couple of extra bits, but + * the marginal benefit is small, and we want this code to be relatively + * easy to implement in client code or scripting APIs. + * + * First byte | Next | | + * byte | bytes| Min Value | Max Value + * ------------+------+------------------------+-------------------------------- + * [00 00xxxx] | free | N/A | N/A + * [00 01llll] | llll | -2^64 | -2^13 - 2^6 + * [00 1xxxxx] | 1 | -2^13 - 2^6 | -2^6 - 1 + * [01 xxxxxx] | 0 | -2^6 | -1 + * [10 xxxxxx] | 0 | 0 | 2^6 - 1 + * [11 0xxxxx] | 1 | 2^6 | 2^13 + 2^6 - 1 + * [11 10llll] | llll | 2^13 + 2^6 | 2^64 - 1 + * [11 11xxxx] | free | N/A | N/A + */ + +#define NEG_MULTI_MARKER (uint8_t)0x10 +#define NEG_2BYTE_MARKER (uint8_t)0x20 +#define NEG_1BYTE_MARKER (uint8_t)0x40 +#define POS_1BYTE_MARKER (uint8_t)0x80 +#define POS_2BYTE_MARKER (uint8_t)0xc0 +#define POS_MULTI_MARKER (uint8_t)0xe0 + +#define NEG_1BYTE_MIN ((-1) << 6) +#define NEG_2BYTE_MIN (((-1) << 13) + NEG_1BYTE_MIN) +#define POS_1BYTE_MAX ((1 << 6) - 1) +#define POS_2BYTE_MAX ((1 << 13) + POS_1BYTE_MAX) + +/* Extract bits <start> to <end> from a value (counting from LSB == 0). */ +#define GET_BITS(x, start, end) \ + (((uint64_t)(x) & ((1U << (start)) - 1U)) >> (end)) + +#define WT_SIZE_CHECK(l, maxl) \ + WT_RET_TEST((maxl) != 0 && (size_t)(l) > (maxl), ENOMEM) + +/* Count the leading zero bytes. */ +#if defined(__GNUC__) +#define WT_LEADING_ZEROS(x, i) \ + (i = (x == 0) ? (int)sizeof (x) : __builtin_clzll(x) >> 3) +#elif defined(_MSC_VER) +#define WT_LEADING_ZEROS(x, i) do { \ + if (x == 0) i = (int)sizeof(x); \ + else { \ + unsigned long __index; \ + _BitScanReverse64(&__index, x); \ + __index = 63 ^ __index; \ + i = (int)(__index >> 3); } \ + } while (0) +#else +#define WT_LEADING_ZEROS(x, i) do { \ + uint64_t __x = (x); \ + uint64_t __m = (uint64_t)0xff << 56; \ + for (i = 0; !(__x & __m) && i != 8; i++) \ + __m >>= 8; \ +} while (0) +#endif + +/* + * __wt_vpack_posint -- + * Packs a positive variable-length integer in the specified location. + */ +static inline int +__wt_vpack_posint(uint8_t **pp, size_t maxlen, uint64_t x) +{ + uint8_t *p; + int len, lz, shift; + + WT_LEADING_ZEROS(x, lz); + len = (int)sizeof (x) - lz; + WT_SIZE_CHECK(len + 1, maxlen); + p = *pp; + + /* There are four bits we can use in the first byte. */ + *p++ |= (len & 0xf); + + for (shift = (len - 1) << 3; len != 0; --len, shift -= 8) + *p++ = (uint8_t)(x >> shift); + + *pp = p; + return (0); +} + +/* + * __wt_vpack_negint -- + * Packs a negative variable-length integer in the specified location. + */ +static inline int +__wt_vpack_negint(uint8_t **pp, size_t maxlen, uint64_t x) +{ + uint8_t *p; + int len, lz, shift; + + WT_LEADING_ZEROS(~x, lz); + len = (int)sizeof (x) - lz; + WT_SIZE_CHECK(len + 1, maxlen); + p = *pp; + + /* + * There are four size bits we can use in the first byte. + * For negative numbers, we store the number of leading 0xff bytes + * to maintain ordering (if this is not obvious, it may help to + * remember that -1 is the largest negative number). + */ + *p++ |= (lz & 0xf); + + for (shift = (len - 1) << 3; len != 0; shift -= 8, --len) + *p++ = (uint8_t)(x >> shift); + + *pp = p; + return (0); +} + +/* + * __wt_vunpack_posint -- + * Reads a variable-length positive integer from the specified location. + */ +static inline int +__wt_vunpack_posint(const uint8_t **pp, size_t maxlen, uint64_t *retp) +{ + uint64_t x; + const uint8_t *p; + uint8_t len; + + /* There are four length bits in the first byte. */ + p = *pp; + len = (*p++ & 0xf); + WT_SIZE_CHECK(len + 1, maxlen); + + for (x = 0; len != 0; --len) + x = (x << 8) | *p++; + + *retp = x; + *pp = p; + return (0); +} + +/* + * __wt_vunpack_negint -- + * Reads a variable-length negative integer from the specified location. + */ +static inline int +__wt_vunpack_negint(const uint8_t **pp, size_t maxlen, uint64_t *retp) +{ + uint64_t x; + const uint8_t *p; + uint8_t len; + + /* There are four length bits in the first byte. */ + p = *pp; + len = (int)sizeof (x) - (*p++ & 0xf); + WT_SIZE_CHECK(len + 1, maxlen); + + for (x = UINT64_MAX; len != 0; --len) + x = (x << 8) | *p++; + + *retp = x; + *pp = p; + return (0); +} + +/* + * __wt_vpack_uint -- + * Variable-sized packing for unsigned integers + */ +static inline int +__wt_vpack_uint(uint8_t **pp, size_t maxlen, uint64_t x) +{ + uint8_t *p; + + WT_SIZE_CHECK(1, maxlen); + p = *pp; + if (x <= POS_1BYTE_MAX) + *p++ = POS_1BYTE_MARKER | GET_BITS(x, 6, 0); + else if (x <= POS_2BYTE_MAX) { + WT_SIZE_CHECK(2, maxlen); + x -= POS_1BYTE_MAX + 1; + *p++ = POS_2BYTE_MARKER | GET_BITS(x, 13, 8); + *p++ = GET_BITS(x, 8, 0); + } else { + x -= POS_2BYTE_MAX + 1; + *p = POS_MULTI_MARKER; + return (__wt_vpack_posint(pp, maxlen, x)); + } + + *pp = p; + return (0); +} + +/* + * __wt_vpack_int -- + * Variable-sized packing for signed integers + */ +static inline int +__wt_vpack_int(uint8_t **pp, size_t maxlen, int64_t x) +{ + uint8_t *p; + + WT_SIZE_CHECK(1, maxlen); + p = *pp; + if (x < NEG_2BYTE_MIN) { + *p = NEG_MULTI_MARKER; + return (__wt_vpack_negint(pp, maxlen, (uint64_t)x)); + } else if (x < NEG_1BYTE_MIN) { + WT_SIZE_CHECK(2, maxlen); + x -= NEG_2BYTE_MIN; + *p++ = NEG_2BYTE_MARKER | GET_BITS(x, 13, 8); + *p++ = GET_BITS(x, 8, 0); + } else if (x < 0) { + x -= NEG_1BYTE_MIN; + *p++ = NEG_1BYTE_MARKER | GET_BITS(x, 6, 0); + } else + /* For non-negative values, use the unsigned code above. */ + return (__wt_vpack_uint(pp, maxlen, (uint64_t)x)); + + *pp = p; + return (0); +} + +/* + * __wt_vunpack_uint -- + * Variable-sized unpacking for unsigned integers + */ +static inline int +__wt_vunpack_uint(const uint8_t **pp, size_t maxlen, uint64_t *xp) +{ + const uint8_t *p; + + WT_SIZE_CHECK(1, maxlen); + p = *pp; + switch (*p & 0xf0) { + case POS_1BYTE_MARKER: + case POS_1BYTE_MARKER | 0x10: + case POS_1BYTE_MARKER | 0x20: + case POS_1BYTE_MARKER | 0x30: + *xp = GET_BITS(*p, 6, 0); + p += 1; + break; + case POS_2BYTE_MARKER: + case POS_2BYTE_MARKER | 0x10: + WT_SIZE_CHECK(2, maxlen); + *xp = GET_BITS(*p++, 5, 0) << 8; + *xp |= *p++; + *xp += POS_1BYTE_MAX + 1; + break; + case POS_MULTI_MARKER: + WT_RET(__wt_vunpack_posint(pp, maxlen, xp)); + *xp += POS_2BYTE_MAX + 1; + return (0); + default: + return (EINVAL); + } + + *pp = p; + return (0); +} + +/* + * __wt_vunpack_int -- + * Variable-sized packing for signed integers + */ +static inline int +__wt_vunpack_int(const uint8_t **pp, size_t maxlen, int64_t *xp) +{ + const uint8_t *p; + + WT_SIZE_CHECK(1, maxlen); + p = *pp; + switch (*p & 0xf0) { + case NEG_MULTI_MARKER: + WT_RET(__wt_vunpack_negint(pp, maxlen, (uint64_t *)xp)); + return (0); + case NEG_2BYTE_MARKER: + case NEG_2BYTE_MARKER | 0x10: + WT_SIZE_CHECK(2, maxlen); + *xp = (int64_t)(GET_BITS(*p++, 5, 0) << 8); + *xp |= *p++; + *xp += NEG_2BYTE_MIN; + p += 2; + break; + case NEG_1BYTE_MARKER: + case NEG_1BYTE_MARKER | 0x10: + case NEG_1BYTE_MARKER | 0x20: + case NEG_1BYTE_MARKER | 0x30: + *xp = NEG_1BYTE_MIN + (int64_t)GET_BITS(*p, 6, 0); + p += 1; + break; + default: + /* Identical to the unsigned case. */ + return (__wt_vunpack_uint(pp, maxlen, (uint64_t *)xp)); + } + + *pp = p; + return (0); +} + +/* + * __wt_vsize_posint -- + * Return the packed size of a positive variable-length integer. + */ +static inline size_t +__wt_vsize_posint(uint64_t x) +{ + int lz; + + WT_LEADING_ZEROS(x, lz); + return ((size_t)(WT_INTPACK64_MAXSIZE - lz)); +} + +/* + * __wt_vsize_negint -- + * Return the packed size of a negative variable-length integer. + */ +static inline size_t +__wt_vsize_negint(uint64_t x) +{ + int lz; + + WT_LEADING_ZEROS(~x, lz); + return (size_t)(WT_INTPACK64_MAXSIZE - lz); +} + +/* + * __wt_vsize_uint -- + * Return the packed size of an unsigned integer. + */ +static inline size_t +__wt_vsize_uint(uint64_t x) +{ + if (x <= POS_1BYTE_MAX) + return (1); + else if (x <= POS_2BYTE_MAX) { + return (2); + } else { + x -= POS_2BYTE_MAX + 1; + return (__wt_vsize_posint(x)); + } +} + +/* + * __wt_vsize_int -- + * Return the packed size of a signed integer. + */ +static inline size_t +__wt_vsize_int(int64_t x) +{ + if (x < NEG_2BYTE_MIN) { + return (__wt_vsize_negint((uint64_t)x)); + } else if (x < NEG_1BYTE_MIN) { + return (2); + } else if (x < 0) { + return (1); + } else + /* For non-negative values, use the unsigned code above. */ + return (__wt_vsize_uint((uint64_t)x)); +} diff --git a/src/third_party/wiredtiger/src/include/lint.h b/src/third_party/wiredtiger/src/include/lint.h new file mode 100644 index 00000000000..7c0a103a8ee --- /dev/null +++ b/src/third_party/wiredtiger/src/include/lint.h @@ -0,0 +1,56 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#define WT_GCC_ATTRIBUTE(x) +#define WT_GCC_FUNC_ATTRIBUTE(x) + +#define __WT_ATOMIC_ADD(v, val) \ + ((v) += (val)) +#define __WT_ATOMIC_FETCH_ADD(v, val) \ + ((v) += (val), (v)) +#define __WT_ATOMIC_CAS(v, old, new) \ + ((v) = ((v) == (old) ? (new) : (old)), (v) == (old)) +#define __WT_ATOMIC_CAS_VAL(v, old, new) \ + ((v) = ((v) == (old) ? (new) : (old)), (v) == (old)) +#define __WT_ATOMIC_STORE(v, val) \ + ((v) = (val)) +#define __WT_ATOMIC_SUB(v, val) \ + ((v) -= (val), (v)) + +#define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val) +#define WT_ATOMIC_FETCH_ADD1(v, val) __WT_ATOMIC_FETCH_ADD(v, val) +#define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new) +#define WT_ATOMIC_CAS_VAL1(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new) +#define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val) +#define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val) + +#define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val) +#define WT_ATOMIC_FETCH_ADD2(v, val) __WT_ATOMIC_FETCH_ADD(v, val) +#define WT_ATOMIC_CAS2(v, old, new) __WT_ATOMIC_CAS(v, old, new) +#define WT_ATOMIC_CAS_VAL2(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new) +#define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val) +#define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val) + +#define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val) +#define WT_ATOMIC_FETCH_ADD4(v, val) __WT_ATOMIC_FETCH_ADD(v, val) +#define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new) +#define WT_ATOMIC_CAS_VAL4(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new) +#define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val) +#define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val) + +#define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val) +#define WT_ATOMIC_FETCH_ADD8(v, val) __WT_ATOMIC_FETCH_ADD(v, val) +#define WT_ATOMIC_CAS8(v, old, new) __WT_ATOMIC_CAS(v, old, new) +#define WT_ATOMIC_CAS_VAL8(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new) +#define WT_ATOMIC_STORE8(v, val) __WT_ATOMIC_STORE(v, val) +#define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val) + +static inline void WT_BARRIER(void) { return; } +static inline void WT_FULL_BARRIER(void) { return; } +static inline void WT_PAUSE(void) { return; } +static inline void WT_READ_BARRIER(void) { return; } +static inline void WT_WRITE_BARRIER(void) { return; } diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h new file mode 100644 index 00000000000..15054e34906 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/log.h @@ -0,0 +1,177 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#define WT_LOG_FILENAME "WiredTigerLog" /* Log file name */ + +/* Logging subsystem declarations. */ +#define LOG_ALIGN 128 +#define WT_LOG_SLOT_BUF_INIT_SIZE 64 * 1024 + +#define INIT_LSN(l) do { \ + (l)->file = 1; \ + (l)->offset = 0; \ +} while (0) + +#define IS_INIT_LSN(l) ((l)->file == 1 && (l)->offset == 0) + +/* + * Both of the macros below need to change if the content of __wt_lsn + * ever changes. The value is the following: + * txnid, record type, operation type, file id, operation key, operation value + */ +#define LOGC_KEY_FORMAT WT_UNCHECKED_STRING(IqI) +#define LOGC_VALUE_FORMAT WT_UNCHECKED_STRING(qIIIuu) + +#define LOG_SKIP_HEADER(data) \ + ((const uint8_t *)(data) + offsetof(WT_LOG_RECORD, record)) +#define LOG_REC_SIZE(size) \ + ((size) - offsetof(WT_LOG_RECORD, record)) + +#define MAX_LSN(l) do { \ + (l)->file = UINT32_MAX; \ + (l)->offset = INT64_MAX; \ +} while (0) + +/* + * Compare 2 LSNs, return -1 if lsn0 < lsn1, 0 if lsn0 == lsn1 + * and 1 if lsn0 > lsn1. + */ +#define LOG_CMP(lsn1, lsn2) \ + ((lsn1)->file != (lsn2)->file ? \ + ((lsn1)->file < (lsn2)->file ? -1 : 1) : \ + ((lsn1)->offset != (lsn2)->offset ? \ + ((lsn1)->offset < (lsn2)->offset ? -1 : 1) : 0)) + +/* + * Possible values for the consolidation array slot states: + * < WT_LOG_SLOT_DONE - threads are actively writing to the log. + * WT_LOG_SLOT_DONE - all activity on this slot is complete. + * WT_LOG_SLOT_FREE - slot is available for allocation. + * WT_LOG_SLOT_PENDING - slot is transitioning from ready to active. + * WT_LOG_SLOT_READY - slot is ready for threads to join. + * > WT_LOG_SLOT_READY - threads are actively consolidating on this slot. + */ +#define WT_LOG_SLOT_DONE 0 +#define WT_LOG_SLOT_FREE 1 +#define WT_LOG_SLOT_PENDING 2 +#define WT_LOG_SLOT_READY 3 +typedef struct { + int64_t slot_state; /* Slot state */ + uint64_t slot_group_size; /* Group size */ + int32_t slot_error; /* Error value */ +#define SLOT_INVALID_INDEX 0xffffffff + uint32_t slot_index; /* Active slot index */ + wt_off_t slot_start_offset; /* Starting file offset */ + WT_LSN slot_release_lsn; /* Slot release LSN */ + WT_LSN slot_start_lsn; /* Slot starting LSN */ + WT_LSN slot_end_lsn; /* Slot ending LSN */ + WT_FH *slot_fh; /* File handle for this group */ + WT_ITEM slot_buf; /* Buffer for grouped writes */ + int32_t slot_churn; /* Active slots are scarce. */ + +#define SLOT_BUF_GROW 0x01 /* Grow buffer on release */ +#define SLOT_BUFFERED 0x02 /* Buffer writes */ +#define SLOT_CLOSEFH 0x04 /* Close old fh on release */ +#define SLOT_SYNC 0x08 /* Needs sync on release */ + uint32_t flags; /* Flags */ +} WT_LOGSLOT WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT))); + +typedef struct { + WT_LOGSLOT *slot; + wt_off_t offset; +} WT_MYSLOT; + + /* Offset of first record */ +#define LOG_FIRST_RECORD log->allocsize + +typedef struct { + uint32_t allocsize; /* Allocation alignment size */ + wt_off_t log_written; /* Amount of log written this period */ + /* + * Log file information + */ + uint32_t fileid; /* Current log file number */ + WT_FH *log_fh; /* Logging file handle */ + WT_FH *log_close_fh; /* Logging file handle to close */ + + /* + * System LSNs + */ + WT_LSN alloc_lsn; /* Next LSN for allocation */ + WT_LSN ckpt_lsn; /* Last checkpoint LSN */ + WT_LSN first_lsn; /* First LSN */ + WT_LSN sync_lsn; /* LSN of the last sync */ + WT_LSN trunc_lsn; /* End LSN for recovery truncation */ + WT_LSN write_lsn; /* Last LSN written to log file */ + + /* + * Synchronization resources + */ + WT_SPINLOCK log_lock; /* Locked: Logging fields */ + WT_SPINLOCK log_slot_lock; /* Locked: Consolidation array */ + WT_SPINLOCK log_sync_lock; /* Locked: Single-thread fsync */ + + WT_RWLOCK *log_archive_lock; /* Archive and log cursors */ + + /* Notify any waiting threads when sync_lsn is updated. */ + WT_CONDVAR *log_sync_cond; + + /* + * Consolidation array information + * SLOT_ACTIVE must be less than SLOT_POOL. + * Our testing shows that the more consolidation we generate the + * better the performance we see which equates to an active slot + * slot count of one. + */ +#define SLOT_ACTIVE 1 +#define SLOT_POOL 16 + uint32_t pool_index; /* Global pool index */ + WT_LOGSLOT *slot_array[SLOT_ACTIVE]; /* Active slots */ + WT_LOGSLOT slot_pool[SLOT_POOL]; /* Pool of all slots */ + +#define WT_LOG_FORCE_CONSOLIDATE 0x01 /* Disable direct writes */ + uint32_t flags; +} WT_LOG; + +typedef struct { + uint32_t len; /* 00-03: Record length including hdr */ + uint32_t checksum; /* 04-07: Checksum of the record */ + uint8_t unused[8]; /* 08-15: Padding */ + uint8_t record[0]; /* Beginning of actual data */ +} WT_LOG_RECORD; + +/* + * WT_LOG_DESC -- + * The log file's description. + */ +struct __wt_log_desc { +#define WT_LOG_MAGIC 0x101064 + uint32_t log_magic; /* 00-03: Magic number */ +#define WT_LOG_MAJOR_VERSION 1 + uint16_t majorv; /* 04-05: Major version */ +#define WT_LOG_MINOR_VERSION 0 + uint16_t minorv; /* 06-07: Minor version */ + uint64_t log_size; /* 08-15: Log file size */ +}; + +/* + * WT_LOG_REC_DESC -- + * A descriptor for a log record type. + */ +struct __wt_log_rec_desc { + const char *fmt; + int (*print)(WT_SESSION_IMPL *session, uint8_t **pp, uint8_t *end); +}; + +/* + * WT_LOG_OP_DESC -- + * A descriptor for a log operation type. + */ +struct __wt_log_op_desc { + const char *fmt; + int (*print)(WT_SESSION_IMPL *session, uint8_t **pp, uint8_t *end); +}; diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h new file mode 100644 index 00000000000..99532b97850 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/lsm.h @@ -0,0 +1,232 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * WT_LSM_WORKER_COOKIE -- + * State for an LSM worker thread. + */ +struct __wt_lsm_worker_cookie { + WT_LSM_CHUNK **chunk_array; + size_t chunk_alloc; + u_int nchunks; +}; + +/* + * WT_LSM_WORKER_ARGS -- + * State for an LSM worker thread. + */ +struct __wt_lsm_worker_args { + WT_SESSION_IMPL *session; /* Session */ + WT_CONDVAR *work_cond; /* Owned by the manager */ + wt_thread_t tid; /* Thread id */ + u_int id; /* My manager slot id */ + uint32_t type; /* Types of operations handled */ +#define WT_LSM_WORKER_RUN 0x01 + uint32_t flags; /* Worker flags */ +}; + +/* + * WT_CURSOR_LSM -- + * An LSM cursor. + */ +struct __wt_cursor_lsm { + WT_CURSOR iface; + + WT_LSM_TREE *lsm_tree; + uint64_t dsk_gen; + + u_int nchunks; /* Number of chunks in the cursor */ + u_int nupdates; /* Updates needed (including + snapshot isolation checks). */ + WT_BLOOM **blooms; /* Bloom filter handles. */ + size_t bloom_alloc; + + WT_CURSOR **cursors; /* Cursor handles. */ + size_t cursor_alloc; + + WT_CURSOR *current; /* The current cursor for iteration */ + WT_LSM_CHUNK *primary_chunk; /* The current primary chunk */ + + uint64_t *switch_txn; /* Switch txn for each chunk */ + size_t txnid_alloc; + + u_int update_count; /* Updates performed. */ + +#define WT_CLSM_ACTIVE 0x01 /* Incremented the session count */ +#define WT_CLSM_ITERATE_NEXT 0x02 /* Forward iteration */ +#define WT_CLSM_ITERATE_PREV 0x04 /* Backward iteration */ +#define WT_CLSM_MERGE 0x08 /* Merge cursor, don't update */ +#define WT_CLSM_MINOR_MERGE 0x10 /* Minor merge, include tombstones */ +#define WT_CLSM_MULTIPLE 0x20 /* Multiple cursors have values for the + current key */ +#define WT_CLSM_OPEN_READ 0x40 /* Open for reads */ +#define WT_CLSM_OPEN_SNAPSHOT 0x80 /* Open for snapshot isolation */ + uint32_t flags; +}; + +/* + * WT_LSM_CHUNK -- + * A single chunk (file) in an LSM tree. + */ +struct __wt_lsm_chunk { + const char *uri; /* Data source for this chunk */ + const char *bloom_uri; /* URI of Bloom filter, if any */ + struct timespec create_ts; /* Creation time (for rate limiting) */ + uint64_t count; /* Approximate count of records */ + uint64_t size; /* Final chunk size */ + + uint64_t switch_txn; /* + * Largest transaction that can write + * to this chunk, set by a worker + * thread when the chunk is switched + * out, or by compact to get the most + * recent chunk flushed. + */ + + uint32_t id; /* ID used to generate URIs */ + uint32_t generation; /* Merge generation */ + uint32_t refcnt; /* Number of worker thread references */ + uint32_t bloom_busy; /* Number of worker thread references */ + + int8_t empty; /* 1/0: checkpoint missing */ + int8_t evicted; /* 1/0: in-memory chunk was evicted */ + +#define WT_LSM_CHUNK_BLOOM 0x01 +#define WT_LSM_CHUNK_MERGING 0x02 +#define WT_LSM_CHUNK_ONDISK 0x04 +#define WT_LSM_CHUNK_STABLE 0x08 + uint32_t flags; +} WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT))); + +/* + * Different types of work units. Used by LSM worker threads to choose which + * type of work they will execute, and by work units to define which action + * is required. + */ +#define WT_LSM_WORK_BLOOM 0x01 /* Create a bloom filter */ +#define WT_LSM_WORK_DROP 0x02 /* Drop unused chunks */ +#define WT_LSM_WORK_FLUSH 0x04 /* Flush a chunk to disk */ +#define WT_LSM_WORK_MERGE 0x08 /* Look for a tree merge */ +#define WT_LSM_WORK_SWITCH 0x10 /* Switch to new in-memory chunk */ + +/* + * WT_LSM_WORK_UNIT -- + * A definition of maintenance that an LSM tree needs done. + */ +struct __wt_lsm_work_unit { + TAILQ_ENTRY(__wt_lsm_work_unit) q; /* Worker unit queue */ + uint32_t type; /* Type of operation */ +#define WT_LSM_WORK_FORCE 0x0001 /* Force operation */ + uint32_t flags; /* Flags for operation */ + WT_LSM_TREE *lsm_tree; +}; + +/* + * WT_LSM_MANAGER -- + * A structure that holds resources used to manage any LSM trees in a + * database. + */ +struct __wt_lsm_manager { + /* + * Queues of work units for LSM worker threads. We maintain three + * queues, to allow us to keep each queue FIFO, rather than needing + * to manage the order of work by shuffling the queue order. + * One queue for switches - since switches should never wait for other + * work to be done. + * One queue for application requested work. For example flushing + * and creating bloom filters. + * One queue that is for longer running operations such as merges. + */ + TAILQ_HEAD(__wt_lsm_work_switch_qh, __wt_lsm_work_unit) switchqh; + TAILQ_HEAD(__wt_lsm_work_app_qh, __wt_lsm_work_unit) appqh; + TAILQ_HEAD(__wt_lsm_work_manager_qh, __wt_lsm_work_unit) managerqh; + WT_SPINLOCK switch_lock; /* Lock for switch queue */ + WT_SPINLOCK app_lock; /* Lock for application queue */ + WT_SPINLOCK manager_lock; /* Lock for manager queue */ + WT_CONDVAR *work_cond; /* Used to notify worker of activity */ + uint32_t lsm_workers; /* Current number of LSM workers */ + uint32_t lsm_workers_max; +#define WT_LSM_MAX_WORKERS 20 + WT_LSM_WORKER_ARGS lsm_worker_cookies[WT_LSM_MAX_WORKERS]; +}; + +/* + * WT_LSM_TREE -- + * An LSM tree. + */ +struct __wt_lsm_tree { + const char *name, *config, *filename; + const char *key_format, *value_format; + const char *bloom_config, *file_config; + + WT_COLLATOR *collator; + const char *collator_name; + + int refcnt; /* Number of users of the tree */ +#define LSM_TREE_MAX_QUEUE 100 + int queue_ref; + WT_RWLOCK *rwlock; + TAILQ_ENTRY(__wt_lsm_tree) q; + + WT_DSRC_STATS stats; /* LSM-level statistics */ + + uint64_t dsk_gen; + + long ckpt_throttle; /* Rate limiting due to checkpoints */ + long merge_throttle; /* Rate limiting due to merges */ + uint64_t chunk_fill_ms; /* Estimate of time to fill a chunk */ + struct timespec last_flush_ts; /* Timestamp last flush finished */ + struct timespec work_push_ts; /* Timestamp last work unit added */ + uint64_t merge_progressing; /* Bumped when merges are active */ + uint32_t merge_syncing; /* Bumped when merges are syncing */ + + /* Configuration parameters */ + uint32_t bloom_bit_count; + uint32_t bloom_hash_count; + uint64_t chunk_size; + uint64_t chunk_max; + u_int merge_min, merge_max; + + u_int merge_idle; /* Count of idle merge threads */ + +#define WT_LSM_BLOOM_MERGED 0x00000001 +#define WT_LSM_BLOOM_OFF 0x00000002 +#define WT_LSM_BLOOM_OLDEST 0x00000004 + uint32_t bloom; /* Bloom creation policy */ + + WT_LSM_CHUNK **chunk; /* Array of active LSM chunks */ + size_t chunk_alloc; /* Space allocated for chunks */ + u_int nchunks; /* Number of active chunks */ + uint32_t last; /* Last allocated ID */ + int modified; /* Have there been updates? */ + + WT_LSM_CHUNK **old_chunks; /* Array of old LSM chunks */ + size_t old_alloc; /* Space allocated for old chunks */ + u_int nold_chunks; /* Number of old chunks */ + int freeing_old_chunks; /* Whether chunks are being freed */ + uint32_t merge_aggressiveness; /* Increase amount of work per merge */ + +#define WT_LSM_TREE_ACTIVE 0x01 /* Workers are active */ +#define WT_LSM_TREE_COMPACTING 0x02 /* Tree being compacted */ +#define WT_LSM_TREE_NEED_SWITCH 0x04 /* New chunk needs creating */ +#define WT_LSM_TREE_OPEN 0x08 /* The tree is open */ +#define WT_LSM_TREE_THROTTLE 0x10 /* Throttle updates */ + uint32_t flags; + +#define WT_LSM_TREE_EXCLUSIVE 0x01 /* Tree is opened exclusively */ + uint8_t flags_atomic; +}; + +/* + * WT_LSM_DATA_SOURCE -- + * Implementation of the WT_DATA_SOURCE interface for LSM. + */ +struct __wt_lsm_data_source { + WT_DATA_SOURCE iface; + + WT_RWLOCK *rwlock; +}; diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h new file mode 100644 index 00000000000..e4d7fd64f94 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/meta.h @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#define WT_WIREDTIGER "WiredTiger" /* Version file */ +#define WT_SINGLETHREAD "WiredTiger.lock" /* Locking file */ + +#define WT_BASECONFIG "WiredTiger.basecfg" /* Configuration */ +#define WT_USERCONFIG "WiredTiger.config" /* Configuration */ + +#define WT_METADATA_BACKUP "WiredTiger.backup" /* Hot backup file */ + +#define WT_METADATA_TURTLE "WiredTiger.turtle" /* Metadata metadata */ +#define WT_METADATA_TURTLE_SET "WiredTiger.turtle.set" /* Turtle temp file */ + +#define WT_METADATA_URI "metadata:" /* Metadata alias */ +#define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata file URI */ +#define WT_IS_METADATA(dh) \ + (strcmp((dh)->name, WT_METAFILE_URI) == 0) +#define WT_METAFILE_ID 0 /* Metadata file ID */ + +#define WT_METADATA_VERSION "WiredTiger version" /* Version keys */ +#define WT_METADATA_VERSION_STR "WiredTiger version string" + +/* + * WT_CKPT -- + * Encapsulation of checkpoint information, shared by the metadata, the + * btree engine, and the block manager. + */ +#define WT_CHECKPOINT "WiredTigerCheckpoint" +#define WT_CKPT_FOREACH(ckptbase, ckpt) \ + for ((ckpt) = (ckptbase); (ckpt)->name != NULL; ++(ckpt)) + +struct __wt_ckpt { + char *name; /* Name or NULL */ + + WT_ITEM addr; /* Checkpoint cookie string */ + WT_ITEM raw; /* Checkpoint cookie raw */ + + int64_t order; /* Checkpoint order */ + + uintmax_t sec; /* Timestamp */ + + uint64_t ckpt_size; /* Checkpoint size */ + + uint64_t write_gen; /* Write generation */ + + void *bpriv; /* Block manager private */ + +#define WT_CKPT_ADD 0x01 /* Checkpoint to be added */ +#define WT_CKPT_DELETE 0x02 /* Checkpoint to be deleted */ +#define WT_CKPT_FAKE 0x04 /* Checkpoint is a fake */ +#define WT_CKPT_UPDATE 0x08 /* Checkpoint requires update */ + uint32_t flags; +}; diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h new file mode 100644 index 00000000000..bf2c4ccb8cf --- /dev/null +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -0,0 +1,221 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * Quiet compiler warnings about unused function parameters and variables, + * and unused function return values. + */ +#define WT_UNUSED(var) (void)(var) + +/* Basic constants. */ +#define WT_MILLION (1000000) +#define WT_BILLION (1000000000) + +#define WT_KILOBYTE (1024) +#define WT_MEGABYTE (1048576) +#define WT_GIGABYTE (1073741824) +#define WT_TERABYTE ((uint64_t)1099511627776) +#define WT_PETABYTE ((uint64_t)1125899906842624) + +/* + * Number of directory entries can grow dynamically. + */ +#define WT_DIR_ENTRY 32 + +#define WT_DIRLIST_EXCLUDE 0x1 /* Exclude files matching prefix */ +#define WT_DIRLIST_INCLUDE 0x2 /* Include files matching prefix */ + +/* + * Sizes that cannot be larger than 2**32 are stored in uint32_t fields in + * common structures to save space. To minimize conversions from size_t to + * uint32_t through the code, we use the following macros. + */ +#define WT_STORE_SIZE(s) ((uint32_t)(s)) +#define WT_PTRDIFF(end, begin) \ + ((size_t)((uint8_t *)(end) - (uint8_t *)(begin))) +#define WT_PTRDIFF32(end, begin) \ + WT_STORE_SIZE(WT_PTRDIFF((end), (begin))) +#define WT_BLOCK_FITS(p, len, begin, maxlen) \ + ((uint8_t *)(p) >= (uint8_t *)(begin) && \ + ((uint8_t *)(p) + (len) <= (uint8_t *)(begin) + (maxlen))) +#define WT_PTR_IN_RANGE(p, begin, maxlen) \ + WT_BLOCK_FITS((p), 1, (begin), (maxlen)) + +/* + * Align an unsigned value of any type to a specified power-of-2, including the + * offset result of a pointer subtraction; do the calculation using the largest + * unsigned integer type available. + */ +#define WT_ALIGN(n, v) \ + ((((uintmax_t)(n)) + ((v) - 1)) & ~(((uintmax_t)(v)) - 1)) + +/* Min, max. */ +#define WT_MIN(a, b) ((a) < (b) ? (a) : (b)) +#define WT_MAX(a, b) ((a) < (b) ? (b) : (a)) + +/* Elements in an array. */ +#define WT_ELEMENTS(a) (sizeof(a) / sizeof(a[0])) + +/* 10 level skip lists, 1/4 have a link to the next element. */ +#define WT_SKIP_MAXDEPTH 10 +#define WT_SKIP_PROBABILITY (UINT32_MAX >> 2) + +/* + * __wt_calloc_def -- + * Simple calls don't need separate sizeof arguments. + */ +#define __wt_calloc_def(session, number, addr) \ + __wt_calloc(session, (size_t)(number), sizeof(**(addr)), addr) + +/* + * __wt_realloc_def -- + * Common case allocate-and-grow function. + * Starts by allocating the requested number of items (at least 10), then + * doubles each time the list needs to grow. + */ +#define __wt_realloc_def(session, sizep, number, addr) \ + (((number) * sizeof(**(addr)) <= *(sizep)) ? 0 : \ + __wt_realloc(session, sizep, WT_MAX(*(sizep) * 2, \ + WT_MAX(10, (number)) * sizeof(**(addr))), addr)) +/* + * Our internal free function clears the underlying address atomically so there + * is a smaller chance of racing threads seeing intermediate results while a + * structure is being free'd. (That would be a bug, of course, but I'd rather + * not drop core, just the same.) That's a non-standard "free" API, and the + * resulting bug is a mother to find -- make sure we get it right, don't make + * the caller remember to put the & operator on the pointer. + */ +#define __wt_free(session, p) do { \ + if ((p) != NULL) \ + __wt_free_int(session, (void *)&(p)); \ +} while (0) +#ifdef HAVE_DIAGNOSTIC +#define __wt_overwrite_and_free(session, p) do { \ + memset(p, WT_DEBUG_BYTE, sizeof(*(p))); \ + __wt_free(session, p); \ +} while (0) +#define __wt_overwrite_and_free_len(session, p, len) do { \ + memset(p, WT_DEBUG_BYTE, len); \ + __wt_free(session, p); \ +} while (0) +#else +#define __wt_overwrite_and_free(session, p) __wt_free(session, p) +#define __wt_overwrite_and_free_len(session, p, len) __wt_free(session, p) +#endif + +/* + * Flag set, clear and test. + * + * They come in 3 flavors: F_XXX (handles a field named "flags" in the structure + * referenced by its argument), LF_XXX (handles a local variable named "flags"), + * and FLD_XXX (handles any variable, anywhere). + * + * Flags are unsigned 32-bit values -- we cast to keep the compiler quiet (the + * hex constant might be a negative integer), and to ensure the hex constant is + * the correct size before applying the bitwise not operator. + */ +#define F_CLR(p, mask) ((p)->flags &= ~((uint32_t)(mask))) +#define F_ISSET(p, mask) ((p)->flags & ((uint32_t)(mask))) +#define F_SET(p, mask) ((p)->flags |= ((uint32_t)(mask))) + +#define LF_CLR(mask) ((flags) &= ~((uint32_t)(mask))) +#define LF_ISSET(mask) ((flags) & ((uint32_t)(mask))) +#define LF_SET(mask) ((flags) |= ((uint32_t)(mask))) + +#define FLD_CLR(field, mask) ((field) &= ~((uint32_t)(mask))) +#define FLD_ISSET(field, mask) ((field) & ((uint32_t)(mask))) +#define FLD_SET(field, mask) ((field) |= ((uint32_t)(mask))) + +/* Verbose messages. */ +#ifdef HAVE_VERBOSE +#define WT_VERBOSE_ISSET(session, f) \ + (FLD_ISSET(S2C(session)->verbose, f)) +#else +#define WT_VERBOSE_ISSET(session, f) 0 +#endif + +/* + * Clear a structure, two flavors: inline when we want to guarantee there's + * no function call or setup/tear-down of a loop, and the default where the + * compiler presumably chooses. Gcc 4.3 is supposed to get this right, but + * we've seen problems when calling memset to clear structures in performance + * critical paths. + */ +#define WT_CLEAR_INLINE(type, s) do { \ + static const type __clear; \ + s = __clear; \ +} while (0) +#define WT_CLEAR(s) \ + memset(&(s), 0, sizeof(s)) + +/* Check if a string matches a prefix. */ +#define WT_PREFIX_MATCH(str, pfx) \ + (((const char *)str)[0] == ((const char *)pfx)[0] && \ + strncmp((str), (pfx), strlen(pfx)) == 0) + +/* Check if a non-nul-terminated string matches a prefix. */ +#define WT_PREFIX_MATCH_LEN(str, len, pfx) \ + ((len) >= strlen(pfx) && WT_PREFIX_MATCH(str, pfx)) + +/* Check if a string matches a prefix, and move past it. */ +#define WT_PREFIX_SKIP(str, pfx) \ + (WT_PREFIX_MATCH(str, pfx) ? ((str) += strlen(pfx), 1) : 0) + +/* + * Check if a variable string equals a constant string. Inline the common + * case for WiredTiger of a single byte string. This is required because not + * all compilers optimize this case in strcmp (e.g., clang). + */ +#define WT_STREQ(s, cs) \ + (sizeof(cs) == 2 ? (s)[0] == (cs)[0] && (s)[1] == '\0' : \ + strcmp(s, cs) == 0) + +/* Check if a string matches a byte string of len bytes. */ +#define WT_STRING_MATCH(str, bytes, len) \ + (((const char *)str)[0] == ((const char *)bytes)[0] && \ + strncmp(str, bytes, len) == 0 && (str)[(len)] == '\0') + +/* + * Macro that produces a string literal that isn't wrapped in quotes, to avoid + * tripping up spell checkers. + */ +#define WT_UNCHECKED_STRING(str) #str + +/* Function return value and scratch buffer declaration and initialization. */ +#define WT_DECL_ITEM(i) WT_ITEM *i = NULL +#define WT_DECL_RET int ret = 0 + +/* If a WT_ITEM data field points somewhere in its allocated memory. */ +#define WT_DATA_IN_ITEM(i) \ + ((i)->mem != NULL && (i)->data >= (i)->mem && \ + WT_PTRDIFF((i)->data, (i)->mem) < (i)->memsize) + +/* Copy the data and size fields of an item. */ +#define WT_ITEM_SET(dst, src) do { \ + (dst).data = (src).data; \ + (dst).size = (src).size; \ +} while (0) + +/* + * In diagnostic mode we track the locations from which hazard pointers and + * scratch buffers were acquired. + */ +#ifdef HAVE_DIAGNOSTIC +#define __wt_scr_alloc(session, size, scratchp) \ + __wt_scr_alloc_func(session, size, scratchp, __FILE__, __LINE__) +#define __wt_page_in(session, ref, flags) \ + __wt_page_in_func(session, ref, flags, __FILE__, __LINE__) +#define __wt_page_swap(session, held, want, flags) \ + __wt_page_swap_func(session, held, want, flags, __FILE__, __LINE__) +#else +#define __wt_scr_alloc(session, size, scratchp) \ + __wt_scr_alloc_func(session, size, scratchp) +#define __wt_page_in(session, ref, flags) \ + __wt_page_in_func(session, ref, flags) +#define __wt_page_swap(session, held, want, flags) \ + __wt_page_swap_func(session, held, want, flags) +#endif diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i new file mode 100644 index 00000000000..73caed09c8c --- /dev/null +++ b/src/third_party/wiredtiger/src/include/misc.i @@ -0,0 +1,32 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * __wt_verbose -- + * Verbose message. + */ +static inline int +__wt_verbose(WT_SESSION_IMPL *session, int flag, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3))) +{ +#ifdef HAVE_VERBOSE + WT_DECL_RET; + va_list ap; + + if (WT_VERBOSE_ISSET(session, flag)) { + va_start(ap, fmt); + ret = __wt_eventv(session, 1, 0, NULL, 0, fmt, ap); + va_end(ap); + } + return (ret); +#else + WT_UNUSED(session); + WT_UNUSED(fmt); + WT_UNUSED(flag); + return (0); +#endif +} diff --git a/src/third_party/wiredtiger/src/include/msvc.h b/src/third_party/wiredtiger/src/include/msvc.h new file mode 100644 index 00000000000..8f44a329940 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/msvc.h @@ -0,0 +1,70 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ +#include <intrin.h> + +#ifndef _M_AMD64 +#error "Only x64 is supported with MSVC" +#endif + +#define inline __inline + +#define WT_GCC_ATTRIBUTE(x) +#define WT_GCC_FUNC_ATTRIBUTE(x) + +#define __WT_ATOMIC_ADD(v, val, n, s, t) \ + (WT_STATIC_ASSERT(sizeof(v) == (n)), \ + _InterlockedExchangeAdd ## s((t*)&(v), (t)(val)) + (val)) +#define __WT_ATOMIC_CAS(v, old, new, n, s, t) \ + (WT_STATIC_ASSERT(sizeof(v) == (n)), \ + _InterlockedCompareExchange ## s \ + ((t*)&(v), (t)(new), (t)(old)) == (t)(old)) +#define __WT_ATOMIC_CAS_VAL(v, old, new, n, s, t) \ + (WT_STATIC_ASSERT(sizeof(v) == (n)), \ + _InterlockedCompareExchange ## s((t*)&(v), (t)(new), (t)(old))) +#define __WT_ATOMIC_STORE(v, val, n, s, t) \ + (WT_STATIC_ASSERT(sizeof(v) == (n)), \ + _InterlockedExchange ## s((t*)&(v), (t)(val))) +#define __WT_ATOMIC_SUB(v, val, n, s, t) \ + (WT_STATIC_ASSERT(sizeof(v) == (n)), \ + _InterlockedExchangeAdd ## s((t*)&(v), -(t) val) - (val)) + +#define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val, 1, 8, char) +#define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new, 1, 8, char) +#define WT_ATOMIC_CAS_VAL1(v, old, new) \ + __WT_ATOMIC_CAS_VAL(v, old, new, 1, 8, char) +#define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val, 1, 8, char) +#define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val, 1, 8, char) + +#define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val, 2, 16, short) +#define WT_ATOMIC_CAS2(v, old, new) \ + __WT_ATOMIC_CAS(v, old, new, 2, 16, short) +#define WT_ATOMIC_CAS_VAL2(v, old, new) \ + __WT_ATOMIC_CAS_VAL(v, old, new, 2, 16, short) +#define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val, 2, 16, short) +#define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val, 2, 16, short) + +#define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val, 4, , long) +#define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new, 4, , long) +#define WT_ATOMIC_CAS_VAL4(v, old, new) \ + __WT_ATOMIC_CAS_VAL(v, old, new, 4, , long) +#define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val, 4, , long) +#define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val, 4, , long) + +#define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val, 8, 64, __int64) +#define WT_ATOMIC_CAS8(v, old, new) \ + __WT_ATOMIC_CAS(v, old, new, 8, 64, __int64) +#define WT_ATOMIC_CAS_VAL8(v, old, new) \ + __WT_ATOMIC_CAS_VAL(v, old, new, 8, 64, __int64) +#define WT_ATOMIC_STORE8(v, val) \ + __WT_ATOMIC_STORE(v, val, 8, 64, __int64) +#define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val, 8, 64, __int64) + +static inline void WT_BARRIER(void) { _ReadWriteBarrier(); } +static inline void WT_FULL_BARRIER(void) { _mm_mfence(); } +static inline void WT_PAUSE(void) { _mm_pause(); } +static inline void WT_READ_BARRIER(void) { _mm_lfence(); } +static inline void WT_WRITE_BARRIER(void) { _mm_sfence(); } diff --git a/src/third_party/wiredtiger/src/include/mutex.h b/src/third_party/wiredtiger/src/include/mutex.h new file mode 100644 index 00000000000..b71496dd595 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/mutex.h @@ -0,0 +1,73 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * Condition variables: + * + * WiredTiger uses condition variables to signal between threads, and for + * locking operations that are expected to block. + */ +struct __wt_condvar { + const char *name; /* Mutex name for debugging */ + + wt_mutex_t mtx; /* Mutex */ + wt_cond_t cond; /* Condition variable */ + + int waiters; /* Numbers of waiters, or + -1 if signalled with no waiters. */ +}; + +/* + * Read/write locks: + * + * WiredTiger uses read/write locks for shared/exclusive access to resources. + */ +struct __wt_rwlock { + const char *name; /* Lock name for debugging */ + + wt_rwlock_t rwlock; /* Read/write lock */ +}; + +/* + * Spin locks: + * + * WiredTiger uses spinlocks for fast mutual exclusion (where operations done + * while holding the spin lock are expected to complete in a small number of + * instructions). + */ +#define SPINLOCK_GCC 0 +#define SPINLOCK_PTHREAD_MUTEX 1 +#define SPINLOCK_PTHREAD_MUTEX_ADAPTIVE 2 +#define SPINLOCK_PTHREAD_MUTEX_LOGGING 3 +#define SPINLOCK_MSVC 4 + +#if SPINLOCK_TYPE == SPINLOCK_GCC + +typedef volatile int + WT_SPINLOCK WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT))); + +#elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\ + SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE ||\ + SPINLOCK_TYPE == SPINLOCK_MSVC ||\ + SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING + +typedef struct { + wt_mutex_t lock; + + uint64_t counter; /* Statistics: counter */ + + const char *name; /* Statistics: mutex name */ + int8_t id; /* Statistics: current holder ID */ + + int8_t initialized; /* Lock initialized, for cleanup */ +} WT_SPINLOCK WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT))); + +#else + +#error Unknown spinlock type + +#endif diff --git a/src/third_party/wiredtiger/src/include/mutex.i b/src/third_party/wiredtiger/src/include/mutex.i new file mode 100644 index 00000000000..0d5a8586051 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/mutex.i @@ -0,0 +1,368 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * Spin locks: + * + * These used for cases where fast mutual exclusion is needed (where operations + * done while holding the spin lock are expected to complete in a small number + * of instructions. + */ + +#if SPINLOCK_TYPE == SPINLOCK_GCC + +#define WT_DECL_SPINLOCK_ID(i) +#define __wt_spin_trylock(session, lock, idp) \ + __wt_spin_trylock_func(session, lock) + +/* Default to spinning 1000 times before yielding. */ +#ifndef WT_SPIN_COUNT +#define WT_SPIN_COUNT 1000 +#endif + +/* + * __wt_spin_init -- + * Initialize a spinlock. + */ +static inline int +__wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) +{ + WT_UNUSED(session); + WT_UNUSED(name); + + *(t) = 0; + return (0); +} + +/* + * __wt_spin_destroy -- + * Destroy a spinlock. + */ +static inline void +__wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + WT_UNUSED(session); + + *(t) = 0; +} + +/* + * __wt_spin_trylock_func -- + * Try to lock a spinlock or fail immediately if it is busy. + */ +static inline int +__wt_spin_trylock_func(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + WT_UNUSED(session); + + return (__sync_lock_test_and_set(t, 1) == 0 ? 0 : EBUSY); +} + +/* + * __wt_spin_lock -- + * Spin until the lock is acquired. + */ +static inline void +__wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + int i; + + WT_UNUSED(session); + + while (__sync_lock_test_and_set(t, 1)) { + for (i = 0; *t && i < WT_SPIN_COUNT; i++) + WT_PAUSE(); + if (*t) + __wt_yield(); + } +} + +/* + * __wt_spin_unlock -- + * Release the spinlock. + */ +static inline void +__wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + WT_UNUSED(session); + + __sync_lock_release(t); +} + +#elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\ + SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE ||\ + SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING + +/* + * __wt_spin_init -- + * Initialize a spinlock. + */ +static inline int +__wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) +{ +#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE + pthread_mutexattr_t attr; + + WT_RET(pthread_mutexattr_init(&attr)); + WT_RET(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP)); + WT_RET(pthread_mutex_init(&t->lock, &attr)); +#else + WT_RET(pthread_mutex_init(&t->lock, NULL)); +#endif + + t->name = name; + t->initialized = 1; + +#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING + WT_RET(__wt_spin_lock_register_lock(session, t)); +#endif + + WT_UNUSED(session); + return (0); +} + +/* + * __wt_spin_destroy -- + * Destroy a spinlock. + */ +static inline void +__wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + WT_UNUSED(session); + +#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING + __wt_spin_lock_unregister_lock(session, t); +#endif + if (t->initialized) { + (void)pthread_mutex_destroy(&t->lock); + t->initialized = 0; + } +} + +#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\ + SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE + +#define WT_DECL_SPINLOCK_ID(i) +#define __wt_spin_trylock(session, lock, idp) \ + __wt_spin_trylock_func(session, lock) + +/* + * __wt_spin_trylock_func -- + * Try to lock a spinlock or fail immediately if it is busy. + */ +static inline int +__wt_spin_trylock_func(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + WT_UNUSED(session); + + return (pthread_mutex_trylock(&t->lock)); +} + +/* + * __wt_spin_lock -- + * Spin until the lock is acquired. + */ +static inline void +__wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + WT_UNUSED(session); + + pthread_mutex_lock(&t->lock); +} + +#endif + +#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING + +/* + * When logging statistics, we track which spinlocks block and why. + */ +#define WT_DECL_SPINLOCK_ID(i) \ + static int i = WT_SPINLOCK_REGISTER +#define WT_SPINLOCK_REGISTER -1 +#define WT_SPINLOCK_REGISTER_FAILED -2 +#define __wt_spin_trylock(session, lock, idp) \ + __wt_spin_trylock_func(session, lock, idp, __FILE__, __LINE__) +#define __wt_spin_lock(session, lock) do { \ + WT_DECL_SPINLOCK_ID(__id); \ + __wt_spin_lock_func(session, lock, &__id, __FILE__, __LINE__); \ +} while (0) + +/* + * __wt_spin_trylock_func -- + * Try to lock a spinlock or fail immediately if it is busy. + */ +static inline int +__wt_spin_trylock_func(WT_SESSION_IMPL *session, + WT_SPINLOCK *t, int *idp, const char *file, int line) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + conn = S2C_SAFE(session); + /* If we're not maintaining statistics, it's simple. */ + if (session == NULL || !FLD_ISSET(conn->stat_flags, WT_STAT_CONN_FAST)) + return (pthread_mutex_trylock(&t->lock)); + + /* + * If this caller hasn't yet registered, do so. The caller's location + * ID is a static offset into a per-connection structure, and that has + * problems: first, if there are multiple connections, we'll need to + * hold some kind of lock to avoid racing when setting that value, and + * second, if/when there are multiple connections and/or a single + * connection is closed and re-opened, the variable may be initialized + * and the underlying connection information may not. Check both. + */ + if (*idp == WT_SPINLOCK_REGISTER || + conn->spinlock_block[*idp].name == NULL) + WT_RET(__wt_spin_lock_register_caller( + session, t->name, file, line, idp)); + + /* + * Try to acquire the mutex: on failure, update blocking statistics, on + * success, set our ID as the mutex holder. + * + * Note the race between acquiring the lock and setting our ID as the + * holder, this can appear in the output as mutexes blocking in ways + * that can't actually happen (although still an indicator of a mutex + * that's busier than we'd like). + */ + if ((ret = pthread_mutex_trylock(&t->lock)) == 0) + t->id = *idp; + else + if (*idp >= 0) { + ++conn->spinlock_block[*idp].total; + if (t->id >= 0) + ++conn->spinlock_block[*idp].blocked[t->id]; + } + + /* Update the mutex counter and flush to minimize the windows. */ + ++t->counter; + WT_FULL_BARRIER(); + return (ret); +} + +/* + * __wt_spin_lock_func -- + * Spin until the lock is acquired. + */ +static inline void +__wt_spin_lock_func(WT_SESSION_IMPL *session, + WT_SPINLOCK *t, int *idp, const char *file, int line) +{ + /* If we're not maintaining statistics, it's simple. */ + if (session == NULL || + !FLD_ISSET(conn->stat_flags, WT_STAT_CONN_FAST)) { + pthread_mutex_lock(&t->lock); + return; + } + + /* Try to acquire the mutex. */ + if (__wt_spin_trylock_func(session, t, idp, file, line) == 0) + return; + + /* + * On failure, wait on the mutex; once acquired, set our ID as the + * holder and flush to minimize the windows. + */ + pthread_mutex_lock(&t->lock); + t->id = *idp; + WT_FULL_BARRIER(); +} + +#endif + +/* + * __wt_spin_unlock -- + * Release the spinlock. + */ +static inline void +__wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + WT_UNUSED(session); + + pthread_mutex_unlock(&t->lock); +} + +#elif SPINLOCK_TYPE == SPINLOCK_MSVC + +#define WT_DECL_SPINLOCK_ID(i) +#define WT_SPINLOCK_REGISTER -1 +#define WT_SPINLOCK_REGISTER_FAILED -2 + +#define __wt_spin_trylock(session, lock, idp) \ + __wt_spin_trylock_func(session, lock) + +/* + * __wt_spin_init -- + * Initialize a spinlock. + */ +static inline int +__wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) +{ + WT_UNUSED(session); + WT_UNUSED(name); + + InitializeCriticalSectionAndSpinCount(&t->lock, 4000); + + return (0); +} + +/* + * __wt_spin_destroy -- + * Destroy a spinlock. + */ +static inline void +__wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + WT_UNUSED(session); + + DeleteCriticalSection(&t->lock); +} + +/* + * __wt_spin_trylock_func -- + * Try to lock a spinlock or fail immediately if it is busy. + */ +static inline int +__wt_spin_trylock_func(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + WT_UNUSED(session); + + BOOL b = TryEnterCriticalSection(&t->lock); + return (b == 0 ? EBUSY : 0); +} + +/* + * __wt_spin_lock -- + * Spin until the lock is acquired. + */ +static inline void +__wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + WT_UNUSED(session); + + EnterCriticalSection(&t->lock); +} + +/* + * __wt_spin_unlock -- + * Release the spinlock. + */ +static inline void +__wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + WT_UNUSED(session); + + LeaveCriticalSection(&t->lock); +} + +#else + +#error Unknown spinlock type + +#endif diff --git a/src/third_party/wiredtiger/src/include/os.h b/src/third_party/wiredtiger/src/include/os.h new file mode 100644 index 00000000000..846249294fe --- /dev/null +++ b/src/third_party/wiredtiger/src/include/os.h @@ -0,0 +1,72 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#define WT_SYSCALL_RETRY(call, ret) do { \ + int __retry; \ + for (__retry = 0; __retry < 10; ++__retry) { \ + if ((call) == 0) { \ + (ret) = 0; \ + break; \ + } \ + switch ((ret) = __wt_errno()) { \ + case 0: \ + /* The call failed but didn't set errno. */ \ + (ret) = WT_ERROR; \ + break; \ + case EAGAIN: \ + case EBUSY: \ + case EINTR: \ + case EIO: \ + case EMFILE: \ + case ENFILE: \ + case ENOSPC: \ + __wt_sleep(0L, 500000L); \ + continue; \ + default: \ + break; \ + } \ + break; \ + } \ +} while (0) + +#define WT_TIMEDIFF(end, begin) \ + (1000000000 * (uint64_t)((end).tv_sec - (begin).tv_sec) + \ + (uint64_t)(end).tv_nsec - (uint64_t)(begin).tv_nsec) +#define WT_TIMECMP(t1, t2) \ + ((t1).tv_sec < (t2).tv_sec ? -1 : \ + (t1).tv_sec == (t2.tv_sec) ? \ + (t1).tv_nsec < (t2).tv_nsec ? -1 : \ + (t1).tv_nsec == (t2).tv_nsec ? 0 : 1 : 1) + +struct __wt_fh { + char *name; /* File name */ + TAILQ_ENTRY(__wt_fh) q; /* List of open handles */ + + u_int ref; /* Reference count */ + +#ifndef _WIN32 + int fd; /* POSIX file handle */ +#else + HANDLE filehandle; /* Windows file handle */ + HANDLE filehandle_secondary; /* Windows file handle + for file size changes */ +#endif + wt_off_t size; /* File size */ + wt_off_t extend_size; /* File extended size */ + wt_off_t extend_len; /* File extend chunk size */ + + int direct_io; /* O_DIRECT configured */ + + int fallocate_available; /* fallocate/posix_fallocate */ + int fallocate_requires_locking; +}; + +#ifndef _WIN32 +#define WT_SIZET_FMT "zu" /* size_t format string */ +#else +#define WT_SIZET_FMT "Iu" /* size_t format string */ +#endif diff --git a/src/third_party/wiredtiger/src/include/os_windows.h b/src/third_party/wiredtiger/src/include/os_windows.h new file mode 100644 index 00000000000..fcae531184f --- /dev/null +++ b/src/third_party/wiredtiger/src/include/os_windows.h @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * Define WT threading and concurrency primitives + * Assumes Windows 7+/2008 R2+ + */ +typedef CONDITION_VARIABLE wt_cond_t; +typedef CRITICAL_SECTION wt_mutex_t; +typedef HANDLE wt_thread_t; +typedef SRWLOCK wt_rwlock_t; + +/* Timespec is a POSIX structure not defined in Windows */ +struct timespec { + time_t tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ +}; + +#define strncasecmp _strnicmp + +/* + * Windows Portability stuff + * These are POSIX types which Windows lacks + * Eventually WiredTiger will migrate away from these types + */ +typedef uint32_t u_int; +typedef unsigned char u_char; +typedef unsigned long u_long; + +/* < VS 2013 is not C99 compat */ +#if _MSC_VER < 1900 +#define snprintf _snprintf +#endif + +/* + * Windows does have ssize_t + * Python headers declare also though so we need to guard it + */ +#ifndef HAVE_SSIZE_T +typedef int ssize_t; +#endif + +/* + * Provide a custom version of vsnprintf that returns the + * needed buffer length instead of -1 on truncation + */ +#define vsnprintf _wt_vsnprintf + +_Check_return_opt_ int __cdecl _wt_vsnprintf( + _Out_writes_(_MaxCount) char * _DstBuf, + _In_ size_t _MaxCount, + _In_z_ _Printf_format_string_ const char * _Format, + va_list _ArgList); + +/* Provide a custom version of localtime_r */ +struct tm *localtime_r(const time_t* timer, struct tm* result); diff --git a/src/third_party/wiredtiger/src/include/packing.i b/src/third_party/wiredtiger/src/include/packing.i new file mode 100644 index 00000000000..6e0e7be13eb --- /dev/null +++ b/src/third_party/wiredtiger/src/include/packing.i @@ -0,0 +1,685 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * Throughout this code we have to be aware of default argument conversion. + * + * Refer to Chapter 8 of "Expert C Programming" by Peter van der Linden for the + * gory details. The short version is that we have less cases to deal with + * because the compiler promotes shorter types to int or unsigned int. + */ +typedef struct { + union { + int64_t i; + uint64_t u; + const char *s; + WT_ITEM item; + } u; + uint32_t size; + int8_t havesize; + char type; +} WT_PACK_VALUE; + +#define WT_PACK_VALUE_INIT { { 0 }, 0, 0, 0 } +#define WT_DECL_PACK_VALUE(pv) WT_PACK_VALUE pv = WT_PACK_VALUE_INIT + +typedef struct { + WT_SESSION_IMPL *session; + const char *cur, *end, *orig; + unsigned long repeats; + WT_PACK_VALUE lastv; +} WT_PACK; + +#define WT_PACK_INIT { NULL, NULL, NULL, NULL, 0, WT_PACK_VALUE_INIT } +#define WT_DECL_PACK(pack) WT_PACK pack = WT_PACK_INIT + +typedef struct { + WT_CONFIG config; + char buf[20]; + int count; + int iskey; + int genname; +} WT_PACK_NAME; + +/* + * __pack_initn -- + * Initialize a pack iterator with the specified string and length. + */ +static inline int +__pack_initn( + WT_SESSION_IMPL *session, WT_PACK *pack, const char *fmt, size_t len) +{ + if (*fmt == '@' || *fmt == '<' || *fmt == '>') + return (EINVAL); + if (*fmt == '.') + ++fmt; + + pack->session = session; + pack->cur = pack->orig = fmt; + pack->end = fmt + len; + pack->repeats = 0; + return (0); +} + +/* + * __pack_init -- + * Initialize a pack iterator with the specified string. + */ +static inline int +__pack_init(WT_SESSION_IMPL *session, WT_PACK *pack, const char *fmt) +{ + return (__pack_initn(session, pack, fmt, strlen(fmt))); +} + +/* + * __pack_name_init -- + * Initialize the name of a pack iterator. + */ +static inline int +__pack_name_init(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *names, + int iskey, WT_PACK_NAME *pn) +{ + WT_CLEAR(*pn); + pn->iskey = iskey; + + if (names->str != NULL) + WT_RET(__wt_config_subinit(session, &pn->config, names)); + else + pn->genname = 1; + + return (0); +} + +/* + * __pack_name_next -- + * Get the next field type from a pack iterator. + */ +static inline int +__pack_name_next(WT_PACK_NAME *pn, WT_CONFIG_ITEM *name) +{ + WT_CONFIG_ITEM ignore; + + if (pn->genname) { + (void)snprintf(pn->buf, sizeof(pn->buf), + (pn->iskey ? "key%d" : "value%d"), pn->count); + WT_CLEAR(*name); + name->str = pn->buf; + name->len = strlen(pn->buf); + name->type = WT_CONFIG_ITEM_STRING; + pn->count++; + } + else + WT_RET(__wt_config_next(&pn->config, name, &ignore)); + + return (0); +} + +/* + * __pack_next -- + * Next pack iterator. + */ +static inline int +__pack_next(WT_PACK *pack, WT_PACK_VALUE *pv) +{ + char *endsize; + + if (pack->repeats > 0) { + *pv = pack->lastv; + --pack->repeats; + return (0); + } + +next: if (pack->cur == pack->end) + return (WT_NOTFOUND); + + if (isdigit(*pack->cur)) { + pv->havesize = 1; + pv->size = WT_STORE_SIZE(strtoul(pack->cur, &endsize, 10)); + pack->cur = endsize; + } else { + pv->havesize = 0; + pv->size = 1; + } + + pv->type = *pack->cur++; + pack->repeats = 0; + + switch (pv->type) { + case 'S': + case 's': + case 'x': + return (0); + case 't': + if (pv->size < 1 || pv->size > 8) + WT_RET_MSG(pack->session, EINVAL, + "Bitfield sizes must be between 1 and 8 bits " + "in format '%.*s'", + (int)(pack->end - pack->orig), pack->orig); + return (0); + case 'u': + case 'U': + /* Special case for items with a size prefix. */ + pv->type = (!pv->havesize && *pack->cur != '\0') ? 'U' : 'u'; + return (0); + case 'b': + case 'h': + case 'i': + case 'B': + case 'H': + case 'I': + case 'l': + case 'L': + case 'q': + case 'Q': + case 'r': + case 'R': + /* Integral types repeat <size> times. */ + if (pv->size == 0) + goto next; + pack->repeats = pv->size - 1; + pack->lastv = *pv; + return (0); + default: + WT_RET_MSG(pack->session, EINVAL, + "Invalid type '%c' found in format '%.*s'", + pv->type, (int)(pack->end - pack->orig), pack->orig); + } + +} + +#define WT_PACK_GET(session, pv, ap) do { \ + WT_ITEM *__item; \ + switch (pv.type) { \ + case 'x': \ + break; \ + case 's': \ + case 'S': \ + pv.u.s = va_arg(ap, const char *); \ + break; \ + case 'U': \ + case 'u': \ + __item = va_arg(ap, WT_ITEM *); \ + pv.u.item.data = __item->data; \ + pv.u.item.size = __item->size; \ + break; \ + case 'b': \ + case 'h': \ + case 'i': \ + pv.u.i = va_arg(ap, int); \ + break; \ + case 'B': \ + case 'H': \ + case 'I': \ + case 't': \ + pv.u.u = va_arg(ap, unsigned int); \ + break; \ + case 'l': \ + pv.u.i = va_arg(ap, long); \ + break; \ + case 'L': \ + pv.u.u = va_arg(ap, unsigned long); \ + break; \ + case 'q': \ + pv.u.i = va_arg(ap, int64_t); \ + break; \ + case 'Q': \ + case 'r': \ + case 'R': \ + pv.u.u = va_arg(ap, uint64_t); \ + break; \ + /* User format strings have already been validated. */ \ + WT_ILLEGAL_VALUE(session); \ + } \ +} while (0) + +/* + * __pack_size -- + * Get the size of a packed value. + */ +static inline size_t +__pack_size(WT_SESSION_IMPL *session, WT_PACK_VALUE *pv) +{ + size_t s, pad; + + switch (pv->type) { + case 'x': + return (pv->size); + case 'j': + case 'J': + if (pv->type == 'j' || pv->havesize) + s = pv->size; + else { + ssize_t len; + + /* The string was previously validated. */ + len = __wt_json_strlen(pv->u.item.data, + pv->u.item.size); + WT_ASSERT(session, len >= 0); + s = (size_t)len + 1; + } + return (s); + case 's': + case 'S': + if (pv->type == 's' || pv->havesize) + s = pv->size; + else + s = strlen(pv->u.s) + 1; + return (s); + case 'U': + case 'u': + s = pv->u.item.size; + pad = 0; + if (pv->havesize && pv->size < s) + s = pv->size; + else if (pv->havesize) + pad = pv->size - s; + if (pv->type == 'U') + s += __wt_vsize_uint(s + pad); + return (s + pad); + case 'b': + case 'B': + case 't': + return (1); + case 'h': + case 'i': + case 'l': + case 'q': + return (__wt_vsize_int(pv->u.i)); + case 'H': + case 'I': + case 'L': + case 'Q': + case 'r': + return (__wt_vsize_uint(pv->u.u)); + case 'R': + return (sizeof(uint64_t)); + } + + __wt_err(session, EINVAL, "unknown pack-value type: %c", (int)pv->type); + return ((size_t)-1); +} + +/* + * __pack_write -- + * Pack a value into a buffer. + */ +static inline int +__pack_write( + WT_SESSION_IMPL *session, WT_PACK_VALUE *pv, uint8_t **pp, size_t maxlen) +{ + uint8_t *oldp; + size_t s, pad; + + switch (pv->type) { + case 'x': + WT_SIZE_CHECK(pv->size, maxlen); + memset(*pp, 0, pv->size); + *pp += pv->size; + break; + case 's': + case 'S': + /* + * XXX if pv->havesize, only want to know if there is a + * '\0' in the first pv->size characters. + */ + s = strlen(pv->u.s); + if ((pv->type == 's' || pv->havesize) && pv->size < s) { + s = pv->size; + pad = 0; + } else if (pv->havesize) + pad = pv->size - s; + else + pad = 1; + WT_SIZE_CHECK(s + pad, maxlen); + if (s > 0) + memcpy(*pp, pv->u.s, s); + *pp += s; + if (pad > 0) { + memset(*pp, 0, pad); + *pp += pad; + } + break; + case 'j': + case 'J': + s = pv->u.item.size; + if ((pv->type == 'j' || pv->havesize) && pv->size < s) { + s = pv->size; + pad = 0; + } else if (pv->havesize) + pad = pv->size - s; + else + pad = 1; + if (s > 0) { + oldp = *pp; + WT_RET(__wt_json_strncpy((char **)pp, maxlen, + pv->u.item.data, s)); + maxlen -= (size_t)(*pp - oldp); + } + if (pad > 0) { + WT_SIZE_CHECK(pad, maxlen); + memset(*pp, 0, pad); + *pp += pad; + } + break; + case 'U': + case 'u': + s = pv->u.item.size; + pad = 0; + if (pv->havesize && pv->size < s) + s = pv->size; + else if (pv->havesize) + pad = pv->size - s; + if (pv->type == 'U') { + oldp = *pp; + WT_RET(__wt_vpack_uint(pp, maxlen, s + pad)); + maxlen -= (size_t)(*pp - oldp); + } + WT_SIZE_CHECK(s + pad, maxlen); + if (s > 0) + memcpy(*pp, pv->u.item.data, s); + *pp += s; + if (pad > 0) { + memset(*pp, 0, pad); + *pp += pad; + } + break; + case 'b': + /* Translate to maintain ordering with the sign bit. */ + WT_SIZE_CHECK(1, maxlen); + **pp = (uint8_t)(pv->u.i + 0x80); + *pp += 1; + break; + case 'B': + case 't': + WT_SIZE_CHECK(1, maxlen); + **pp = (uint8_t)pv->u.u; + *pp += 1; + break; + case 'h': + case 'i': + case 'l': + case 'q': + WT_RET(__wt_vpack_int(pp, maxlen, pv->u.i)); + break; + case 'H': + case 'I': + case 'L': + case 'Q': + case 'r': + WT_RET(__wt_vpack_uint(pp, maxlen, pv->u.u)); + break; + case 'R': + WT_SIZE_CHECK(sizeof(uint64_t), maxlen); + *(uint64_t *)*pp = pv->u.u; + *pp += sizeof(uint64_t); + break; + default: + WT_RET_MSG(session, EINVAL, + "unknown pack-value type: %c", (int)pv->type); + } + + return (0); +} + +/* + * __unpack_read -- + * Read a packed value from a buffer. + */ +static inline int +__unpack_read(WT_SESSION_IMPL *session, + WT_PACK_VALUE *pv, const uint8_t **pp, size_t maxlen) +{ + size_t s; + + switch (pv->type) { + case 'x': + WT_SIZE_CHECK(pv->size, maxlen); + *pp += pv->size; + break; + case 's': + case 'S': + if (pv->type == 's' || pv->havesize) + s = pv->size; + else + s = strlen((const char *)*pp) + 1; + if (s > 0) + pv->u.s = (const char *)*pp; + WT_SIZE_CHECK(s, maxlen); + *pp += s; + break; + case 'U': + WT_RET(__wt_vunpack_uint(pp, maxlen, &pv->u.u)); + /* FALLTHROUGH */ + case 'u': + if (pv->havesize) + s = pv->size; + else if (pv->type == 'U') + s = (size_t)pv->u.u; + else + s = maxlen; + WT_SIZE_CHECK(s, maxlen); + pv->u.item.data = *pp; + pv->u.item.size = s; + *pp += s; + break; + case 'b': + /* Translate to maintain ordering with the sign bit. */ + WT_SIZE_CHECK(1, maxlen); + pv->u.i = (int8_t)(*(*pp)++ - 0x80); + break; + case 'B': + case 't': + WT_SIZE_CHECK(1, maxlen); + pv->u.u = *(*pp)++; + break; + case 'h': + case 'i': + case 'l': + case 'q': + WT_RET(__wt_vunpack_int(pp, maxlen, &pv->u.i)); + break; + case 'H': + case 'I': + case 'L': + case 'Q': + case 'r': + WT_RET(__wt_vunpack_uint(pp, maxlen, &pv->u.u)); + break; + case 'R': + WT_SIZE_CHECK(sizeof(uint64_t), maxlen); + pv->u.u = *(uint64_t *)*pp; + *pp += sizeof(uint64_t); + break; + default: + WT_RET_MSG(session, EINVAL, + "unknown pack-value type: %c", (int)pv->type); + } + + return (0); +} + +#define WT_UNPACK_PUT(session, pv, ap) do { \ + WT_ITEM *__item; \ + switch (pv.type) { \ + case 'x': \ + break; \ + case 's': \ + case 'S': \ + *va_arg(ap, const char **) = pv.u.s; \ + break; \ + case 'U': \ + case 'u': \ + __item = va_arg(ap, WT_ITEM *); \ + __item->data = pv.u.item.data; \ + __item->size = pv.u.item.size; \ + break; \ + case 'b': \ + *va_arg(ap, int8_t *) = (int8_t)pv.u.i; \ + break; \ + case 'h': \ + *va_arg(ap, int16_t *) = (short)pv.u.i; \ + break; \ + case 'i': \ + case 'l': \ + *va_arg(ap, int32_t *) = (int32_t)pv.u.i; \ + break; \ + case 'q': \ + *va_arg(ap, int64_t *) = pv.u.i; \ + break; \ + case 'B': \ + case 't': \ + *va_arg(ap, uint8_t *) = (uint8_t)pv.u.u; \ + break; \ + case 'H': \ + *va_arg(ap, uint16_t *) = (uint16_t)pv.u.u; \ + break; \ + case 'I': \ + case 'L': \ + *va_arg(ap, uint32_t *) = (uint32_t)pv.u.u; \ + break; \ + case 'Q': \ + case 'r': \ + case 'R': \ + *va_arg(ap, uint64_t *) = pv.u.u; \ + break; \ + /* User format strings have already been validated. */ \ + WT_ILLEGAL_VALUE(session); \ + } \ +} while (0) + +/* + * __wt_struct_packv -- + * Pack a byte string (va_list version). + */ +static inline int +__wt_struct_packv(WT_SESSION_IMPL *session, + void *buffer, size_t size, const char *fmt, va_list ap) +{ + WT_DECL_PACK_VALUE(pv); + WT_DECL_RET; + WT_PACK pack; + uint8_t *p, *end; + + p = buffer; + end = p + size; + + if (fmt[0] != '\0' && fmt[1] == '\0') { + pv.type = fmt[0]; + WT_PACK_GET(session, pv, ap); + return (__pack_write(session, &pv, &p, size)); + } + + WT_RET(__pack_init(session, &pack, fmt)); + while ((ret = __pack_next(&pack, &pv)) == 0) { + WT_PACK_GET(session, pv, ap); + WT_RET(__pack_write(session, &pv, &p, (size_t)(end - p))); + } + + /* Be paranoid - __pack_write should never overflow. */ + WT_ASSERT(session, p <= end); + + if (ret != WT_NOTFOUND) + return (ret); + + return (0); +} + +/* + * __wt_struct_sizev -- + * Calculate the size of a packed byte string (va_list version). + */ +static inline int +__wt_struct_sizev( + WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, va_list ap) +{ + WT_DECL_PACK_VALUE(pv); + WT_PACK pack; + size_t total; + + if (fmt[0] != '\0' && fmt[1] == '\0') { + pv.type = fmt[0]; + WT_PACK_GET(session, pv, ap); + *sizep = __pack_size(session, &pv); + return (0); + } + + WT_RET(__pack_init(session, &pack, fmt)); + for (total = 0; __pack_next(&pack, &pv) == 0;) { + WT_PACK_GET(session, pv, ap); + total += __pack_size(session, &pv); + } + *sizep = total; + return (0); +} + +/* + * __wt_struct_unpackv -- + * Unpack a byte string (va_list version). + */ +static inline int +__wt_struct_unpackv(WT_SESSION_IMPL *session, + const void *buffer, size_t size, const char *fmt, va_list ap) +{ + WT_DECL_PACK_VALUE(pv); + WT_DECL_RET; + WT_PACK pack; + const uint8_t *p, *end; + + p = buffer; + end = p + size; + + if (fmt[0] != '\0' && fmt[1] == '\0') { + pv.type = fmt[0]; + if ((ret = __unpack_read(session, &pv, &p, size)) == 0) + WT_UNPACK_PUT(session, pv, ap); + return (0); + } + + WT_RET(__pack_init(session, &pack, fmt)); + while ((ret = __pack_next(&pack, &pv)) == 0) { + WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p))); + WT_UNPACK_PUT(session, pv, ap); + } + + /* Be paranoid - __pack_write should never overflow. */ + WT_ASSERT(session, p <= end); + + if (ret != WT_NOTFOUND) + return (ret); + + return (0); +} + +/* + * __wt_struct_size_adjust -- + * Adjust the size field for a packed structure. + * + * Sometimes we want to include the size as a field in a packed structure. + * This is done by calling __wt_struct_size with the expected format and + * a size of zero. Then we want to pack the structure using the final + * size. This function adjusts the size appropriately (taking into + * account the size of the final size or the size field itself). + */ +static inline void +__wt_struct_size_adjust(WT_SESSION_IMPL *session, size_t *sizep) +{ + size_t prev_size = 1; + size_t orig_size = *sizep; + size_t field_size0 = __wt_vsize_uint(orig_size); + size_t field_size1 = + __wt_vsize_uint(orig_size + field_size0 - prev_size); + *sizep += field_size1 - prev_size; + + /* + * Make sure the field size we calculated matches the adjusted size. + * This relies on the fact that we are only adjusting by a small number + * of bytes, so we won't cross multiple boundaries in the packing + * routine. If that were not true, we would need to iterate here until + * the field size stops growing. + */ + WT_ASSERT(session, field_size1 == __wt_vsize_uint(*sizep)); +} diff --git a/src/third_party/wiredtiger/src/include/posix.h b/src/third_party/wiredtiger/src/include/posix.h new file mode 100644 index 00000000000..e3b43ea38ab --- /dev/null +++ b/src/third_party/wiredtiger/src/include/posix.h @@ -0,0 +1,47 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* Some systems don't configure 64-bit MIN/MAX by default. */ +#ifndef ULLONG_MAX +#define ULLONG_MAX 0xffffffffffffffffULL +#endif +#ifndef LLONG_MAX +#define LLONG_MAX 0x7fffffffffffffffLL +#endif +#ifndef LLONG_MIN +#define LLONG_MIN (-0x7fffffffffffffffLL - 1) +#endif + +/* Define O_BINARY for Posix systems */ +#define O_BINARY 0 + +/* + * Define WT threading and concurrency primitives + */ +typedef pthread_cond_t wt_cond_t; +typedef pthread_mutex_t wt_mutex_t; +typedef pthread_t wt_thread_t; + +/* + * !!! + * Don't touch this structure without understanding the read/write + * locking functions. + */ +typedef union { /* Read/write lock */ +#ifdef WORDS_BIGENDIAN + WiredTiger read/write locks require modification for big-endian systems. +#else + uint64_t u; + uint32_t us; + struct { + uint16_t writers; + uint16_t readers; + uint16_t users; + uint16_t pad; + } s; +#endif +} wt_rwlock_t; diff --git a/src/third_party/wiredtiger/src/include/queue.h b/src/third_party/wiredtiger/src/include/queue.h new file mode 100644 index 00000000000..42e736e7b09 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/queue.h @@ -0,0 +1,559 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + * $FreeBSD: src/sys/sys/queue.h,v 1.54 2002/08/05 05:18:43 alfred Exp $ + */ + +#ifndef _DB_QUEUE_H_ +#define _DB_QUEUE_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * This file defines four types of data structures: singly-linked lists, + * singly-linked tail queues, lists and tail queues. + * + * A singly-linked list is headed by a single forward pointer. The elements + * are singly linked for minimum space and pointer manipulation overhead at + * the expense of O(n) removal for arbitrary elements. New elements can be + * added to the list after an existing element or at the head of the list. + * Elements being removed from the head of the list should use the explicit + * macro for this purpose for optimum efficiency. A singly-linked list may + * only be traversed in the forward direction. Singly-linked lists are ideal + * for applications with large datasets and few or no removals or for + * implementing a LIFO queue. + * + * A singly-linked tail queue is headed by a pair of pointers, one to the + * head of the list and the other to the tail of the list. The elements are + * singly linked for minimum space and pointer manipulation overhead at the + * expense of O(n) removal for arbitrary elements. New elements can be added + * to the list after an existing element, at the head of the list, or at the + * end of the list. Elements being removed from the head of the tail queue + * should use the explicit macro for this purpose for optimum efficiency. + * A singly-linked tail queue may only be traversed in the forward direction. + * Singly-linked tail queues are ideal for applications with large datasets + * and few or no removals or for implementing a FIFO queue. + * + * A list is headed by a single forward pointer (or an array of forward + * pointers for a hash table header). The elements are doubly linked + * so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before + * or after an existing element or at the head of the list. A list + * may only be traversed in the forward direction. + * + * A tail queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or + * after an existing element, at the head of the list, or at the end of + * the list. A tail queue may be traversed in either direction. + * + * For details on the use of these macros, see the queue(3) manual page. + * + * + * SLIST LIST STAILQ TAILQ + * _HEAD + + + + + * _HEAD_INITIALIZER + + + + + * _ENTRY + + + + + * _INIT + + + + + * _EMPTY + + + + + * _FIRST + + + + + * _NEXT + + + + + * _PREV - - - + + * _LAST - - + + + * _FOREACH + + + + + * _FOREACH_REVERSE - - - + + * _INSERT_HEAD + + + + + * _INSERT_BEFORE - + - + + * _INSERT_AFTER + + + + + * _INSERT_TAIL - - + + + * _CONCAT - - + + + * _REMOVE_HEAD + - + - + * _REMOVE + + + + + * + */ + +/* + * XXX + * We #undef all of the macros because there are incompatible versions of this + * file and these macros on various systems. What makes the problem worse is + * they are included and/or defined by system include files which we may have + * already loaded into Berkeley DB before getting here. For example, FreeBSD's + * <rpc/rpc.h> includes its system <sys/queue.h>, and VxWorks UnixLib.h defines + * several of the LIST_XXX macros. Visual C.NET 7.0 also defines some of these + * same macros in Vc7\PlatformSDK\Include\WinNT.h. Make sure we use ours. + */ +#undef LIST_EMPTY +#undef LIST_ENTRY +#undef LIST_FIRST +#undef LIST_FOREACH +#undef LIST_HEAD +#undef LIST_HEAD_INITIALIZER +#undef LIST_INIT +#undef LIST_INSERT_AFTER +#undef LIST_INSERT_BEFORE +#undef LIST_INSERT_HEAD +#undef LIST_NEXT +#undef LIST_REMOVE +#undef QMD_TRACE_ELEM +#undef QMD_TRACE_HEAD +#undef QUEUE_MACRO_DEBUG +#undef SLIST_EMPTY +#undef SLIST_ENTRY +#undef SLIST_FIRST +#undef SLIST_FOREACH +#undef SLIST_FOREACH_PREVPTR +#undef SLIST_HEAD +#undef SLIST_HEAD_INITIALIZER +#undef SLIST_INIT +#undef SLIST_INSERT_AFTER +#undef SLIST_INSERT_HEAD +#undef SLIST_NEXT +#undef SLIST_REMOVE +#undef SLIST_REMOVE_HEAD +#undef STAILQ_CONCAT +#undef STAILQ_EMPTY +#undef STAILQ_ENTRY +#undef STAILQ_FIRST +#undef STAILQ_FOREACH +#undef STAILQ_HEAD +#undef STAILQ_HEAD_INITIALIZER +#undef STAILQ_INIT +#undef STAILQ_INSERT_AFTER +#undef STAILQ_INSERT_HEAD +#undef STAILQ_INSERT_TAIL +#undef STAILQ_LAST +#undef STAILQ_NEXT +#undef STAILQ_REMOVE +#undef STAILQ_REMOVE_HEAD +#undef STAILQ_REMOVE_HEAD_UNTIL +#undef TAILQ_CONCAT +#undef TAILQ_EMPTY +#undef TAILQ_ENTRY +#undef TAILQ_FIRST +#undef TAILQ_FOREACH +#undef TAILQ_FOREACH_REVERSE +#undef TAILQ_HEAD +#undef TAILQ_HEAD_INITIALIZER +#undef TAILQ_INIT +#undef TAILQ_INSERT_AFTER +#undef TAILQ_INSERT_BEFORE +#undef TAILQ_INSERT_HEAD +#undef TAILQ_INSERT_TAIL +#undef TAILQ_LAST +#undef TAILQ_NEXT +#undef TAILQ_PREV +#undef TAILQ_REMOVE +#undef TRACEBUF +#undef TRASHIT + +#define QUEUE_MACRO_DEBUG 0 +#if QUEUE_MACRO_DEBUG +/* Store the last 2 places the queue element or head was altered */ +struct qm_trace { + char * lastfile; + int lastline; + char * prevfile; + int prevline; +}; + +#define TRACEBUF struct qm_trace trace; +#define TRASHIT(x) do {(x) = (void *)-1;} while (0) + +#define QMD_TRACE_HEAD(head) do { \ + (head)->trace.prevline = (head)->trace.lastline; \ + (head)->trace.prevfile = (head)->trace.lastfile; \ + (head)->trace.lastline = __LINE__; \ + (head)->trace.lastfile = __FILE__; \ +} while (0) + +#define QMD_TRACE_ELEM(elem) do { \ + (elem)->trace.prevline = (elem)->trace.lastline; \ + (elem)->trace.prevfile = (elem)->trace.lastfile; \ + (elem)->trace.lastline = __LINE__; \ + (elem)->trace.lastfile = __FILE__; \ +} while (0) + +#else +#define QMD_TRACE_ELEM(elem) +#define QMD_TRACE_HEAD(head) +#define TRACEBUF +#define TRASHIT(x) +#endif /* QUEUE_MACRO_DEBUG */ + +/* + * Singly-linked List declarations. + */ +#define SLIST_HEAD(name, type) \ +struct name { \ + struct type *slh_first; /* first element */ \ +} + +#define SLIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define SLIST_ENTRY(type) \ +struct { \ + struct type *sle_next; /* next element */ \ +} + +/* + * Singly-linked List functions. + */ +#define SLIST_EMPTY(head) ((head)->slh_first == NULL) + +#define SLIST_FIRST(head) ((head)->slh_first) + +#define SLIST_FOREACH(var, head, field) \ + for ((var) = SLIST_FIRST((head)); \ + (var); \ + (var) = SLIST_NEXT((var), field)) + +#define SLIST_FOREACH_PREVPTR(var, varp, head, field) \ + for ((varp) = &SLIST_FIRST((head)); \ + ((var) = *(varp)) != NULL; \ + (varp) = &SLIST_NEXT((var), field)) + +#define SLIST_INIT(head) do { \ + SLIST_FIRST((head)) = NULL; \ +} while (0) + +#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \ + SLIST_NEXT((slistelm), field) = (elm); \ +} while (0) + +#define SLIST_INSERT_HEAD(head, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \ + SLIST_FIRST((head)) = (elm); \ +} while (0) + +#define SLIST_NEXT(elm, field) ((elm)->field.sle_next) + +#define SLIST_REMOVE(head, elm, type, field) do { \ + if (SLIST_FIRST((head)) == (elm)) { \ + SLIST_REMOVE_HEAD((head), field); \ + } \ + else { \ + struct type *curelm = SLIST_FIRST((head)); \ + while (SLIST_NEXT(curelm, field) != (elm)) \ + curelm = SLIST_NEXT(curelm, field); \ + SLIST_NEXT(curelm, field) = \ + SLIST_NEXT(SLIST_NEXT(curelm, field), field); \ + } \ +} while (0) + +#define SLIST_REMOVE_HEAD(head, field) do { \ + SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ +} while (0) + +/* + * Singly-linked Tail queue declarations. + */ +#define STAILQ_HEAD(name, type) \ +struct name { \ + struct type *stqh_first;/* first element */ \ + struct type **stqh_last;/* addr of last next element */ \ +} + +#define STAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).stqh_first } + +#define STAILQ_ENTRY(type) \ +struct { \ + struct type *stqe_next; /* next element */ \ +} + +/* + * Singly-linked Tail queue functions. + */ +#define STAILQ_CONCAT(head1, head2) do { \ + if (!STAILQ_EMPTY((head2))) { \ + *(head1)->stqh_last = (head2)->stqh_first; \ + (head1)->stqh_last = (head2)->stqh_last; \ + STAILQ_INIT((head2)); \ + } \ +} while (0) + +#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) + +#define STAILQ_FIRST(head) ((head)->stqh_first) + +#define STAILQ_FOREACH(var, head, field) \ + for ((var) = STAILQ_FIRST((head)); \ + (var); \ + (var) = STAILQ_NEXT((var), field)) + +#define STAILQ_INIT(head) do { \ + STAILQ_FIRST((head)) = NULL; \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_NEXT((tqelm), field) = (elm); \ +} while (0) + +#define STAILQ_INSERT_HEAD(head, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_FIRST((head)) = (elm); \ +} while (0) + +#define STAILQ_INSERT_TAIL(head, elm, field) do { \ + STAILQ_NEXT((elm), field) = NULL; \ + *(head)->stqh_last = (elm); \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ +} while (0) + +#define STAILQ_LAST(head, type, field) \ + (STAILQ_EMPTY((head)) ? \ + NULL : \ + ((struct type *) \ + ((char *)((head)->stqh_last) - __offsetof(struct type, field)))) + +#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) + +#define STAILQ_REMOVE(head, elm, type, field) do { \ + if (STAILQ_FIRST((head)) == (elm)) { \ + STAILQ_REMOVE_HEAD((head), field); \ + } \ + else { \ + struct type *curelm = STAILQ_FIRST((head)); \ + while (STAILQ_NEXT(curelm, field) != (elm)) \ + curelm = STAILQ_NEXT(curelm, field); \ + if ((STAILQ_NEXT(curelm, field) = \ + STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\ + (head)->stqh_last = &STAILQ_NEXT((curelm), field);\ + } \ +} while (0) + +#define STAILQ_REMOVE_HEAD(head, field) do { \ + if ((STAILQ_FIRST((head)) = \ + STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \ + if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +/* + * List declarations. + */ +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + +#define LIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define LIST_ENTRY(type) \ +struct { \ + struct type *le_next; /* next element */ \ + struct type **le_prev; /* address of previous next element */ \ +} + +/* + * List functions. + */ + +#define LIST_EMPTY(head) ((head)->lh_first == NULL) + +#define LIST_FIRST(head) ((head)->lh_first) + +#define LIST_FOREACH(var, head, field) \ + for ((var) = LIST_FIRST((head)); \ + (var); \ + (var) = LIST_NEXT((var), field)) + +#define LIST_INIT(head) do { \ + LIST_FIRST((head)) = NULL; \ +} while (0) + +#define LIST_INSERT_AFTER(listelm, elm, field) do { \ + if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\ + LIST_NEXT((listelm), field)->field.le_prev = \ + &LIST_NEXT((elm), field); \ + LIST_NEXT((listelm), field) = (elm); \ + (elm)->field.le_prev = &LIST_NEXT((listelm), field); \ +} while (0) + +#define LIST_INSERT_BEFORE(listelm, elm, field) do { \ + (elm)->field.le_prev = (listelm)->field.le_prev; \ + LIST_NEXT((elm), field) = (listelm); \ + *(listelm)->field.le_prev = (elm); \ + (listelm)->field.le_prev = &LIST_NEXT((elm), field); \ +} while (0) + +#define LIST_INSERT_HEAD(head, elm, field) do { \ + if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ + LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\ + LIST_FIRST((head)) = (elm); \ + (elm)->field.le_prev = &LIST_FIRST((head)); \ +} while (0) + +#define LIST_NEXT(elm, field) ((elm)->field.le_next) + +#define LIST_REMOVE(elm, field) do { \ + if (LIST_NEXT((elm), field) != NULL) \ + LIST_NEXT((elm), field)->field.le_prev = \ + (elm)->field.le_prev; \ + *(elm)->field.le_prev = LIST_NEXT((elm), field); \ +} while (0) + +/* + * Tail queue declarations. + */ +#define TAILQ_HEAD(name, type) \ +struct name { \ + struct type *tqh_first; /* first element */ \ + struct type **tqh_last; /* addr of last next element */ \ + TRACEBUF \ +} + +#define TAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).tqh_first } + +#define TAILQ_ENTRY(type) \ +struct { \ + struct type *tqe_next; /* next element */ \ + struct type **tqe_prev; /* address of previous next element */ \ + TRACEBUF \ +} + +/* + * Tail queue functions. + */ +#define TAILQ_CONCAT(head1, head2, field) do { \ + if (!TAILQ_EMPTY(head2)) { \ + *(head1)->tqh_last = (head2)->tqh_first; \ + (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \ + (head1)->tqh_last = (head2)->tqh_last; \ + TAILQ_INIT((head2)); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_HEAD(head2); \ + } \ +} while (0) + +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) + +#define TAILQ_FIRST(head) ((head)->tqh_first) + +#define TAILQ_FOREACH(var, head, field) \ + for ((var) = TAILQ_FIRST((head)); \ + (var); \ + (var) = TAILQ_NEXT((var), field)) + +#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var); \ + (var) = TAILQ_PREV((var), headname, field)) + +#define TAILQ_INIT(head) do { \ + TAILQ_FIRST((head)) = NULL; \ + (head)->tqh_last = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ +} while (0) + +#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else { \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + } \ + TAILQ_NEXT((listelm), field) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&listelm->field); \ +} while (0) + +#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ + (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ + TAILQ_NEXT((elm), field) = (listelm); \ + *(listelm)->field.tqe_prev = (elm); \ + (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&listelm->field); \ +} while (0) + +#define TAILQ_INSERT_HEAD(head, elm, field) do { \ + if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ + TAILQ_FIRST((head))->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + TAILQ_FIRST((head)) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_INSERT_TAIL(head, elm, field) do { \ + TAILQ_NEXT((elm), field) = NULL; \ + (elm)->field.tqe_prev = (head)->tqh_last; \ + *(head)->tqh_last = (elm); \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) + +#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) + +#define TAILQ_PREV(elm, headname, field) \ + (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) + +#define TAILQ_REMOVE(head, elm, field) do { \ + if ((TAILQ_NEXT((elm), field)) != NULL) \ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + (elm)->field.tqe_prev; \ + else { \ + (head)->tqh_last = (elm)->field.tqe_prev; \ + QMD_TRACE_HEAD(head); \ + } \ + *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ + TRASHIT((elm)->field.tqe_next); \ + TRASHIT((elm)->field.tqe_prev); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#if defined(__cplusplus) +} +#endif +#endif /* !_DB_QUEUE_H_ */ diff --git a/src/third_party/wiredtiger/src/include/schema.h b/src/third_party/wiredtiger/src/include/schema.h new file mode 100644 index 00000000000..e24a19b03ca --- /dev/null +++ b/src/third_party/wiredtiger/src/include/schema.h @@ -0,0 +1,101 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* Character constants for projection plans */ +#define WT_PROJ_KEY 'k' /* Go to key in cursor <arg> */ +#define WT_PROJ_NEXT 'n' /* Process the next item (<arg> repeats) */ +#define WT_PROJ_REUSE 'r' /* Reuse the previous item (<arg> repeats) */ +#define WT_PROJ_SKIP 's' /* Skip a column in the cursor (<arg> repeats) */ +#define WT_PROJ_VALUE 'v' /* Go to the value in cursor <arg> */ + +struct __wt_colgroup { + const char *name; /* Logical name */ + const char *source; /* Underlying data source */ + const char *config; /* Configuration string */ + + WT_CONFIG_ITEM colconf; /* List of columns from config */ +}; + +struct __wt_index { + const char *name; /* Logical name */ + const char *source; /* Underlying data source */ + const char *config; /* Configuration string */ + + WT_CONFIG_ITEM colconf; /* List of columns from config */ + + const char *idxkey_format; /* Index key format (hides primary) */ + const char *key_format; /* Key format */ + const char *key_plan; /* Key projection plan */ + const char *value_plan; /* Value projection plan */ +}; + +/* + * WT_TABLE -- + * Handle for a logical table. A table consists of one or more column + * groups, each of which holds some set of columns all sharing a primary + * key; and zero or more indices, each of which holds some set of columns + * in an index key that can be used to reconstruct the primary key. + */ +struct __wt_table { + const char *name, *config, *plan; + const char *key_format, *value_format; + + WT_CONFIG_ITEM cgconf, colconf; + + WT_COLGROUP **cgroups; + WT_INDEX **indices; + size_t idx_alloc; + + TAILQ_ENTRY(__wt_table) q; + + int cg_complete, idx_complete, is_simple; + u_int ncolgroups, nindices, nkey_columns; + + uint32_t refcnt; /* Number of open cursors */ + uint32_t schema_gen; /* Cached schema generation number */ +}; + +/* + * Tables without explicit column groups have a single default column group + * containing all of the columns. + */ +#define WT_COLGROUPS(t) WT_MAX((t)->ncolgroups, 1) + +/* + * WT_WITH_SCHEMA_LOCK -- + * Acquire the schema lock, perform an operation, drop the lock. + */ +#define WT_WITH_SCHEMA_LOCK(session, op) do { \ + WT_ASSERT(session, \ + F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) || \ + !F_ISSET(session, WT_SESSION_NO_SCHEMA_LOCK)); \ + if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) { \ + (op); \ + } else { \ + __wt_spin_lock(session, &S2C(session)->schema_lock); \ + F_SET(session, WT_SESSION_SCHEMA_LOCKED); \ + (op); \ + __wt_spin_unlock(session, &S2C(session)->schema_lock); \ + F_CLR(session, WT_SESSION_SCHEMA_LOCKED); \ + } \ +} while (0) + +/* + * WT_WITHOUT_SCHEMA_LOCK -- + * Drop the schema lock, perform an operation, re-acquire the lock. + */ +#define WT_WITHOUT_SCHEMA_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) { \ + __wt_spin_unlock(session, &S2C(session)->schema_lock); \ + F_CLR(session, WT_SESSION_SCHEMA_LOCKED); \ + (op); \ + __wt_spin_lock(session, &S2C(session)->schema_lock); \ + F_SET(session, WT_SESSION_SCHEMA_LOCKED); \ + } else { \ + (op); \ + } \ +} while (0) diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i new file mode 100644 index 00000000000..70dc6b8764d --- /dev/null +++ b/src/third_party/wiredtiger/src/include/serial.i @@ -0,0 +1,329 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * __page_write_gen_wrapped_check -- + * Confirm the page's write generation number won't wrap. + */ +static inline int +__page_write_gen_wrapped_check(WT_PAGE *page) +{ + return (page->modify->write_gen > + UINT32_MAX - WT_MILLION ? WT_RESTART : 0); +} + +/* + * __insert_serial_func -- + * Worker function to add a WT_INSERT entry to a skiplist. + */ +static inline int +__insert_serial_func(WT_SESSION_IMPL *session, + WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT *new_ins, + u_int skipdepth) +{ + u_int i; + + WT_UNUSED(session); + + /* + * Confirm we are still in the expected position, and no item has been + * added where our insert belongs. Take extra care at the beginning + * and end of the list (at each level): retry if we race there. + * + * !!! + * Note the test for ins_stack[0] == NULL: that's the test for an + * uninitialized cursor, ins_stack[0] is cleared as part of + * initializing a cursor for a search. + */ + for (i = 0; i < skipdepth; i++) { + if (ins_stack[i] == NULL || + *ins_stack[i] != new_ins->next[i]) + return (WT_RESTART); + if (new_ins->next[i] == NULL && + ins_head->tail[i] != NULL && + ins_stack[i] != &ins_head->tail[i]->next[i]) + return (WT_RESTART); + } + + /* Update the skiplist elements referencing the new WT_INSERT item. */ + for (i = 0; i < skipdepth; i++) { + if (ins_head->tail[i] == NULL || + ins_stack[i] == &ins_head->tail[i]->next[i]) + ins_head->tail[i] = new_ins; + *ins_stack[i] = new_ins; + } + + return (0); +} + +/* + * __col_append_serial_func -- + * Worker function to allocate a record number as necessary, then add a + * WT_INSERT entry to a skiplist. + */ +static inline int +__col_append_serial_func(WT_SESSION_IMPL *session, + WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT *new_ins, + uint64_t *recnop, u_int skipdepth) +{ + WT_BTREE *btree; + uint64_t recno; + u_int i; + + btree = S2BT(session); + + /* + * If the application didn't specify a record number, allocate a new one + * and set up for an append. + */ + if ((recno = WT_INSERT_RECNO(new_ins)) == 0) { + recno = WT_INSERT_RECNO(new_ins) = btree->last_recno + 1; + WT_ASSERT(session, WT_SKIP_LAST(ins_head) == NULL || + recno > WT_INSERT_RECNO(WT_SKIP_LAST(ins_head))); + for (i = 0; i < skipdepth; i++) + ins_stack[i] = ins_head->tail[i] == NULL ? + &ins_head->head[i] : &ins_head->tail[i]->next[i]; + } + + /* Confirm position and insert the new WT_INSERT item. */ + WT_RET(__insert_serial_func( + session, ins_head, ins_stack, new_ins, skipdepth)); + + /* + * Set the calling cursor's record number. + * If we extended the file, update the last record number. + */ + *recnop = recno; + if (recno > btree->last_recno) + btree->last_recno = recno; + + return (0); +} + +/* + * __update_serial_func -- + * Worker function to add an WT_UPDATE entry in the page array. + */ +static inline int +__update_serial_func(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_UPDATE **upd_entry, WT_UPDATE *upd) +{ + WT_DECL_RET; + WT_UPDATE *obsolete; + WT_DECL_SPINLOCK_ID(id); /* Must appear last */ + + /* + * Swap the update into place. If that fails, a new update was added + * after our search, we raced. Check if our update is still permitted, + * and if it is, do a full-barrier to ensure the update's next pointer + * is set before we update the linked list and try again. + */ + while (!WT_ATOMIC_CAS8(*upd_entry, upd->next, upd)) { + WT_RET(__wt_txn_update_check(session, upd->next = *upd_entry)); + WT_WRITE_BARRIER(); + } + + /* + * If there are subsequent WT_UPDATE structures, we're evicting pages + * and the page-scanning mutex isn't held, discard obsolete WT_UPDATE + * structures. Serialization is needed so only one thread does the + * obsolete check at a time, and to protect updates from disappearing + * under reconciliation. + */ + if (upd->next != NULL && + F_ISSET(S2C(session)->cache, WT_EVICT_ACTIVE)) { + F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret); + /* If we can't lock it, don't scan, that's okay. */ + if (ret != 0) + return (0); + obsolete = __wt_update_obsolete_check(session, upd->next); + F_CLR_ATOMIC(page, WT_PAGE_SCANNING); + if (obsolete != NULL) + __wt_update_obsolete_free(session, page, obsolete); + } + return (0); +} + +/* + * DO NOT EDIT: automatically built by dist/serial.py. + * Serialization function section: BEGIN + */ + +static inline int +__wt_col_append_serial( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *ins_head, + WT_INSERT ***ins_stack, WT_INSERT **new_insp, size_t new_ins_size, + uint64_t *recnop, u_int skipdepth) +{ + WT_INSERT *new_ins = *new_insp; + WT_DECL_RET; + size_t incr_mem; + + /* Clear references to memory we now own. */ + *new_insp = NULL; + + /* + * Check to see if the page's write generation is about to wrap (wildly + * unlikely as it implies 4B updates between clean page reconciliations, + * but technically possible), and fail the update. + * + * The check is outside of the serialization mutex because the page's + * write generation is going to be a hot cache line, so technically it's + * possible for the page's write generation to wrap between the test and + * our subsequent modification of it. However, the test is (4B-1M), and + * there cannot be a million threads that have done the test but not yet + * completed their modification. + */ + WT_RET(__page_write_gen_wrapped_check(page)); + + /* Acquire the page's spinlock, call the worker function. */ + WT_PAGE_LOCK(session, page); + ret = __col_append_serial_func( + session, ins_head, ins_stack, new_ins, recnop, skipdepth); + WT_PAGE_UNLOCK(session, page); + + /* Free unused memory on error. */ + if (ret != 0) { + __wt_free(session, new_ins); + + return (ret); + } + + /* + * Increment in-memory footprint after releasing the mutex: that's safe + * because the structures we added cannot be discarded while visible to + * any running transaction, and we're a running transaction, which means + * there can be no corresponding delete until we complete. + */ + incr_mem = 0; + WT_ASSERT(session, new_ins_size != 0); + incr_mem += new_ins_size; + if (incr_mem != 0) + __wt_cache_page_inmem_incr(session, page, incr_mem); + + /* Mark the page dirty after updating the footprint. */ + __wt_page_modify_set(session, page); + + return (0); +} + +static inline int +__wt_insert_serial( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *ins_head, + WT_INSERT ***ins_stack, WT_INSERT **new_insp, size_t new_ins_size, + u_int skipdepth) +{ + WT_INSERT *new_ins = *new_insp; + WT_DECL_RET; + size_t incr_mem; + + /* Clear references to memory we now own. */ + *new_insp = NULL; + + /* + * Check to see if the page's write generation is about to wrap (wildly + * unlikely as it implies 4B updates between clean page reconciliations, + * but technically possible), and fail the update. + * + * The check is outside of the serialization mutex because the page's + * write generation is going to be a hot cache line, so technically it's + * possible for the page's write generation to wrap between the test and + * our subsequent modification of it. However, the test is (4B-1M), and + * there cannot be a million threads that have done the test but not yet + * completed their modification. + */ + WT_RET(__page_write_gen_wrapped_check(page)); + + /* Acquire the page's spinlock, call the worker function. */ + WT_PAGE_LOCK(session, page); + ret = __insert_serial_func( + session, ins_head, ins_stack, new_ins, skipdepth); + WT_PAGE_UNLOCK(session, page); + + /* Free unused memory on error. */ + if (ret != 0) { + __wt_free(session, new_ins); + + return (ret); + } + + /* + * Increment in-memory footprint after releasing the mutex: that's safe + * because the structures we added cannot be discarded while visible to + * any running transaction, and we're a running transaction, which means + * there can be no corresponding delete until we complete. + */ + incr_mem = 0; + WT_ASSERT(session, new_ins_size != 0); + incr_mem += new_ins_size; + if (incr_mem != 0) + __wt_cache_page_inmem_incr(session, page, incr_mem); + + /* Mark the page dirty after updating the footprint. */ + __wt_page_modify_set(session, page); + + return (0); +} + +static inline int +__wt_update_serial( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE **srch_upd, + WT_UPDATE **updp, size_t upd_size) +{ + WT_UPDATE *upd = *updp; + WT_DECL_RET; + size_t incr_mem; + + /* Clear references to memory we now own. */ + *updp = NULL; + + /* + * Check to see if the page's write generation is about to wrap (wildly + * unlikely as it implies 4B updates between clean page reconciliations, + * but technically possible), and fail the update. + * + * The check is outside of the serialization mutex because the page's + * write generation is going to be a hot cache line, so technically it's + * possible for the page's write generation to wrap between the test and + * our subsequent modification of it. However, the test is (4B-1M), and + * there cannot be a million threads that have done the test but not yet + * completed their modification. + */ + WT_RET(__page_write_gen_wrapped_check(page)); + + ret = __update_serial_func( + session, page, srch_upd, upd); + + /* Free unused memory on error. */ + if (ret != 0) { + __wt_free(session, upd); + + return (ret); + } + + /* + * Increment in-memory footprint after releasing the mutex: that's safe + * because the structures we added cannot be discarded while visible to + * any running transaction, and we're a running transaction, which means + * there can be no corresponding delete until we complete. + */ + incr_mem = 0; + WT_ASSERT(session, upd_size != 0); + incr_mem += upd_size; + if (incr_mem != 0) + __wt_cache_page_inmem_incr(session, page, incr_mem); + + /* Mark the page dirty after updating the footprint. */ + __wt_page_modify_set(session, page); + + return (0); +} + +/* + * Serialization function section: END + * DO NOT EDIT: automatically built by dist/serial.py. + */ diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h new file mode 100644 index 00000000000..788ffe5eb45 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/session.h @@ -0,0 +1,156 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * WT_DATA_HANDLE_CACHE -- + * Per-session cache of handles to avoid synchronization when opening + * cursors. + */ +struct __wt_data_handle_cache { + WT_DATA_HANDLE *dhandle; + + SLIST_ENTRY(__wt_data_handle_cache) l; +}; + +/* + * WT_HAZARD -- + * A hazard pointer. + */ +struct __wt_hazard { + WT_PAGE *page; /* Page address */ +#ifdef HAVE_DIAGNOSTIC + const char *file; /* File/line where hazard acquired */ + int line; +#endif +}; + +/* Get the connection implementation for a session */ +#define S2C(session) ((WT_CONNECTION_IMPL *)(session)->iface.connection) +#define S2C_SAFE(session) ((session) == NULL ? NULL : S2C(session)) + +/* Get the btree for a session */ +#define S2BT(session) ((WT_BTREE *)(session)->dhandle->handle) +#define S2BT_SAFE(session) ((session)->dhandle == NULL ? NULL : S2BT(session)) + +/* + * WT_SESSION_IMPL -- + * Implementation of WT_SESSION. + */ +struct __wt_session_impl { + WT_SESSION iface; + + void *lang_private; /* Language specific private storage */ + + u_int active; /* Non-zero if the session is in-use */ + + const char *name; /* Name */ + const char *lastop; /* Last operation */ + uint32_t id; /* UID, offset in session array */ + + WT_CONDVAR *cond; /* Condition variable */ + + uint32_t rnd[2]; /* Random number generation state */ + + WT_EVENT_HANDLER *event_handler;/* Application's event handlers */ + + WT_DATA_HANDLE *dhandle; /* Current data handle */ + + /* Session handle reference list */ + SLIST_HEAD(__dhandles, __wt_data_handle_cache) dhandles; +#define WT_DHANDLE_SWEEP_WAIT 60 /* Wait before discarding */ +#define WT_DHANDLE_SWEEP_PERIOD 20 /* Only sweep every 20 seconds */ + time_t last_sweep; /* Last sweep for dead handles */ + + WT_CURSOR *cursor; /* Current cursor */ + /* Cursors closed with the session */ + TAILQ_HEAD(__cursors, __wt_cursor) cursors; + + WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */ + WT_COMPACT *compact; /* Compact state */ + + WT_BTREE *metafile; /* Metadata file */ + void *meta_track; /* Metadata operation tracking */ + void *meta_track_next; /* Current position */ + void *meta_track_sub; /* Child transaction / save point */ + size_t meta_track_alloc; /* Currently allocated */ + int meta_track_nest; /* Nesting level of meta transaction */ +#define WT_META_TRACKING(session) (session->meta_track_next != NULL) + + TAILQ_HEAD(__tables, __wt_table) tables; + + WT_ITEM **scratch; /* Temporary memory for any function */ + u_int scratch_alloc; /* Currently allocated */ +#ifdef HAVE_DIAGNOSTIC + /* + * It's hard to figure out from where a buffer was allocated after it's + * leaked, so in diagnostic mode we track them; DIAGNOSTIC can't simply + * add additional fields to WT_ITEM structures because they are visible + * to applications, create a parallel structure instead. + */ + struct __wt_scratch_track { + const char *file; /* Allocating file, line */ + int line; + } *scratch_track; +#endif + + WT_TXN_ISOLATION isolation; + WT_TXN txn; /* Transaction state */ + u_int ncursors; /* Count of active file cursors. */ + + WT_REF **excl; /* Eviction exclusive list */ + u_int excl_next; /* Next empty slot */ + size_t excl_allocated; /* Bytes allocated */ + + void *block_manager; /* Block-manager support */ + int (*block_manager_cleanup)(WT_SESSION_IMPL *); + + WT_DATA_HANDLE **ckpt_handle; /* Checkpoint support */ + u_int ckpt_handle_next; /* Next empty slot */ + size_t ckpt_handle_allocated; /* Bytes allocated */ + + void *reconcile; /* Reconciliation support */ + int (*reconcile_cleanup)(WT_SESSION_IMPL *); + + int compaction; /* Compaction did some work */ + + /* + * The split stash memory and hazard information persist past session + * close, because they are accessed by threads of control other than + * the thread owning the session. They live at the end of the + * structure so it's somewhat easier to clear everything but the fields + * that persist. + */ +#define WT_SESSION_CLEAR_SIZE(s) \ + (WT_PTRDIFF(&(s)->flags, s) + sizeof((s)->flags)) + uint32_t flags; + + /* + * Splits can "free" memory that may still be in use, and we use a + * split generation number to track it, that is, the session stores a + * reference to the memory and allocates a split generation; when no + * session is reading from that split generation, the memory can be + * freed for real. + */ + struct __wt_split_stash { + uint64_t split_gen; /* Split generation */ + void *p; /* Memory, length */ + size_t len; + } *split_stash; /* Split stash array */ + size_t split_stash_cnt; /* Array entries */ + size_t split_stash_alloc; /* Allocated bytes */ + + uint64_t split_gen; /* Reading split generation */ + + /* + * Hazard pointers. + * The number of hazard pointers that can be in use grows dynamically. + */ +#define WT_HAZARD_INCR 10 + uint32_t hazard_size; /* Allocated slots in hazard array. */ + uint32_t nhazard; /* Count of active hazard pointers */ + WT_HAZARD *hazard; /* Hazard pointer array */ +} WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT))); diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h new file mode 100644 index 00000000000..11f42ac5500 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -0,0 +1,332 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +struct __wt_stats { + const char *desc; /* text description */ + uint64_t v; /* 64-bit value */ +}; + +/* + * Read/write statistics without any test for statistics configuration. + */ +#define WT_STAT(stats, fld) \ + ((stats)->fld.v) +#define WT_STAT_ATOMIC_DECRV(stats, fld, value) do { \ + (void)WT_ATOMIC_SUB8(WT_STAT(stats, fld), (value)); \ +} while (0) +#define WT_STAT_ATOMIC_DECR(stats, fld) WT_STAT_ATOMIC_DECRV(stats, fld, 1) +#define WT_STAT_ATOMIC_INCRV(stats, fld, value) do { \ + (void)WT_ATOMIC_ADD8(WT_STAT(stats, fld), (value)); \ +} while (0) +#define WT_STAT_ATOMIC_INCR(stats, fld) WT_ATOMIC_ADD(WT_STAT(stats, fld), 1) +#define WT_STAT_DECRV(stats, fld, value) do { \ + (stats)->fld.v -= (value); \ +} while (0) +#define WT_STAT_DECR(stats, fld) WT_STAT_DECRV(stats, fld, 1) +#define WT_STAT_INCRV(stats, fld, value) do { \ + (stats)->fld.v += (value); \ +} while (0) +#define WT_STAT_INCR(stats, fld) WT_STAT_INCRV(stats, fld, 1) +#define WT_STAT_SET(stats, fld, value) do { \ + (stats)->fld.v = (uint64_t)(value); \ +} while (0) + +/* + * Read/write statistics if "fast" statistics are configured. + */ +#define WT_STAT_FAST_ATOMIC_DECRV(session, stats, fld, value) do { \ + if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \ + WT_STAT_ATOMIC_DECRV(stats, fld, value); \ +} while (0) +#define WT_STAT_FAST_ATOMIC_DECR(session, stats, fld) \ + WT_STAT_FAST_ATOMIC_DECRV(session, stats, fld, 1) +#define WT_STAT_FAST_ATOMIC_INCRV(session, stats, fld, value) do { \ + if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \ + WT_STAT_ATOMIC_INCRV(stats, fld, value); \ +} while (0) +#define WT_STAT_FAST_ATOMIC_INCR(session, stats, fld) \ + WT_STAT_FAST_ATOMIC_INCRV(session, stats, fld, 1) +#define WT_STAT_FAST_DECRV(session, stats, fld, value) do { \ + if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \ + WT_STAT_DECRV(stats, fld, value); \ +} while (0) +#define WT_STAT_FAST_DECR(session, stats, fld) \ + WT_STAT_FAST_DECRV(session, stats, fld, 1) +#define WT_STAT_FAST_INCRV(session, stats, fld, value) do { \ + if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \ + WT_STAT_INCRV(stats, fld, value); \ +} while (0) +#define WT_STAT_FAST_INCR(session, stats, fld) \ + WT_STAT_FAST_INCRV(session, stats, fld, 1) +#define WT_STAT_FAST_SET(session, stats, fld, value) do { \ + if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \ + WT_STAT_SET(stats, fld, value); \ +} while (0) + +/* + * Read/write connection handle statistics if "fast" statistics are configured. + */ +#define WT_STAT_FAST_CONN_ATOMIC_DECRV(session, fld, value) \ + WT_STAT_FAST_ATOMIC_DECRV(session, &S2C(session)->stats, fld, value) +#define WT_STAT_FAST_CONN_ATOMIC_DECR(session, fld) \ + WT_STAT_FAST_ATOMIC_DECR(session, &S2C(session)->stats, fld) +#define WT_STAT_FAST_CONN_ATOMIC_INCRV(session, fld, value) \ + WT_STAT_FAST_ATOMIC_INCRV(session, &S2C(session)->stats, fld, value) +#define WT_STAT_FAST_CONN_ATOMIC_INCR(session, fld) \ + WT_STAT_FAST_ATOMIC_INCR(session, &S2C(session)->stats, fld) +#define WT_STAT_FAST_CONN_DECR(session, fld) \ + WT_STAT_FAST_DECR(session, &S2C(session)->stats, fld) +#define WT_STAT_FAST_CONN_DECRV(session, fld, value) \ + WT_STAT_FAST_DECRV(session, &S2C(session)->stats, fld, value) +#define WT_STAT_FAST_CONN_INCR(session, fld) \ + WT_STAT_FAST_INCR(session, &S2C(session)->stats, fld) +#define WT_STAT_FAST_CONN_INCRV(session, fld, value) \ + WT_STAT_FAST_INCRV(session, &S2C(session)->stats, fld, value) +#define WT_STAT_FAST_CONN_SET(session, fld, value) \ + WT_STAT_FAST_SET(session, &S2C(session)->stats, fld, value) + +/* + * Read/write data-source handle statistics if the data-source handle is set + * and "fast" statistics are configured. + * + * XXX + * We shouldn't have to check if the data-source handle is NULL, but it's + * useful until everything is converted to using data-source handles. + */ +#define WT_STAT_FAST_DATA_DECRV(session, fld, value) do { \ + if ((session)->dhandle != NULL) \ + WT_STAT_FAST_DECRV( \ + session, &(session)->dhandle->stats, fld, value); \ +} while (0) +#define WT_STAT_FAST_DATA_DECR(session, fld) \ + WT_STAT_FAST_DATA_DECRV(session, fld, 1) +#define WT_STAT_FAST_DATA_INCRV(session, fld, value) do { \ + if ((session)->dhandle != NULL) \ + WT_STAT_FAST_INCRV( \ + session, &(session)->dhandle->stats, fld, value); \ +} while (0) +#define WT_STAT_FAST_DATA_INCR(session, fld) \ + WT_STAT_FAST_DATA_INCRV(session, fld, 1) +#define WT_STAT_FAST_DATA_SET(session, fld, value) do { \ + if ((session)->dhandle != NULL) \ + WT_STAT_FAST_SET( \ + session, &(session)->dhandle->stats, fld, value); \ +} while (0) + +/* + * DO NOT EDIT: automatically built by dist/stat.py. + */ +/* Statistics section: BEGIN */ + +/* + * Statistics entries for connections. + */ +#define WT_CONNECTION_STATS_BASE 1000 +struct __wt_connection_stats { + WT_STATS async_alloc_race; + WT_STATS async_alloc_view; + WT_STATS async_cur_queue; + WT_STATS async_flush; + WT_STATS async_full; + WT_STATS async_max_queue; + WT_STATS async_nowork; + WT_STATS async_op_alloc; + WT_STATS async_op_compact; + WT_STATS async_op_insert; + WT_STATS async_op_remove; + WT_STATS async_op_search; + WT_STATS async_op_update; + WT_STATS block_byte_map_read; + WT_STATS block_byte_read; + WT_STATS block_byte_write; + WT_STATS block_map_read; + WT_STATS block_preload; + WT_STATS block_read; + WT_STATS block_write; + WT_STATS cache_bytes_dirty; + WT_STATS cache_bytes_inuse; + WT_STATS cache_bytes_max; + WT_STATS cache_bytes_read; + WT_STATS cache_bytes_write; + WT_STATS cache_eviction_checkpoint; + WT_STATS cache_eviction_clean; + WT_STATS cache_eviction_deepen; + WT_STATS cache_eviction_dirty; + WT_STATS cache_eviction_fail; + WT_STATS cache_eviction_force; + WT_STATS cache_eviction_force_fail; + WT_STATS cache_eviction_hazard; + WT_STATS cache_eviction_internal; + WT_STATS cache_eviction_queue_empty; + WT_STATS cache_eviction_queue_not_empty; + WT_STATS cache_eviction_server_evicting; + WT_STATS cache_eviction_server_not_evicting; + WT_STATS cache_eviction_slow; + WT_STATS cache_eviction_split; + WT_STATS cache_eviction_walk; + WT_STATS cache_pages_dirty; + WT_STATS cache_pages_inuse; + WT_STATS cache_read; + WT_STATS cache_write; + WT_STATS cond_wait; + WT_STATS cursor_create; + WT_STATS cursor_insert; + WT_STATS cursor_next; + WT_STATS cursor_prev; + WT_STATS cursor_remove; + WT_STATS cursor_reset; + WT_STATS cursor_search; + WT_STATS cursor_search_near; + WT_STATS cursor_update; + WT_STATS dh_session_handles; + WT_STATS dh_session_sweeps; + WT_STATS file_open; + WT_STATS log_buffer_grow; + WT_STATS log_buffer_size; + WT_STATS log_bytes_user; + WT_STATS log_bytes_written; + WT_STATS log_close_yields; + WT_STATS log_max_filesize; + WT_STATS log_reads; + WT_STATS log_scan_records; + WT_STATS log_scan_rereads; + WT_STATS log_scans; + WT_STATS log_slot_closes; + WT_STATS log_slot_consolidated; + WT_STATS log_slot_joins; + WT_STATS log_slot_races; + WT_STATS log_slot_switch_fails; + WT_STATS log_slot_toobig; + WT_STATS log_slot_toosmall; + WT_STATS log_slot_transitions; + WT_STATS log_sync; + WT_STATS log_writes; + WT_STATS lsm_checkpoint_throttle; + WT_STATS lsm_merge_throttle; + WT_STATS lsm_rows_merged; + WT_STATS lsm_work_queue_app; + WT_STATS lsm_work_queue_manager; + WT_STATS lsm_work_queue_max; + WT_STATS lsm_work_queue_switch; + WT_STATS lsm_work_units_created; + WT_STATS lsm_work_units_discarded; + WT_STATS lsm_work_units_done; + WT_STATS memory_allocation; + WT_STATS memory_free; + WT_STATS memory_grow; + WT_STATS read_io; + WT_STATS rec_pages; + WT_STATS rec_pages_eviction; + WT_STATS rec_split_stashed_bytes; + WT_STATS rec_split_stashed_objects; + WT_STATS rwlock_read; + WT_STATS rwlock_write; + WT_STATS session_cursor_open; + WT_STATS session_open; + WT_STATS txn_begin; + WT_STATS txn_checkpoint; + WT_STATS txn_checkpoint_running; + WT_STATS txn_commit; + WT_STATS txn_fail_cache; + WT_STATS txn_pinned_range; + WT_STATS txn_rollback; + WT_STATS write_io; +}; + +/* + * Statistics entries for data sources. + */ +#define WT_DSRC_STATS_BASE 2000 +struct __wt_dsrc_stats { + WT_STATS allocation_size; + WT_STATS block_alloc; + WT_STATS block_checkpoint_size; + WT_STATS block_extension; + WT_STATS block_free; + WT_STATS block_magic; + WT_STATS block_major; + WT_STATS block_minor; + WT_STATS block_reuse_bytes; + WT_STATS block_size; + WT_STATS bloom_count; + WT_STATS bloom_false_positive; + WT_STATS bloom_hit; + WT_STATS bloom_miss; + WT_STATS bloom_page_evict; + WT_STATS bloom_page_read; + WT_STATS bloom_size; + WT_STATS btree_column_deleted; + WT_STATS btree_column_fix; + WT_STATS btree_column_internal; + WT_STATS btree_column_variable; + WT_STATS btree_compact_rewrite; + WT_STATS btree_entries; + WT_STATS btree_fixed_len; + WT_STATS btree_maximum_depth; + WT_STATS btree_maxintlitem; + WT_STATS btree_maxintlpage; + WT_STATS btree_maxleafitem; + WT_STATS btree_maxleafpage; + WT_STATS btree_overflow; + WT_STATS btree_row_internal; + WT_STATS btree_row_leaf; + WT_STATS cache_bytes_read; + WT_STATS cache_bytes_write; + WT_STATS cache_eviction_checkpoint; + WT_STATS cache_eviction_clean; + WT_STATS cache_eviction_dirty; + WT_STATS cache_eviction_fail; + WT_STATS cache_eviction_hazard; + WT_STATS cache_eviction_internal; + WT_STATS cache_overflow_value; + WT_STATS cache_read; + WT_STATS cache_read_overflow; + WT_STATS cache_write; + WT_STATS compress_raw_fail; + WT_STATS compress_raw_fail_temporary; + WT_STATS compress_raw_ok; + WT_STATS compress_read; + WT_STATS compress_write; + WT_STATS compress_write_fail; + WT_STATS compress_write_too_small; + WT_STATS cursor_create; + WT_STATS cursor_insert; + WT_STATS cursor_insert_bulk; + WT_STATS cursor_insert_bytes; + WT_STATS cursor_next; + WT_STATS cursor_prev; + WT_STATS cursor_remove; + WT_STATS cursor_remove_bytes; + WT_STATS cursor_reset; + WT_STATS cursor_search; + WT_STATS cursor_search_near; + WT_STATS cursor_update; + WT_STATS cursor_update_bytes; + WT_STATS lsm_checkpoint_throttle; + WT_STATS lsm_chunk_count; + WT_STATS lsm_generation_max; + WT_STATS lsm_lookup_no_bloom; + WT_STATS lsm_merge_throttle; + WT_STATS rec_dictionary; + WT_STATS rec_multiblock_internal; + WT_STATS rec_multiblock_leaf; + WT_STATS rec_multiblock_max; + WT_STATS rec_overflow_key_internal; + WT_STATS rec_overflow_key_leaf; + WT_STATS rec_overflow_value; + WT_STATS rec_page_delete; + WT_STATS rec_page_match; + WT_STATS rec_pages; + WT_STATS rec_pages_eviction; + WT_STATS rec_prefix_compression; + WT_STATS rec_suffix_compression; + WT_STATS session_compact; + WT_STATS session_cursor_open; + WT_STATS txn_update_conflict; +}; + +/* Statistics section: END */ diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h new file mode 100644 index 00000000000..c28a9231750 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -0,0 +1,139 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#define WT_TXN_NONE 0 /* No txn running in a session. */ +#define WT_TXN_ABORTED UINT64_MAX /* Update rolled back, ignore. */ + +/* + * Transaction ID comparison dealing with edge cases. + * + * WT_TXN_ABORTED is the largest possible ID (never visible to a running + * transaction), WT_TXN_NONE is smaller than any possible ID (visible to all + * running transactions). + */ +#define TXNID_LE(t1, t2) \ + ((t1) <= (t2)) + +#define TXNID_LT(t1, t2) \ + ((t1) != (t2) && TXNID_LE(t1, t2)) + +#define WT_SESSION_TXN_STATE(s) (&S2C(s)->txn_global.states[(s)->id]) + +struct __wt_txn_state { + volatile uint64_t id; + volatile uint64_t snap_min; +} WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT))); + +struct __wt_txn_global { + volatile uint64_t current; /* Current transaction ID. */ + + /* The oldest running transaction ID (may race). */ + uint64_t last_running; + + /* + * The oldest transaction ID that is not yet visible to some + * transaction in the system. + */ + volatile uint64_t oldest_id; + + /* The oldest session found in the last scan. */ + uint32_t oldest_session; + + /* Count of scanning threads, or -1 for exclusive access. */ + volatile int32_t scan_count; + + WT_TXN_STATE *states; /* Per-session transaction states */ +}; + +typedef enum __wt_txn_isolation { + TXN_ISO_EVICTION, /* Internal: eviction context */ + TXN_ISO_READ_UNCOMMITTED, + TXN_ISO_READ_COMMITTED, + TXN_ISO_SNAPSHOT +} WT_TXN_ISOLATION; + +/* + * WT_TXN_OP -- + * A transactional operation. Each transaction builds an in-memory array + * of these operations as it runs, then uses the array to either write log + * records during commit or undo the operations during rollback. + */ +struct __wt_txn_op { + uint32_t fileid; + enum { + TXN_OP_BASIC, + TXN_OP_INMEM, + TXN_OP_REF, + TXN_OP_TRUNCATE_COL, + TXN_OP_TRUNCATE_ROW + } type; + union { + /* TXN_OP_BASIC, TXN_OP_INMEM */ + WT_UPDATE *upd; + /* TXN_OP_REF */ + WT_REF *ref; + /* TXN_OP_TRUNCATE_COL */ + struct { + uint64_t start, stop; + } truncate_col; + /* TXN_OP_TRUNCATE_ROW */ + struct { + WT_ITEM start, stop; + enum { + TXN_TRUNC_ALL, + TXN_TRUNC_BOTH, + TXN_TRUNC_START, + TXN_TRUNC_STOP + } mode; + } truncate_row; + } u; +}; + +/* + * WT_TXN -- + * Per-session transaction context. + */ +struct __wt_txn { + uint64_t id; + + WT_TXN_ISOLATION isolation; + + /* + * Snapshot data: + * ids < snap_min are visible, + * ids > snap_max are invisible, + * everything else is visible unless it is in the snapshot. + */ + uint64_t snap_min, snap_max; + uint64_t *snapshot; + uint32_t snapshot_count; + uint32_t txn_logsync; /* Log sync configuration */ + + /* Array of modifications by this transaction. */ + WT_TXN_OP *mod; + size_t mod_alloc; + u_int mod_count; + + /* Scratch buffer for in-memory log records. */ + WT_ITEM *logrec; + + /* Requested notification when transactions are resolved. */ + WT_TXN_NOTIFY *notify; + + /* Checkpoint status. */ + WT_LSN ckpt_lsn; + int full_ckpt; + uint32_t ckpt_nsnapshot; + WT_ITEM *ckpt_snapshot; + +#define TXN_AUTOCOMMIT 0x01 +#define TXN_ERROR 0x02 +#define TXN_HAS_ID 0x04 +#define TXN_HAS_SNAPSHOT 0x08 +#define TXN_RUNNING 0x10 + uint32_t flags; +}; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i new file mode 100644 index 00000000000..127176c67ea --- /dev/null +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -0,0 +1,382 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +static inline int __wt_txn_id_check(WT_SESSION_IMPL *session); +static inline void __wt_txn_read_last(WT_SESSION_IMPL *session); + +/* + * __txn_next_op -- + * Mark a WT_UPDATE object modified by the current transaction. + */ +static inline int +__txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp) +{ + WT_TXN *txn; + + txn = &session->txn; + *opp = NULL; + + /* + * We're about to perform an update. + * Make sure we have allocated a transaction ID. + */ + WT_RET(__wt_txn_id_check(session)); + WT_ASSERT(session, F_ISSET(txn, TXN_HAS_ID)); + + WT_RET(__wt_realloc_def(session, &txn->mod_alloc, + txn->mod_count + 1, &txn->mod)); + + *opp = &txn->mod[txn->mod_count++]; + WT_CLEAR(**opp); + (*opp)->fileid = S2BT(session)->id; + return (0); +} + +/* + * __wt_txn_unmodify -- + * If threads race making updates, they may discard the last referenced + * WT_UPDATE item while the transaction is still active. This function + * removes the last update item from the "log". + */ +static inline void +__wt_txn_unmodify(WT_SESSION_IMPL *session) +{ + WT_TXN *txn; + + txn = &session->txn; + if (F_ISSET(txn, TXN_HAS_ID)) { + WT_ASSERT(session, txn->mod_count > 0); + txn->mod_count--; + } +} + +/* + * __wt_txn_modify -- + * Mark a WT_UPDATE object modified by the current transaction. + */ +static inline int +__wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd) +{ + WT_DECL_RET; + WT_TXN_OP *op; + + WT_RET(__txn_next_op(session, &op)); + op->type = F_ISSET(session, WT_SESSION_LOGGING_INMEM) ? + TXN_OP_INMEM : TXN_OP_BASIC; + op->u.upd = upd; + upd->txnid = session->txn.id; + return (ret); +} + +/* + * __wt_txn_modify_ref -- + * Remember a WT_REF object modified by the current transaction. + */ +static inline int +__wt_txn_modify_ref(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_TXN_OP *op; + + WT_RET(__txn_next_op(session, &op)); + op->type = TXN_OP_REF; + op->u.ref = ref; + return (__wt_txn_log_op(session, NULL)); +} + +/* + * __wt_txn_visible_all -- + * Check if a given transaction ID is "globally visible". This is, if + * all sessions in the system will see the transaction ID. + */ +static inline int +__wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id) +{ + uint64_t oldest_id; + + oldest_id = S2C(session)->txn_global.oldest_id; + return (TXNID_LT(id, oldest_id)); +} + +/* + * __wt_txn_visible -- + * Can the current transaction see the given ID? + */ +static inline int +__wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) +{ + WT_TXN *txn; + + txn = &session->txn; + + /* + * Eviction only sees globally visible updates, or if there is a + * checkpoint transaction running, use its transaction. + */ + if (txn->isolation == TXN_ISO_EVICTION) + return (__wt_txn_visible_all(session, id)); + + /* Nobody sees the results of aborted transactions. */ + if (id == WT_TXN_ABORTED) + return (0); + + /* Changes with no associated transaction are always visible. */ + if (id == WT_TXN_NONE) + return (1); + + /* + * Read-uncommitted transactions see all other changes. + * + * All metadata reads are at read-uncommitted isolation. That's + * because once a schema-level operation completes, subsequent + * operations must see the current version of checkpoint metadata, or + * they may try to read blocks that may have been freed from a file. + * Metadata updates use non-transactional techniques (such as the + * schema and metadata locks) to protect access to in-flight updates. + */ + if (txn->isolation == TXN_ISO_READ_UNCOMMITTED || + S2BT_SAFE(session) == session->metafile) + return (1); + + /* Transactions see their own changes. */ + if (id == txn->id) + return (1); + + /* + * TXN_ISO_SNAPSHOT, TXN_ISO_READ_COMMITTED: the ID is visible if it is + * not the result of a concurrent transaction, that is, if was + * committed before the snapshot was taken. + * + * The order here is important: anything newer than the maximum ID we + * saw when taking the snapshot should be invisible, even if the + * snapshot is empty. + */ + if (TXNID_LE(txn->snap_max, id)) + return (0); + if (txn->snapshot_count == 0 || TXNID_LT(id, txn->snap_min)) + return (1); + + return (bsearch(&id, txn->snapshot, txn->snapshot_count, + sizeof(uint64_t), __wt_txnid_cmp) == NULL); +} + +/* + * __wt_txn_read -- + * Get the first visible update in a list (or NULL if none are visible). + */ +static inline WT_UPDATE * +__wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd) +{ + while (upd != NULL && !__wt_txn_visible(session, upd->txnid)) + upd = upd->next; + + return (upd); +} + +/* + * __wt_txn_autocommit_check -- + * If an auto-commit transaction is required, start one. +*/ +static inline int +__wt_txn_autocommit_check(WT_SESSION_IMPL *session) +{ + WT_TXN *txn; + + txn = &session->txn; + if (F_ISSET(txn, TXN_AUTOCOMMIT)) { + F_CLR(txn, TXN_AUTOCOMMIT); + return (__wt_txn_begin(session, NULL)); + } + return (0); +} + +/* + * __wt_txn_new_id -- + * Allocate a new transaction ID. + */ +static inline uint64_t +__wt_txn_new_id(WT_SESSION_IMPL *session) +{ + /* + * We want the global value to lead the allocated values, so that any + * allocated transaction ID eventually becomes globally visible. When + * there are no transactions running, the oldest_id will reach the + * global current ID, so we want post-increment semantics. Our atomic + * add primitive does pre-increment, so adjust the result here. + */ + return (WT_ATOMIC_ADD8(S2C(session)->txn_global.current, 1) - 1); +} + +/* + * __wt_txn_id_check -- + * A transaction is going to do an update, start an auto commit + * transaction if required and allocate a transaction ID. + */ +static inline int +__wt_txn_id_check(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *txn_state; + + txn = &session->txn; + + WT_ASSERT(session, F_ISSET(txn, TXN_RUNNING)); + if (!F_ISSET(txn, TXN_HAS_ID)) { + conn = S2C(session); + txn_global = &conn->txn_global; + txn_state = &txn_global->states[session->id]; + + WT_ASSERT(session, txn_state->id == WT_TXN_NONE); + + /* + * Allocate a transaction ID. + * + * We use an atomic compare and swap to ensure that we get a + * unique ID that is published before the global counter is + * updated. + * + * If two threads race to allocate an ID, only the latest ID + * will proceed. The winning thread can be sure its snapshot + * contains all of the earlier active IDs. Threads that race + * and get an earlier ID may not appear in the snapshot, but + * they will loop and allocate a new ID before proceeding to + * make any updates. + * + * This potentially wastes transaction IDs when threads race to + * begin transactions: that is the price we pay to keep this + * path latch free. + */ + do { + txn_state->id = txn->id = txn_global->current; + } while (!WT_ATOMIC_CAS8( + txn_global->current, txn->id, txn->id + 1)); + + /* + * If we have used 64-bits of transaction IDs, there is nothing + * more we can do. + */ + if (txn->id == WT_TXN_ABORTED) + WT_RET_MSG(session, ENOMEM, "Out of transaction IDs"); + F_SET(txn, TXN_HAS_ID); + } + + return (0); +} + +/* + * __wt_txn_update_check -- + * Check if the current transaction can update an item. + */ +static inline int +__wt_txn_update_check(WT_SESSION_IMPL *session, WT_UPDATE *upd) +{ + WT_TXN *txn; + + txn = &session->txn; + if (txn->isolation == TXN_ISO_SNAPSHOT) + while (upd != NULL && !__wt_txn_visible(session, upd->txnid)) { + if (upd->txnid != WT_TXN_ABORTED) { + WT_STAT_FAST_DATA_INCR( + session, txn_update_conflict); + return (WT_ROLLBACK); + } + upd = upd->next; + } + + return (0); +} + +/* + * __wt_txn_read_last -- + * Called when the last page for a session is released. + */ +static inline void +__wt_txn_read_last(WT_SESSION_IMPL *session) +{ + WT_TXN *txn; + + txn = &session->txn; + + /* Release the snap_min ID we put in the global table. */ + if (!F_ISSET(txn, TXN_RUNNING) || + txn->isolation != TXN_ISO_SNAPSHOT) + __wt_txn_release_snapshot(session); +} + +/* + * __wt_txn_cursor_op -- + * Called for each cursor operation. + */ +static inline void +__wt_txn_cursor_op(WT_SESSION_IMPL *session) +{ + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *txn_state; + + txn = &session->txn; + txn_global = &S2C(session)->txn_global; + txn_state = &txn_global->states[session->id]; + + /* + * If there is no transaction running (so we don't have an ID), and no + * snapshot allocated, put an ID in the global table to prevent any + * update that we are reading from being trimmed to save memory. Do a + * read before the write because this shared data is accessed a lot. + * + * !!! + * Note: We are updating the global table unprotected, so the + * oldest_id may move past this ID if a scan races with this + * value being published. That said, read-uncommitted operations + * always take the most recent version of a value, so for that version + * to be freed, two newer versions would have to be committed. Putting + * this snap_min ID in the table prevents the oldest ID from moving + * further forward, so that once a read-uncommitted cursor is + * positioned on a value, it can't be freed. + */ + if (txn->isolation == TXN_ISO_READ_UNCOMMITTED && + !F_ISSET(txn, TXN_HAS_ID) && + TXNID_LT(txn_state->snap_min, txn_global->last_running)) + txn_state->snap_min = txn_global->last_running; + + if (txn->isolation != TXN_ISO_READ_UNCOMMITTED && + !F_ISSET(txn, TXN_HAS_SNAPSHOT)) + __wt_txn_refresh(session, 1); +} + +/* + * __wt_txn_am_oldest -- + * Am I the oldest transaction in the system? + */ +static inline int +__wt_txn_am_oldest(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *s; + uint64_t id; + uint32_t i, session_cnt; + + conn = S2C(session); + txn = &session->txn; + txn_global = &conn->txn_global; + + if (txn->id == WT_TXN_NONE) + return (0); + + WT_ORDERED_READ(session_cnt, conn->session_cnt); + for (i = 0, s = txn_global->states; + i < session_cnt; + i++, s++) + if ((id = s->id) != WT_TXN_NONE && + TXNID_LT(id, txn->id)) + return (0); + + return (1); +} diff --git a/src/third_party/wiredtiger/src/include/verify_build.h b/src/third_party/wiredtiger/src/include/verify_build.h new file mode 100644 index 00000000000..5f05db11c4b --- /dev/null +++ b/src/third_party/wiredtiger/src/include/verify_build.h @@ -0,0 +1,75 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#undef ALIGN_CHECK +#undef SIZE_CHECK + +/* + * NOTE: If you see a compile failure in this file, your compiler is laying out + * structs in memory in a way WiredTiger does not expect. Please refer to the + * build instructions in the documentation (docs/html/install.html) for more + * information. + */ + +/* + * Compile time assertions. + * + * If the argument to WT_STATIC_ASSERT is zero, the macro evaluates to: + * + * (void)sizeof(char[-1]) + * + * which fails to compile (which is what we want, the assertion failed). + * If the value of the argument to WT_STATIC_ASSERT is non-zero, then the + * macro evaluates to: + * + * (void)sizeof(char[1]); + * + * which compiles with no warnings, and produces no code. + * + * For more details about why this works, see + * http://scaryreasoner.wordpress.com/2009/02/28/ + */ +#define WT_STATIC_ASSERT(cond) (void)sizeof(char[1 - 2 * !(cond)]) + +#define SIZE_CHECK(type, e) do { \ + char __check_##type[1 - 2 * !(sizeof(type) == (e))]; \ + (void)__check_##type; \ +} while (0) + +#define ALIGN_CHECK(type, a) \ + WT_STATIC_ASSERT(WT_ALIGN(sizeof(type), (a)) == sizeof(type)) + +/* + * __wt_verify_build -- + * This function is never called: it exists so there is a place for code + * that checks build-time conditions. + */ +static inline void +__wt_verify_build(void) +{ + /* Check specific structures weren't padded. */ + SIZE_CHECK(WT_BLOCK_DESC, WT_BLOCK_DESC_SIZE); + SIZE_CHECK(WT_REF, WT_REF_SIZE); + + /* + * The btree code encodes key/value pairs in size_t's, and requires at + * least 8B size_t's. + */ + WT_STATIC_ASSERT(sizeof(size_t) >= 8); + + /* + * We require a wt_off_t fit into an 8B chunk because 8B is the largest + * integral value we can encode into an address cookie. + * + * WiredTiger has never been tested on a system with 4B file offsets, + * disallow them for now. + */ + WT_STATIC_ASSERT(sizeof(wt_off_t) == 8); +} + +#undef ALIGN_CHECK +#undef SIZE_CHECK diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in new file mode 100644 index 00000000000..09cbca89f17 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -0,0 +1,3463 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#ifndef __WIREDTIGER_H_ +#define __WIREDTIGER_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/******************************************* + * Version information + *******************************************/ +#define WIREDTIGER_VERSION_MAJOR @VERSION_MAJOR@ +#define WIREDTIGER_VERSION_MINOR @VERSION_MINOR@ +#define WIREDTIGER_VERSION_PATCH @VERSION_PATCH@ +#define WIREDTIGER_VERSION_STRING @VERSION_STRING@ + +/******************************************* + * Required includes + *******************************************/ +@wiredtiger_includes_decl@ + +/******************************************* + * Portable type names + *******************************************/ +@off_t_decl@ +@uintmax_t_decl@ +@uintptr_t_decl@ + +#if defined(DOXYGEN) || defined(SWIG) +#define __F(func) func +#else +#define __F(func) (*func) +#endif + +#ifdef SWIG +%{ +#include <wiredtiger.h> +%} +#endif + +/*! + * @defgroup wt WiredTiger API + * The functions, handles and methods applications use to access and manage + * data with WiredTiger. + * + * @{ + */ + +/******************************************* + * Public forward structure declarations + *******************************************/ +struct __wt_async_callback; + typedef struct __wt_async_callback WT_ASYNC_CALLBACK; +struct __wt_async_op; typedef struct __wt_async_op WT_ASYNC_OP; +struct __wt_collator; typedef struct __wt_collator WT_COLLATOR; +struct __wt_compressor; typedef struct __wt_compressor WT_COMPRESSOR; +struct __wt_config_item; typedef struct __wt_config_item WT_CONFIG_ITEM; +struct __wt_config_parser; + typedef struct __wt_config_parser WT_CONFIG_PARSER; +struct __wt_connection; typedef struct __wt_connection WT_CONNECTION; +struct __wt_cursor; typedef struct __wt_cursor WT_CURSOR; +struct __wt_data_source; typedef struct __wt_data_source WT_DATA_SOURCE; +struct __wt_event_handler; typedef struct __wt_event_handler WT_EVENT_HANDLER; +struct __wt_extension_api; typedef struct __wt_extension_api WT_EXTENSION_API; +struct __wt_extractor; typedef struct __wt_extractor WT_EXTRACTOR; +struct __wt_item; typedef struct __wt_item WT_ITEM; +struct __wt_lsn; typedef struct __wt_lsn WT_LSN; +struct __wt_session; typedef struct __wt_session WT_SESSION; + +#if defined(SWIGJAVA) +#define WT_HANDLE_NULLABLE(typename) typename##_NULLABLE +#define WT_HANDLE_CLOSED(typename) typename##_CLOSED +typedef WT_CURSOR WT_CURSOR_NULLABLE; +typedef WT_CURSOR WT_CURSOR_CLOSED; +typedef WT_SESSION WT_SESSION_CLOSED; +typedef WT_CONNECTION WT_CONNECTION_CLOSED; +#elif !defined(DOXYGEN) +#define WT_HANDLE_NULLABLE(typename) typename +#define WT_HANDLE_CLOSED(typename) typename +#endif + +/*! + * A raw item of data to be managed, including a pointer to the data and a + * length. + * + * WT_ITEM structures do not need to be cleared before use. + */ +struct __wt_item { + /*! + * The memory reference of the data item. + * + * For items returned by a WT_CURSOR, the pointer is only valid until + * the next operation on that cursor. Applications that need to keep + * an item across multiple cursor operations must make a copy. + */ + const void *data; + + /*! + * The number of bytes in the data item. + * + * The maximum length of a single column stored in a table is not fixed + * (as it partially depends on the underlying file configuration), but + * is always a small number of bytes less than 4GB. + */ + size_t size; + +#ifndef DOXYGEN +#define WT_ITEM_ALIGNED 0x00000001 +#define WT_ITEM_INUSE 0x00000002 + /* This appears in the middle of the struct to avoid padding. */ + /*! Object flags (internal use). */ + uint32_t flags; + + /*! Managed memory chunk (internal use). */ + void *mem; + /*! Managed memory size (internal use). */ + size_t memsize; +#endif +}; + +/* + * We rely on this structure being aligned at 64 bits by the compiler, + * if we were paranoid we could add an unused field to ensure the padding + * is correct. + * + * NOTE: If you change the contents of this structure you must also update + * the macros in log.h. + */ +/*! + * A log sequence number, representing a position in the transaction log. + */ +struct __wt_lsn { + uint32_t file; /*!< Log file number */ + wt_off_t offset; /*!< Log file offset */ +}; + +/*! + * The maximum packed size of a 64-bit integer. The ::wiredtiger_struct_pack + * function will pack single long integers into at most this many bytes. + */ +#define WT_INTPACK64_MAXSIZE ((int)sizeof (int64_t) + 1) + +/*! + * The maximum packed size of a 32-bit integer. The ::wiredtiger_struct_pack + * function will pack single integers into at most this many bytes. + */ +#define WT_INTPACK32_MAXSIZE ((int)sizeof (int32_t) + 1) + +/*! + * A WT_CURSOR handle is the interface to a cursor. + * + * Cursors allow data to be searched, iterated and modified, implementing the + * CRUD (create, read, update and delete) operations. Cursors are opened in + * the context of a session. If a transaction is started, cursors operate in + * the context of the transaction until the transaction is resolved. + * + * Raw data is represented by key/value pairs of WT_ITEM structures, but + * cursors can also provide access to fields within the key and value if the + * formats are described in the WT_SESSION::create method. + * + * In the common case, a cursor is used to access records in a table. However, + * cursors can be used on subsets of tables (such as a single column or a + * projection of multiple columns), as an interface to statistics, configuration + * data or application-specific data sources. See WT_SESSION::open_cursor for + * more information. + * + * <b>Thread safety:</b> A WT_CURSOR handle is not usually shared between + * threads, see @ref threads for more information. + */ +struct __wt_cursor { + WT_SESSION *session; /*!< The session handle for this cursor. */ + + /*! + * The name of the data source for the cursor, matches the \c uri + * parameter to WT_SESSION::open_cursor used to open the cursor. + */ + const char *uri; + + /*! + * The format of the data packed into key items. See @ref packing for + * details. If not set, a default value of "u" is assumed, and + * applications must use WT_ITEM structures to manipulate untyped byte + * arrays. + */ + const char *key_format; + + /*! + * The format of the data packed into value items. See @ref packing + * for details. If not set, a default value of "u" is assumed, and + * applications must use WT_ITEM structures to manipulate untyped byte + * arrays. + */ + const char *value_format; + + /*! + * @name Data access + * @{ + */ + /*! + * Get the key for the current record. + * + * @snippet ex_all.c Get the cursor's string key + * + * @snippet ex_all.c Get the cursor's record number key + * + * @param cursor the cursor handle + * @param ... pointers to hold key fields corresponding to + * WT_CURSOR::key_format. + * @errors + */ + int __F(get_key)(WT_CURSOR *cursor, ...); + + /*! + * Get the value for the current record. + * + * @snippet ex_all.c Get the cursor's string value + * + * @snippet ex_all.c Get the cursor's raw value + * + * @param cursor the cursor handle + * @param ... pointers to hold value fields corresponding to + * WT_CURSOR::value_format. + * @errors + */ + int __F(get_value)(WT_CURSOR *cursor, ...); + + /*! + * Set the key for the next operation. + * + * @snippet ex_all.c Set the cursor's string key + * + * @snippet ex_all.c Set the cursor's record number key + * + * @param cursor the cursor handle + * @param ... key fields corresponding to WT_CURSOR::key_format. + * + * If an error occurs during this operation, a flag will be set in the + * cursor, and the next operation to access the key will fail. This + * simplifies error handling in applications. + */ + void __F(set_key)(WT_CURSOR *cursor, ...); + + /*! + * Set the value for the next operation. + * + * @snippet ex_all.c Set the cursor's string value + * + * @snippet ex_all.c Set the cursor's raw value + * + * @param cursor the cursor handle + * @param ... value fields corresponding to WT_CURSOR::value_format. + * + * If an error occurs during this operation, a flag will be set in the + * cursor, and the next operation to access the value will fail. This + * simplifies error handling in applications. + */ + void __F(set_value)(WT_CURSOR *cursor, ...); + /*! @} */ + + /*! + * @name Cursor positioning + * @{ + */ + /*! + * Return the ordering relationship between two cursors: both cursors + * must have the same data source and have valid keys. + * + * @snippet ex_all.c Cursor comparison + * + * @param cursor the cursor handle + * @param other another cursor handle + * @param comparep the status of the comparison: < 0 if + * <code>cursor</code> refers to a key that appears before + * <code>other</code>, 0 if the cursors refer to the same key, + * and > 0 if <code>cursor</code> refers to a key that appears after + * <code>other</code>. + * @errors + */ + int __F(compare)(WT_CURSOR *cursor, WT_CURSOR *other, int *comparep); + + /*! + * Return the next record. + * + * @snippet ex_all.c Return the next record + * + * @param cursor the cursor handle + * @errors + */ + int __F(next)(WT_CURSOR *cursor); + + /*! + * Return the previous record. + * + * @snippet ex_all.c Return the previous record + * + * @param cursor the cursor handle + * @errors + */ + int __F(prev)(WT_CURSOR *cursor); + + /*! + * Reset the position of the cursor. Any resources held by the cursor + * are released, and the cursor's key and position are no longer valid. + * A subsequent iteration with WT_CURSOR::next will move to the first + * record, or with WT_CURSOR::prev will move to the last record. + * + * @snippet ex_all.c Reset the cursor + * + * @param cursor the cursor handle + * @errors + */ + int __F(reset)(WT_CURSOR *cursor); + + /*! + * Return the record matching the key. The key must first be set. + * + * @snippet ex_all.c Search for an exact match + * + * On success, the cursor ends positioned at the returned record; to + * minimize cursor resources, the WT_CURSOR::reset method should be + * called as soon as the record has been retrieved and the cursor no + * longer needs that position. + * + * @param cursor the cursor handle + * @errors + */ + int __F(search)(WT_CURSOR *cursor); + + /*! + * Return the record matching the key if it exists, or an adjacent + * record. An adjacent record is either the smallest record larger + * than the key or the largest record smaller than the key (in other + * words, a logically adjacent key). + * + * The key must first be set. + * + * An example of a search for an exact or adjacent match: + * + * @snippet ex_all.c Search for an exact or adjacent match + * + * An example of a forward scan through the table, where all keys + * greater than or equal to a specified prefix are included in the + * scan: + * + * @snippet ex_all.c Forward scan greater than or equal + * + * An example of a backward scan through the table, where all keys + * less than a specified prefix are included in the scan: + * + * @snippet ex_all.c Backward scan less than + * + * On success, the cursor ends positioned at the returned record; to + * minimize cursor resources, the WT_CURSOR::reset method should be + * called as soon as the record has been retrieved and the cursor no + * longer needs that position. + * + * @param cursor the cursor handle + * @param exactp the status of the search: 0 if an exact match is + * found, < 0 if a smaller key is returned, > 0 if a larger key is + * returned + * @errors + */ + int __F(search_near)(WT_CURSOR *cursor, int *exactp); + /*! @} */ + + /*! + * @name Data modification + * @{ + */ + /*! + * Insert a record and optionally update an existing record. + * + * If the cursor was configured with "overwrite=true" (the default), + * both the key and value must be set; if the record already exists, + * the key's value will be updated, otherwise, the record will be + * inserted. + * + * @snippet ex_all.c Insert a new record or overwrite an existing record + * + * If the cursor was not configured with "overwrite=true", both the key + * and value must be set and the record must not already exist; the + * record will be inserted. + * + * @snippet ex_all.c Insert a new record and fail if the record exists + * + * If a cursor with record number keys was configured with + * "append=true" (not the default), the value must be set; a new record + * will be appended and the record number set as the cursor key value. + * + * @snippet ex_all.c Insert a new record and assign a record number + * + * The cursor ends with no position, and a subsequent call to the + * WT_CURSOR::next (WT_CURSOR::prev) method will iterate from the + * beginning (end) of the table. + * + * Inserting a new record after the current maximum record in a + * fixed-length bit field column-store (that is, a store with an + * 'r' type key and 't' type value) may implicitly create the missing + * records as records with a value of 0. + * + * When loading a large amount of data into a new object, using + * a cursor with the \c bulk configuration string enabled and + * loading the data in sorted order will be much faster than doing + * out-of-order inserts. See @ref tune_bulk_load for more information. + * + * The maximum length of a single column stored in a table is not fixed + * (as it partially depends on the underlying file configuration), but + * is always a small number of bytes less than 4GB. + * + * @param cursor the cursor handle + * @errors + * In particular, if \c overwrite is not configured and a record with + * the specified key already exists, ::WT_DUPLICATE_KEY is returned. + */ + int __F(insert)(WT_CURSOR *cursor); + + /*! + * Update a record and optionally insert an existing record. + * + * If the cursor was configured with "overwrite=true" (the default), + * both the key and value must be set; if the record already exists, the + * key's value will be updated, otherwise, the record will be inserted. + * + * @snippet ex_all.c Update an existing record or insert a new record + * + * If the cursor was not configured with "overwrite=true", both the key + * and value must be set and the record must already exist; the + * record will be updated. + * + * @snippet ex_all.c Update an existing record and fail if DNE + * + * On success, the cursor ends positioned at the modified record; to + * minimize cursor resources, the WT_CURSOR::reset method should be + * called as soon as the cursor no longer needs that position. + * + * The maximum length of a single column stored in a table is not fixed + * (as it partially depends on the underlying file configuration), but + * is always a small number of bytes less than 4GB. + * + * @param cursor the cursor handle + * @errors + * In particular, if \c overwrite is not configured and no record with + * the specified key exists, ::WT_NOTFOUND is returned. + */ + int __F(update)(WT_CURSOR *cursor); + + /*! + * Remove a record. + * + * If the cursor was configured with "overwrite=true" (the default), + * the key must be set; the key's record will be removed if it exists, + * no error will be returned if the record does not exist. + * + * @snippet ex_all.c Remove a record + * + * If the cursor was not configured with "overwrite=true", the key must + * be set and the key's record must exist; the record will be removed. + * + * @snippet ex_all.c Remove a record and fail if DNE + * + * Removing a record in a fixed-length bit field column-store + * (that is, a store with an 'r' type key and 't' type value) is + * identical to setting the record's value to 0. + * + * On success, the cursor ends positioned at the removed record; to + * minimize cursor resources, the WT_CURSOR::reset method should be + * called as soon as the cursor no longer needs that position. + * + * @param cursor the cursor handle + * @errors + * In particular, if \c overwrite is not configured and no record with + * the specified key exists, ::WT_NOTFOUND is returned. + */ + int __F(remove)(WT_CURSOR *cursor); + /*! @} */ + + /*! + * Close the cursor. + * + * This releases the resources associated with the cursor handle. + * Cursors are closed implicitly by ending the enclosing connection or + * closing the session in which they were opened. + * + * @snippet ex_all.c Close the cursor + * + * @param cursor the cursor handle + * @errors + */ + int __F(close)(WT_HANDLE_CLOSED(WT_CURSOR) *cursor); + + /* + * Protected fields, only to be used by cursor implementations. + */ +#if !defined(SWIG) && !defined(DOXYGEN) + /* + * !!! + * Explicit representations of structures from queue.h. + * TAILQ_ENTRY(wt_cursor) q; + */ + struct { + WT_CURSOR *tqe_next; + WT_CURSOR **tqe_prev; + } q; /* Linked list of WT_CURSORs. */ + + uint64_t recno; /* Record number, normal and raw mode */ + uint8_t raw_recno_buf[WT_INTPACK64_MAXSIZE]; + + void *json_private; /* JSON specific storage */ + void *lang_private; /* Language specific private storage */ + + WT_ITEM key, value; + int saved_err; /* Saved error in set_{key,value}. */ + /* + * URI used internally, may differ from the URI provided by the + * user on open. + */ + const char *internal_uri; + +#define WT_CURSTD_APPEND 0x0001 +#define WT_CURSTD_BULK 0x0002 +#define WT_CURSTD_DATA_SOURCE 0x0004 +#define WT_CURSTD_DUMP_HEX 0x0008 +#define WT_CURSTD_DUMP_JSON 0x0010 +#define WT_CURSTD_DUMP_PRINT 0x0020 +#define WT_CURSTD_KEY_EXT 0x0040 /* Key points out of the tree. */ +#define WT_CURSTD_KEY_INT 0x0080 /* Key points into the tree. */ +#define WT_CURSTD_KEY_SET (WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT) +#define WT_CURSTD_OPEN 0x0100 +#define WT_CURSTD_OVERWRITE 0x0200 +#define WT_CURSTD_RAW 0x0400 +#define WT_CURSTD_VALUE_EXT 0x0800 /* Value points out of the tree. */ +#define WT_CURSTD_VALUE_INT 0x1000 /* Value points into the tree. */ +#define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT) + uint32_t flags; +#endif +}; + +/*! Asynchronous operation types. */ +typedef enum { + WT_AOP_NONE=0, /*!< No operation type set */ + WT_AOP_COMPACT, /*!< WT_ASYNC_OP::compact */ + WT_AOP_INSERT, /*!< WT_ASYNC_OP::insert */ + WT_AOP_REMOVE, /*!< WT_ASYNC_OP::remove */ + WT_AOP_SEARCH, /*!< WT_ASYNC_OP::search */ + WT_AOP_UPDATE /*!< WT_ASYNC_OP::update */ +} WT_ASYNC_OPTYPE; + +/*! + * A WT_ASYNC_OP handle is the interface to an asynchronous operation. + * + * An asynchronous operation describes a data manipulation to be performed + * asynchronously by a WiredTiger worker thread. These operations implement + * the CRUD (create, read, update and delete) operations. Each operation + * is a self-contained work unit. The operation will be performed in the + * context of the worker thread's session. Each operation is performed + * within the context of a transaction. The application is notified of its + * completion with a callback. The transaction is resolved once the callback + * returns. + * + * The table referenced in an operation must already exist. + * + * Raw data is represented by key/value pairs of WT_ITEM structures, but + * operations can also provide access to fields within the key and value if + * the formats are described in the WT_SESSION::create method. + * + * <b>Thread safety:</b> A WT_ASYNC_OP handle may not be shared between + * threads, see @ref threads for more information. + */ +struct __wt_async_op { + /*! The connection for this operation. */ + WT_CONNECTION *connection; + + /*! + * The format of the data packed into key items. See @ref packing for + * details. If not set, a default value of "u" is assumed, and + * applications must use WT_ITEM structures to manipulate untyped byte + * arrays. + */ + const char *key_format; + + /*! + * The format of the data packed into value items. See @ref packing + * for details. If not set, a default value of "u" is assumed, and + * applications must use WT_ITEM structures to manipulate untyped byte + * arrays. + */ + const char *value_format; + + /* + * Don't expose app_private to non-C language bindings - they have + * their own way to attach data to an operation. + */ +#if !defined(SWIG) + /*! + * A location for applications to store information that will be + * available in the callback from an async operation. + */ + void *app_private; +#endif + + /*! + * @name Data access + * @{ + */ + /*! + * Invoke the underlying WT_CURSOR::get_key method; see that method + * for configuration, return and error values. + * + * @param op the operation handle + * @returns as described for WT_CURSOR::get_key + */ + int __F(get_key)(WT_ASYNC_OP *op, ...); + + /*! + * Invoke the underlying WT_CURSOR::get_value method; see that method + * for configuration, return and error values. + * + * @param op the operation handle + * @returns as described for WT_CURSOR::get_value + */ + int __F(get_value)(WT_ASYNC_OP *op, ...); + + /*! + * Invoke the underlying WT_CURSOR::set_key method; see that method + * for configuration, return and error values. + * + * @param op the operation handle + */ + void __F(set_key)(WT_ASYNC_OP *op, ...); + + /*! + * Invoke the underlying WT_CURSOR::set_value method; see that method + * for configuration, return and error values. + * + * @param op the operation handle + */ + void __F(set_value)(WT_ASYNC_OP *op, ...); + /*! @} */ + + /*! + * @name Positioning + * @{ + */ + /*! + * Invoke the underlying WT_CURSOR::search method; see that method + * for configuration, return and error values. + * + * @param op the operation handle + * @returns via the callback as described for WT_CURSOR::search + */ + int __F(search)(WT_ASYNC_OP *op); + /*! @} */ + + /*! + * @name Data modification + * @{ + */ + /*! + * Invoke the underlying WT_CURSOR::insert method; see that method + * for configuration, return and error values. + * + * @param op the operation handle + * @returns via the callback as described for WT_CURSOR::insert + */ + int __F(insert)(WT_ASYNC_OP *op); + + /*! + * Invoke the underlying WT_CURSOR::update method; see that method + * for configuration, return and error values. + * + * @param op the operation handle + * @returns via the callback as described for WT_CURSOR::update + */ + int __F(update)(WT_ASYNC_OP *op); + + /*! + * Invoke the underlying WT_CURSOR::remove method; see that method + * for configuration, return and error values. + * + * @param op the operation handle + * @returns via the callback as described for WT_CURSOR::remove + */ + int __F(remove)(WT_ASYNC_OP *op); + /*! @} */ + + /*! + * @name Table operations + * @{ + */ + /*! + * Invoke the underlying WT_SESSION::compact method; see that method + * for configuration, return and error values. + * + * @param op the operation handle + * @returns via the callback as described for WT_SESSION::compact + */ + int __F(compact)(WT_ASYNC_OP *op); + /*! @} */ + + /*! + * Get the unique identifier for this operation. + * + * @snippet ex_async.c async get identifier + * + * @param op the operation handle + * @returns the id of the operation + */ + uint64_t __F(get_id)(WT_ASYNC_OP *op); + + /*! + * Get the type for this operation. + * + * @snippet ex_async.c async get type + * + * @param op the operation handle + * @returns the ::WT_ASYNC_OPTYPE of the operation + */ + WT_ASYNC_OPTYPE __F(get_type)(WT_ASYNC_OP *op); + + /* + * Protected fields, only to be used by internal implementation. + * Everything we need for maintaining the key/value is part of + * a cursor. So, include one here so that we can use the cursor + * functions to manage them. + */ +#if !defined(SWIG) && !defined(DOXYGEN) + WT_CURSOR c; +#endif +}; + +/*! + * All data operations are performed in the context of a WT_SESSION. This + * encapsulates the thread and transactional context of the operation. + * + * <b>Thread safety:</b> A WT_SESSION handle is not usually shared between + * threads, see @ref threads for more information. + */ +struct __wt_session { + /*! The connection for this session. */ + WT_CONNECTION *connection; + + /*! + * Close the session handle. + * + * This will release the resources associated with the session handle, + * including rolling back any active transactions and closing any + * cursors that remain open in the session. + * + * @snippet ex_all.c Close a session + * + * @param session the session handle + * @configempty{session.close, see dist/api_data.py} + * @errors + */ + int __F(close)(WT_HANDLE_CLOSED(WT_SESSION) *session, + const char *config); + + /*! + * Reconfigure a session handle. + * + * @snippet ex_all.c Reconfigure a session + * + * WT_SESSION::reconfigure will fail if a transaction is in progress + * in the session. + * + * All cursors are reset. + * + * @param session the session handle + * @configstart{session.reconfigure, see dist/api_data.py} + * @config{isolation, the default isolation level for operations in this + * session., a string\, chosen from the following options: \c + * "read-uncommitted"\, \c "read-committed"\, \c "snapshot"; default \c + * read-committed.} + * @configend + * @errors + */ + int __F(reconfigure)(WT_SESSION *session, const char *config); + + /*! + * @name Cursor handles + * @{ + */ + + /*! + * Open a new cursor on a data source or duplicate an existing cursor. + * + * @snippet ex_all.c Open a cursor + * + * An existing cursor can be duplicated by passing it as the \c to_dup + * parameter and setting the \c uri parameter to \c NULL: + * + * @snippet ex_all.c Duplicate a cursor + * + * Cursors being duplicated must have a key set, and successfully + * duplicated cursors are positioned at the same place in the data + * source as the original. + * + * To reconfigure a cursor, duplicate it with a new configuration value: + * + * @snippet ex_all.c Reconfigure a cursor + * + * Cursor handles should be discarded by calling WT_CURSOR::close. + * + * Cursors capable of supporting transactional operations operate in the + * context of the current transaction, if any. + * + * WT_SESSION::rollback_transaction implicitly resets all cursors. + * + * Cursors are relatively light-weight objects but may hold references + * to heavier-weight objects; applications should re-use cursors when + * possible, but instantiating new cursors is not so expensive that + * applications need to cache cursors at all cost. + * + * @param session the session handle + * @param uri the data source on which the cursor operates; cursors + * are usually opened on tables, however, cursors can be opened on + * any data source, regardless of whether it is ultimately stored + * in a table. Some cursor types may have limited functionality + * (for example, they may be read-only or not support transactional + * updates). See @ref data_sources for more information. + * <br> + * @copydoc doc_cursor_types + * @param to_dup a cursor to duplicate + * @configstart{session.open_cursor, see dist/api_data.py} + * @config{append, append the value as a new record\, creating a new + * record number key; valid only for cursors with record number keys., a + * boolean flag; default \c false.} + * @config{bulk, configure the cursor for bulk-loading\, a fast\, + * initial load path (see @ref tune_bulk_load for more information). + * Bulk-load may only be used for newly created objects and cursors + * configured for bulk-load only support the WT_CURSOR::insert and + * WT_CURSOR::close methods. When bulk-loading row-store objects\, keys + * must be loaded in sorted order. The value is usually a true/false + * flag; when bulk-loading fixed-length column store objects\, the + * special value \c bitmap allows chunks of a memory resident bitmap to + * be loaded directly into a file by passing a \c WT_ITEM to + * WT_CURSOR::set_value where the \c size field indicates the number of + * records in the bitmap (as specified by the object's \c value_format + * configuration). Bulk-loaded bitmap values must end on a byte boundary + * relative to the bit count (except for the last set of values + * loaded)., a string; default \c false.} + * @config{checkpoint, the name of a checkpoint to open (the reserved + * name "WiredTigerCheckpoint" opens the most recent internal checkpoint + * taken for the object). The cursor does not support data + * modification., a string; default empty.} + * @config{dump, configure the cursor for dump format inputs and + * outputs: "hex" selects a simple hexadecimal format\, "json" selects a + * JSON format with each record formatted as fields named by column + * names if available\, and "print" selects a format where only + * non-printing characters are hexadecimal encoded. These formats are + * compatible with the @ref util_dump and @ref util_load commands., a + * string\, chosen from the following options: \c "hex"\, \c "json"\, \c + * "print"; default empty.} + * @config{next_random, configure the cursor to return a pseudo-random + * record from the object; valid only for row-store cursors. Cursors + * configured with \c next_random=true only support the WT_CURSOR::next + * and WT_CURSOR::close methods. See @ref cursor_random for details., a + * boolean flag; default \c false.} + * @config{overwrite, configures whether the cursor's insert\, update + * and remove methods check the existing state of the record. If \c + * overwrite is \c false\, WT_CURSOR::insert fails with + * ::WT_DUPLICATE_KEY if the record exists\, WT_CURSOR::update and + * WT_CURSOR::remove fail with ::WT_NOTFOUND if the record does not + * exist., a boolean flag; default \c true.} + * @config{raw, ignore the encodings for the key and value\, manage data + * as if the formats were \c "u". See @ref cursor_raw for details., a + * boolean flag; default \c false.} + * @config{readonly, only query operations are supported by this cursor. + * An error is returned if a modification is attempted using the cursor. + * The default is false for all cursor types except for log and metadata + * cursors., a boolean flag; default \c false.} + * @config{statistics, Specify the statistics to be gathered. Choosing + * "all" gathers statistics regardless of cost and may include + * traversing on-disk files; "fast" gathers a subset of relatively + * inexpensive statistics. The selection must agree with the database + * \c statistics configuration specified to ::wiredtiger_open or + * WT_CONNECTION::reconfigure. For example\, "all" or "fast" can be + * configured when the database is configured with "all"\, but the + * cursor open will fail if "all" is specified when the database is + * configured with "fast"\, and the cursor open will fail in all cases + * when the database is configured with "none". If \c statistics is not + * configured\, the default configuration is the database configuration. + * The "clear" configuration resets statistics after gathering them\, + * where appropriate (for example\, a cache size statistic is not + * cleared\, while the count of cursor insert operations will be + * cleared). See @ref statistics for more information., a list\, with + * values chosen from the following options: \c "all"\, \c "fast"\, \c + * "clear"; default empty.} + * @config{target, if non-empty\, backup the list of objects; valid only + * for a backup data source., a list of strings; default empty.} + * @configend + * @param[out] cursorp a pointer to the newly opened cursor + * @errors + */ + int __F(open_cursor)(WT_SESSION *session, + const char *uri, WT_HANDLE_NULLABLE(WT_CURSOR) *to_dup, + const char *config, WT_CURSOR **cursorp); + /*! @} */ + + /*! + * @name Table operations + * @{ + */ + /*! + * Create a table, column group, index or file. + * + * @snippet ex_all.c Create a table + * + * @param session the session handle + * @param name the URI of the object to create, such as + * \c "table:stock". For a description of URI formats + * see @ref data_sources. + * @configstart{session.create, see dist/api_data.py} + * @config{allocation_size, the file unit allocation size\, in bytes\, + * must a power-of-two; smaller values decrease the file space required + * by overflow items\, and the default value of 4KB is a good choice + * absent requirements from the operating system or storage device., an + * integer between 512B and 128MB; default \c 4KB.} + * @config{app_metadata, application-owned metadata for this object., a + * string; default empty.} + * @config{block_allocation, configure block allocation. Permitted + * values are \c "first" or \c "best"; the \c "first" configuration uses + * a first-available algorithm during block allocation\, the \c "best" + * configuration uses a best-fit algorithm., a string\, chosen from the + * following options: \c "first"\, \c "best"; default \c best.} + * @config{block_compressor, configure a compressor for file blocks. + * Permitted values are empty (off) or \c "bzip2"\, \c "snappy" or + * custom compression engine \c "name" created with + * WT_CONNECTION::add_compressor. See @ref compression for more + * information., a string; default empty.} + * @config{cache_resident, do not ever evict the object's pages; see + * @ref tuning_cache_resident for more information., a boolean flag; + * default \c false.} + * @config{checksum, configure block checksums; permitted values are + * <code>on</code> (checksum all blocks)\, <code>off</code> (checksum no + * blocks) and <code>uncompresssed</code> (checksum only blocks which + * are not compressed for any reason). The \c uncompressed setting is + * for applications which can rely on decompression to fail if a block + * has been corrupted., a string\, chosen from the following options: \c + * "on"\, \c "off"\, \c "uncompressed"; default \c uncompressed.} + * @config{colgroups, comma-separated list of names of column groups. + * Each column group is stored separately\, keyed by the primary key of + * the table. If no column groups are specified\, all columns are + * stored together in a single file. All value columns in the table + * must appear in at least one column group. Each column group must be + * created with a separate call to WT_SESSION::create., a list of + * strings; default empty.} + * @config{collator, configure custom collation for keys. Value must be + * a collator name created with WT_CONNECTION::add_collator., a string; + * default empty.} + * @config{columns, list of the column names. Comma-separated list of + * the form <code>(column[\,...])</code>. For tables\, the number of + * entries must match the total number of values in \c key_format and \c + * value_format. For colgroups and indices\, all column names must + * appear in the list of columns for the table., a list of strings; + * default empty.} + * @config{dictionary, the maximum number of unique values remembered in + * the Btree row-store leaf page value dictionary; see @ref + * file_formats_compression for more information., an integer greater + * than or equal to 0; default \c 0.} + * @config{exclusive, fail if the object exists. When false (the + * default)\, if the object exists\, check that its settings match the + * specified configuration., a boolean flag; default \c false.} + * @config{format, the file format., a string\, chosen from the + * following options: \c "btree"; default \c btree.} + * @config{huffman_key, configure Huffman encoding for keys. Permitted + * values are empty (off)\, \c "english"\, \c "utf8<file>" or \c + * "utf16<file>". See @ref huffman for more information., a string; + * default empty.} + * @config{huffman_value, configure Huffman encoding for values. + * Permitted values are empty (off)\, \c "english"\, \c "utf8<file>" or + * \c "utf16<file>". See @ref huffman for more information., a string; + * default empty.} + * @config{internal_item_max, the largest key stored within an internal + * node\, in bytes. If non-zero\, any key larger than the specified + * size will be stored as an overflow item (which may require additional + * I/O to access). If zero\, a default size is chosen that permits at + * least 8 keys per internal page., an integer greater than or equal to + * 0; default \c 0.} + * @config{internal_key_truncate, configure internal key truncation\, + * discarding unnecessary trailing bytes on internal keys (ignored for + * custom collators)., a boolean flag; default \c true.} + * @config{internal_page_max, the maximum page size for internal nodes\, + * in bytes; the size must be a multiple of the allocation size and is + * significant for applications wanting to avoid excessive L2 cache + * misses while searching the tree. The page maximum is the bytes of + * uncompressed data\, that is\, the limit is applied before any block + * compression is done., an integer between 512B and 512MB; default \c + * 4KB.} + * @config{key_format, the format of the data packed into key items. + * See @ref schema_format_types for details. By default\, the + * key_format is \c 'u' and applications use WT_ITEM structures to + * manipulate raw byte arrays. By default\, records are stored in + * row-store files: keys of type \c 'r' are record numbers and records + * referenced by record number are stored in column-store files., a + * format string; default \c u.} + * @config{leaf_item_max, the largest key or value stored within a leaf + * node\, in bytes. If non-zero\, any key or value larger than the + * specified size will be stored as an overflow item (which may require + * additional I/O to access). If zero\, a default size is chosen that + * permits at least 4 key and value pairs per leaf page., an integer + * greater than or equal to 0; default \c 0.} + * @config{leaf_page_max, the maximum page size for leaf nodes\, in + * bytes; the size must be a multiple of the allocation size\, and is + * significant for applications wanting to maximize sequential data + * transfer from a storage device. The page maximum is the bytes of + * uncompressed data\, that is\, the limit is applied before any block + * compression is done., an integer between 512B and 512MB; default \c + * 32KB.} + * @config{lsm = (, options only relevant for LSM data sources., a set + * of related configuration options defined below.} + * @config{ auto_throttle, Throttle inserts into + * LSM trees if flushing to disk isn't keeping up., a boolean flag; + * default \c true.} + * @config{ bloom, create bloom + * filters on LSM tree chunks as they are merged., a boolean flag; + * default \c true.} + * @config{ bloom_bit_count, + * the number of bits used per item for LSM bloom filters., an integer + * between 2 and 1000; default \c 16.} + * @config{ bloom_config, config string used when + * creating Bloom filter files\, passed to WT_SESSION::create., a + * string; default empty.} + * @config{ bloom_hash_count, the number of hash + * values per item used for LSM bloom filters., an integer between 2 and + * 100; default \c 8.} + * @config{ bloom_oldest, + * create a bloom filter on the oldest LSM tree chunk. Only supported + * if bloom filters are enabled., a boolean flag; default \c false.} + * @config{ chunk_max, the maximum size a single + * chunk can be. Chunks larger than this size are not considered for + * further merges. This is a soft limit\, and chunks larger than this + * value can be created. Must be larger than chunk_size., an integer + * between 100MB and 10TB; default \c 5GB.} + * @config{ chunk_size, the maximum size of the + * in-memory chunk of an LSM tree. This limit is soft - it is possible + * for chunks to be temporarily larger than this value. This overrides + * the \c memory_page_max setting., an integer between 512K and 500MB; + * default \c 10MB.} + * @config{ merge_max, the + * maximum number of chunks to include in a merge operation., an integer + * between 2 and 100; default \c 15.} + * @config{ merge_min, the minimum number of + * chunks to include in a merge operation. If set to 0 or 1 half the + * value of merge_max is used., an integer no more than 100; default \c + * 0.} + * @config{ ),,} + * @config{memory_page_max, the maximum size a page can grow to in + * memory before being reconciled to disk. The specified size will be + * adjusted to a lower bound of <code>50 * leaf_page_max</code>\, and an + * upper bound of <code>cache_size / 2</code>. This limit is soft - it + * is possible for pages to be temporarily larger than this value. This + * setting is ignored for LSM trees\, see \c chunk_size., an integer + * between 512B and 10TB; default \c 5MB.} + * @config{os_cache_dirty_max, maximum dirty system buffer cache usage\, + * in bytes. If non-zero\, schedule writes for dirty blocks belonging + * to this object in the system buffer cache after that many bytes from + * this object are written into the buffer cache., an integer greater + * than or equal to 0; default \c 0.} + * @config{os_cache_max, maximum system buffer cache usage\, in bytes. + * If non-zero\, evict object blocks from the system buffer cache after + * that many bytes from this object are read or written into the buffer + * cache., an integer greater than or equal to 0; default \c 0.} + * @config{prefix_compression, configure prefix compression on row-store + * leaf pages., a boolean flag; default \c false.} + * @config{prefix_compression_min, minimum gain before prefix + * compression will be used on row-store leaf pages., an integer greater + * than or equal to 0; default \c 4.} + * @config{split_pct, the Btree page split size as a percentage of the + * maximum Btree page size\, that is\, when a Btree page is split\, it + * will be split into smaller pages\, where each page is the specified + * percentage of the maximum Btree page size., an integer between 25 and + * 100; default \c 75.} + * @config{type, set the type of data source used to store a column + * group\, index or simple table. By default\, a \c "file:" URI is + * derived from the object name. The \c type configuration can be used + * to switch to a different data source\, such as LSM or an extension + * configured by the application., a string; default \c file.} + * @config{value_format, the format of the data packed into value items. + * See @ref schema_format_types for details. By default\, the + * value_format is \c 'u' and applications use a WT_ITEM structure to + * manipulate raw byte arrays. Value items of type 't' are bitfields\, + * and when configured with record number type keys\, will be stored + * using a fixed-length store., a format string; default \c u.} + * @configend + * @errors + */ + int __F(create)(WT_SESSION *session, + const char *name, const char *config); + + /*! + * Compact a live row- or column-store btree or LSM tree. + * + * @snippet ex_all.c Compact a table + * + * @param session the session handle + * @param name the URI of the object to compact, such as + * \c "table:stock" + * @configstart{session.compact, see dist/api_data.py} + * @config{timeout, maximum amount of time to allow for compact in + * seconds. The actual amount of time spent in compact may exceed the + * configured value. A value of zero disables the timeout., an integer; + * default \c 1200.} + * @configend + * @errors + */ + int __F(compact)(WT_SESSION *session, + const char *name, const char *config); + + /*! + * Drop (delete) an object. + * + * @snippet ex_all.c Drop a table + * + * @param session the session handle + * @param name the URI of the object to drop, such as \c "table:stock" + * @configstart{session.drop, see dist/api_data.py} + * @config{force, return success if the object does not exist., a + * boolean flag; default \c false.} + * @config{remove_files, should the underlying files be removed?., a + * boolean flag; default \c true.} + * @configend + * @ebusy_errors + */ + int __F(drop)(WT_SESSION *session, + const char *name, const char *config); + + /*! + * Insert a ::WT_LOGREC_MESSAGE type record in the database log files + * (the database must be configured for logging when this method is + * called). + * + * @param session the session handle + * @param fmt a printf format specifier + * @errors + */ + int __F(log_printf)(WT_SESSION *session, const char *fmt, ...); + + /*! + * Rename an object. + * + * @snippet ex_all.c Rename a table + * + * @param session the session handle + * @param uri the current URI of the object, such as \c "table:old" + * @param newuri the new URI of the object, such as \c "table:new" + * @configempty{session.rename, see dist/api_data.py} + * @ebusy_errors + */ + int __F(rename)(WT_SESSION *session, + const char *uri, const char *newuri, const char *config); + + /*! + * Salvage a file or table + * + * Salvage rebuilds the file, or files of which a table is comprised, + * discarding any corrupted file blocks. + * + * Previously deleted records may re-appear, and inserted records may + * disappear, when salvage is done, so salvage should not be run + * unless it is known to be necessary. Normally, salvage should be + * called after a file or table has been corrupted, as reported by the + * WT_SESSION::verify method. + * + * Files are rebuilt in place, the salvage method overwrites the + * existing files. + * + * @snippet ex_all.c Salvage a table + * + * @param session the session handle + * @param name the URI of the file or table to salvage + * @configstart{session.salvage, see dist/api_data.py} + * @config{force, force salvage even of files that do not appear to be + * WiredTiger files., a boolean flag; default \c false.} + * @configend + * @ebusy_errors + */ + int __F(salvage)(WT_SESSION *session, + const char *name, const char *config); + + /*! + * Truncate a file, table or cursor range. + * + * Truncate a file or table. + * @snippet ex_all.c Truncate a table + * + * Truncate a cursor range. When truncating based on a cursor position, + * it is not required the cursor reference a record in the object, only + * that the key be set. This allows applications to discard portions of + * the object name space without knowing exactly what records the object + * contains. + * @snippet ex_all.c Truncate a range + * + * @param session the session handle + * @param name the URI of the file or table to truncate + * @param start optional cursor marking the first record discarded; + * if <code>NULL</code>, the truncate starts from the beginning of + * the object + * @param stop optional cursor marking the last record discarded; + * if <code>NULL</code>, the truncate continues to the end of the + * object + * @configempty{session.truncate, see dist/api_data.py} + * @ebusy_errors + */ + int __F(truncate)(WT_SESSION *session, + const char *name, + WT_HANDLE_NULLABLE(WT_CURSOR) *start, + WT_HANDLE_NULLABLE(WT_CURSOR) *stop, + const char *config); + + /*! + * Upgrade a file or table. + * + * Upgrade upgrades a file or table, if upgrade is required. + * + * @snippet ex_all.c Upgrade a table + * + * @param session the session handle + * @param name the URI of the file or table to upgrade + * @configempty{session.upgrade, see dist/api_data.py} + * @ebusy_errors + */ + int __F(upgrade)(WT_SESSION *session, + const char *name, const char *config); + + /*! + * Verify a file or table. + * + * Verify reports if a file, or the files of which a table is + * comprised, have been corrupted. The WT_SESSION::salvage method + * can be used to repair a corrupted file, + * + * @snippet ex_all.c Verify a table + * + * @param session the session handle + * @param name the URI of the file or table to verify + * @configstart{session.verify, see dist/api_data.py} + * @config{dump_address, Display addresses and page types as pages are + * verified\, using the application's message handler\, intended for + * debugging., a boolean flag; default \c false.} + * @config{dump_blocks, Display the contents of on-disk blocks as they + * are verified\, using the application's message handler\, intended for + * debugging., a boolean flag; default \c false.} + * @config{dump_offsets, Display the contents of specific on-disk + * blocks\, using the application's message handler\, intended for + * debugging., a list of strings; default empty.} + * @config{dump_pages, Display the contents of in-memory pages as they + * are verified\, using the application's message handler\, intended for + * debugging., a boolean flag; default \c false.} + * @configend + * @ebusy_errors + */ + int __F(verify)(WT_SESSION *session, + const char *name, const char *config); + /*! @} */ + + /*! + * @name Transactions + * @{ + */ + /*! + * Start a transaction in this session. + * + * The transaction remains active until ended by + * WT_SESSION::commit_transaction or WT_SESSION::rollback_transaction. + * Operations performed on cursors capable of supporting transactional + * operations that are already open in this session, or which are opened + * before the transaction ends, will operate in the context of the + * transaction. + * + * WT_SESSION::begin_transaction will fail if a transaction is already + * in progress in the session. + * + * @snippet ex_all.c transaction commit/rollback + * + * @param session the session handle + * @configstart{session.begin_transaction, see dist/api_data.py} + * @config{isolation, the isolation level for this transaction; defaults + * to the session's isolation level., a string\, chosen from the + * following options: \c "read-uncommitted"\, \c "read-committed"\, \c + * "snapshot"; default empty.} + * @config{name, name of the transaction for tracing and debugging., a + * string; default empty.} + * @config{priority, priority of the transaction for resolving + * conflicts. Transactions with higher values are less likely to + * abort., an integer between -100 and 100; default \c 0.} + * @config{sync, whether to sync log records when the transaction + * commits\, inherited from ::wiredtiger_open \c transaction_sync., a + * boolean flag; default empty.} + * @configend + * @errors + */ + int __F(begin_transaction)(WT_SESSION *session, const char *config); + + /*! + * Commit the current transaction. + * + * A transaction must be in progress when this method is called. + * + * If WT_SESSION::commit_transaction returns an error, the transaction + * was rolled back, not committed. + * + * @snippet ex_all.c transaction commit/rollback + * + * @param session the session handle + * @configempty{session.commit_transaction, see dist/api_data.py} + * @errors + */ + int __F(commit_transaction)(WT_SESSION *session, const char *config); + + /*! + * Roll back the current transaction. + * + * A transaction must be in progress when this method is called. + * + * All cursors are reset. + * + * @snippet ex_all.c transaction commit/rollback + * + * @param session the session handle + * @configempty{session.rollback_transaction, see dist/api_data.py} + * @errors + */ + int __F(rollback_transaction)(WT_SESSION *session, const char *config); + + /*! + * Write a transactionally consistent snapshot of a database or set of + * objects. The checkpoint includes all transactions committed before + * the checkpoint starts. Additionally, checkpoints may optionally be + * discarded. + * + * @snippet ex_all.c Checkpoint examples + * + * @param session the session handle + * @configstart{session.checkpoint, see dist/api_data.py} + * @config{drop, specify a list of checkpoints to drop. The list may + * additionally contain one of the following keys: \c "from=all" to drop + * all checkpoints\, \c "from=<checkpoint>" to drop all checkpoints + * after and including the named checkpoint\, or \c "to=<checkpoint>" to + * drop all checkpoints before and including the named checkpoint. + * Checkpoints cannot be dropped while a hot backup is in progress or if + * open in a cursor., a list of strings; default empty.} + * @config{force, by default\, checkpoints may be skipped if the + * underlying object has not been modified\, this option forces the + * checkpoint., a boolean flag; default \c false.} + * @config{name, if non-empty\, specify a name for the checkpoint (note + * that checkpoints including LSM trees may not be named)., a string; + * default empty.} + * @config{target, if non-empty\, checkpoint the list of objects., a + * list of strings; default empty.} + * @configend + * @errors + */ + int __F(checkpoint)(WT_SESSION *session, const char *config); + + /*! + * Return the transaction ID range pinned by the session handle. + * + * The ID range is approximate and is calculated based on the oldest + * ID needed for the active transaction in this session, compared + * to the newest transaction in the system. + * + * @snippet ex_all.c transaction pinned range + * + * @param session the session handle + * @param[out] range the range of IDs pinned by this session. Zero if + * there is no active transaction. + * @errors + */ + int __F(transaction_pinned_range)(WT_SESSION* session, uint64_t *range); + + /*! @} */ +}; + +/*! + * A connection to a WiredTiger database. The connection may be opened within + * the same address space as the caller or accessed over a socket connection. + * + * Most applications will open a single connection to a database for each + * process. The first process to open a connection to a database will access + * the database in its own address space. Subsequent connections (if allowed) + * will communicate with the first process over a socket connection to perform + * their operations. + * + * <b>Thread safety:</b> A WT_CONNECTION handle may be shared between threads, + * see @ref threads for more information. + */ +struct __wt_connection { + /*! + * @name Async operation handles + * @{ + */ + /*! + * Wait for all outstanding operations to complete. + * + * @snippet ex_async.c async flush + * + * @param connection the connection handle + * @errors + */ + int __F(async_flush)(WT_CONNECTION *connection); + + /*! + * Return an async operation handle + * + * @snippet ex_async.c async handle allocation + * + * @param connection the connection handle + * @param uri the connection handle + * @configstart{connection.async_new_op, see dist/api_data.py} + * @config{append, append the value as a new record\, creating a new + * record number key; valid only for operations with record number + * keys., a boolean flag; default \c false.} + * @config{overwrite, configures whether the cursor's insert\, update + * and remove methods check the existing state of the record. If \c + * overwrite is \c false\, WT_CURSOR::insert fails with + * ::WT_DUPLICATE_KEY if the record exists\, WT_CURSOR::update and + * WT_CURSOR::remove fail with ::WT_NOTFOUND if the record does not + * exist., a boolean flag; default \c true.} + * @config{raw, ignore the encodings for the key and value\, manage data + * as if the formats were \c "u". See @ref cursor_raw for details., a + * boolean flag; default \c false.} + * @config{timeout, maximum amount of time to allow for compact in + * seconds. The actual amount of time spent in compact may exceed the + * configured value. A value of zero disables the timeout., an integer; + * default \c 1200.} + * @configend + * @param callback the operation callback + * @param[out] asyncopp the new op handle + * @errors + * If there are no available handles, \c EBUSY is returned. + */ + int __F(async_new_op)(WT_CONNECTION *connection, + const char *uri, const char *config, WT_ASYNC_CALLBACK *callback, + WT_ASYNC_OP **asyncopp); + /*! @} */ + + /*! + * Close a connection. + * + * Any open sessions will be closed. + * + * @snippet ex_all.c Close a connection + * + * @param connection the connection handle + * @configstart{connection.close, see dist/api_data.py} + * @config{leak_memory, don't free memory during close., a boolean flag; + * default \c false.} + * @configend + * @errors + */ + int __F(close)(WT_HANDLE_CLOSED(WT_CONNECTION) *connection, + const char *config); + + /*! + * Reconfigure a connection handle. + * + * @snippet ex_all.c Reconfigure a connection + * + * @param connection the connection handle + * @configstart{connection.reconfigure, see dist/api_data.py} + * @config{async = (, asynchronous operations configuration options., a + * set of related configuration options defined below.} + * @config{ enabled, enable asynchronous + * operation., a boolean flag; default \c false.} + * @config{ ops_max, maximum number of expected + * simultaneous asynchronous operations., an integer between 10 and + * 4096; default \c 1024.} + * @config{ threads, the + * number of worker threads to service asynchronous requests., an + * integer between 1 and 20; default \c 2.} + * @config{ ),,} + * @config{cache_size, maximum heap memory to allocate for the cache. A + * database should configure either a cache_size or a shared_cache not + * both., an integer between 1MB and 10TB; default \c 100MB.} + * @config{checkpoint = (, periodically checkpoint the database., a set + * of related configuration options defined below.} + * @config{ log_size, wait for this amount of log + * record bytes to be written to the log between each checkpoint. A + * database can configure both log_size and wait to set an upper bound + * for checkpoints; setting this value above 0 configures periodic + * checkpoints., an integer between 0 and 2GB; default \c 0.} + * @config{ name, the checkpoint name., a string; + * default \c "WiredTigerCheckpoint".} + * @config{ wait, seconds to wait between each + * checkpoint; setting this value above 0 configures periodic + * checkpoints., an integer between 0 and 100000; default \c 0.} + * @config{ ),,} + * @config{error_prefix, prefix string for error messages., a string; + * default empty.} + * @config{eviction = (, eviction configuration options., a set of + * related configuration options defined below.} + * @config{ threads_max, maximum number of + * threads WiredTiger will start to help evict pages from cache. The + * number of threads started will vary depending on the current eviction + * load., an integer between 1 and 20; default \c 1.} + * @config{ threads_min, minimum number of + * threads WiredTiger will start to help evict pages from cache. The + * number of threads currently running will vary depending on the + * current eviction load., an integer between 1 and 20; default \c 1.} + * @config{ ),,} + * @config{eviction_dirty_target, continue evicting until the cache has + * less dirty memory than the value\, as a percentage of the total cache + * size. Dirty pages will only be evicted if the cache is full enough + * to trigger eviction., an integer between 10 and 99; default \c 80.} + * @config{eviction_target, continue evicting until the cache has less + * total memory than the value\, as a percentage of the total cache + * size. Must be less than \c eviction_trigger., an integer between 10 + * and 99; default \c 80.} + * @config{eviction_trigger, trigger eviction when the cache is using + * this much memory\, as a percentage of the total cache size., an + * integer between 10 and 99; default \c 95.} + * @config{lsm_manager = (, configure database wide options for LSM tree + * management., a set of related configuration options defined below.} + * @config{ merge, merge LSM chunks where + * possible., a boolean flag; default \c true.} + * @config{ worker_thread_max, Configure a set of + * threads to manage merging LSM trees in the database., an integer + * between 3 and 20; default \c 4.} + * @config{ ),,} + * @config{shared_cache = (, shared cache configuration options. A + * database should configure either a cache_size or a shared_cache not + * both., a set of related configuration options defined below.} + * @config{ chunk, the granularity that a shared + * cache is redistributed., an integer between 1MB and 10TB; default \c + * 10MB.} + * @config{ name, name of a cache that is + * shared between databases., a string; default empty.} + * @config{ reserve, amount of cache this + * database is guaranteed to have available from the shared cache. This + * setting is per database. Defaults to the chunk size., an integer; + * default \c 0.} + * @config{ size, maximum memory + * to allocate for the shared cache. Setting this will update the value + * if one is already set., an integer between 1MB and 10TB; default \c + * 500MB.} + * @config{ ),,} + * @config{statistics, Maintain database statistics\, which may impact + * performance. Choosing "all" maintains all statistics regardless of + * cost\, "fast" maintains a subset of statistics that are relatively + * inexpensive\, "none" turns off all statistics. The "clear" + * configuration resets statistics after they are gathered\, where + * appropriate (for example\, a cache size statistic is not cleared\, + * while the count of cursor insert operations will be cleared). When + * "clear" is configured for the database\, gathered statistics are + * reset each time a statistics cursor is used to gather statistics\, as + * well as each time statistics are logged using the \c statistics_log + * configuration. See @ref statistics for more information., a list\, + * with values chosen from the following options: \c "all"\, \c "fast"\, + * \c "none"\, \c "clear"; default \c none.} + * @config{statistics_log = (, log any statistics the database is + * configured to maintain\, to a file. See @ref statistics for more + * information., a set of related configuration options defined below.} + * @config{ on_close, log statistics on database + * close., a boolean flag; default \c false.} + * @config{ path, the pathname to a file into + * which the log records are written\, may contain ISO C standard + * strftime conversion specifications. If the value is not an absolute + * path name\, the file is created relative to the database home., a + * string; default \c "WiredTigerStat.%d.%H".} + * @config{ sources, if non-empty\, include + * statistics for the list of data source URIs\, if they are open at the + * time of the statistics logging. The list may include URIs matching a + * single data source ("table:mytable")\, or a URI matching all data + * sources of a particular type ("table:")., a list of strings; default + * empty.} + * @config{ timestamp, a timestamp + * prepended to each log record\, may contain strftime conversion + * specifications., a string; default \c "%b %d %H:%M:%S".} + * @config{ wait, seconds to wait between each + * write of the log records., an integer between 0 and 100000; default + * \c 0.} + * @config{ ),,} + * @config{verbose, enable messages for various events. Only available + * if WiredTiger is configured with --enable-verbose. Options are given + * as a list\, such as <code>"verbose=[evictserver\,read]"</code>., a + * list\, with values chosen from the following options: \c "api"\, \c + * "block"\, \c "checkpoint"\, \c "compact"\, \c "evict"\, \c + * "evictserver"\, \c "fileops"\, \c "log"\, \c "lsm"\, \c "metadata"\, + * \c "mutex"\, \c "overflow"\, \c "read"\, \c "reconcile"\, \c + * "recovery"\, \c "salvage"\, \c "shared_cache"\, \c "split"\, \c + * "temporary"\, \c "transaction"\, \c "verify"\, \c "version"\, \c + * "write"; default empty.} + * @configend + * @errors + */ + int __F(reconfigure)(WT_CONNECTION *connection, const char *config); + + /*! + * The home directory of the connection. + * + * @snippet ex_all.c Get the database home directory + * + * @param connection the connection handle + * @returns a pointer to a string naming the home directory + */ + const char *__F(get_home)(WT_CONNECTION *connection); + + /*! + * Add configuration options for a method. See + * @ref custom_ds_config_add for more information. + * + * @snippet ex_all.c Configure method configuration + * + * @param connection the connection handle + * @param method the name of the method + * @param uri the object type or NULL for all object types + * @param config the additional configuration's name and default value + * @param type the additional configuration's type (must be one of + * \c "boolean"\, \c "int", \c "list" or \c "string") + * @param check the additional configuration check string, or NULL if + * none + * @errors + */ + int __F(configure_method)(WT_CONNECTION *connection, + const char *method, const char *uri, + const char *config, const char *type, const char *check); + + /*! + * Return if opening this handle created the database. + * + * @snippet ex_all.c Check if the database is newly created + * + * @param connection the connection handle + * @returns false (zero) if the connection existed before the call to + * ::wiredtiger_open, true (non-zero) if it was created by opening this + * handle. + */ + int __F(is_new)(WT_CONNECTION *connection); + + /*! + * @name Session handles + * @{ + */ + /*! + * Open a session. + * + * @snippet ex_all.c Open a session + * + * @param connection the connection handle + * @param errhandler An error handler. If <code>NULL</code>, the + * connection's error handler is used + * @configstart{connection.open_session, see dist/api_data.py} + * @config{isolation, the default isolation level for operations in this + * session., a string\, chosen from the following options: \c + * "read-uncommitted"\, \c "read-committed"\, \c "snapshot"; default \c + * read-committed.} + * @configend + * @param[out] sessionp the new session handle + * @errors + */ + int __F(open_session)(WT_CONNECTION *connection, + WT_EVENT_HANDLER *errhandler, const char *config, + WT_SESSION **sessionp); + /*! @} */ + + /*! + * @name Extensions + * @{ + */ + /*! + * Load an extension. + * + * @snippet ex_all.c Load an extension + * + * @param connection the connection handle + * @param path the filename of the extension module, or \c "local" to + * search the current application binary for the initialization + * function, see @ref extensions for more details. + * @configstart{connection.load_extension, see dist/api_data.py} + * @config{config, configuration string passed to the entry point of the + * extension as its WT_CONFIG_ARG argument., a string; default empty.} + * @config{entry, the entry point of the extension\, called to + * initialize the extension when it is loaded. The signature of the + * function must match ::wiredtiger_extension_init., a string; default + * \c wiredtiger_extension_init.} + * @config{terminate, an optional function in the extension that is + * called before the extension is unloaded during WT_CONNECTION::close. + * The signature of the function must match + * ::wiredtiger_extension_terminate., a string; default \c + * wiredtiger_extension_terminate.} + * @configend + * @errors + */ + int __F(load_extension)(WT_CONNECTION *connection, + const char *path, const char *config); + + /*! + * Add a custom data source. See @ref custom_data_sources for more + * information. + * + * The application must first implement the WT_DATA_SOURCE interface + * and then register the implementation with WiredTiger: + * + * @snippet ex_data_source.c WT_DATA_SOURCE register + * + * @param connection the connection handle + * @param prefix the URI prefix for this data source, e.g., "file:" + * @param data_source the application-supplied implementation of + * WT_DATA_SOURCE to manage this data source. + * @configempty{connection.add_data_source, see dist/api_data.py} + * @errors + */ + int __F(add_data_source)(WT_CONNECTION *connection, const char *prefix, + WT_DATA_SOURCE *data_source, const char *config); + + /*! + * Add a custom collation function. + * + * The application must first implement the WT_COLLATOR interface and + * then register the implementation with WiredTiger: + * + * @snippet ex_all.c WT_COLLATOR register + * + * @param connection the connection handle + * @param name the name of the collation to be used in calls to + * WT_SESSION::create + * @param collator the application-supplied collation handler + * @configempty{connection.add_collator, see dist/api_data.py} + * @errors + */ + int __F(add_collator)(WT_CONNECTION *connection, + const char *name, WT_COLLATOR *collator, const char *config); + + /*! + * Add a compression function. + * + * The application must first implement the WT_COMPRESSOR interface + * and then register the implementation with WiredTiger: + * + * @snippet nop_compress.c WT_COMPRESSOR initialization structure + * + * @snippet nop_compress.c WT_COMPRESSOR initialization function + * + * @param connection the connection handle + * @param name the name of the compression function to be used in calls + * to WT_SESSION::create + * @param compressor the application-supplied compression handler + * @configempty{connection.add_compressor, see dist/api_data.py} + * @errors + */ + int __F(add_compressor)(WT_CONNECTION *connection, + const char *name, WT_COMPRESSOR *compressor, const char *config); + + /*! + * Add a custom extractor for index keys or column groups. + * @notyet{custom extractors} + * + * The application must first implement the WT_EXTRACTOR interface and + * then register the implementation with WiredTiger: + * + * @snippet ex_all.c WT_EXTRACTOR register + * + * @param connection the connection handle + * @param name the name of the extractor to be used in calls to + * WT_SESSION::create + * @param extractor the application-supplied extractor + * @configempty{connection.add_extractor, see dist/api_data.py} + * @errors + */ + int __F(add_extractor)(WT_CONNECTION *connection, const char *name, + WT_EXTRACTOR *extractor, const char *config); + + /*! + * Return a reference to the WiredTiger extension functions. + * + * @snippet ex_data_source.c WT_EXTENSION_API declaration + * + * @param wt_conn the WT_CONNECTION handle + * @returns a reference to a WT_EXTENSION_API structure. + */ + WT_EXTENSION_API *__F(get_extension_api)(WT_CONNECTION *wt_conn); + /*! @} */ +}; + +/*! + * Open a connection to a database. + * + * @snippet ex_all.c Open a connection + * + * @param home The path to the database home directory. See @ref home + * for more information. + * @param errhandler An error handler. If <code>NULL</code>, a builtin error + * handler is installed that writes error messages to stderr + * @configstart{wiredtiger_open, see dist/api_data.py} + * @config{async = (, asynchronous operations configuration options., a set of + * related configuration options defined below.} + * @config{ enabled, enable asynchronous operation., a + * boolean flag; default \c false.} + * @config{ ops_max, + * maximum number of expected simultaneous asynchronous operations., an integer + * between 10 and 4096; default \c 1024.} + * @config{ threads, the number of worker threads to + * service asynchronous requests., an integer between 1 and 20; default \c 2.} + * @config{ ),,} + * @config{buffer_alignment, in-memory alignment (in bytes) for buffers used for + * I/O. The default value of -1 indicates a platform-specific alignment value + * should be used (4KB on Linux systems\, zero elsewhere)., an integer between + * -1 and 1MB; default \c -1.} + * @config{cache_size, maximum heap memory to allocate for the cache. A + * database should configure either a cache_size or a shared_cache not both., an + * integer between 1MB and 10TB; default \c 100MB.} + * @config{checkpoint = (, periodically checkpoint the database., a set of + * related configuration options defined below.} + * @config{ log_size, wait for this amount of log record + * bytes to be written to the log between each checkpoint. A database can + * configure both log_size and wait to set an upper bound for checkpoints; + * setting this value above 0 configures periodic checkpoints., an integer + * between 0 and 2GB; default \c 0.} + * @config{ name, the + * checkpoint name., a string; default \c "WiredTigerCheckpoint".} + * @config{ wait, seconds to wait between each + * checkpoint; setting this value above 0 configures periodic checkpoints., an + * integer between 0 and 100000; default \c 0.} + * @config{ ),,} + * @config{checkpoint_sync, flush files to stable storage when closing or + * writing checkpoints., a boolean flag; default \c true.} + * @config{config_base, write the base configuration file if creating the + * database\, see @ref config_base for more information., a boolean flag; + * default \c true.} + * @config{create, create the database if it does not exist., a boolean flag; + * default \c false.} + * @config{direct_io, Use \c O_DIRECT to access files. Options are given as a + * list\, such as <code>"direct_io=[data]"</code>. Configuring \c direct_io + * requires care\, see @ref tuning_system_buffer_cache_direct_io for important + * warnings. Including \c "data" will cause WiredTiger data files to use \c + * O_DIRECT\, including \c "log" will cause WiredTiger log files to use \c + * O_DIRECT\, and including \c "checkpoint" will cause WiredTiger data files + * opened at a checkpoint (i.e: read only) to use \c O_DIRECT., a list\, with + * values chosen from the following options: \c "checkpoint"\, \c "data"\, \c + * "log"; default empty.} + * @config{error_prefix, prefix string for error messages., a string; default + * empty.} + * @config{eviction = (, eviction configuration options., a set of related + * configuration options defined below.} + * @config{ threads_max, maximum number of threads + * WiredTiger will start to help evict pages from cache. The number of threads + * started will vary depending on the current eviction load., an integer between + * 1 and 20; default \c 1.} + * @config{ threads_min, minimum + * number of threads WiredTiger will start to help evict pages from cache. The + * number of threads currently running will vary depending on the current + * eviction load., an integer between 1 and 20; default \c 1.} + * @config{ ),,} + * @config{eviction_dirty_target, continue evicting until the cache has less + * dirty memory than the value\, as a percentage of the total cache size. Dirty + * pages will only be evicted if the cache is full enough to trigger eviction., + * an integer between 10 and 99; default \c 80.} + * @config{eviction_target, continue evicting until the cache has less total + * memory than the value\, as a percentage of the total cache size. Must be + * less than \c eviction_trigger., an integer between 10 and 99; default \c 80.} + * @config{eviction_trigger, trigger eviction when the cache is using this much + * memory\, as a percentage of the total cache size., an integer between 10 and + * 99; default \c 95.} + * @config{exclusive, fail if the database already exists\, generally used with + * the \c create option., a boolean flag; default \c false.} + * @config{extensions, list of shared library extensions to load (using dlopen). + * Any values specified to an library extension are passed to + * WT_CONNECTION::load_extension as the \c config parameter (for example\, + * <code>extensions=(/path/ext.so={entry=my_entry})</code>)., a list of strings; + * default empty.} + * @config{file_extend, file extension configuration. If set\, extend files of + * the set type in allocations of the set size\, instead of a block at a time as + * each new block is written. For example\, + * <code>file_extend=(data=16MB)</code>., a list\, with values chosen from the + * following options: \c "data"\, \c "log"; default empty.} + * @config{hazard_max, maximum number of simultaneous hazard pointers per + * session handle., an integer greater than or equal to 15; default \c 1000.} + * @config{log = (, enable logging., a set of related configuration options + * defined below.} + * @config{ archive, automatically + * archive unneeded log files., a boolean flag; default \c true.} + * @config{ enabled, enable logging subsystem., a boolean + * flag; default \c false.} + * @config{ file_max, the + * maximum size of log files., an integer between 100KB and 2GB; default \c + * 100MB.} + * @config{ path, the path to a directory into + * which the log files are written. If the value is not an absolute path name\, + * the files are created relative to the database home., a string; default \c + * "".} + * @config{ ),,} + * @config{lsm_manager = (, configure database wide options for LSM tree + * management., a set of related configuration options defined below.} + * @config{ merge, merge LSM chunks where possible., a + * boolean flag; default \c true.} + * @config{ worker_thread_max, Configure a set of threads + * to manage merging LSM trees in the database., an integer between 3 and 20; + * default \c 4.} + * @config{ ),,} + * @config{mmap, Use memory mapping to access files when possible., a boolean + * flag; default \c true.} + * @config{multiprocess, permit sharing between processes (will automatically + * start an RPC server for primary processes and use RPC for secondary + * processes). <b>Not yet supported in WiredTiger</b>., a boolean flag; default + * \c false.} + * @config{session_max, maximum expected number of sessions (including server + * threads)., an integer greater than or equal to 1; default \c 100.} + * @config{shared_cache = (, shared cache configuration options. A database + * should configure either a cache_size or a shared_cache not both., a set of + * related configuration options defined below.} + * @config{ chunk, the granularity that a shared cache is + * redistributed., an integer between 1MB and 10TB; default \c 10MB.} + * @config{ name, name of a cache that is shared between + * databases., a string; default empty.} + * @config{ reserve, amount of cache this database is + * guaranteed to have available from the shared cache. This setting is per + * database. Defaults to the chunk size., an integer; default \c 0.} + * @config{ size, maximum memory to allocate for the + * shared cache. Setting this will update the value if one is already set., an + * integer between 1MB and 10TB; default \c 500MB.} + * @config{ ),,} + * @config{statistics, Maintain database statistics\, which may impact + * performance. Choosing "all" maintains all statistics regardless of cost\, + * "fast" maintains a subset of statistics that are relatively inexpensive\, + * "none" turns off all statistics. The "clear" configuration resets statistics + * after they are gathered\, where appropriate (for example\, a cache size + * statistic is not cleared\, while the count of cursor insert operations will + * be cleared). When "clear" is configured for the database\, gathered + * statistics are reset each time a statistics cursor is used to gather + * statistics\, as well as each time statistics are logged using the \c + * statistics_log configuration. See @ref statistics for more information., a + * list\, with values chosen from the following options: \c "all"\, \c "fast"\, + * \c "none"\, \c "clear"; default \c none.} + * @config{statistics_log = (, log any statistics the database is configured to + * maintain\, to a file. See @ref statistics for more information., a set of + * related configuration options defined below.} + * @config{ on_close, log statistics on database close., + * a boolean flag; default \c false.} + * @config{ path, the + * pathname to a file into which the log records are written\, may contain ISO C + * standard strftime conversion specifications. If the value is not an absolute + * path name\, the file is created relative to the database home., a string; + * default \c "WiredTigerStat.%d.%H".} + * @config{ sources, + * if non-empty\, include statistics for the list of data source URIs\, if they + * are open at the time of the statistics logging. The list may include URIs + * matching a single data source ("table:mytable")\, or a URI matching all data + * sources of a particular type ("table:")., a list of strings; default empty.} + * @config{ timestamp, a timestamp prepended to each log + * record\, may contain strftime conversion specifications., a string; default + * \c "%b %d %H:%M:%S".} + * @config{ wait, seconds to wait + * between each write of the log records., an integer between 0 and 100000; + * default \c 0.} + * @config{ ),,} + * @config{transaction_sync = (, how to sync log records when the transaction + * commits., a set of related configuration options defined below.} + * @config{ enabled, whether to sync the log on every + * commit by default\, can be overridden by the \c sync setting to + * WT_SESSION::begin_transaction., a boolean flag; default \c false.} + * @config{ method, the method used to ensure log records + * are stable on disk\, see @ref tune_durability for more information., a + * string\, chosen from the following options: \c "dsync"\, \c "fsync"\, \c + * "none"; default \c fsync.} + * @config{ ),,} + * @config{use_environment_priv, use the \c WIREDTIGER_CONFIG and \c + * WIREDTIGER_HOME environment variables regardless of whether or not the + * process is running with special privileges. See @ref home for more + * information., a boolean flag; default \c false.} + * @config{verbose, enable messages for various events. Only available if + * WiredTiger is configured with --enable-verbose. Options are given as a + * list\, such as <code>"verbose=[evictserver\,read]"</code>., a list\, with + * values chosen from the following options: \c "api"\, \c "block"\, \c + * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\, + * \c "log"\, \c "lsm"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c + * "read"\, \c "reconcile"\, \c "recovery"\, \c "salvage"\, \c "shared_cache"\, + * \c "split"\, \c "temporary"\, \c "transaction"\, \c "verify"\, \c "version"\, + * \c "write"; default empty.} + * @configend + * Additionally, if files named \c WiredTiger.config or \c WiredTiger.basecfg + * appear in the WiredTiger home directory, they are read for configuration + * values (see @ref config_file and @ref config_base for details). + * See @ref config_order for ordering of the configuration mechanisms. + * @param[out] connectionp A pointer to the newly opened connection handle + * @errors + */ +int wiredtiger_open(const char *home, + WT_EVENT_HANDLER *errhandler, const char *config, + WT_CONNECTION **connectionp); + +/*! + * Return information about an error as a string; wiredtiger_strerror is a + * superset of the ISO C99/POSIX 1003.1-2001 function strerror. + * + * @snippet ex_all.c Display an error + * + * @param err a return value from a WiredTiger, C library or POSIX function + * @returns a string representation of the error + */ +const char *wiredtiger_strerror(int err); + +#if !defined(SWIG) +/*! + * The interface implemented by applications to accept notifications + * of the completion of asynchronous operations. + * + * Applications register their implementation with WiredTiger by calling + * WT_CONNECTION::async_new_op. + * + * @snippet ex_async.c async handle allocation + */ +struct __wt_async_callback { + /*! + * Callback to receive completion notification. + * + * @param[in] op the operation handle + * @param[in] op_ret the result of the async operation + * @param[in] flags currently unused + * @returns zero for success, non-zero to indicate an error. + * + * @snippet ex_async.c async example callback implementation + */ + int (*notify)(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op, + int op_ret, uint32_t flags); +}; +#endif + +/*! + * The interface implemented by applications to handle error, informational and + * progress messages. Entries set to NULL are ignored and the default handlers + * will continue to be used. + */ +struct __wt_event_handler { + /*! + * Callback to handle error messages; by default, error messages are + * written to the stderr stream. + * + * Error handler returns are not ignored: if the handler returns + * non-zero, the error may cause the WiredTiger function posting the + * event to fail, and may even cause operation or library failure. + * + * @param session the WiredTiger session handle in use when the error + * was generated. The handle may have been created by the application + * or automatically by WiredTiger. + * @param error a WiredTiger, C99 or POSIX error code, which can + * be converted to a string using ::wiredtiger_strerror + * @param message an error string + */ + int (*handle_error)(WT_EVENT_HANDLER *handler, + WT_SESSION *session, int error, const char *message); + + /*! + * Callback to handle informational messages; by default, informational + * messages are written to the stdout stream. + * + * Message handler returns are not ignored: if the handler returns + * non-zero, the error may cause the WiredTiger function posting the + * event to fail, and may even cause operation or library failure. + * + * @param session the WiredTiger session handle in use when the message + * was generated. The handle may have been created by the application + * or automatically by WiredTiger. + * @param message an informational string + */ + int (*handle_message)(WT_EVENT_HANDLER *handler, + WT_SESSION *session, const char *message); + + /*! + * Callback to handle progress messages; by default, no progress + * messages are written. + * + * Progress handler returns are not ignored: if the handler returns + * non-zero, the error may cause the WiredTiger function posting the + * event to fail, and may even cause operation or library failure. + * + * @param session the WiredTiger session handle in use when the + * progress message was generated. The handle may have been created by + * the application or automatically by WiredTiger. + * @param operation a string representation of the operation + * @param progress a counter + */ + int (*handle_progress)(WT_EVENT_HANDLER *handler, + WT_SESSION *session, const char *operation, uint64_t progress); + + /*! + * Callback to handle automatic close of a WiredTiger handle. + * + * Close handler returns are not ignored: if the handler returns + * non-zero, the error may cause the WiredTiger function posting the + * event to fail, and may even cause operation or library failure. + * + * @param session The session handle that is being closed if the + * cursor parameter is NULL. + * @param cursor The cursor handle that is being closed, or NULL if + * it is a session handle being closed. + */ + int (*handle_close)(WT_EVENT_HANDLER *handler, + WT_SESSION *session, WT_CURSOR *cursor); +}; + +/*! + * @name Data packing and unpacking + * @{ + */ + +/*! + * Pack a structure into a buffer. + * + * See @ref packing for a description of the permitted format strings. + * + * @section pack_examples Packing Examples + * + * For example, the string <code>"iSh"</code> will pack a 32-bit integer + * followed by a NUL-terminated string, followed by a 16-bit integer. The + * default, big-endian encoding will be used, with no alignment. This could be + * used in C as follows: + * + * @snippet ex_all.c Pack fields into a buffer + * + * Then later, the values can be unpacked as follows: + * + * @snippet ex_all.c Unpack fields from a buffer + * + * @param session the session handle + * @param buffer a pointer to a packed byte array + * @param size the number of valid bytes in the buffer + * @param format the data format, see @ref packing + * @errors + */ +int wiredtiger_struct_pack(WT_SESSION *session, + void *buffer, size_t size, const char *format, ...); + +/*! + * Calculate the size required to pack a structure. + * + * Note that for variable-sized fields including variable-sized strings and + * integers, the calculated sized merely reflects the expected sizes specified + * in the format string itself. + * + * @snippet ex_all.c Get the packed size + * + * @param session the session handle + * @param sizep a location where the number of bytes needed for the + * matching call to ::wiredtiger_struct_pack is returned + * @param format the data format, see @ref packing + * @errors + */ +int wiredtiger_struct_size(WT_SESSION *session, + size_t *sizep, const char *format, ...); + +/*! + * Unpack a structure from a buffer. + * + * Reverse of ::wiredtiger_struct_pack: gets values out of a + * packed byte string. + * + * @snippet ex_all.c Unpack fields from a buffer + * + * @param session the session handle + * @param buffer a pointer to a packed byte array + * @param size the number of valid bytes in the buffer + * @param format the data format, see @ref packing + * @errors + */ +int wiredtiger_struct_unpack(WT_SESSION *session, + const void *buffer, size_t size, const char *format, ...); + +#if !defined(SWIG) + +/*! + * Streaming interface to packing. + * + * This allows applications to pack or unpack records one field at a time. + * This is an opaque handle returned by ::wiredtiger_pack_start or + * ::wiredtiger_unpack_start. It must be closed with ::wiredtiger_pack_close. + */ +typedef struct __wt_pack_stream WT_PACK_STREAM; + +/*! + * Start a packing operation into a buffer with the given format string. This + * should be followed by a series of calls to ::wiredtiger_pack_item, + * ::wiredtiger_pack_int, ::wiredtiger_pack_str or ::wiredtiger_pack_uint + * to fill in the values. + * + * @param session the session handle + * @param format the data format, see @ref packing + * @param buffer a pointer to memory to hold the packed data + * @param size the size of the buffer + * @param[out] psp the new packing stream handle + * @errors + */ +int wiredtiger_pack_start(WT_SESSION *session, + const char *format, void *buffer, size_t size, WT_PACK_STREAM **psp); + +/*! + * Start an unpacking operation from a buffer with the given format string. + * This should be followed by a series of calls to ::wiredtiger_unpack_item, + * ::wiredtiger_unpack_int, ::wiredtiger_unpack_str or ::wiredtiger_unpack_uint + * to retrieve the packed values. + * + * @param session the session handle + * @param format the data format, see @ref packing + * @param buffer a pointer to memory holding the packed data + * @param size the size of the buffer + * @param[out] psp the new packing stream handle + * @errors + */ +int wiredtiger_unpack_start(WT_SESSION *session, + const char *format, const void *buffer, size_t size, WT_PACK_STREAM **psp); + +/*! + * Close a packing stream. + * + * @param ps the packing stream handle + * @param[out] usedp the number of bytes in the buffer used by the stream + * @errors + */ +int wiredtiger_pack_close(WT_PACK_STREAM *ps, size_t *usedp); + +/*! + * Pack an item into a packing stream. + * + * @param ps the packing stream handle + * @param item an item to pack + * @errors + */ +int wiredtiger_pack_item(WT_PACK_STREAM *ps, WT_ITEM *item); + +/*! + * Pack a signed integer into a packing stream. + * + * @param ps the packing stream handle + * @param i a signed integer to pack + * @errors + */ +int wiredtiger_pack_int(WT_PACK_STREAM *ps, int64_t i); + +/*! + * Pack a string into a packing stream. + * + * @param ps the packing stream handle + * @param s a string to pack + * @errors + */ +int wiredtiger_pack_str(WT_PACK_STREAM *ps, const char *s); + +/*! + * Pack an unsigned integer into a packing stream. + * + * @param ps the packing stream handle + * @param u an unsigned integer to pack + * @errors + */ +int wiredtiger_pack_uint(WT_PACK_STREAM *ps, uint64_t u); + +/*! + * Unpack an item from a packing stream. + * + * @param ps the packing stream handle + * @param item an item to unpack + * @errors + */ +int wiredtiger_unpack_item(WT_PACK_STREAM *ps, WT_ITEM *item); + +/*! + * Unpack a signed integer from a packing stream. + * + * @param ps the packing stream handle + * @param[out] ip the unpacked signed integer + * @errors + */ +int wiredtiger_unpack_int(WT_PACK_STREAM *ps, int64_t *ip); + +/*! + * Unpack a string from a packing stream. + * + * @param ps the packing stream handle + * @param[out] sp the unpacked string + * @errors + */ +int wiredtiger_unpack_str(WT_PACK_STREAM *ps, const char **sp); + +/*! + * Unpack an unsigned integer from a packing stream. + * + * @param ps the packing stream handle + * @param[out] up the unpacked unsigned integer + * @errors + */ +int wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up); +/*! @} */ + +/*! + * @name Configuration string parsing + * @{ + */ + +/*! + * The configuration information returned by the WiredTiger configuration + * parsing functions in the WT_EXTENSION_API and the public API. + */ +struct __wt_config_item { + /*! + * The value of a configuration string. + * + * Regardless of the type of the configuration string (boolean, int, + * list or string), the \c str field will reference the value of the + * configuration string. + * + * The bytes referenced by \c str are <b>not</b> nul-terminated, + * use the \c len field instead of a terminating nul byte. + */ + const char *str; + + /*! The number of bytes in the value referenced by \c str. */ + size_t len; + + /*! + * The value of a configuration boolean or integer. + * + * If the configuration string's value is "true" or "false", the + * \c val field will be set to 1 (true), or 0 (false). + * + * If the configuration string can be legally interpreted as an integer, + * using the strtoll function rules as specified in ISO/IEC 9899:1990 + * ("ISO C90"), that integer will be stored in the \c val field. + */ + int64_t val; + + /*! Permitted values of the \c type field. */ + enum { + /*! A string value with quotes stripped. */ + WT_CONFIG_ITEM_STRING, + /*! A boolean literal ("true" or "false"). */ + WT_CONFIG_ITEM_BOOL, + /*! An unquoted identifier: a string value without quotes. */ + WT_CONFIG_ITEM_ID, + /*! A numeric value. */ + WT_CONFIG_ITEM_NUM, + /*! A nested structure or list, including brackets. */ + WT_CONFIG_ITEM_STRUCT + } + /*! + * The type of value determined by the parser. In all cases, + * the \c str and \c len fields are set. + */ + type; +}; + +/*! + * Create a handle that can be used to parse or create configuration strings + * compatible with WiredTiger APIs. + * This API is outside the scope of a WiredTiger connection handle, since + * applications may need to generate configuration strings prior to calling + * ::wiredtiger_open. + * @param session the session handle to be used for error reporting. If NULL + * error messages will be written to stdout. + * @param config the configuration string being parsed. The string must + * remain valid for the lifetime of the parser handle. + * @param len the number of valid bytes in \c config + * @param[out] config_parserp A pointer to the newly opened handle + * @errors + */ +int wiredtiger_config_parser_open(WT_SESSION *session, + const char *config, size_t len, WT_CONFIG_PARSER **config_parserp); + +/*! + * A handle that can be used to search and traverse configuration strings + * compatible with WiredTiger APIs. + * To parse the contents of a list or nested configuration string use a new + * configuration parser handle based on the content of the ::WT_CONFIG_ITEM + * retrieved from the parent configuration string. + * + * @section config_parse_examples Configuration String Parsing examples + * + * This could be used in C to create a configuration parser as follows: + * + * @snippet ex_config_parse.c Create a configuration parser + * + * Once the parser has been created the content can be queried directly: + * + * @snippet ex_config_parse.c get + * + * Or the content can be traversed linearly: + * + * @snippet ex_config_parse.c next + * + * Nested configuration values can be queried using a shorthand notation: + * + * @snippet ex_config_parse.c nested get + * + * Nested configuration values can be traversed using multiple + * ::WT_CONFIG_PARSER handles: + * + * @snippet ex_config_parse.c nested traverse + */ +struct __wt_config_parser { + + /*! + * Close the configuration scanner releasing any resources. + * + * @param config_parser the configuration parser handle + * @errors + * + */ + int __F(close)(WT_CONFIG_PARSER *config_parser); + + /*! + * Return the next key/value pair. + * + * When iteration would pass the end of the configuration string + * ::WT_NOTFOUND will be returned. + * + * If an item has no explicitly assigned value, the item will be + * returned in \c key and the \c value will be set to the boolean + * \c "true" value. + * + * @param config_parser the configuration parser handle + * @param key the returned key + * @param value the returned value + * @errors + * + */ + int __F(next)(WT_CONFIG_PARSER *config_parser, + WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value); + + /*! + * Return the value of an item in the configuration string. + * + * @param config_parser the configuration parser handle + * @param key configuration key string + * @param value the returned value + * @errors + * + */ + int __F(get)(WT_CONFIG_PARSER *config_parser, + const char *key, WT_CONFIG_ITEM *value); +}; + +#endif /* !defined(SWIG) */ +/*! @} */ + +/*! + * Get version information. + * + * @snippet ex_all.c Get the WiredTiger library version #1 + * @snippet ex_all.c Get the WiredTiger library version #2 + * + * @param majorp a location where the major version number is returned + * @param minorp a location where the minor version number is returned + * @param patchp a location where the patch version number is returned + * @returns a string representation of the version + */ +const char *wiredtiger_version(int *majorp, int *minorp, int *patchp); + +/******************************************* + * Error returns + *******************************************/ +/*! + * @anchor error_returns + * @name Error returns + * Most functions and methods in WiredTiger return an integer code indicating + * whether the operation succeeded or failed. A return of zero indicates + * success, all non-zero return values indicate some kind of failure. + * + * WiredTiger reserves all values from -31,800 to -31,999 as possible error + * return values. WiredTiger may also return C99/POSIX error codes such as + * \c ENOMEM, \c EINVAL and \c ENOTSUP, with the usual meanings. + * + * The following are all of the WiredTiger-specific error returns: + * @{ + */ +/* + * DO NOT EDIT: automatically built by dist/api_err.py. + * Error return section: BEGIN + */ +/*! + * Attempt to insert an existing key. + * This error is generated when the application attempts to insert a record with + * the same key as an existing record without the 'overwrite' configuration to + * WT_SESSION::open_cursor. + */ +#define WT_DUPLICATE_KEY -31800 +/*! + * Non-specific WiredTiger error. + * This error is returned when an error is not covered by a specific error + * return. + */ +#define WT_ERROR -31801 +/*! + * Item not found. + * This error indicates an operation did not find a value to return. This + * includes cursor search and other operations where no record matched the + * cursor's search key such as WT_CURSOR::update or WT_CURSOR::remove. + */ +#define WT_NOTFOUND -31802 +/*! + * WiredTiger library panic. + * This error indicates an underlying problem that requires the application exit + * and restart. + */ +#define WT_PANIC -31803 +/*! @cond internal */ +/*! Restart the operation (internal). */ +#define WT_RESTART -31804 +/*! @endcond */ +/*! + * Conflict between concurrent operations. + * This error is generated when an operation cannot be completed due to a + * conflict with concurrent operations. The operation may be retried; if a + * transaction is in progress, it should be rolled back and the operation + * retried in a new transaction. + */ +#define WT_ROLLBACK -31805 +/* + * Error return section: END + * DO NOT EDIT: automatically built by dist/api_err.py. + */ +/*! @} */ + +#ifndef DOXYGEN +#define WT_DEADLOCK WT_ROLLBACK /* Backward compatibility */ +#endif + +/*! @} */ + +/*! + * @defgroup wt_ext WiredTiger Extension API + * The functions and interfaces applications use to customize and extend the + * behavior of WiredTiger. + * @{ + */ + +/******************************************* + * Forward structure declarations for the extension API + *******************************************/ +struct __wt_config_arg; typedef struct __wt_config_arg WT_CONFIG_ARG; + +/*! + * The interface implemented by applications to provide custom ordering of + * records. + * + * Applications register their implementation with WiredTiger by calling + * WT_CONNECTION::add_collator. + * + * @snippet ex_extending.c add collator nocase + * + * @snippet ex_extending.c add collator prefix10 + */ +struct __wt_collator { + /*! + * Callback to compare keys. + * + * @param[out] cmp set to -1 if <code>key1 < key2</code>, + * 0 if <code>key1 == key2</code>, + * 1 if <code>key1 > key2</code>. + * @returns zero for success, non-zero to indicate an error. + * + * @snippet ex_all.c Implement WT_COLLATOR + * + * @snippet ex_extending.c case insensitive comparator + * + * @snippet ex_extending.c n character comparator + */ + int (*compare)(WT_COLLATOR *collator, WT_SESSION *session, + const WT_ITEM *key1, const WT_ITEM *key2, int *cmp); + + /*! + * If non-NULL, this callback is called to customize the collator + * for each data source. If the callback returns a non-NULL + * collator, that instance is used instead of this one for all + * comparisons. + */ + int (*customize)(WT_COLLATOR *collator, WT_SESSION *session, + const char *uri, WT_CONFIG_ITEM *appcfg, WT_COLLATOR **customp); + + /*! + * If non-NULL, a callback performed when the database is closed. + * + * The WT_COLLATOR::terminate callback is intended to allow cleanup, + * the handle will not be subsequently accessed by WiredTiger. + */ + int (*terminate)(WT_COLLATOR *collator, WT_SESSION *session); +}; + +/*! + * The interface implemented by applications to provide custom compression. + * + * Compressors must implement the WT_COMPRESSOR interface: the + * WT_COMPRESSOR::compress and WT_COMPRESSOR::decompress callbacks must be + * specified, and WT_COMPRESSOR::pre_size is optional. To build your own + * compressor, use one of the compressors in \c ext/compressors as a template: + * \c ext/nop_compress is a simple compressor that passes through data + * unchanged, and is a reasonable starting point. + * + * Applications register their implementation with WiredTiger by calling + * WT_CONNECTION::add_compressor. + * + * @snippet nop_compress.c WT_COMPRESSOR initialization structure + * @snippet nop_compress.c WT_COMPRESSOR initialization function + */ +struct __wt_compressor { + /*! + * Callback to compress a chunk of data. + * + * WT_COMPRESSOR::compress takes a source buffer and a destination + * buffer, by default of the same size. If the callback can compress + * the buffer to a smaller size in the destination, it does so, sets + * the \c compression_failed return to 0 and returns 0. If compression + * does not produce a smaller result, the callback sets the + * \c compression_failed return to 1 and returns 0. If another + * error occurs, it returns an errno or WiredTiger error code. + * + * On entry, \c src will point to memory, with the length of the memory + * in \c src_len. After successful completion, the callback should + * return \c 0 and set \c result_lenp to the number of bytes required + * for the compressed representation. + * + * On entry, \c dst points to the destination buffer with a length + * of \c dst_len. If the WT_COMPRESSOR::pre_size method is specified, + * the destination buffer will be at least the size returned by that + * method; otherwise, the destination buffer will be at least as large + * as \c src_len. + * + * If compression would not shrink the data or the \c dst buffer is not + * large enough to hold the compressed data, the callback should set + * \c compression_failed to a non-zero value and return 0. + * + * @param[in] src the data to compress + * @param[in] src_len the length of the data to compress + * @param[in] dst the destination buffer + * @param[in] dst_len the length of the destination buffer + * @param[out] result_lenp the length of the compressed data + * @param[out] compression_failed non-zero if compression did not + * decrease the length of the data (compression may not have completed) + * @returns zero for success, non-zero to indicate an error. + * + * @snippet nop_compress.c WT_COMPRESSOR compress + */ + int (*compress)(WT_COMPRESSOR *compressor, WT_SESSION *session, + uint8_t *src, size_t src_len, + uint8_t *dst, size_t dst_len, + size_t *result_lenp, int *compression_failed); + + /*! + * Callback to compress a list of byte strings. + * + * WT_COMPRESSOR::compress_raw gives applications fine-grained control + * over disk block size when writing row-store or variable-length + * column-store pages. Where this level of control is not required by + * the underlying storage device, set the WT_COMPRESSOR::compress_raw + * callback to \c NULL and WiredTiger will internally split each page + * into blocks, each block then compressed by WT_COMPRESSOR::compress. + * + * WT_COMPRESSOR::compress_raw takes a source buffer and an array of + * 0-based offsets of byte strings in that buffer. The callback then + * encodes none, some or all of the byte strings and copies the encoded + * representation into a destination buffer. The callback returns the + * number of byte strings encoded and the bytes needed for the encoded + * representation. The encoded representation has header information + * prepended and is written as a block to the underlying file object. + * + * On entry, \c page_max is the configured maximum size for objects of + * this type. (This value is provided for convenience, and will be + * either the \c internal_page_max or \c leaf_page_max value specified + * to WT_SESSION::create when the object was created.) + * + * On entry, \c split_pct is the configured Btree page split size for + * this object. (This value is provided for convenience, and will be + * the \c split_pct value specified to WT_SESSION::create when the + * object was created.) + * + * On entry, \c extra is a count of additional bytes that will be added + * to the encoded representation before it is written. In other words, + * if the target write size is 8KB, the returned encoded representation + * should be less than or equal to (8KB - \c extra). The method does + * not need to skip bytes in the destination buffer based on \c extra, + * the method should only use \c extra to decide how many bytes to store + * into the destination buffer for its ideal block size. + * + * On entry, \c src points to the source buffer; \c offsets is an array + * of \c slots 0-based offsets into \c src, where each offset is the + * start of a byte string, except for the last offset, which is the + * offset of the first byte past the end of the last byte string. (In + * other words, <code>offsets[0]</code> will be 0, the offset of the + * first byte of the first byte string in \c src, and + * <code>offsets[slots]</code> is the total length of all of the byte + * strings in the \c src buffer.) + * + * On entry, \c dst points to the destination buffer with a length + * of \c dst_len. If the WT_COMPRESSOR::pre_size method is specified, + * the destination buffer will be at least the size returned by that + * method; otherwise, the destination buffer will be at least the + * maximum size for the page being written (that is, when writing a + * row-store leaf page, the destination buffer will be at least as + * large as the \c leaf_page_max configuration value). + * + * After successful completion, the callback should return \c 0, and + * set \c result_slotsp to the number of byte strings encoded and + * \c result_lenp to the bytes needed for the encoded representation. + * + * There is no requirement the callback encode any or all of the byte + * strings passed by WiredTiger. If the callback does not encode any + * of the byte strings and compression should not be retried, the + * callback should set \c result_slotsp to 0. + * + * If the callback does not encode any of the byte strings and + * compression should be retried with additional byte strings, the + * callback must return \c EAGAIN. In that case, WiredTiger will + * accumulate more rows and repeat the call. + * + * If there are no more rows to accumulate or the callback indicates + * that it cannot be retried, WiredTiger writes the remaining rows + * using \c WT_COMPRESSOR::compress. + * + * On entry, \c final is zero if there are more rows to be written as + * part of this page (if there will be additional data provided to the + * callback), and non-zero if there are no more rows to be written as + * part of this page. If \c final is set and the callback fails to + * encode any rows, WiredTiger writes the remaining rows without further + * calls to the callback. If \c final is set and the callback encodes + * any number of rows, WiredTiger continues to call the callback until + * all of the rows are encoded or the callback fails to encode any rows. + * + * The WT_COMPRESSOR::compress_raw callback is intended for applications + * wanting to create disk blocks in specific sizes. + * WT_COMPRESSOR::compress_raw is not a replacement for + * WT_COMPRESSOR::compress: objects which WT_COMPRESSOR::compress_raw + * cannot handle (for example, overflow key or value items), or which + * WT_COMPRESSOR::compress_raw chooses not to compress for any reason + * (for example, if WT_COMPRESSOR::compress_raw callback chooses not to + * compress a small number of rows, but the page being written has no + * more rows to accumulate), will be passed to WT_COMPRESSOR::compress. + * + * The WT_COMPRESSOR::compress_raw callback is only called for objects + * where it is applicable, that is, for row-store and variable-length + * column-store objects, where both row-store key prefix compression + * and row-store and variable-length column-store dictionary compression + * are \b not configured. When WT_COMPRESSOR::compress_raw is not + * applicable, the WT_COMPRESSOR::compress callback is used instead. + * + * @param[in] page_max the configured maximum page size for this object + * @param[in] split_pct the configured page split size for this object + * @param[in] extra the count of the additional bytes + * @param[in] src the data to compress + * @param[in] offsets the byte offsets of the byte strings in src + * @param[in] slots the number of entries in offsets + * @param[in] dst the destination buffer + * @param[in] dst_len the length of the destination buffer + * @param[in] final non-zero if there are no more rows to accumulate + * @param[out] result_lenp the length of the compressed data + * @param[out] result_slotsp the number of byte offsets taken + * @returns zero for success, non-zero to indicate an error. + */ + int (*compress_raw)(WT_COMPRESSOR *compressor, WT_SESSION *session, + size_t page_max, int split_pct, size_t extra, + uint8_t *src, uint32_t *offsets, uint32_t slots, + uint8_t *dst, size_t dst_len, + int final, + size_t *result_lenp, uint32_t *result_slotsp); + + /*! + * Callback to decompress a chunk of data. + * + * WT_COMPRESSOR::decompress takes a source buffer and a destination + * buffer. The contents are switched from \c compress: the + * source buffer is the compressed value, and the destination buffer is + * sized to be the original size. If the callback successfully + * decompresses the source buffer to the destination buffer, it returns + * 0. If an error occurs, it returns an errno or WiredTiger error code. + * The source buffer that WT_COMPRESSOR::decompress takes may have a + * size that is rounded up from the size originally produced by + * WT_COMPRESSOR::compress, with the remainder of the buffer set to + * zeroes. Most compressors do not care about this difference if the + * size to be decompressed can be implicitly discovered from the + * compressed data. If your compressor cares, you may need to allocate + * space for, and store, the actual size in the compressed buffer. See + * the source code for the included snappy compressor for an example. + * + * On entry, \c src will point to memory, with the length of the memory + * in \c src_len. After successful completion, the callback should + * return \c 0 and set \c result_lenp to the number of bytes required + * for the decompressed representation. + * + * If the \c dst buffer is not big enough to hold the decompressed + * data, the callback should return an error. + * + * @param[in] src the data to decompress + * @param[in] src_len the length of the data to decompress + * @param[in] dst the destination buffer + * @param[in] dst_len the length of the destination buffer + * @param[out] result_lenp the length of the decompressed data + * @returns zero for success, non-zero to indicate an error. + * + * @snippet nop_compress.c WT_COMPRESSOR decompress + */ + int (*decompress)(WT_COMPRESSOR *compressor, WT_SESSION *session, + uint8_t *src, size_t src_len, + uint8_t *dst, size_t dst_len, + size_t *result_lenp); + + /*! + * Callback to size a destination buffer for compression + * + * WT_COMPRESSOR::pre_size is an optional callback that, given the + * source buffer and size, produces the size of the destination buffer + * to be given to WT_COMPRESSOR::compress. This is useful for + * compressors that assume that the output buffer is sized for the + * worst case and thus no overrun checks are made. If your compressor + * works like this, WT_COMPRESSOR::pre_size will need to be defined. + * See the source code for the snappy compressor for an example. + * However, if your compressor detects and avoids overruns against its + * target buffer, you will not need to define WT_COMPRESSOR::pre_size. + * When WT_COMPRESSOR::pre_size is set to NULL, the destination buffer + * is sized the same as the source buffer. This is always sufficient, + * since a compression result that is larger than the source buffer is + * discarded by WiredTiger. + * + * If not NULL, this callback is called before each call to + * WT_COMPRESS::compress to determine the size of the destination + * buffer to provide. If the callback is NULL, the destination + * buffer will be the same size as the source buffer. + * + * The callback should set \c result_lenp to a suitable buffer size + * for compression, typically the maximum length required by + * WT_COMPRESSOR::compress. + * + * This callback function is for compressors that require an output + * buffer larger than the source buffer (for example, that do not + * check for buffer overflow during compression). + * + * @param[in] src the data to compress + * @param[in] src_len the length of the data to compress + * @param[out] result_lenp the required destination buffer size + * @returns zero for success, non-zero to indicate an error. + * + * @snippet nop_compress.c WT_COMPRESSOR presize + */ + int (*pre_size)(WT_COMPRESSOR *compressor, WT_SESSION *session, + uint8_t *src, size_t src_len, size_t *result_lenp); + + /*! + * If non-NULL, a callback performed when the database is closed. + * + * The WT_COMPRESSOR::terminate callback is intended to allow cleanup, + * the handle will not be subsequently accessed by WiredTiger. + * + * @snippet nop_compress.c WT_COMPRESSOR terminate + */ + int (*terminate)(WT_COMPRESSOR *compressor, WT_SESSION *session); +}; + +/*! + * Applications can extend WiredTiger by providing new implementations of the + * WT_DATA_SOURCE class. Each data source supports a different URI scheme for + * data sources to WT_SESSION::create, WT_SESSION::open_cursor and related + * methods. See @ref custom_data_sources for more information. + * + * <b>Thread safety:</b> WiredTiger may invoke methods on the WT_DATA_SOURCE + * interface from multiple threads concurrently. It is the responsibility of + * the implementation to protect any shared data. + * + * Applications register their implementation with WiredTiger by calling + * WT_CONNECTION::add_data_source. + * + * @snippet ex_data_source.c WT_DATA_SOURCE register + */ +struct __wt_data_source { + /*! + * Callback to create a new object. + * + * @snippet ex_data_source.c WT_DATA_SOURCE create + */ + int (*create)(WT_DATA_SOURCE *dsrc, WT_SESSION *session, + const char *uri, WT_CONFIG_ARG *config); + + /*! + * Callback to compact an object. + * + * @snippet ex_data_source.c WT_DATA_SOURCE compact + */ + int (*compact)(WT_DATA_SOURCE *dsrc, WT_SESSION *session, + const char *uri, WT_CONFIG_ARG *config); + + /*! + * Callback to drop an object. + * + * @snippet ex_data_source.c WT_DATA_SOURCE drop + */ + int (*drop)(WT_DATA_SOURCE *dsrc, WT_SESSION *session, + const char *uri, WT_CONFIG_ARG *config); + + /*! + * Callback to initialize a cursor. + * + * @snippet ex_data_source.c WT_DATA_SOURCE open_cursor + */ + int (*open_cursor)(WT_DATA_SOURCE *dsrc, WT_SESSION *session, + const char *uri, WT_CONFIG_ARG *config, WT_CURSOR **new_cursor); + + /*! + * Callback to rename an object. + * + * @snippet ex_data_source.c WT_DATA_SOURCE rename + */ + int (*rename)(WT_DATA_SOURCE *dsrc, WT_SESSION *session, + const char *uri, const char *newuri, WT_CONFIG_ARG *config); + + /*! + * Callback to salvage an object. + * + * @snippet ex_data_source.c WT_DATA_SOURCE salvage + */ + int (*salvage)(WT_DATA_SOURCE *dsrc, WT_SESSION *session, + const char *uri, WT_CONFIG_ARG *config); + + /*! + * Callback to truncate an object. + * + * @snippet ex_data_source.c WT_DATA_SOURCE truncate + */ + int (*truncate)(WT_DATA_SOURCE *dsrc, WT_SESSION *session, + const char *uri, WT_CONFIG_ARG *config); + + /*! + * Callback to truncate a range of an object. + * + * @snippet ex_data_source.c WT_DATA_SOURCE range truncate + */ + int (*range_truncate)(WT_DATA_SOURCE *dsrc, WT_SESSION *session, + WT_CURSOR *start, WT_CURSOR *stop); + + /*! + * Callback to verify an object. + * + * @snippet ex_data_source.c WT_DATA_SOURCE verify + */ + int (*verify)(WT_DATA_SOURCE *dsrc, WT_SESSION *session, + const char *uri, WT_CONFIG_ARG *config); + + /*! + * Callback to checkpoint the database. + * + * @snippet ex_data_source.c WT_DATA_SOURCE checkpoint + */ + int (*checkpoint)( + WT_DATA_SOURCE *dsrc, WT_SESSION *session, WT_CONFIG_ARG *config); + + /*! + * If non-NULL, a callback performed when the database is closed. + * + * The WT_DATA_SOURCE::terminate callback is intended to allow cleanup, + * the handle will not be subsequently accessed by WiredTiger. + * + * @snippet ex_data_source.c WT_DATA_SOURCE terminate + */ + int (*terminate)(WT_DATA_SOURCE *dsrc, WT_SESSION *session); +}; + +/*! + * The interface implemented by applications to provide custom extraction of + * index keys or column group values. + * + * Applications register implementations with WiredTiger by calling + * WT_CONNECTION::add_extractor. + * + * @snippet ex_all.c WT_EXTRACTOR register + */ +struct __wt_extractor { + /*! + * Callback to extract a value for an index or column group. + * + * @errors + * + * @snippet ex_all.c WT_EXTRACTOR + */ + int (*extract)(WT_EXTRACTOR *extractor, WT_SESSION *session, + const WT_ITEM *key, const WT_ITEM *value, WT_ITEM *result); +}; + +/*! + * Entry point to an extension, called when the extension is loaded. + * + * @param connection the connection handle + * @param config the config information passed to WT_CONNECTION::load_extension + * @errors + */ +extern int wiredtiger_extension_init( + WT_CONNECTION *connection, WT_CONFIG_ARG *config); + +/*! + * Optional cleanup function for an extension, called during + * WT_CONNECTION::close. + * + * @param connection the connection handle + * @errors + */ +extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); + +/*! @} */ + +/******************************************* + * Statistic reference. + *******************************************/ +/*! + * @addtogroup wt + * @{ + */ +/* + * DO NOT EDIT: automatically built by dist/api_stat.py. + * Statistics section: BEGIN + */ + +/*! + * @name Connection statistics + * @anchor statistics_keys + * @anchor statistics_conn + * Statistics are accessed through cursors with \c "statistics:" URIs. + * Individual statistics can be queried through the cursor using the following + * keys. See @ref data_statistics for more information. + * @{ + */ +/*! async: number of allocation state races */ +#define WT_STAT_CONN_ASYNC_ALLOC_RACE 1000 +/*! async: number of op slots viewed for alloc */ +#define WT_STAT_CONN_ASYNC_ALLOC_VIEW 1001 +/*! async: current work queue length */ +#define WT_STAT_CONN_ASYNC_CUR_QUEUE 1002 +/*! async: number of async flush calls */ +#define WT_STAT_CONN_ASYNC_FLUSH 1003 +/*! async: number of times op allocation failed */ +#define WT_STAT_CONN_ASYNC_FULL 1004 +/*! async: maximum work queue length */ +#define WT_STAT_CONN_ASYNC_MAX_QUEUE 1005 +/*! async: number of times worker found no work */ +#define WT_STAT_CONN_ASYNC_NOWORK 1006 +/*! async: op allocations */ +#define WT_STAT_CONN_ASYNC_OP_ALLOC 1007 +/*! async: op compact calls */ +#define WT_STAT_CONN_ASYNC_OP_COMPACT 1008 +/*! async: op insert calls */ +#define WT_STAT_CONN_ASYNC_OP_INSERT 1009 +/*! async: op remove calls */ +#define WT_STAT_CONN_ASYNC_OP_REMOVE 1010 +/*! async: op search calls */ +#define WT_STAT_CONN_ASYNC_OP_SEARCH 1011 +/*! async: op update calls */ +#define WT_STAT_CONN_ASYNC_OP_UPDATE 1012 +/*! block manager: mapped bytes read */ +#define WT_STAT_CONN_BLOCK_BYTE_MAP_READ 1013 +/*! block manager: bytes read */ +#define WT_STAT_CONN_BLOCK_BYTE_READ 1014 +/*! block manager: bytes written */ +#define WT_STAT_CONN_BLOCK_BYTE_WRITE 1015 +/*! block manager: mapped blocks read */ +#define WT_STAT_CONN_BLOCK_MAP_READ 1016 +/*! block manager: blocks pre-loaded */ +#define WT_STAT_CONN_BLOCK_PRELOAD 1017 +/*! block manager: blocks read */ +#define WT_STAT_CONN_BLOCK_READ 1018 +/*! block manager: blocks written */ +#define WT_STAT_CONN_BLOCK_WRITE 1019 +/*! cache: tracked dirty bytes in the cache */ +#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1020 +/*! cache: bytes currently in the cache */ +#define WT_STAT_CONN_CACHE_BYTES_INUSE 1021 +/*! cache: maximum bytes configured */ +#define WT_STAT_CONN_CACHE_BYTES_MAX 1022 +/*! cache: bytes read into cache */ +#define WT_STAT_CONN_CACHE_BYTES_READ 1023 +/*! cache: bytes written from cache */ +#define WT_STAT_CONN_CACHE_BYTES_WRITE 1024 +/*! cache: checkpoint blocked page eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1025 +/*! cache: unmodified pages evicted */ +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1026 +/*! cache: page split during eviction deepened the tree */ +#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1027 +/*! cache: modified pages evicted */ +#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1028 +/*! cache: pages selected for eviction unable to be evicted */ +#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1029 +/*! cache: pages evicted because they exceeded the in-memory maximum */ +#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1030 +/*! cache: failed eviction of pages that exceeded the in-memory maximum */ +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1031 +/*! cache: hazard pointer blocked page eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1032 +/*! cache: internal pages evicted */ +#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1033 +/*! cache: eviction server candidate queue empty when topping up */ +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1034 +/*! cache: eviction server candidate queue not empty when topping up */ +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1035 +/*! cache: eviction server evicting pages */ +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1036 +/*! cache: eviction server populating queue, but not evicting pages */ +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1037 +/*! cache: eviction server unable to reach eviction goal */ +#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1038 +/*! cache: pages split during eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1039 +/*! cache: pages walked for eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1040 +/*! cache: tracked dirty pages in the cache */ +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1041 +/*! cache: pages currently held in the cache */ +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1042 +/*! cache: pages read into cache */ +#define WT_STAT_CONN_CACHE_READ 1043 +/*! cache: pages written from cache */ +#define WT_STAT_CONN_CACHE_WRITE 1044 +/*! conn: pthread mutex condition wait calls */ +#define WT_STAT_CONN_COND_WAIT 1045 +/*! Btree: cursor create calls */ +#define WT_STAT_CONN_CURSOR_CREATE 1046 +/*! Btree: cursor insert calls */ +#define WT_STAT_CONN_CURSOR_INSERT 1047 +/*! Btree: cursor next calls */ +#define WT_STAT_CONN_CURSOR_NEXT 1048 +/*! Btree: cursor prev calls */ +#define WT_STAT_CONN_CURSOR_PREV 1049 +/*! Btree: cursor remove calls */ +#define WT_STAT_CONN_CURSOR_REMOVE 1050 +/*! Btree: cursor reset calls */ +#define WT_STAT_CONN_CURSOR_RESET 1051 +/*! Btree: cursor search calls */ +#define WT_STAT_CONN_CURSOR_SEARCH 1052 +/*! Btree: cursor search near calls */ +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1053 +/*! Btree: cursor update calls */ +#define WT_STAT_CONN_CURSOR_UPDATE 1054 +/*! dhandle: session dhandles swept */ +#define WT_STAT_CONN_DH_SESSION_HANDLES 1055 +/*! dhandle: session sweep attempts */ +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1056 +/*! conn: files currently open */ +#define WT_STAT_CONN_FILE_OPEN 1057 +/*! log: log buffer size increases */ +#define WT_STAT_CONN_LOG_BUFFER_GROW 1058 +/*! log: total log buffer size */ +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1059 +/*! log: user provided log bytes written */ +#define WT_STAT_CONN_LOG_BYTES_USER 1060 +/*! log: log bytes written */ +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1061 +/*! log: yields waiting for previous log file close */ +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1062 +/*! log: maximum log file size */ +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1063 +/*! log: log read operations */ +#define WT_STAT_CONN_LOG_READS 1064 +/*! log: records processed by log scan */ +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1065 +/*! log: log scan records requiring two reads */ +#define WT_STAT_CONN_LOG_SCAN_REREADS 1066 +/*! log: log scan operations */ +#define WT_STAT_CONN_LOG_SCANS 1067 +/*! log: consolidated slot closures */ +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1068 +/*! log: logging bytes consolidated */ +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1069 +/*! log: consolidated slot joins */ +#define WT_STAT_CONN_LOG_SLOT_JOINS 1070 +/*! log: consolidated slot join races */ +#define WT_STAT_CONN_LOG_SLOT_RACES 1071 +/*! log: slots selected for switching that were unavailable */ +#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1072 +/*! log: record size exceeded maximum */ +#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1073 +/*! log: failed to find a slot large enough for record */ +#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1074 +/*! log: consolidated slot join transitions */ +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1075 +/*! log: log sync operations */ +#define WT_STAT_CONN_LOG_SYNC 1076 +/*! log: log write operations */ +#define WT_STAT_CONN_LOG_WRITES 1077 +/*! LSM: sleep for LSM checkpoint throttle */ +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1078 +/*! LSM: sleep for LSM merge throttle */ +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1079 +/*! LSM: rows merged in an LSM tree */ +#define WT_STAT_CONN_LSM_ROWS_MERGED 1080 +/*! LSM: App work units currently queued */ +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1081 +/*! LSM: Merge work units currently queued */ +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1082 +/*! LSM: tree queue hit maximum */ +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1083 +/*! LSM: Switch work units currently queued */ +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1084 +/*! LSM: tree maintenance operations scheduled */ +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1085 +/*! LSM: tree maintenance operations discarded */ +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1086 +/*! LSM: tree maintenance operations executed */ +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1087 +/*! conn: memory allocations */ +#define WT_STAT_CONN_MEMORY_ALLOCATION 1088 +/*! conn: memory frees */ +#define WT_STAT_CONN_MEMORY_FREE 1089 +/*! conn: memory re-allocations */ +#define WT_STAT_CONN_MEMORY_GROW 1090 +/*! conn: total read I/Os */ +#define WT_STAT_CONN_READ_IO 1091 +/*! reconciliation: page reconciliation calls */ +#define WT_STAT_CONN_REC_PAGES 1092 +/*! reconciliation: page reconciliation calls for eviction */ +#define WT_STAT_CONN_REC_PAGES_EVICTION 1093 +/*! reconciliation: split bytes currently awaiting free */ +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1094 +/*! reconciliation: split objects currently awaiting free */ +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1095 +/*! conn: pthread mutex shared lock read-lock calls */ +#define WT_STAT_CONN_RWLOCK_READ 1096 +/*! conn: pthread mutex shared lock write-lock calls */ +#define WT_STAT_CONN_RWLOCK_WRITE 1097 +/*! session: open cursor count */ +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1098 +/*! session: open session count */ +#define WT_STAT_CONN_SESSION_OPEN 1099 +/*! txn: transaction begins */ +#define WT_STAT_CONN_TXN_BEGIN 1100 +/*! txn: transaction checkpoints */ +#define WT_STAT_CONN_TXN_CHECKPOINT 1101 +/*! txn: transaction checkpoint currently running */ +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1102 +/*! txn: transactions committed */ +#define WT_STAT_CONN_TXN_COMMIT 1103 +/*! txn: transaction failures due to cache overflow */ +#define WT_STAT_CONN_TXN_FAIL_CACHE 1104 +/*! txn: transaction range of IDs currently pinned */ +#define WT_STAT_CONN_TXN_PINNED_RANGE 1105 +/*! txn: transactions rolled back */ +#define WT_STAT_CONN_TXN_ROLLBACK 1106 +/*! conn: total write I/Os */ +#define WT_STAT_CONN_WRITE_IO 1107 + +/*! + * @} + * @name Statistics for data sources + * @anchor statistics_dsrc + * @{ + */ +/*! block manager: file allocation unit size */ +#define WT_STAT_DSRC_ALLOCATION_SIZE 2000 +/*! block manager: blocks allocated */ +#define WT_STAT_DSRC_BLOCK_ALLOC 2001 +/*! block manager: checkpoint size */ +#define WT_STAT_DSRC_BLOCK_CHECKPOINT_SIZE 2002 +/*! block manager: allocations requiring file extension */ +#define WT_STAT_DSRC_BLOCK_EXTENSION 2003 +/*! block manager: blocks freed */ +#define WT_STAT_DSRC_BLOCK_FREE 2004 +/*! block manager: file magic number */ +#define WT_STAT_DSRC_BLOCK_MAGIC 2005 +/*! block manager: file major version number */ +#define WT_STAT_DSRC_BLOCK_MAJOR 2006 +/*! block manager: minor version number */ +#define WT_STAT_DSRC_BLOCK_MINOR 2007 +/*! block manager: file bytes available for reuse */ +#define WT_STAT_DSRC_BLOCK_REUSE_BYTES 2008 +/*! block manager: file size in bytes */ +#define WT_STAT_DSRC_BLOCK_SIZE 2009 +/*! LSM: bloom filters in the LSM tree */ +#define WT_STAT_DSRC_BLOOM_COUNT 2010 +/*! LSM: bloom filter false positives */ +#define WT_STAT_DSRC_BLOOM_FALSE_POSITIVE 2011 +/*! LSM: bloom filter hits */ +#define WT_STAT_DSRC_BLOOM_HIT 2012 +/*! LSM: bloom filter misses */ +#define WT_STAT_DSRC_BLOOM_MISS 2013 +/*! LSM: bloom filter pages evicted from cache */ +#define WT_STAT_DSRC_BLOOM_PAGE_EVICT 2014 +/*! LSM: bloom filter pages read into cache */ +#define WT_STAT_DSRC_BLOOM_PAGE_READ 2015 +/*! LSM: total size of bloom filters */ +#define WT_STAT_DSRC_BLOOM_SIZE 2016 +/*! btree: column-store variable-size deleted values */ +#define WT_STAT_DSRC_BTREE_COLUMN_DELETED 2017 +/*! btree: column-store fixed-size leaf pages */ +#define WT_STAT_DSRC_BTREE_COLUMN_FIX 2018 +/*! btree: column-store internal pages */ +#define WT_STAT_DSRC_BTREE_COLUMN_INTERNAL 2019 +/*! btree: column-store variable-size leaf pages */ +#define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2020 +/*! btree: pages rewritten by compaction */ +#define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2021 +/*! btree: number of key/value pairs */ +#define WT_STAT_DSRC_BTREE_ENTRIES 2022 +/*! btree: fixed-record size */ +#define WT_STAT_DSRC_BTREE_FIXED_LEN 2023 +/*! btree: maximum tree depth */ +#define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2024 +/*! btree: maximum internal page item size */ +#define WT_STAT_DSRC_BTREE_MAXINTLITEM 2025 +/*! btree: maximum internal page size */ +#define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2026 +/*! btree: maximum leaf page item size */ +#define WT_STAT_DSRC_BTREE_MAXLEAFITEM 2027 +/*! btree: maximum leaf page size */ +#define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2028 +/*! btree: overflow pages */ +#define WT_STAT_DSRC_BTREE_OVERFLOW 2029 +/*! btree: row-store internal pages */ +#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2030 +/*! btree: row-store leaf pages */ +#define WT_STAT_DSRC_BTREE_ROW_LEAF 2031 +/*! cache: bytes read into cache */ +#define WT_STAT_DSRC_CACHE_BYTES_READ 2032 +/*! cache: bytes written from cache */ +#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2033 +/*! cache: checkpoint blocked page eviction */ +#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2034 +/*! cache: unmodified pages evicted */ +#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2035 +/*! cache: modified pages evicted */ +#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2036 +/*! cache: data source pages selected for eviction unable to be evicted */ +#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2037 +/*! cache: hazard pointer blocked page eviction */ +#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2038 +/*! cache: internal pages evicted */ +#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2039 +/*! cache: overflow values cached in memory */ +#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2040 +/*! cache: pages read into cache */ +#define WT_STAT_DSRC_CACHE_READ 2041 +/*! cache: overflow pages read into cache */ +#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2042 +/*! cache: pages written from cache */ +#define WT_STAT_DSRC_CACHE_WRITE 2043 +/*! compression: raw compression call failed, no additional data available */ +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2044 +/*! compression: raw compression call failed, additional data available */ +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2045 +/*! compression: raw compression call succeeded */ +#define WT_STAT_DSRC_COMPRESS_RAW_OK 2046 +/*! compression: compressed pages read */ +#define WT_STAT_DSRC_COMPRESS_READ 2047 +/*! compression: compressed pages written */ +#define WT_STAT_DSRC_COMPRESS_WRITE 2048 +/*! compression: page written failed to compress */ +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2049 +/*! compression: page written was too small to compress */ +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2050 +/*! cursor: create calls */ +#define WT_STAT_DSRC_CURSOR_CREATE 2051 +/*! cursor: insert calls */ +#define WT_STAT_DSRC_CURSOR_INSERT 2052 +/*! cursor: bulk-loaded cursor-insert calls */ +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2053 +/*! cursor: cursor-insert key and value bytes inserted */ +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2054 +/*! cursor: next calls */ +#define WT_STAT_DSRC_CURSOR_NEXT 2055 +/*! cursor: prev calls */ +#define WT_STAT_DSRC_CURSOR_PREV 2056 +/*! cursor: remove calls */ +#define WT_STAT_DSRC_CURSOR_REMOVE 2057 +/*! cursor: cursor-remove key bytes removed */ +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2058 +/*! cursor: reset calls */ +#define WT_STAT_DSRC_CURSOR_RESET 2059 +/*! cursor: search calls */ +#define WT_STAT_DSRC_CURSOR_SEARCH 2060 +/*! cursor: search near calls */ +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2061 +/*! cursor: update calls */ +#define WT_STAT_DSRC_CURSOR_UPDATE 2062 +/*! cursor: cursor-update value bytes updated */ +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2063 +/*! LSM: sleep for LSM checkpoint throttle */ +#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2064 +/*! LSM: chunks in the LSM tree */ +#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2065 +/*! LSM: highest merge generation in the LSM tree */ +#define WT_STAT_DSRC_LSM_GENERATION_MAX 2066 +/*! LSM: queries that could have benefited from a Bloom filter that did + * not exist */ +#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2067 +/*! LSM: sleep for LSM merge throttle */ +#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2068 +/*! reconciliation: dictionary matches */ +#define WT_STAT_DSRC_REC_DICTIONARY 2069 +/*! reconciliation: internal page multi-block writes */ +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2070 +/*! reconciliation: leaf page multi-block writes */ +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2071 +/*! reconciliation: maximum blocks required for a page */ +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2072 +/*! reconciliation: internal-page overflow keys */ +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2073 +/*! reconciliation: leaf-page overflow keys */ +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2074 +/*! reconciliation: overflow values written */ +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2075 +/*! reconciliation: pages deleted */ +#define WT_STAT_DSRC_REC_PAGE_DELETE 2076 +/*! reconciliation: page checksum matches */ +#define WT_STAT_DSRC_REC_PAGE_MATCH 2077 +/*! reconciliation: page reconciliation calls */ +#define WT_STAT_DSRC_REC_PAGES 2078 +/*! reconciliation: page reconciliation calls for eviction */ +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2079 +/*! reconciliation: leaf page key bytes discarded using prefix compression */ +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2080 +/*! reconciliation: internal page key bytes discarded using suffix + * compression */ +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2081 +/*! session: object compaction */ +#define WT_STAT_DSRC_SESSION_COMPACT 2082 +/*! session: open cursor count */ +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2083 +/*! txn: update conflicts */ +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2084 +/*! @} */ +/* + * Statistics section: END + * DO NOT EDIT: automatically built by dist/api_stat.py. + */ +/*! + * @name Log record and operation types + * @anchor log_types + * @{ + */ +/* + * DO NOT EDIT: automatically built by dist/log.py. + * Log record declarations: BEGIN + */ +/*! invalid operation */ +#define WT_LOGOP_INVALID 0 +/*! checkpoint */ +#define WT_LOGREC_CHECKPOINT 0 +/*! transaction commit */ +#define WT_LOGREC_COMMIT 1 +/*! file sync */ +#define WT_LOGREC_FILE_SYNC 2 +/*! message */ +#define WT_LOGREC_MESSAGE 3 +/*! column put */ +#define WT_LOGOP_COL_PUT 1 +/*! column remove */ +#define WT_LOGOP_COL_REMOVE 2 +/*! column truncate */ +#define WT_LOGOP_COL_TRUNCATE 3 +/*! row put */ +#define WT_LOGOP_ROW_PUT 4 +/*! row remove */ +#define WT_LOGOP_ROW_REMOVE 5 +/*! row truncate */ +#define WT_LOGOP_ROW_TRUNCATE 6 +/* + * Log record declarations: END + * DO NOT EDIT: automatically built by dist/log.py. + */ +/*! @} */ +/*! @} */ + +#undef __F + +#if defined(__cplusplus) +} +#endif +#endif /* __WIREDTIGER_H_ */ diff --git a/src/third_party/wiredtiger/src/include/wiredtiger_ext.h b/src/third_party/wiredtiger/src/include/wiredtiger_ext.h new file mode 100644 index 00000000000..fd0282cd50c --- /dev/null +++ b/src/third_party/wiredtiger/src/include/wiredtiger_ext.h @@ -0,0 +1,398 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#ifndef __WIREDTIGER_EXT_H_ +#define __WIREDTIGER_EXT_H_ + +#include <wiredtiger.h> + +#if defined(__cplusplus) +extern "C" { +#endif + +#if !defined(SWIG) + +/*! + * @addtogroup wt_ext + * @{ + */ + +/*! + * Read-committed isolation level, returned by + * WT_EXTENSION_API::transaction_isolation_level. + */ +#define WT_TXN_ISO_READ_COMMITTED 1 +/*! + * Read-uncommitted isolation level, returned by + * WT_EXTENSION_API::transaction_isolation_level. + */ +#define WT_TXN_ISO_READ_UNCOMMITTED 2 +/*! + * Snapshot isolation level, returned by + * WT_EXTENSION_API::transaction_isolation_level. + */ +#define WT_TXN_ISO_SNAPSHOT 3 + +typedef struct __wt_txn_notify WT_TXN_NOTIFY; +/*! + * Snapshot isolation level, returned by + * WT_EXTENSION_API::transaction_isolation_level. + */ +struct __wt_txn_notify { + /*! + * A method called when the session's current transaction is committed + * or rolled back. + * + * @param notify a pointer to the event handler + * @param session the current session handle + * @param txnid the transaction ID + * @param committed an integer value which is non-zero if the + * transaction is being committed. + */ + int (*notify)(WT_TXN_NOTIFY *notify, WT_SESSION *session, + uint64_t txnid, int committed); +}; + +/*! + * Table of WiredTiger extension methods. + * + * This structure is used to provide a set of WiredTiger methods to extension + * modules without needing to link the modules with the WiredTiger library. + * + * The extension methods may be used both by modules that are linked with + * the WiredTiger library (for example, a data source configured using the + * WT_CONNECTION::add_data_source method), and by modules not linked with the + * WiredTiger library (for example, a compression module configured using the + * WT_CONNECTION::add_compressor method). + * + * To use these functions: + * - include the wiredtiger_ext.h header file, + * - declare a variable which references a WT_EXTENSION_API structure, and + * - initialize the variable using WT_CONNECTION::get_extension_api method. + * + * @snippet ex_data_source.c WT_EXTENSION_API declaration + * + * The following code is from the sample compression module, where compression + * extension functions are configured in the extension's entry point: + * + * @snippet nop_compress.c WT_COMPRESSOR initialization structure + * @snippet nop_compress.c WT_COMPRESSOR initialization function + */ +struct __wt_extension_api { +/* !!! To maintain backwards compatibility, this structure is append-only. */ +#if !defined(DOXYGEN) + /* + * Private fields. + */ + WT_CONNECTION *conn; /* Enclosing connection */ +#endif + /*! + * Insert an error message into the WiredTiger error stream. + * + * @param wt_api the extension handle + * @param session the session handle (or NULL if none available) + * @param fmt a printf-like format specification + * @errors + * + * @snippet ex_data_source.c WT_EXTENSION_API err_printf + */ + int (*err_printf)(WT_EXTENSION_API *wt_api, + WT_SESSION *session, const char *fmt, ...); + + /*! + * Insert a message into the WiredTiger message stream. + * + * @param wt_api the extension handle + * @param session the session handle (or NULL if none available) + * @param fmt a printf-like format specification + * @errors + * + * @snippet ex_data_source.c WT_EXTENSION_API msg_printf + */ + int (*msg_printf)( + WT_EXTENSION_API *, WT_SESSION *session, const char *fmt, ...); + + /*! + * Return information about an error as a string; the strerror method + * is a superset of the ISO C99/POSIX 1003.1-2001 function strerror. + * + * @snippet ex_data_source.c WT_EXTENSION_API strerror + * + * @param err a return value from a WiredTiger, C library or POSIX + * function + * @returns a string representation of the error + */ + const char *(*strerror)(int err); + + /*! + * Allocate short-term use scratch memory. + * + * @param wt_api the extension handle + * @param session the session handle (or NULL if none available) + * @param bytes the number of bytes of memory needed + * @returns A valid memory reference on success or NULL on error + * + * @snippet ex_data_source.c WT_EXTENSION_API scr_alloc + */ + void *(*scr_alloc)( + WT_EXTENSION_API *wt_api, WT_SESSION *session, size_t bytes); + + /*! + * Free short-term use scratch memory. + * + * @param wt_api the extension handle + * @param session the session handle (or NULL if none available) + * @param ref a memory reference returned by WT_EXTENSION_API::scr_alloc + * + * @snippet ex_data_source.c WT_EXTENSION_API scr_free + */ + void (*scr_free)(WT_EXTENSION_API *, WT_SESSION *session, void *ref); + + /*! + * Configure the extension collator method. + * + * @param wt_api the extension handle + * @param session the session handle (or NULL if none available) + * @param config the configuration information passed to an application + * @param collatorp the selector collator, if any + * @param ownp set if the collator terminate method should be called + * when no longer needed + * @errors + * + * @snippet ex_data_source.c WT_EXTENSION collator config + */ + int (*collator_config)(WT_EXTENSION_API *wt_api, WT_SESSION *session, + WT_CONFIG_ARG *config, WT_COLLATOR **collatorp, int *ownp); + + /*! + * The extension collator method. + * + * @param wt_api the extension handle + * @param session the session handle (or NULL if none available) + * @param collator the collator (or NULL if none available) + * @param first first item + * @param second second item + * @param[out] cmp set less than 0 if \c first collates less than + * \c second, set equal to 0 if \c first collates equally to \c second, + * set greater than 0 if \c first collates greater than \c second + * @errors + * + * @snippet ex_data_source.c WT_EXTENSION collate + */ + int (*collate)(WT_EXTENSION_API *wt_api, WT_SESSION *session, + WT_COLLATOR *collator, WT_ITEM *first, WT_ITEM *second, int *cmp); + + /*! + * @copydoc wiredtiger_config_parser_open + */ + int (*config_parser_open)(WT_EXTENSION_API *wt_api, WT_SESSION *session, + const char *config, size_t len, WT_CONFIG_PARSER **config_parserp); + + /*! + * Return the value of a configuration string. + * + * @param wt_api the extension handle + * @param session the session handle (or NULL if none available) + * @param key configuration key string + * @param config the configuration information passed to an application + * @param value the returned value + * @errors + * + * @snippet ex_data_source.c WT_EXTENSION config_get + */ + int (*config_get)(WT_EXTENSION_API *wt_api, WT_SESSION *session, + WT_CONFIG_ARG *config, const char *key, WT_CONFIG_ITEM *value); + + /*! + * Insert a row into the metadata if it does not already exist. + * + * @param wt_api the extension handle + * @param session the session handle (or NULL if none available) + * @param key row key + * @param value row value + * @errors + * + * @snippet ex_data_source.c WT_EXTENSION metadata insert + */ + int (*metadata_insert)(WT_EXTENSION_API *wt_api, + WT_SESSION *session, const char *key, const char *value); + + /*! + * Remove a row from the metadata. + * + * @param wt_api the extension handle + * @param session the session handle (or NULL if none available) + * @param key row key + * @errors + * + * @snippet ex_data_source.c WT_EXTENSION metadata remove + */ + int (*metadata_remove)( + WT_EXTENSION_API *wt_api, WT_SESSION *session, const char *key); + + /*! + * Return a row from the metadata. + * + * @param wt_api the extension handle + * @param session the session handle (or NULL if none available) + * @param key row key + * @param [out] valuep the row value + * @errors + * + * @snippet ex_data_source.c WT_EXTENSION metadata search + */ + int (*metadata_search)(WT_EXTENSION_API *wt_api, + WT_SESSION *session, const char *key, const char **valuep); + + /*! + * Update a row in the metadata by either inserting a new record or + * updating an existing record. + * + * @param wt_api the extension handle + * @param session the session handle (or NULL if none available) + * @param key row key + * @param value row value + * @errors + * + * @snippet ex_data_source.c WT_EXTENSION metadata update + */ + int (*metadata_update)(WT_EXTENSION_API *wt_api, + WT_SESSION *session, const char *key, const char *value); + + /*! + * Pack a structure into a buffer. + * See ::wiredtiger_struct_pack for details. + * + * @param wt_api the extension handle + * @param session the session handle + * @param buffer a pointer to a packed byte array + * @param size the number of valid bytes in the buffer + * @param format the data format, see @ref packing + * @errors + */ + int (*struct_pack)(WT_EXTENSION_API *wt_api, WT_SESSION *session, + void *buffer, size_t size, const char *format, ...); + + /*! + * Calculate the size required to pack a structure. + * See ::wiredtiger_struct_size for details. + * + * @param wt_api the extension handle + * @param session the session handle + * @param sizep a location where the number of bytes needed for the + * matching call to WT_EXTENSION_API::struct_pack is returned + * @param format the data format, see @ref packing + * @errors + */ + int (*struct_size)(WT_EXTENSION_API *wt_api, WT_SESSION *session, + size_t *sizep, const char *format, ...); + + /*! + * Unpack a structure from a buffer. + * See ::wiredtiger_struct_unpack for details. + * + * @param wt_api the extension handle + * @param session the session handle + * @param buffer a pointer to a packed byte array + * @param size the number of valid bytes in the buffer + * @param format the data format, see @ref packing + * @errors + */ + int (*struct_unpack)(WT_EXTENSION_API *wt_api, WT_SESSION *session, + const void *buffer, size_t size, const char *format, ...); + + /*! + * Return the current transaction ID. + * + * @param wt_api the extension handle + * @param session the session handle + * @returns the current transaction ID. + * + * @snippet ex_data_source.c WT_EXTENSION transaction ID + */ + uint64_t (*transaction_id)(WT_EXTENSION_API *wt_api, + WT_SESSION *session); + + /*! + * Return the current transaction's isolation level; returns one of + * ::WT_TXN_ISO_READ_COMMITTED, ::WT_TXN_ISO_READ_UNCOMMITTED, or + * ::WT_TXN_ISO_SNAPSHOT. + * + * @param wt_api the extension handle + * @param session the session handle + * @returns the current transaction's isolation level. + * + * @snippet ex_data_source.c WT_EXTENSION transaction isolation level + */ + int (*transaction_isolation_level)(WT_EXTENSION_API *wt_api, + WT_SESSION *session); + + /*! + * Request notification of transaction resolution by specifying a + * function to be called when the session's current transaction is + * either committed or rolled back. If the transaction is being + * committed, but the notification function returns an error, the + * transaction will be rolled back. + * + * @param wt_api the extension handle + * @param session the session handle + * @param notify a handler for commit or rollback events + * @errors + * + * @snippet ex_data_source.c WT_EXTENSION transaction notify + */ + int (*transaction_notify)(WT_EXTENSION_API *wt_api, + WT_SESSION *session, WT_TXN_NOTIFY *notify); + + /*! + * Return the oldest transaction ID not yet visible to a running + * transaction. + * + * @param wt_api the extension handle + * @param session the session handle + * @returns the oldest transaction ID not yet visible to a running + * transaction. + * + * @snippet ex_data_source.c WT_EXTENSION transaction oldest + */ + uint64_t (*transaction_oldest)(WT_EXTENSION_API *wt_api); + + /*! + * Return if the current transaction can see the given transaction ID. + * + * @param wt_api the extension handle + * @param session the session handle + * @param transaction_id the transaction ID + * @returns true (non-zero) if the transaction ID is visible to the + * current transaction. + * + * @snippet ex_data_source.c WT_EXTENSION transaction visible + */ + int (*transaction_visible)(WT_EXTENSION_API *wt_api, + WT_SESSION *session, uint64_t transaction_id); + + /*! + * @copydoc wiredtiger_version + */ + const char *(*version)(int *majorp, int *minorp, int *patchp); +}; + +/*! + * @typedef WT_CONFIG_ARG + * + * A configuration object passed to some extension interfaces. This is an + * opaque type: configuration values can be queried using + * WT_EXTENSION_API::config_get + */ + +/*! @} */ +#endif /* SWIG */ + +#if defined(__cplusplus) +} +#endif +#endif /* __WIREDTIGER_EXT_H_ */ diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h new file mode 100644 index 00000000000..e9482c688d3 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -0,0 +1,337 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#if defined(__cplusplus) +extern "C" { +#endif + +/******************************************* + * WiredTiger public include file, and configuration control. + *******************************************/ +#include "wiredtiger_config.h" +#include "wiredtiger_ext.h" + +/******************************************* + * WiredTiger system include files. + *******************************************/ +#ifndef _WIN32 +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/uio.h> +#endif +#include <ctype.h> +#ifndef _WIN32 +#include <dlfcn.h> +#endif +#include <errno.h> +#include <fcntl.h> +#include <inttypes.h> +#ifdef _WIN32 +#include <io.h> +#endif +#include <limits.h> +#ifndef _WIN32 +#include <pthread.h> +#endif +#ifdef HAVE_PTHREAD_NP_H +#include <pthread_np.h> +#endif +#include <stddef.h> +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#ifndef _WIN32 +#include <unistd.h> +#endif +#include <time.h> +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#include <windows.h> +#endif + +/******************************************* + * WiredTiger externally maintained include files. + *******************************************/ +#include "queue.h" + +/* + * DO NOT EDIT: automatically built by dist/s_typedef. + * Forward type declarations for internal types: BEGIN + */ +struct __wt_addr; + typedef struct __wt_addr WT_ADDR; +struct __wt_async; + typedef struct __wt_async WT_ASYNC; +struct __wt_async_cursor; + typedef struct __wt_async_cursor WT_ASYNC_CURSOR; +struct __wt_async_format; + typedef struct __wt_async_format WT_ASYNC_FORMAT; +struct __wt_async_op_impl; + typedef struct __wt_async_op_impl WT_ASYNC_OP_IMPL; +struct __wt_async_worker_state; + typedef struct __wt_async_worker_state WT_ASYNC_WORKER_STATE; +struct __wt_block; + typedef struct __wt_block WT_BLOCK; +struct __wt_block_ckpt; + typedef struct __wt_block_ckpt WT_BLOCK_CKPT; +struct __wt_block_desc; + typedef struct __wt_block_desc WT_BLOCK_DESC; +struct __wt_block_header; + typedef struct __wt_block_header WT_BLOCK_HEADER; +struct __wt_bloom; + typedef struct __wt_bloom WT_BLOOM; +struct __wt_bloom_hash; + typedef struct __wt_bloom_hash WT_BLOOM_HASH; +struct __wt_bm; + typedef struct __wt_bm WT_BM; +struct __wt_btree; + typedef struct __wt_btree WT_BTREE; +struct __wt_cache; + typedef struct __wt_cache WT_CACHE; +struct __wt_cache_pool; + typedef struct __wt_cache_pool WT_CACHE_POOL; +struct __wt_cell; + typedef struct __wt_cell WT_CELL; +struct __wt_cell_unpack; + typedef struct __wt_cell_unpack WT_CELL_UNPACK; +struct __wt_ckpt; + typedef struct __wt_ckpt WT_CKPT; +struct __wt_col; + typedef struct __wt_col WT_COL; +struct __wt_col_rle; + typedef struct __wt_col_rle WT_COL_RLE; +struct __wt_colgroup; + typedef struct __wt_colgroup WT_COLGROUP; +struct __wt_compact; + typedef struct __wt_compact WT_COMPACT; +struct __wt_condvar; + typedef struct __wt_condvar WT_CONDVAR; +struct __wt_config; + typedef struct __wt_config WT_CONFIG; +struct __wt_config_check; + typedef struct __wt_config_check WT_CONFIG_CHECK; +struct __wt_config_entry; + typedef struct __wt_config_entry WT_CONFIG_ENTRY; +struct __wt_config_parser_impl; + typedef struct __wt_config_parser_impl WT_CONFIG_PARSER_IMPL; +struct __wt_connection_impl; + typedef struct __wt_connection_impl WT_CONNECTION_IMPL; +struct __wt_connection_stats; + typedef struct __wt_connection_stats WT_CONNECTION_STATS; +struct __wt_connection_stats_spinlock; + typedef struct __wt_connection_stats_spinlock WT_CONNECTION_STATS_SPINLOCK; +struct __wt_cursor_backup; + typedef struct __wt_cursor_backup WT_CURSOR_BACKUP; +struct __wt_cursor_backup_entry; + typedef struct __wt_cursor_backup_entry WT_CURSOR_BACKUP_ENTRY; +struct __wt_cursor_btree; + typedef struct __wt_cursor_btree WT_CURSOR_BTREE; +struct __wt_cursor_bulk; + typedef struct __wt_cursor_bulk WT_CURSOR_BULK; +struct __wt_cursor_config; + typedef struct __wt_cursor_config WT_CURSOR_CONFIG; +struct __wt_cursor_data_source; + typedef struct __wt_cursor_data_source WT_CURSOR_DATA_SOURCE; +struct __wt_cursor_dump; + typedef struct __wt_cursor_dump WT_CURSOR_DUMP; +struct __wt_cursor_index; + typedef struct __wt_cursor_index WT_CURSOR_INDEX; +struct __wt_cursor_json; + typedef struct __wt_cursor_json WT_CURSOR_JSON; +struct __wt_cursor_log; + typedef struct __wt_cursor_log WT_CURSOR_LOG; +struct __wt_cursor_lsm; + typedef struct __wt_cursor_lsm WT_CURSOR_LSM; +struct __wt_cursor_metadata; + typedef struct __wt_cursor_metadata WT_CURSOR_METADATA; +struct __wt_cursor_stat; + typedef struct __wt_cursor_stat WT_CURSOR_STAT; +struct __wt_cursor_table; + typedef struct __wt_cursor_table WT_CURSOR_TABLE; +struct __wt_data_handle; + typedef struct __wt_data_handle WT_DATA_HANDLE; +struct __wt_data_handle_cache; + typedef struct __wt_data_handle_cache WT_DATA_HANDLE_CACHE; +struct __wt_dlh; + typedef struct __wt_dlh WT_DLH; +struct __wt_dsrc_stats; + typedef struct __wt_dsrc_stats WT_DSRC_STATS; +struct __wt_evict_entry; + typedef struct __wt_evict_entry WT_EVICT_ENTRY; +struct __wt_evict_worker; + typedef struct __wt_evict_worker WT_EVICT_WORKER; +struct __wt_ext; + typedef struct __wt_ext WT_EXT; +struct __wt_extlist; + typedef struct __wt_extlist WT_EXTLIST; +struct __wt_fh; + typedef struct __wt_fh WT_FH; +struct __wt_hazard; + typedef struct __wt_hazard WT_HAZARD; +struct __wt_ikey; + typedef struct __wt_ikey WT_IKEY; +struct __wt_index; + typedef struct __wt_index WT_INDEX; +struct __wt_insert; + typedef struct __wt_insert WT_INSERT; +struct __wt_insert_head; + typedef struct __wt_insert_head WT_INSERT_HEAD; +struct __wt_log_desc; + typedef struct __wt_log_desc WT_LOG_DESC; +struct __wt_log_op_desc; + typedef struct __wt_log_op_desc WT_LOG_OP_DESC; +struct __wt_log_rec_desc; + typedef struct __wt_log_rec_desc WT_LOG_REC_DESC; +struct __wt_lsm_chunk; + typedef struct __wt_lsm_chunk WT_LSM_CHUNK; +struct __wt_lsm_data_source; + typedef struct __wt_lsm_data_source WT_LSM_DATA_SOURCE; +struct __wt_lsm_manager; + typedef struct __wt_lsm_manager WT_LSM_MANAGER; +struct __wt_lsm_tree; + typedef struct __wt_lsm_tree WT_LSM_TREE; +struct __wt_lsm_work_unit; + typedef struct __wt_lsm_work_unit WT_LSM_WORK_UNIT; +struct __wt_lsm_worker_args; + typedef struct __wt_lsm_worker_args WT_LSM_WORKER_ARGS; +struct __wt_lsm_worker_cookie; + typedef struct __wt_lsm_worker_cookie WT_LSM_WORKER_COOKIE; +struct __wt_multi; + typedef struct __wt_multi WT_MULTI; +struct __wt_named_collator; + typedef struct __wt_named_collator WT_NAMED_COLLATOR; +struct __wt_named_compressor; + typedef struct __wt_named_compressor WT_NAMED_COMPRESSOR; +struct __wt_named_data_source; + typedef struct __wt_named_data_source WT_NAMED_DATA_SOURCE; +struct __wt_ovfl_reuse; + typedef struct __wt_ovfl_reuse WT_OVFL_REUSE; +struct __wt_ovfl_track; + typedef struct __wt_ovfl_track WT_OVFL_TRACK; +struct __wt_ovfl_txnc; + typedef struct __wt_ovfl_txnc WT_OVFL_TXNC; +struct __wt_page; + typedef struct __wt_page WT_PAGE; +struct __wt_page_deleted; + typedef struct __wt_page_deleted WT_PAGE_DELETED; +struct __wt_page_header; + typedef struct __wt_page_header WT_PAGE_HEADER; +struct __wt_page_index; + typedef struct __wt_page_index WT_PAGE_INDEX; +struct __wt_page_modify; + typedef struct __wt_page_modify WT_PAGE_MODIFY; +struct __wt_process; + typedef struct __wt_process WT_PROCESS; +struct __wt_ref; + typedef struct __wt_ref WT_REF; +struct __wt_row; + typedef struct __wt_row WT_ROW; +struct __wt_rwlock; + typedef struct __wt_rwlock WT_RWLOCK; +struct __wt_salvage_cookie; + typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE; +struct __wt_scratch_track; + typedef struct __wt_scratch_track WT_SCRATCH_TRACK; +struct __wt_session_impl; + typedef struct __wt_session_impl WT_SESSION_IMPL; +struct __wt_size; + typedef struct __wt_size WT_SIZE; +struct __wt_split_stash; + typedef struct __wt_split_stash WT_SPLIT_STASH; +struct __wt_stats; + typedef struct __wt_stats WT_STATS; +struct __wt_table; + typedef struct __wt_table WT_TABLE; +struct __wt_txn; + typedef struct __wt_txn WT_TXN; +struct __wt_txn_global; + typedef struct __wt_txn_global WT_TXN_GLOBAL; +struct __wt_txn_op; + typedef struct __wt_txn_op WT_TXN_OP; +struct __wt_txn_state; + typedef struct __wt_txn_state WT_TXN_STATE; +struct __wt_upd_skipped; + typedef struct __wt_upd_skipped WT_UPD_SKIPPED; +struct __wt_update; + typedef struct __wt_update WT_UPDATE; +/* + * Forward type declarations for internal types: END + * DO NOT EDIT: automatically built by dist/s_typedef. + */ + +/******************************************* + * WiredTiger internal include files. + *******************************************/ +#if defined(_lint) +#include "lint.h" +#elif defined(__GNUC__) +#include "gcc.h" +#elif defined(_MSC_VER) +#include "msvc.h" +#endif +#include "hardware.h" + +#ifdef _WIN32 +#include "os_windows.h" +#else +#include "posix.h" +#endif + +#include "misc.h" +#include "mutex.h" + +#include "stat.h" /* required by dhandle.h */ +#include "dhandle.h" /* required by btree.h */ + +#include "api.h" +#include "async.h" +#include "block.h" +#include "bloom.h" +#include "btmem.h" +#include "btree.h" +#include "cache.h" +#include "config.h" +#include "compact.h" +#include "cursor.h" +#include "dlh.h" +#include "error.h" +#include "flags.h" +#include "log.h" +#include "lsm.h" +#include "meta.h" +#include "os.h" +#include "schema.h" +#include "txn.h" + +#include "session.h" /* required by connection.h */ +#include "connection.h" + +#include "extern.h" +#include "verify_build.h" + +#include "buf.i" +#include "misc.i" +#include "intpack.i" /* required by cell.i, packing.i */ +#include "packing.i" +#include "cell.i" /* required by btree.i */ + +#include "mutex.i" /* required by btree.i */ +#include "txn.i" /* required by btree.i */ + +#include "btree.i" /* required by cursor.i */ +#include "cache.i" /* required by cursor.i */ +#include "cursor.i" + +#include "bitstring.i" +#include "column.i" +#include "serial.i" + +#if defined(__cplusplus) +} +#endif diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c new file mode 100644 index 00000000000..d13002cdc5a --- /dev/null +++ b/src/third_party/wiredtiger/src/log/log.c @@ -0,0 +1,1243 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_log_ckpt -- + * Record the given LSN as the checkpoint LSN and signal the archive + * thread as needed. + */ +int +__wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + + conn = S2C(session); + log = conn->log; + log->ckpt_lsn = *ckp_lsn; + if (conn->arch_cond != NULL) + WT_RET(__wt_cond_signal(session, conn->arch_cond)); + return (0); +} + +/* + * __wt_log_written_reset -- + * Interface to reset the amount of log written during this + * during this checkpoint period. Called from the checkpoint code. + */ +void +__wt_log_written_reset(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + + conn = S2C(session); + if (!conn->logging) + return; + log = conn->log; + log->log_written = 0; + return; +} + +/* + * __wt_log_get_files -- + * Retrieve the list of all existing log files. + */ +int +__wt_log_get_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp) +{ + WT_CONNECTION_IMPL *conn; + const char *log_path; + + *countp = 0; + *filesp = NULL; + + conn = S2C(session); + log_path = conn->log_path; + if (log_path == NULL) + log_path = ""; + return (__wt_dirlist(session, log_path, WT_LOG_FILENAME, + WT_DIRLIST_INCLUDE, filesp, countp)); +} + +/* + * __wt_log_get_active_files -- + * Retrieve the list of active log files (those that are not candidates + * for archiving). + */ +int +__wt_log_get_active_files( + WT_SESSION_IMPL *session, char ***filesp, u_int *countp) +{ + WT_DECL_RET; + WT_LOG *log; + char **files; + uint32_t id; + u_int count, i; + + id = 0; + log = S2C(session)->log; + + WT_RET(__wt_log_get_files(session, &files, &count)); + + /* Filter out any files that are below the checkpoint LSN. */ + for (i = 0; i < count; ) { + WT_ERR(__wt_log_extract_lognum(session, files[i], &id)); + if (id < log->ckpt_lsn.file) { + __wt_free(session, files[i]); + files[i] = files[count - 1]; + files[--count] = NULL; + } else + i++; + } + + *filesp = files; + *countp = count; + + if (0) { +err: __wt_log_files_free(session, files, count); + } + return (ret); +} + +/* + * __wt_log_files_free -- + * Free memory associated with a log file list. + */ +void +__wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count) +{ + u_int i; + + for (i = 0; i < count; i++) + __wt_free(session, files[i]); + __wt_free(session, files); +} + +/* + * __wt_log_filename -- + * Given a log number, return a WT_ITEM of a generated log file name. + */ +int +__wt_log_filename(WT_SESSION_IMPL *session, uint32_t id, WT_ITEM *buf) +{ + const char *log_path; + + log_path = S2C(session)->log_path; + + if (log_path != NULL && log_path[0] != '\0') + WT_RET(__wt_buf_fmt(session, buf, "%s/%s.%010" PRIu32, + log_path, WT_LOG_FILENAME, id)); + else + WT_RET(__wt_buf_fmt(session, buf, "%s.%010" PRIu32, + WT_LOG_FILENAME, id)); + + return (0); +} + +/* + * __wt_log_extract_lognum -- + * Given a log file name, extract out the log number. + */ +int +__wt_log_extract_lognum( + WT_SESSION_IMPL *session, const char *name, uint32_t *id) +{ + const char *p; + + WT_UNUSED(session); + + if (id == NULL || name == NULL) + return (WT_ERROR); + if ((p = strrchr(name, '.')) == NULL || + sscanf(++p, "%" PRIu32, id) != 1) + WT_RET_MSG(session, WT_ERROR, "Bad log file name '%s'", name); + return (0); +} + +/* + * __wt_log_remove -- + * Given a log number, remove that log file. + */ +int +__wt_log_remove(WT_SESSION_IMPL *session, uint32_t lognum) +{ + WT_DECL_ITEM(path); + WT_DECL_RET; + + WT_ERR(__wt_scr_alloc(session, 0, &path)); + WT_ERR(__wt_log_filename(session, lognum, path)); + WT_ERR(__wt_verbose(session, WT_VERB_LOG, + "log_remove: remove log %s", (char *)path->data)); + WT_ERR(__wt_remove(session, path->data)); +err: __wt_scr_free(&path); + return (ret); +} + +/* + * __log_openfile -- + * Open a log file with the given log file number and return the WT_FH. + */ +static int +__log_openfile(WT_SESSION_IMPL *session, int ok_create, WT_FH **fh, uint32_t id) +{ + WT_DECL_ITEM(path); + WT_DECL_RET; + + WT_RET(__wt_scr_alloc(session, 0, &path)); + WT_ERR(__wt_log_filename(session, id, path)); + WT_ERR(__wt_verbose(session, WT_VERB_LOG, + "opening log %s", (const char *)path->data)); + WT_ERR(__wt_open( + session, path->data, ok_create, 0, WT_FILE_TYPE_LOG, fh)); +err: __wt_scr_free(&path); + return (ret); +} + +/* + * __wt_log_open -- + * Open the appropriate log file for the connection. The purpose is + * to find the last log file that exists, open it and set our initial + * LSNs to the end of that file. If none exist, call __wt_log_newfile + * to create it. + */ +int +__wt_log_open(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + uint32_t firstlog, lastlog, lognum; + u_int i, logcount; + char **logfiles; + + conn = S2C(session); + log = conn->log; + lastlog = 0; + firstlog = UINT32_MAX; + + WT_RET(__wt_log_get_files(session, &logfiles, &logcount)); + for (i = 0; i < logcount; i++) { + WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); + lastlog = WT_MAX(lastlog, lognum); + firstlog = WT_MIN(firstlog, lognum); + } + log->fileid = lastlog; + WT_ERR(__wt_verbose(session, WT_VERB_LOG, + "log_open: first log %d last log %d", firstlog, lastlog)); + log->first_lsn.file = firstlog; + log->first_lsn.offset = 0; + + /* + * Start logging at the beginning of the next log file, no matter + * where the previous log file ends. + */ + WT_ERR(__wt_log_newfile(session, 1)); + + /* + * If there were log files, run recovery. + * XXX belongs at a higher level than this. + */ + if (logcount > 0) { + log->trunc_lsn = log->alloc_lsn; + WT_ERR(__wt_txn_recover(conn)); + } + +err: __wt_log_files_free(session, logfiles, logcount); + return (ret); +} + +/* + * __wt_log_close -- + * Close the log file. + */ +int +__wt_log_close(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + + conn = S2C(session); + log = conn->log; + + if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) { + WT_RET(__wt_verbose(session, WT_VERB_LOG, + "closing old log %s", log->log_close_fh->name)); + WT_RET(__wt_close(session, log->log_close_fh)); + } + if (log->log_fh != NULL) { + WT_RET(__wt_verbose(session, WT_VERB_LOG, + "closing log %s", log->log_fh->name)); + WT_RET(__wt_close(session, log->log_fh)); + log->log_fh = NULL; + } + return (0); +} + +/* + * __log_fill -- + * Copy a thread's log records into the assigned slot. + */ +static int +__log_fill(WT_SESSION_IMPL *session, + WT_MYSLOT *myslot, int direct, WT_ITEM *record, WT_LSN *lsnp) +{ + WT_DECL_RET; + WT_LOG_RECORD *logrec; + + logrec = (WT_LOG_RECORD *)record->mem; + /* + * Call __wt_write. For now the offset is the real byte offset. + * If the offset becomes a unit of LOG_ALIGN this is where we would + * multiply by LOG_ALIGN to get the real file byte offset for write(). + */ + if (direct) + WT_ERR(__wt_write(session, myslot->slot->slot_fh, + myslot->offset + myslot->slot->slot_start_offset, + (size_t)logrec->len, (void *)logrec)); + else + memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset, + logrec, logrec->len); + + WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, logrec->len); + if (lsnp != NULL) { + *lsnp = myslot->slot->slot_start_lsn; + lsnp->offset += (wt_off_t)myslot->offset; + } +err: + if (ret != 0 && myslot->slot->slot_error == 0) + myslot->slot->slot_error = ret; + return (ret); +} + +/* + * __log_size_fit -- + * Return whether or not recsize will fit in the log file. + */ +static int +__log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + return (lsn->offset + (wt_off_t)recsize < conn->log_file_max); +} + +/* + * __log_truncate -- + * Truncate the log to the given LSN. If this_log is set, it will only + * truncate the log file indicated in the given LSN. If not set, + * it will truncate between the given LSN and the trunc_lsn. That is, + * since we pre-allocate log files, it will free that space and allow the + * log to be traversed. We use the trunc_lsn because logging has already + * opened the new/next log file before recovery ran. This function assumes + * we are in recovery or other dedicated time and not during live running. + */ +static int +__log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, uint32_t this_log) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_FH *log_fh, *tmp_fh; + WT_LOG *log; + uint32_t lognum; + u_int i, logcount; + char **logfiles; + + conn = S2C(session); + log = conn->log; + log_fh = NULL; + logcount = 0; + logfiles = NULL; + + /* + * Truncate the log file to the given LSN. + */ + WT_ERR(__log_openfile(session, 0, &log_fh, lsn->file)); + WT_ERR(__wt_ftruncate(session, log_fh, lsn->offset)); + tmp_fh = log_fh; + log_fh = NULL; + WT_ERR(__wt_close(session, tmp_fh)); + + /* + * If we just want to truncate the current log, return and skip + * looking for intervening logs. + */ + if (this_log) + goto err; + WT_ERR(__wt_log_get_files(session, &logfiles, &logcount)); + for (i = 0; i < logcount; i++) { + WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); + if (lognum > lsn->file && lognum < log->trunc_lsn.file) { + WT_ERR(__log_openfile(session, 0, &log_fh, lognum)); + /* + * If there are intervening files pre-allocated, + * truncate them to the end of the log file header. + */ + WT_ERR(__wt_ftruncate(session, + log_fh, LOG_FIRST_RECORD)); + tmp_fh = log_fh; + log_fh = NULL; + WT_ERR(__wt_close(session, tmp_fh)); + } + } +err: if (log_fh != NULL) + WT_TRET(__wt_close(session, log_fh)); + if (logfiles != NULL) + __wt_log_files_free(session, logfiles, logcount); + return (ret); +} + +/* + * __log_filesize -- + * Returns an estimate of the real end of log file. + */ +static int +__log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *eof) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + wt_off_t log_size, off, off1; + uint32_t allocsize, bufsz; + char *buf, *zerobuf; + + conn = S2C(session); + log = conn->log; + if (eof == NULL) + return (0); + *eof = 0; + WT_RET(__wt_filesize(session, fh, &log_size)); + if (log == NULL) + allocsize = LOG_ALIGN; + else + allocsize = log->allocsize; + + /* + * It can be very slow looking for the last real record in the log + * in very small chunks. Walk backward by a megabyte at a time. When + * we find a part of the log that is not just zeroes, walk to find + * the last record. + */ + buf = zerobuf = NULL; + if (allocsize < WT_MEGABYTE && log_size > WT_MEGABYTE) + bufsz = WT_MEGABYTE; + else + bufsz = allocsize; + WT_RET(__wt_calloc_def(session, bufsz, &buf)); + WT_ERR(__wt_calloc_def(session, bufsz, &zerobuf)); + + /* + * Read in a chunk starting at the end of the file. Keep going until + * we reach the beginning or we find a chunk that contains any non-zero + * bytes. Compare against a known zero byte chunk. + */ + for (off = log_size - (wt_off_t)bufsz; + off >= 0; + off -= (wt_off_t)bufsz) { + WT_ERR(__wt_read(session, fh, off, bufsz, buf)); + if (memcmp(buf, zerobuf, bufsz) != 0) + break; + } + + /* + * If we're walking by large amounts, now walk by the real allocsize + * to find the real end, if we found something. Otherwise we reached + * the beginning of the file. Offset can go negative if the log file + * size is not a multiple of a megabyte. The first chunk of the log + * file will always be non-zero. + */ + if (off < 0) + off = 0; + + /* + * We know all log records are aligned at log->allocsize. The first + * item in a log record is always a 32-bit length. Look for any + * non-zero length at the allocsize boundary. This may not be a true + * log record since it could be the middle of a large record. But we + * know no log record starts after it. Return an estimate of the log + * file size. + */ + for (off1 = bufsz - allocsize; + off1 > 0; off1 -= (wt_off_t)allocsize) + if (memcmp(buf + off1, zerobuf, sizeof(uint32_t)) != 0) + break; + off = off + off1; + + /* + * Set EOF to the last zero-filled record we saw. + */ + *eof = off + (wt_off_t)allocsize; +err: + if (buf != NULL) + __wt_free(session, buf); + if (zerobuf != NULL) + __wt_free(session, zerobuf); + return (ret); +} + +/* + * __log_acquire -- + * Called with the log slot lock held. Can be called recursively + * from __wt_log_newfile when we change log files. + */ +static int +__log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + + conn = S2C(session); + log = conn->log; + /* + * Called locked. Add recsize to alloc_lsn. Save our starting LSN + * where the previous allocation finished for the release LSN. + * That way when log files switch, we're waiting for the correct LSN + * from outstanding writes. + */ + slot->slot_release_lsn = log->alloc_lsn; + if (!__log_size_fit(session, &log->alloc_lsn, recsize)) { + WT_RET(__wt_log_newfile(session, 0)); + if (log->log_close_fh != NULL) + F_SET(slot, SLOT_CLOSEFH); + } + /* + * Checkpoints can be configured based on amount of log written. + * Add in this log record to the sum and if needed, signal the + * checkpoint condition. The logging subsystem manages the + * accumulated field. There is a bit of layering violation + * here checking the connection ckpt field and using its + * condition. + */ + if (WT_CKPT_LOGSIZE(conn)) { + log->log_written += (wt_off_t)recsize; + WT_RET(__wt_checkpoint_signal(session, log->log_written)); + } + + /* + * Need to minimally fill in slot info here. Our slot start LSN + * comes after any potential new log file creations. + */ + slot->slot_start_lsn = log->alloc_lsn; + slot->slot_start_offset = log->alloc_lsn.offset; + /* + * Pre-allocate on the first real write into the log file. + */ + if (log->alloc_lsn.offset == LOG_FIRST_RECORD) { + if (!log->log_fh->fallocate_available || + (ret = __wt_fallocate(session, log->log_fh, + LOG_FIRST_RECORD, conn->log_file_max)) == ENOTSUP) + ret = __wt_ftruncate(session, log->log_fh, + LOG_FIRST_RECORD + conn->log_file_max); + WT_RET(ret); + } + + log->alloc_lsn.offset += (wt_off_t)recsize; + slot->slot_end_lsn = log->alloc_lsn; + slot->slot_error = 0; + slot->slot_fh = log->log_fh; + return (0); +} + +/* + * __log_release -- + * Release a log slot. + */ +static int +__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_FH *close_fh; + WT_LOG *log; + WT_LSN sync_lsn; + size_t write_size; + WT_DECL_SPINLOCK_ID(id); /* Must appear last */ + + conn = S2C(session); + log = conn->log; + /* + * If we're going to have to close our log file, make a local copy + * of the file handle structure. + */ + close_fh = NULL; + if (F_ISSET(slot, SLOT_CLOSEFH)) { + close_fh = log->log_close_fh; + log->log_close_fh = NULL; + F_CLR(slot, SLOT_CLOSEFH); + } + + /* Write the buffered records */ + if (F_ISSET(slot, SLOT_BUFFERED)) { + write_size = (size_t) + (slot->slot_end_lsn.offset - slot->slot_start_offset); + WT_ERR(__wt_write(session, slot->slot_fh, + slot->slot_start_offset, write_size, slot->slot_buf.mem)); + } + + /* + * Wait for earlier groups to finish, otherwise there could be holes + * in the log file. + */ + while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) + __wt_yield(); + log->write_lsn = slot->slot_end_lsn; + /* + * Try to consolidate calls to fsync to wait less. Acquire a spin lock + * so that threads finishing writing to the log will wait while the + * current fsync completes and advance log->write_lsn. + */ + while (F_ISSET(slot, SLOT_SYNC) && + LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { + if (__wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) { + (void)__wt_cond_wait( + session, log->log_sync_cond, 10000); + continue; + } + /* + * Record the current end of log after we grabbed the lock. + * That is how far our fsync call with guarantee. + */ + sync_lsn = log->write_lsn; + if (LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { + WT_STAT_FAST_CONN_INCR(session, log_sync); + ret = __wt_fsync(session, log->log_fh); + if (ret == 0) { + F_CLR(slot, SLOT_SYNC); + log->sync_lsn = sync_lsn; + ret = __wt_cond_signal( + session, log->log_sync_cond); + } + } + __wt_spin_unlock(session, &log->log_sync_lock); + WT_ERR(ret); + } + if (F_ISSET(slot, SLOT_BUF_GROW)) { + WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); + F_CLR(slot, SLOT_BUF_GROW); + WT_STAT_FAST_CONN_INCRV(session, + log_buffer_size, slot->slot_buf.memsize); + WT_ERR(__wt_buf_grow(session, + &slot->slot_buf, slot->slot_buf.memsize * 2)); + } + /* + * If we have a file to close, close it now. + */ + if (close_fh) + WT_ERR(__wt_close(session, close_fh)); + +err: if (ret != 0 && slot->slot_error == 0) + slot->slot_error = ret; + return (ret); +} + +/* + * __wt_log_newfile -- + * Create the next log file and write the file header record into it. + */ +int +__wt_log_newfile(WT_SESSION_IMPL *session, int conn_create) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_LOG *log; + WT_LOG_DESC *desc; + WT_LOG_RECORD *logrec; + WT_LOGSLOT tmp; + WT_MYSLOT myslot; + + conn = S2C(session); + log = conn->log; + + /* + * Set aside the log file handle to be closed later. Other threads + * may still be using it to write to the log. If the log file size + * is small we could fill a log file before the previous one is closed. + * Wait for that to close. + */ + while (log->log_close_fh != NULL) { + __wt_errx(session, + "log_newfile: Log file size %" PRIuMAX " too small", + (uintmax_t)conn->log_file_max); + WT_STAT_FAST_CONN_INCR(session, log_close_yields); + __wt_yield(); + } + log->log_close_fh = log->log_fh; + log->fileid++; + WT_RET(__log_openfile(session, 1, &log->log_fh, log->fileid)); + log->alloc_lsn.file = log->fileid; + log->alloc_lsn.offset = log->log_fh->size; + + /* + * Set up the log descriptor record. Use a scratch buffer to + * get correct alignment for direct I/O. + */ + WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize); + WT_RET(__wt_scr_alloc(session, log->allocsize, &buf)); + memset(buf->mem, 0, log->allocsize); + logrec = (WT_LOG_RECORD *)buf->mem; + desc = (WT_LOG_DESC *)logrec->record; + desc->log_magic = WT_LOG_MAGIC; + desc->majorv = WT_LOG_MAJOR_VERSION; + desc->minorv = WT_LOG_MINOR_VERSION; + desc->log_size = (uint64_t)conn->log_file_max; + + /* + * Now that the record is set up, initialize the record header. + */ + logrec->len = log->allocsize; + logrec->checksum = 0; + logrec->checksum = __wt_cksum(logrec, log->allocsize); + WT_CLEAR(tmp); + myslot.slot = &tmp; + myslot.offset = 0; + + /* + * Recursively call __log_acquire to allocate log space for the + * log descriptor record. Call __log_fill to write it, but we + * do not need to call __log_release because we're not waiting for + * earlier operations to complete. + */ + WT_ERR(__log_acquire(session, logrec->len, &tmp)); + WT_ERR(__log_fill(session, &myslot, 1, buf, NULL)); + + /* + * If we're called from connection creation code, we need to update + * the LSNs since we're the only write in progress. + */ + if (conn_create) { + WT_ERR(__wt_fsync(session, log->log_fh)); + log->sync_lsn = tmp.slot_end_lsn; + log->write_lsn = tmp.slot_end_lsn; + } + +err: __wt_scr_free(&buf); + return (ret); +} + +/* + * __wt_log_read -- + * Read the log record at the given LSN. Return the record (including + * the log header) in the WT_ITEM. Caller is responsible for freeing it. + */ +int +__wt_log_read(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, + uint32_t flags) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_FH *log_fh; + WT_LOG *log; + WT_LOG_RECORD *logrec; + uint32_t cksum, rdup_len, reclen; + + WT_UNUSED(flags); + /* + * If the caller didn't give us an LSN or something to return, + * there's nothing to do. + */ + if (lsnp == NULL || record == NULL) + return (0); + conn = S2C(session); + log = conn->log; + /* + * If the offset isn't on an allocation boundary it must be wrong. + */ + if (lsnp->offset % log->allocsize != 0 || lsnp->file > log->fileid) + return (WT_NOTFOUND); + + WT_RET(__log_openfile(session, 0, &log_fh, lsnp->file)); + /* + * Read the minimum allocation size a record could be. + */ + WT_ERR(__wt_buf_init(session, record, log->allocsize)); + WT_ERR(__wt_read(session, + log_fh, lsnp->offset, (size_t)log->allocsize, record->mem)); + /* + * First 4 bytes is the real record length. See if we + * need to read more than the allocation size. We expect + * that we rarely will have to read more. Most log records + * will be fairly small. + */ + reclen = *(uint32_t *)record->mem; + if (reclen == 0) { + ret = WT_NOTFOUND; + goto err; + } + if (reclen > log->allocsize) { + rdup_len = __wt_rduppo2(reclen, log->allocsize); + WT_ERR(__wt_buf_grow(session, record, rdup_len)); + WT_ERR(__wt_read(session, + log_fh, lsnp->offset, (size_t)rdup_len, record->mem)); + } + /* + * We read in the record, verify checksum. + */ + logrec = (WT_LOG_RECORD *)record->mem; + cksum = logrec->checksum; + logrec->checksum = 0; + logrec->checksum = __wt_cksum(logrec, logrec->len); + if (logrec->checksum != cksum) + WT_ERR_MSG(session, WT_ERROR, "log_read: Bad checksum"); + record->size = logrec->len; + WT_STAT_FAST_CONN_INCR(session, log_reads); +err: + WT_TRET(__wt_close(session, log_fh)); + return (ret); +} + +/* + * __wt_log_scan -- + * Scan the logs, calling a function on each record found. + */ +int +__wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, + int (*func)(WT_SESSION_IMPL *session, + WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_FH *log_fh; + WT_ITEM buf; + WT_LOG *log; + WT_LOG_RECORD *logrec; + WT_LSN end_lsn, rd_lsn, start_lsn; + wt_off_t log_size; + uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen; + u_int i, logcount; + int eol; + char **logfiles; + + conn = S2C(session); + log = conn->log; + log_fh = NULL; + logcount = 0; + logfiles = NULL; + eol = 0; + WT_CLEAR(buf); + + /* + * If the caller did not give us a callback function there is nothing + * to do. + */ + if (func == NULL) + return (0); + + if (LF_ISSET(WT_LOGSCAN_RECOVER)) + WT_RET(__wt_verbose(session, WT_VERB_LOG, + "__wt_log_scan truncating to %u/%" PRIuMAX, + log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset)); + + if (log != NULL) { + allocsize = log->allocsize; + + if (lsnp == NULL) { + if (LF_ISSET(WT_LOGSCAN_FIRST)) + start_lsn = log->first_lsn; + else if (LF_ISSET(WT_LOGSCAN_FROM_CKP)) + start_lsn = log->ckpt_lsn; + else + return (WT_ERROR); /* Illegal usage */ + } else { + if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP)) + WT_RET_MSG(session, WT_ERROR, + "choose either a start LSN or a start flag"); + + /* Offsets must be on allocation boundaries. */ + if (lsnp->offset % allocsize != 0 || + lsnp->file > log->fileid) + return (WT_NOTFOUND); + + /* + * Log cursors may not know the starting LSN. If an + * LSN pointer is passed in, but it is the INIT_LSN, + * start from the first_lsn. + */ + start_lsn = *lsnp; + if (IS_INIT_LSN(&start_lsn)) + start_lsn = log->first_lsn; + } + end_lsn = log->alloc_lsn; + } else { + /* + * If logging is not configured, we can still print out the log + * if log files exist. We just need to set the LSNs from what + * is in the files versus what is in the live connection. + */ + /* + * Set allocsize to the minimum alignment it could be. Larger + * records and larger allocation boundaries should always be + * a multiple of this. + */ + allocsize = LOG_ALIGN; + lastlog = 0; + firstlog = UINT32_MAX; + WT_RET(__wt_log_get_files(session, &logfiles, &logcount)); + if (logcount == 0) + /* + * Return it is not supported if none don't exist. + */ + return (ENOTSUP); + for (i = 0; i < logcount; i++) { + WT_ERR(__wt_log_extract_lognum(session, logfiles[i], + &lognum)); + lastlog = WT_MAX(lastlog, lognum); + firstlog = WT_MIN(firstlog, lognum); + } + start_lsn.file = firstlog; + end_lsn.file = lastlog; + start_lsn.offset = end_lsn.offset = 0; + __wt_log_files_free(session, logfiles, logcount); + logfiles = NULL; + } + WT_ERR(__log_openfile(session, 0, &log_fh, start_lsn.file)); + WT_ERR(__log_filesize(session, log_fh, &log_size)); + rd_lsn = start_lsn; + WT_ERR(__wt_buf_initsize(session, &buf, LOG_ALIGN)); + for (;;) { + if (rd_lsn.offset + allocsize > log_size) { +advance: + /* + * If we read the last record, go to the next file. + */ + WT_ERR(__wt_close(session, log_fh)); + log_fh = NULL; + eol = 1; + /* + * Truncate this log file before we move to the next. + */ + if (LF_ISSET(WT_LOGSCAN_RECOVER)) + WT_ERR(__log_truncate(session, &rd_lsn, 1)); + rd_lsn.file++; + rd_lsn.offset = 0; + /* + * Avoid an error message when we reach end of log + * by checking here. + */ + if (rd_lsn.file > end_lsn.file) + break; + WT_ERR(__log_openfile( + session, 0, &log_fh, rd_lsn.file)); + WT_ERR(__log_filesize(session, log_fh, &log_size)); + continue; + } + /* + * Read the minimum allocation size a record could be. + */ + WT_ASSERT(session, buf.memsize >= allocsize); + WT_ERR(__wt_read(session, + log_fh, rd_lsn.offset, (size_t)allocsize, buf.mem)); + /* + * First 8 bytes is the real record length. See if we + * need to read more than the allocation size. We expect + * that we rarely will have to read more. Most log records + * will be fairly small. + */ + reclen = *(uint32_t *)buf.mem; + /* + * Log files are pre-allocated. We never expect a zero length + * unless we've reached the end of the log. The log can be + * written out of order, so when recovery finds the end of + * the log, truncate the file and remove any later log files + * that may exist. + */ + if (reclen == 0) { + /* This LSN is the end. */ + break; + } + rdup_len = __wt_rduppo2(reclen, allocsize); + if (reclen > allocsize) { + /* + * The log file end could be the middle of this + * log record. + */ + if (rd_lsn.offset + rdup_len > log_size) + goto advance; + /* + * We need to round up and read in the full padded + * record, especially for direct I/O. + */ + WT_ERR(__wt_buf_grow(session, &buf, rdup_len)); + WT_ERR(__wt_read(session, + log_fh, rd_lsn.offset, (size_t)rdup_len, buf.mem)); + WT_STAT_FAST_CONN_INCR(session, log_scan_rereads); + } + /* + * We read in the record, verify checksum. + */ + buf.size = reclen; + logrec = (WT_LOG_RECORD *)buf.mem; + cksum = logrec->checksum; + logrec->checksum = 0; + logrec->checksum = __wt_cksum(logrec, logrec->len); + if (logrec->checksum != cksum) { + /* + * A checksum mismatch means we have reached the end of + * the useful part of the log. This should be found on + * the first pass through recovery. In the second pass + * where we truncate the log, this is where it should + * end. + */ + if (log != NULL) + log->trunc_lsn = rd_lsn; + break; + } + + /* + * We have a valid log record. If it is not the log file + * header, invoke the callback. + */ + WT_STAT_FAST_CONN_INCR(session, log_scan_records); + if (rd_lsn.offset != 0) { + WT_ERR((*func)(session, &buf, &rd_lsn, cookie)); + if (LF_ISSET(WT_LOGSCAN_ONE)) + break; + } + rd_lsn.offset += (wt_off_t)rdup_len; + } + + /* Truncate if we're in recovery. */ + if (LF_ISSET(WT_LOGSCAN_RECOVER) && + LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0) + WT_ERR(__log_truncate(session, &rd_lsn, 0)); + +err: WT_STAT_FAST_CONN_INCR(session, log_scans); + if (logfiles != NULL) + __wt_log_files_free(session, logfiles, logcount); + __wt_buf_free(session, &buf); + /* + * If the caller wants one record and it is at the end of log, + * return WT_NOTFOUND. + */ + if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0) + ret = WT_NOTFOUND; + if (ret == ENOENT) + ret = 0; + if (log_fh != NULL) + WT_TRET(__wt_close(session, log_fh)); + return (ret); +} + +/* + * __log_direct_write -- + * Write a log record without using the consolidation arrays. + */ +static int +__log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, + uint32_t flags) +{ + WT_DECL_RET; + WT_LOG *log; + WT_LOGSLOT tmp; + WT_MYSLOT myslot; + int locked; + WT_DECL_SPINLOCK_ID(id); /* Must appear last */ + + log = S2C(session)->log; + myslot.slot = &tmp; + myslot.offset = 0; + WT_CLEAR(tmp); + + /* Fast path the contended case. */ + if (__wt_spin_trylock(session, &log->log_slot_lock, &id) != 0) + return (EAGAIN); + locked = 1; + + if (LF_ISSET(WT_LOG_FSYNC)) + F_SET(&tmp, SLOT_SYNC); + WT_ERR(__log_acquire(session, record->size, &tmp)); + __wt_spin_unlock(session, &log->log_slot_lock); + locked = 0; + WT_ERR(__log_fill(session, &myslot, 1, record, lsnp)); + WT_ERR(__log_release(session, &tmp)); + +err: if (locked) + __wt_spin_unlock(session, &log->log_slot_lock); + return (ret); +} + +/* + * __wt_log_write -- + * Write a record into the log. + */ +int +__wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, + uint32_t flags) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + WT_LOG_RECORD *logrec; + WT_LSN lsn; + WT_MYSLOT myslot; + uint32_t rdup_len; + int locked; + + conn = S2C(session); + log = conn->log; + locked = 0; + INIT_LSN(&lsn); + myslot.slot = NULL; + /* + * Assume the WT_ITEM the user passed is a WT_LOG_RECORD, which has + * a header at the beginning for us to fill in. + * + * If using direct_io, the caller should pass us an aligned record. + * But we need to make sure it is big enough and zero-filled so + * that we can write the full amount. Do this whether or not + * direct_io is in use because it makes the reading code cleaner. + */ + WT_STAT_FAST_CONN_INCRV(session, log_bytes_user, record->size); + rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize); + WT_ERR(__wt_buf_grow(session, record, rdup_len)); + WT_ASSERT(session, record->data == record->mem); + /* + * If the caller's record only partially fills the necessary + * space, we need to zero-fill the remainder. + */ + if (record->size != rdup_len) { + memset((uint8_t *)record->mem + record->size, 0, + rdup_len - record->size); + record->size = rdup_len; + } + logrec = (WT_LOG_RECORD *)record->mem; + logrec->len = (uint32_t)record->size; + logrec->checksum = 0; + logrec->checksum = __wt_cksum(logrec, record->size); + + WT_STAT_FAST_CONN_INCR(session, log_writes); + + if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) { + ret = __log_direct_write(session, record, lsnp, flags); + if (ret == 0) + return (0); + if (ret != EAGAIN) + WT_ERR(ret); + /* + * An EAGAIN return means we failed to get the try lock - + * fall through to the consolidation code in that case. + */ + } + + /* + * As soon as we see contention for the log slot, disable direct + * log writes. We get better performance by forcing writes through + * the consolidation code. This is because individual writes flood + * the I/O system faster than they contend on the log slot lock. + */ + F_SET(log, WT_LOG_FORCE_CONSOLIDATE); + if ((ret = __wt_log_slot_join( + session, rdup_len, flags, &myslot)) == ENOMEM) { + /* + * If we couldn't find a consolidated slot for this record + * write the record directly. + */ + while ((ret = __log_direct_write( + session, record, lsnp, flags)) == EAGAIN) + ; + WT_ERR(ret); + /* + * Increase the buffer size of any slots we can get access + * to, so future consolidations are likely to succeed. + */ + WT_ERR(__wt_log_slot_grow_buffers(session, 4 * rdup_len)); + return (0); + } + WT_ERR(ret); + if (myslot.offset == 0) { + __wt_spin_lock(session, &log->log_slot_lock); + locked = 1; + WT_ERR(__wt_log_slot_close(session, myslot.slot)); + WT_ERR(__log_acquire( + session, myslot.slot->slot_group_size, myslot.slot)); + __wt_spin_unlock(session, &log->log_slot_lock); + locked = 0; + WT_ERR(__wt_log_slot_notify(session, myslot.slot)); + } else + WT_ERR(__wt_log_slot_wait(session, myslot.slot)); + WT_ERR(__log_fill(session, &myslot, 0, record, &lsn)); + if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) { + WT_ERR(__log_release(session, myslot.slot)); + WT_ERR(__wt_log_slot_free(myslot.slot)); + } else if (LF_ISSET(WT_LOG_FSYNC)) { + /* Wait for our writes to reach disk */ + while (LOG_CMP(&log->sync_lsn, &lsn) <= 0 && + myslot.slot->slot_error == 0) + (void)__wt_cond_wait( + session, log->log_sync_cond, 10000); + } +err: + if (locked) + __wt_spin_unlock(session, &log->log_slot_lock); + if (ret == 0 && lsnp != NULL) + *lsnp = lsn; + /* + * If we're synchronous and some thread had an error, we don't know + * if our write made it out to the file or not. The error could be + * before or after us. So, if anyone got an error, we report it. + * If we're not synchronous, only report if our own operation got + * an error. + */ + if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 && + myslot.slot != NULL) + ret = myslot.slot->slot_error; + return (ret); +} + +/* + * __wt_log_vprintf -- + * Write a message into the log. + */ +int +__wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_ITEM(logrec); + WT_DECL_RET; + va_list ap_copy; + const char *rec_fmt = WT_UNCHECKED_STRING(I); + uint32_t rectype = WT_LOGREC_MESSAGE; + size_t header_size, len; + + conn = S2C(session); + + if (!conn->logging) + return (0); + + va_copy(ap_copy, ap); + len = (size_t)vsnprintf(NULL, 0, fmt, ap_copy) + 1; + va_end(ap_copy); + + WT_RET( + __wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec)); + + /* + * We're writing a record with the type (an integer) followed by a + * string (NUL-terminated data). To avoid writing the string into + * a buffer before copying it, we write the header first, then the + * raw bytes of the string. + */ + WT_ERR(__wt_struct_size(session, &header_size, rec_fmt, rectype)); + WT_ERR(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, header_size, + rec_fmt, rectype)); + logrec->size += (uint32_t)header_size; + + (void)vsnprintf((char *)logrec->data + logrec->size, len, fmt, ap); + + WT_ERR(__wt_verbose(session, WT_VERB_LOG, + "log_printf: %s", (char *)logrec->data + logrec->size)); + + logrec->size += len; + WT_ERR(__wt_log_write(session, logrec, NULL, 0)); +err: __wt_scr_free(&logrec); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/log/log_auto.c b/src/third_party/wiredtiger/src/log/log_auto.c new file mode 100644 index 00000000000..f3db79f4daf --- /dev/null +++ b/src/third_party/wiredtiger/src/log/log_auto.c @@ -0,0 +1,437 @@ +/* DO NOT EDIT: automatically built by dist/log.py. */ + +#include "wt_internal.h" + +int +__wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp) +{ + WT_ITEM *logrec; + + WT_RET(__wt_scr_alloc(session, WT_ALIGN(size + 1, LOG_ALIGN), &logrec)); + WT_CLEAR(*(WT_LOG_RECORD *)logrec->data); + logrec->size = offsetof(WT_LOG_RECORD, record); + + *logrecp = logrec; + return (0); +} + +void +__wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp) +{ + WT_UNUSED(session); + __wt_scr_free(logrecp); +} + +int +__wt_logrec_read(WT_SESSION_IMPL *session, + const uint8_t **pp, const uint8_t *end, uint32_t *rectypep) +{ + uint64_t rectype; + + WT_UNUSED(session); + WT_RET(__wt_vunpack_uint(pp, WT_PTRDIFF(end, *pp), &rectype)); + *rectypep = (uint32_t)rectype; + return (0); +} + +int +__wt_logop_read(WT_SESSION_IMPL *session, + const uint8_t **pp, const uint8_t *end, + uint32_t *optypep, uint32_t *opsizep) +{ + return (__wt_struct_unpack( + session, *pp, WT_PTRDIFF(end, *pp), "II", optypep, opsizep)); +} + +int +__wt_logop_col_put_pack( + WT_SESSION_IMPL *session, WT_ITEM *logrec, + uint32_t fileid, uint64_t recno, WT_ITEM *value) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIru); + size_t size; + uint32_t optype, recsize; + + optype = WT_LOGOP_COL_PUT; + WT_RET(__wt_struct_size(session, &size, fmt, + optype, 0, fileid, recno, value)); + + __wt_struct_size_adjust(session, &size); + WT_RET(__wt_buf_extend(session, logrec, logrec->size + size)); + recsize = (uint32_t)size; + WT_RET(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, size, fmt, + optype, recsize, fileid, recno, value)); + + logrec->size += (uint32_t)size; + return (0); +} + +int +__wt_logop_col_put_unpack( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIru); + uint32_t optype, size; + + WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, + &optype, &size, fileidp, recnop, valuep)); + WT_ASSERT(session, optype == WT_LOGOP_COL_PUT); + + *pp += size; + return (0); +} + +int +__wt_logop_col_put_print( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + uint32_t fileid; + uint64_t recno; + WT_ITEM value; + + WT_RET(__wt_logop_col_put_unpack( + session, pp, end, &fileid, &recno, &value)); + + fprintf(out, " \"optype\": \"col_put\",\n"); + fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); + fprintf(out, " \"recno\": \"%" PRIu64 "\",\n", recno); + fprintf(out, " \"value\": \"%.*s\",\n", + (int)value.size, (const char *)value.data); + return (0); +} + +int +__wt_logop_col_remove_pack( + WT_SESSION_IMPL *session, WT_ITEM *logrec, + uint32_t fileid, uint64_t recno) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIr); + size_t size; + uint32_t optype, recsize; + + optype = WT_LOGOP_COL_REMOVE; + WT_RET(__wt_struct_size(session, &size, fmt, + optype, 0, fileid, recno)); + + __wt_struct_size_adjust(session, &size); + WT_RET(__wt_buf_extend(session, logrec, logrec->size + size)); + recsize = (uint32_t)size; + WT_RET(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, size, fmt, + optype, recsize, fileid, recno)); + + logrec->size += (uint32_t)size; + return (0); +} + +int +__wt_logop_col_remove_unpack( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + uint32_t *fileidp, uint64_t *recnop) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIr); + uint32_t optype, size; + + WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, + &optype, &size, fileidp, recnop)); + WT_ASSERT(session, optype == WT_LOGOP_COL_REMOVE); + + *pp += size; + return (0); +} + +int +__wt_logop_col_remove_print( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + uint32_t fileid; + uint64_t recno; + + WT_RET(__wt_logop_col_remove_unpack( + session, pp, end, &fileid, &recno)); + + fprintf(out, " \"optype\": \"col_remove\",\n"); + fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); + fprintf(out, " \"recno\": \"%" PRIu64 "\",\n", recno); + return (0); +} + +int +__wt_logop_col_truncate_pack( + WT_SESSION_IMPL *session, WT_ITEM *logrec, + uint32_t fileid, uint64_t start, uint64_t stop) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIrr); + size_t size; + uint32_t optype, recsize; + + optype = WT_LOGOP_COL_TRUNCATE; + WT_RET(__wt_struct_size(session, &size, fmt, + optype, 0, fileid, start, stop)); + + __wt_struct_size_adjust(session, &size); + WT_RET(__wt_buf_extend(session, logrec, logrec->size + size)); + recsize = (uint32_t)size; + WT_RET(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, size, fmt, + optype, recsize, fileid, start, stop)); + + logrec->size += (uint32_t)size; + return (0); +} + +int +__wt_logop_col_truncate_unpack( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + uint32_t *fileidp, uint64_t *startp, uint64_t *stopp) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIrr); + uint32_t optype, size; + + WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, + &optype, &size, fileidp, startp, stopp)); + WT_ASSERT(session, optype == WT_LOGOP_COL_TRUNCATE); + + *pp += size; + return (0); +} + +int +__wt_logop_col_truncate_print( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + uint32_t fileid; + uint64_t start; + uint64_t stop; + + WT_RET(__wt_logop_col_truncate_unpack( + session, pp, end, &fileid, &start, &stop)); + + fprintf(out, " \"optype\": \"col_truncate\",\n"); + fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); + fprintf(out, " \"start\": \"%" PRIu64 "\",\n", start); + fprintf(out, " \"stop\": \"%" PRIu64 "\",\n", stop); + return (0); +} + +int +__wt_logop_row_put_pack( + WT_SESSION_IMPL *session, WT_ITEM *logrec, + uint32_t fileid, WT_ITEM *key, WT_ITEM *value) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIuu); + size_t size; + uint32_t optype, recsize; + + optype = WT_LOGOP_ROW_PUT; + WT_RET(__wt_struct_size(session, &size, fmt, + optype, 0, fileid, key, value)); + + __wt_struct_size_adjust(session, &size); + WT_RET(__wt_buf_extend(session, logrec, logrec->size + size)); + recsize = (uint32_t)size; + WT_RET(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, size, fmt, + optype, recsize, fileid, key, value)); + + logrec->size += (uint32_t)size; + return (0); +} + +int +__wt_logop_row_put_unpack( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIuu); + uint32_t optype, size; + + WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, + &optype, &size, fileidp, keyp, valuep)); + WT_ASSERT(session, optype == WT_LOGOP_ROW_PUT); + + *pp += size; + return (0); +} + +int +__wt_logop_row_put_print( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + uint32_t fileid; + WT_ITEM key; + WT_ITEM value; + + WT_RET(__wt_logop_row_put_unpack( + session, pp, end, &fileid, &key, &value)); + + fprintf(out, " \"optype\": \"row_put\",\n"); + fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); + fprintf(out, " \"key\": \"%.*s\",\n", + (int)key.size, (const char *)key.data); + fprintf(out, " \"value\": \"%.*s\",\n", + (int)value.size, (const char *)value.data); + return (0); +} + +int +__wt_logop_row_remove_pack( + WT_SESSION_IMPL *session, WT_ITEM *logrec, + uint32_t fileid, WT_ITEM *key) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIu); + size_t size; + uint32_t optype, recsize; + + optype = WT_LOGOP_ROW_REMOVE; + WT_RET(__wt_struct_size(session, &size, fmt, + optype, 0, fileid, key)); + + __wt_struct_size_adjust(session, &size); + WT_RET(__wt_buf_extend(session, logrec, logrec->size + size)); + recsize = (uint32_t)size; + WT_RET(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, size, fmt, + optype, recsize, fileid, key)); + + logrec->size += (uint32_t)size; + return (0); +} + +int +__wt_logop_row_remove_unpack( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + uint32_t *fileidp, WT_ITEM *keyp) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIu); + uint32_t optype, size; + + WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, + &optype, &size, fileidp, keyp)); + WT_ASSERT(session, optype == WT_LOGOP_ROW_REMOVE); + + *pp += size; + return (0); +} + +int +__wt_logop_row_remove_print( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + uint32_t fileid; + WT_ITEM key; + + WT_RET(__wt_logop_row_remove_unpack( + session, pp, end, &fileid, &key)); + + fprintf(out, " \"optype\": \"row_remove\",\n"); + fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); + fprintf(out, " \"key\": \"%.*s\",\n", + (int)key.size, (const char *)key.data); + return (0); +} + +int +__wt_logop_row_truncate_pack( + WT_SESSION_IMPL *session, WT_ITEM *logrec, + uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIuuI); + size_t size; + uint32_t optype, recsize; + + optype = WT_LOGOP_ROW_TRUNCATE; + WT_RET(__wt_struct_size(session, &size, fmt, + optype, 0, fileid, start, stop, mode)); + + __wt_struct_size_adjust(session, &size); + WT_RET(__wt_buf_extend(session, logrec, logrec->size + size)); + recsize = (uint32_t)size; + WT_RET(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, size, fmt, + optype, recsize, fileid, start, stop, mode)); + + logrec->size += (uint32_t)size; + return (0); +} + +int +__wt_logop_row_truncate_unpack( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIuuI); + uint32_t optype, size; + + WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, + &optype, &size, fileidp, startp, stopp, modep)); + WT_ASSERT(session, optype == WT_LOGOP_ROW_TRUNCATE); + + *pp += size; + return (0); +} + +int +__wt_logop_row_truncate_print( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + uint32_t fileid; + WT_ITEM start; + WT_ITEM stop; + uint32_t mode; + + WT_RET(__wt_logop_row_truncate_unpack( + session, pp, end, &fileid, &start, &stop, &mode)); + + fprintf(out, " \"optype\": \"row_truncate\",\n"); + fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); + fprintf(out, " \"start\": \"%.*s\",\n", + (int)start.size, (const char *)start.data); + fprintf(out, " \"stop\": \"%.*s\",\n", + (int)stop.size, (const char *)stop.data); + fprintf(out, " \"mode\": \"%" PRIu32 "\",\n", mode); + return (0); +} + +int +__wt_txn_op_printlog( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + uint32_t optype, opsize; + + /* Peek at the size and the type. */ + WT_RET(__wt_logop_read(session, pp, end, &optype, &opsize)); + end = *pp + opsize; + + switch (optype) { + case WT_LOGOP_COL_PUT: + WT_RET(__wt_logop_col_put_print(session, pp, end, out)); + break; + + case WT_LOGOP_COL_REMOVE: + WT_RET(__wt_logop_col_remove_print(session, pp, end, out)); + break; + + case WT_LOGOP_COL_TRUNCATE: + WT_RET(__wt_logop_col_truncate_print(session, pp, end, out)); + break; + + case WT_LOGOP_ROW_PUT: + WT_RET(__wt_logop_row_put_print(session, pp, end, out)); + break; + + case WT_LOGOP_ROW_REMOVE: + WT_RET(__wt_logop_row_remove_print(session, pp, end, out)); + break; + + case WT_LOGOP_ROW_TRUNCATE: + WT_RET(__wt_logop_row_truncate_print(session, pp, end, out)); + break; + + WT_ILLEGAL_VALUE(session); + } + + return (0); +} diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c new file mode 100644 index 00000000000..c12f47d231b --- /dev/null +++ b/src/third_party/wiredtiger/src/log/log_slot.c @@ -0,0 +1,354 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * This file implements the consolidated array algorithm as described in + * the paper: + * Scalability of write-ahead logging on multicore and multisocket hardware + * by Ryan Johnson, Ippokratis Pandis, Radu Stoica, Manos Athanassoulis + * and Anastasia Ailamaki. + * + * It appeared in The VLDB Journal, DOI 10.1007/s00778-011-0260-8 and can + * be found at: + * http://infoscience.epfl.ch/record/170505/files/aether-smpfulltext.pdf + */ + +/* + * __wt_log_slot_init -- + * Initialize the slot array. + */ +int +__wt_log_slot_init(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + WT_LOGSLOT *slot; + int32_t i; + + conn = S2C(session); + log = conn->log; + for (i = 0; i < SLOT_POOL; i++) { + log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE; + log->slot_pool[i].slot_index = SLOT_INVALID_INDEX; + } + + /* + * Set up the available slots from the pool the first time. + */ + for (i = 0; i < SLOT_ACTIVE; i++) { + slot = &log->slot_pool[i]; + slot->slot_index = (uint32_t)i; + slot->slot_state = WT_LOG_SLOT_READY; + log->slot_array[i] = slot; + } + + /* + * Allocate memory for buffers now that the arrays are setup. Split + * this out to make error handling simpler. + */ + for (i = 0; i < SLOT_POOL; i++) { + WT_ERR(__wt_buf_init(session, + &log->slot_pool[i].slot_buf, WT_LOG_SLOT_BUF_INIT_SIZE)); + F_SET(&log->slot_pool[i], SLOT_BUFFERED); + } + WT_STAT_FAST_CONN_INCRV(session, + log_buffer_size, WT_LOG_SLOT_BUF_INIT_SIZE * SLOT_POOL); + if (0) { +err: while (--i >= 0) + __wt_buf_free(session, &log->slot_pool[i].slot_buf); + } + return (ret); +} + +/* + * __wt_log_slot_destroy -- + * Clean up the slot array on shutdown. + */ +int +__wt_log_slot_destroy(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + int i; + + conn = S2C(session); + log = conn->log; + + for (i = 0; i < SLOT_POOL; i++) + __wt_buf_free(session, &log->slot_pool[i].slot_buf); + return (0); +} + +/* + * __wt_log_slot_join -- + * Join a consolidated logging slot. Callers should be prepared to deal + * with a ENOMEM return - which indicates no slots could accommodate + * the log record. + */ +int +__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, + uint32_t flags, WT_MYSLOT *myslotp) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + WT_LOGSLOT *slot; + int64_t cur_state, new_state, old_state; + uint32_t allocated_slot, slot_grow_attempts; + + conn = S2C(session); + log = conn->log; + slot_grow_attempts = 0; +find_slot: + allocated_slot = __wt_random(session->rnd) % SLOT_ACTIVE; + slot = log->slot_array[allocated_slot]; + old_state = slot->slot_state; +join_slot: + /* + * WT_LOG_SLOT_READY and higher means the slot is available for + * joining. Any other state means it is in use and transitioning + * from the active array. + */ + if (old_state < WT_LOG_SLOT_READY) { + WT_STAT_FAST_CONN_INCR(session, log_slot_transitions); + goto find_slot; + } + /* + * Add in our size to the state and then atomically swap that + * into place if it is still the same value. + */ + new_state = old_state + (int64_t)mysize; + if (new_state < old_state) { + /* Our size doesn't fit here. */ + WT_STAT_FAST_CONN_INCR(session, log_slot_toobig); + goto find_slot; + } + /* + * If the slot buffer isn't big enough to hold this update, mark + * the slot for a buffer size increase and find another slot. + */ + if (new_state > (int64_t)slot->slot_buf.memsize) { + F_SET(slot, SLOT_BUF_GROW); + if (++slot_grow_attempts > 5) { + WT_STAT_FAST_CONN_INCR(session, log_slot_toosmall); + return (ENOMEM); + } + goto find_slot; + } + cur_state = WT_ATOMIC_CAS_VAL8(slot->slot_state, old_state, new_state); + /* + * We lost a race to add our size into this slot. Check the state + * and try again. + */ + if (cur_state != old_state) { + old_state = cur_state; + WT_STAT_FAST_CONN_INCR(session, log_slot_races); + goto join_slot; + } + WT_ASSERT(session, myslotp != NULL); + /* + * We joined this slot. Fill in our information to return to + * the caller. + */ + WT_STAT_FAST_CONN_INCR(session, log_slot_joins); + if (LF_ISSET(WT_LOG_FSYNC)) + F_SET(slot, SLOT_SYNC); + myslotp->slot = slot; + myslotp->offset = (wt_off_t)old_state - WT_LOG_SLOT_READY; + return (0); +} + +/* + * __wt_log_slot_close -- + * Close a slot and do not allow any other threads to join this slot. + * Remove this from the active slot array and move a new slot from + * the pool into its place. Set up the size of this group; + * Must be called with the logging spinlock held. + */ +int +__wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + WT_LOGSLOT *newslot; + int64_t old_state; + int32_t yields; + uint32_t pool_i, switch_fails; + + conn = S2C(session); + log = conn->log; + switch_fails = 0; +retry: + /* + * Find an unused slot in the pool. + */ + pool_i = log->pool_index; + newslot = &log->slot_pool[pool_i]; + if (++log->pool_index >= SLOT_POOL) + log->pool_index = 0; + if (newslot->slot_state != WT_LOG_SLOT_FREE) { + WT_STAT_FAST_CONN_INCR(session, log_slot_switch_fails); + /* + * If it takes a number of attempts to find an available slot + * it's likely all slots are waiting to be released. This + * churn is used to change how long we pause before closing + * the slot - which leads to more consolidation and less churn. + */ + if (++switch_fails % SLOT_POOL == 0 && + switch_fails != 0 && slot->slot_churn < 5) + ++slot->slot_churn; + __wt_yield(); + goto retry; + } else if (slot->slot_churn > 0) { + --slot->slot_churn; + WT_ASSERT(session, slot->slot_churn >= 0); + } + + /* Pause to allow other threads a chance to consolidate. */ + for (yields = slot->slot_churn; yields >= 0; yields--) + __wt_yield(); + + /* + * Swap out the slot we're going to use and put a free one in the + * slot array in its place so that threads can use it right away. + */ + WT_STAT_FAST_CONN_INCR(session, log_slot_closes); + newslot->slot_state = WT_LOG_SLOT_READY; + newslot->slot_index = slot->slot_index; + log->slot_array[newslot->slot_index] = &log->slot_pool[pool_i]; + old_state = WT_ATOMIC_STORE8(slot->slot_state, WT_LOG_SLOT_PENDING); + slot->slot_group_size = (uint64_t)(old_state - WT_LOG_SLOT_READY); + /* + * Note that this statistic may be much bigger than in reality, + * especially when compared with the total bytes written in + * __log_fill. The reason is that this size reflects any + * rounding up that is needed and the total bytes in __log_fill + * is the amount of user bytes. + */ + WT_STAT_FAST_CONN_INCRV(session, + log_slot_consolidated, (uint64_t)slot->slot_group_size); + return (0); +} + +/* + * __wt_log_slot_notify -- + * Notify all threads waiting for the state to be < WT_LOG_SLOT_DONE. + */ +int +__wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) +{ + WT_UNUSED(session); + + slot->slot_state = + (int64_t)WT_LOG_SLOT_DONE - (int64_t)slot->slot_group_size; + return (0); +} + +/* + * __wt_log_slot_wait -- + * Wait for slot leader to allocate log area and tell us our log offset. + */ +int +__wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) +{ + WT_UNUSED(session); + + while (slot->slot_state > WT_LOG_SLOT_DONE) + __wt_yield(); + return (0); +} + +/* + * __wt_log_slot_release -- + * Each thread in a consolidated group releases its portion to + * signal it has completed writing its piece of the log. + */ +int64_t +__wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size) +{ + int64_t newsize; + + /* + * Add my size into the state. When it reaches WT_LOG_SLOT_DONE + * all participatory threads have completed copying their piece. + */ + newsize = WT_ATOMIC_ADD8(slot->slot_state, (int64_t)size); + return (newsize); +} + +/* + * __wt_log_slot_free -- + * Free a slot back into the pool. + */ +int +__wt_log_slot_free(WT_LOGSLOT *slot) +{ + slot->slot_state = WT_LOG_SLOT_FREE; + return (0); +} + +/* + * __wt_log_slot_grow_buffers -- + * Increase the buffer size of all available slots in the buffer pool. + * Go to some lengths to include active (but unused) slots to handle + * the case where all log write record sizes exceed the size of the + * active buffer. + */ +int +__wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + WT_LOGSLOT *slot; + int64_t orig_state; + uint64_t old_size, total_growth; + int i; + + conn = S2C(session); + log = conn->log; + total_growth = 0; + WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); + /* + * Take the log slot lock to prevent other threads growing buffers + * at the same time. Could tighten the scope of this lock, or have + * a separate lock if there is contention. + */ + __wt_spin_lock(session, &log->log_slot_lock); + for (i = 0; i < SLOT_POOL; i++) { + slot = &log->slot_pool[i]; + /* Avoid atomic operations if they won't succeed. */ + if (slot->slot_state != WT_LOG_SLOT_FREE && + slot->slot_state != WT_LOG_SLOT_READY) + continue; + /* Don't keep growing unrelated buffers. */ + if (slot->slot_buf.memsize > (10 * newsize) && + !F_ISSET(slot, SLOT_BUF_GROW)) + continue; + orig_state = WT_ATOMIC_CAS_VAL8( + slot->slot_state, WT_LOG_SLOT_FREE, WT_LOG_SLOT_PENDING); + if (orig_state != WT_LOG_SLOT_FREE) { + orig_state = WT_ATOMIC_CAS_VAL8(slot->slot_state, + WT_LOG_SLOT_READY, WT_LOG_SLOT_PENDING); + if (orig_state != WT_LOG_SLOT_READY) + continue; + } + + /* We have a slot - now go ahead and grow the buffer. */ + old_size = slot->slot_buf.memsize; + F_CLR(slot, SLOT_BUF_GROW); + WT_ERR(__wt_buf_grow(session, &slot->slot_buf, + WT_MAX(slot->slot_buf.memsize * 2, newsize))); + slot->slot_state = orig_state; + total_growth += slot->slot_buf.memsize - old_size; + } +err: __wt_spin_unlock(session, &log->log_slot_lock); + WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, total_growth); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c new file mode 100644 index 00000000000..f50706fb2e9 --- /dev/null +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -0,0 +1,1519 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +#define WT_FORALL_CURSORS(clsm, c, i) \ + for ((i) = (clsm)->nchunks; (i) > 0;) \ + if (((c) = (clsm)->cursors[--i]) != NULL) + +#define WT_LSM_CURCMP(s, lsm_tree, c1, c2, cmp) \ + __wt_compare(s, (lsm_tree)->collator, &(c1)->key, &(c2)->key, &cmp) + +static int __clsm_lookup(WT_CURSOR_LSM *, WT_ITEM *); +static int __clsm_open_cursors(WT_CURSOR_LSM *, int, u_int, uint32_t); +static int __clsm_reset_cursors(WT_CURSOR_LSM *, WT_CURSOR *); + +/* + * __clsm_enter_update -- + * Make sure an LSM cursor is ready to perform an update. + */ +static int +__clsm_enter_update(WT_CURSOR_LSM *clsm) +{ + WT_CURSOR *primary; + WT_DECL_RET; + WT_LSM_CHUNK *primary_chunk; + WT_LSM_TREE *lsm_tree; + WT_SESSION_IMPL *session; + int have_primary, ovfl, waited; + + lsm_tree = clsm->lsm_tree; + if (clsm->nchunks == 0 || + (primary = clsm->cursors[clsm->nchunks - 1]) == NULL) + return (0); + session = (WT_SESSION_IMPL *)primary->session; + primary_chunk = clsm->primary_chunk; + have_primary = (primary_chunk != NULL && + primary_chunk->switch_txn == WT_TXN_NONE); + ovfl = 0; + + /* + * In LSM there are multiple btrees active at one time. The tree + * switch code needs to use btree API methods, and it wants to + * operate on the btree for the primary chunk. Set that up now. + * + * If the primary chunk has grown too large, set a flag so the worker + * thread will switch when it gets a chance to avoid introducing high + * latency into application threads. Don't do this indefinitely: if a + * chunk grows twice as large as the configured size, block until it + * can be switched. + */ + if (!F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { + if (have_primary) + WT_WITH_BTREE(session, + ((WT_CURSOR_BTREE *)primary)->btree, + ovfl = __wt_btree_size_overflow( + session, lsm_tree->chunk_size)); + + if (ovfl || !have_primary) { + /* + * Check that we are up-to-date: don't set the switch + * if the tree has changed since we last opened + * cursors: that can lead to switching multiple times + * when only one switch is required, creating very + * small chunks. + */ + WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); + if (clsm->dsk_gen == lsm_tree->dsk_gen && + !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { + ret = __wt_lsm_manager_push_entry( + session, WT_LSM_WORK_SWITCH, 0, lsm_tree); + F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH); + } + WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree)); + WT_RET(ret); + ovfl = 0; + } + } else if (have_primary) + WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree, + ovfl = __wt_btree_size_overflow( + session, 2 * lsm_tree->chunk_size)); + + /* + * If there is no primary chunk, or it has really overflowed, which + * either means a worker thread has fallen behind or there has just + * been a user-level checkpoint, wait until the tree changes. + * + * We used to switch chunks in the application thread if we got to + * here, but that is problematic because there is a transaction in + * progress and it could roll back, leaving the metadata inconsistent. + */ + if (ovfl || !have_primary) { + for (waited = 0; + clsm->dsk_gen == lsm_tree->dsk_gen; + ++waited) { + if (waited % 100 == 0) + WT_RET(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_SWITCH, 0, lsm_tree)); + __wt_sleep(0, 10); + } + } + + return (0); +} + +/* + * __clsm_enter -- + * Start an operation on an LSM cursor, update if the tree has changed. + */ +static inline int +__clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + uint64_t *switch_txnp; + uint64_t snap_min; + + session = (WT_SESSION_IMPL *)clsm->iface.session; + + /* Merge cursors never update. */ + if (F_ISSET(clsm, WT_CLSM_MERGE)) + return (0); + + if (reset) { + WT_ASSERT(session, !F_ISSET(&clsm->iface, + WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT)); + WT_RET(__clsm_reset_cursors(clsm, NULL)); + } + + for (;;) { + /* + * If the cursor looks up-to-date, check if the cache is full. + * In case this call blocks, the check will be repeated before + * proceeding. + */ + if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) + goto open; + + WT_RET(__wt_cache_full_check(session)); + + if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) + goto open; + + /* Update the maximum transaction ID in the primary chunk. */ + if (update) { + WT_RET(__clsm_enter_update(clsm)); + if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) + goto open; + + /* + * Ensure that there is a transaction snapshot active. + */ + WT_RET(__wt_txn_autocommit_check(session)); + + if (session->txn.isolation == TXN_ISO_SNAPSHOT) + __wt_txn_cursor_op(session); + + /* + * Figure out how many updates are required for + * snapshot isolation. + * + * This is not a normal visibility check on the maximum + * transaction ID in each chunk: any transaction ID + * that overlaps with our snapshot is a potential + * conflict. + */ + clsm->nupdates = 1; + if (session->txn.isolation == TXN_ISO_SNAPSHOT && + F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { + WT_ASSERT(session, + F_ISSET(&session->txn, TXN_HAS_SNAPSHOT)); + snap_min = session->txn.snap_min; + for (switch_txnp = + &clsm->switch_txn[clsm->nchunks - 2]; + clsm->nupdates < clsm->nchunks; + clsm->nupdates++, switch_txnp--) { + if (TXNID_LT(*switch_txnp, snap_min)) + break; + WT_ASSERT(session, + !__wt_txn_visible_all( + session, *switch_txnp)); + } + } + } + + /* + * Stop when we are up-to-date, as long as this is: + * - a snapshot isolation update and the cursor is set up for + * that; + * - an update operation with a primary chunk, or + * - a read operation and the cursor is open for reading. + */ + if ((!update || + session->txn.isolation != TXN_ISO_SNAPSHOT || + F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) && + ((update && clsm->primary_chunk != NULL) || + (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ)))) + break; + +open: WT_WITH_SCHEMA_LOCK(session, + ret = __clsm_open_cursors(clsm, update, 0, 0)); + WT_RET(ret); + } + + if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) { + WT_RET(__cursor_enter(session)); + F_SET(clsm, WT_CLSM_ACTIVE); + } + + return (0); +} + +/* + * __clsm_leave -- + * Finish an operation on an LSM cursor. + */ +static int +__clsm_leave(WT_CURSOR_LSM *clsm) +{ + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)clsm->iface.session; + + if (F_ISSET(clsm, WT_CLSM_ACTIVE)) { + WT_RET(__cursor_leave(session)); + F_CLR(clsm, WT_CLSM_ACTIVE); + } + + return (0); +} + +/* + * We need a tombstone to mark deleted records, and we use the special + * value below for that purpose. We use two 0x14 (Device Control 4) bytes to + * minimize the likelihood of colliding with an application-chosen encoding + * byte, if the application uses two leading DC4 byte for some reason, we'll do + * a wasted data copy each time a new value is inserted into the object. + */ +static const WT_ITEM __tombstone = { "\x14\x14", 2, 0, NULL, 0 }; + +/* + * __clsm_deleted -- + * Check whether the current value is a tombstone. + */ +static inline int +__clsm_deleted(WT_CURSOR_LSM *clsm, const WT_ITEM *item) +{ + return (!F_ISSET(clsm, WT_CLSM_MINOR_MERGE) && + item->size == __tombstone.size && + memcmp(item->data, __tombstone.data, __tombstone.size) == 0); +} + +/* + * __clsm_deleted_encode -- + * Encode values that are in the encoded name space. + */ +static inline int +__clsm_deleted_encode(WT_SESSION_IMPL *session, + const WT_ITEM *value, WT_ITEM *final_value, WT_ITEM **tmpp) +{ + WT_ITEM *tmp; + + /* + * If value requires encoding, get a scratch buffer of the right size + * and create a copy of the data with the first byte of the tombstone + * appended. + */ + if (value->size >= __tombstone.size && + memcmp(value->data, __tombstone.data, __tombstone.size) == 0) { + WT_RET(__wt_scr_alloc(session, value->size + 1, tmpp)); + tmp = *tmpp; + + memcpy(tmp->mem, value->data, value->size); + memcpy((uint8_t *)tmp->mem + value->size, __tombstone.data, 1); + final_value->data = tmp->mem; + final_value->size = value->size + 1; + } else { + final_value->data = value->data; + final_value->size = value->size; + } + + return (0); +} + +/* + * __clsm_deleted_decode -- + * Decode values that start with the tombstone. + */ +static inline void +__clsm_deleted_decode(WT_ITEM *value) +{ + /* + * Take care with this check: when an LSM cursor is used for a merge, + * and/or to create a Bloom filter, it is valid to return the tombstone + * value. + */ + if (value->size > __tombstone.size && + memcmp(value->data, __tombstone.data, __tombstone.size) == 0) + --value->size; +} + +/* + * __clsm_close_cursors -- + * Close any btree cursors that are not needed. + */ +static int +__clsm_close_cursors(WT_CURSOR_LSM *clsm, u_int start, u_int end) +{ + WT_BLOOM *bloom; + WT_CURSOR *c; + u_int i; + + if (clsm->cursors == NULL || clsm->nchunks == 0) + return (0); + + /* + * Walk the cursors, closing any we don't need. Note that the exit + * condition here is special, don't use WT_FORALL_CURSORS, and be + * careful with unsigned integer wrapping. + */ + for (i = start; i < end; i++) { + if ((c = (clsm)->cursors[i]) != NULL) { + clsm->cursors[i] = NULL; + WT_RET(c->close(c)); + } + if ((bloom = clsm->blooms[i]) != NULL) { + clsm->blooms[i] = NULL; + WT_RET(__wt_bloom_close(bloom)); + } + } + + return (0); +} + +/* + * __clsm_open_cursors -- + * Open cursors for the current set of files. + */ +static int +__clsm_open_cursors( + WT_CURSOR_LSM *clsm, int update, u_int start_chunk, uint32_t start_id) +{ + WT_BTREE *btree; + WT_CURSOR *c, **cp, *primary; + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + WT_LSM_TREE *lsm_tree; + WT_SESSION_IMPL *session; + WT_TXN *txn; + const char *checkpoint, *ckpt_cfg[3]; + uint64_t saved_gen; + u_int i, nchunks, ngood, nupdates; + int locked; + + c = &clsm->iface; + session = (WT_SESSION_IMPL *)c->session; + txn = &session->txn; + lsm_tree = clsm->lsm_tree; + chunk = NULL; + + ckpt_cfg[0] = WT_CONFIG_BASE(session, session_open_cursor); + ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw"; + ckpt_cfg[2] = NULL; + + /* Copy the key, so we don't lose the cursor position. */ + if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key)) + WT_RET(__wt_buf_set( + session, &c->key, c->key.data, c->key.size)); + + F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); + + if (update) { + if (txn->isolation == TXN_ISO_SNAPSHOT) + F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT); + } else + F_SET(clsm, WT_CLSM_OPEN_READ); + + WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); + locked = 1; + + /* + * If there is no in-memory chunk in the tree for an update operation, + * create one. + * + * !!! + * It is exceeding unlikely that we get here at all, but if we were to + * switch chunks in this thread and our transaction roll back, it would + * leave the metadata inconsistent. Signal for the LSM worker thread + * to create the chunk instead to avoid the issue. + */ + if (update && (lsm_tree->nchunks == 0 || + (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) == NULL || + chunk->switch_txn != WT_TXN_NONE)) { + /* Release our lock because switch will get a write lock. */ + F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH); + locked = 0; + WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree)); + + /* + * Give the worker thread a chance to run before locking the + * tree again -- we will loop in __clsm_enter until there is an + * in-memory chunk in the tree. + */ + __wt_sleep(0, 1000); + WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree)); + locked = 1; + } + + /* Merge cursors have already figured out how many chunks they need. */ +retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { + nchunks = clsm->nchunks; + ngood = 0; + + /* + * We may have raced with another merge completing. Check that + * we're starting at the right offset in the chunk array. + */ + if (start_chunk >= lsm_tree->nchunks || + lsm_tree->chunk[start_chunk]->id != start_id) { + for (start_chunk = 0; + start_chunk < lsm_tree->nchunks; + start_chunk++) { + chunk = lsm_tree->chunk[start_chunk]; + if (chunk->id == start_id) + break; + } + /* We have to find the start chunk: merge locked it. */ + WT_ASSERT(session, start_chunk < lsm_tree->nchunks); + } + + WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); + } else { + nchunks = lsm_tree->nchunks; + + /* + * If we are only opening the cursor for updates, only open the + * primary chunk, plus any other chunks that might be required + * to detect snapshot isolation conflicts. + */ + if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) + WT_ERR(__wt_realloc_def(session, + &clsm->txnid_alloc, nchunks, + &clsm->switch_txn)); + if (F_ISSET(clsm, WT_CLSM_OPEN_READ)) + ngood = nupdates = 0; + else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { + /* + * Keep going until all updates in the next + * chunk are globally visible. Copy the maximum + * transaction IDs into the cursor as we go. + */ + for (ngood = nchunks - 1, nupdates = 1; + ngood > 0; + ngood--, nupdates++) { + chunk = lsm_tree->chunk[ngood - 1]; + clsm->switch_txn[ngood - 1] = chunk->switch_txn; + if (__wt_txn_visible_all( + session, chunk->switch_txn)) + break; + } + } else { + nupdates = 1; + ngood = nchunks - 1; + } + + /* Check how many cursors are already open. */ + for (cp = clsm->cursors + ngood; + ngood < clsm->nchunks && ngood < nchunks; + cp++, ngood++) { + chunk = lsm_tree->chunk[ngood]; + + /* If the cursor isn't open yet, we're done. */ + if (*cp == NULL) + break; + + /* Easy case: the URIs don't match. */ + if (strcmp((*cp)->uri, chunk->uri) != 0) + break; + + /* Make sure the checkpoint config matches. */ + checkpoint = ((WT_CURSOR_BTREE *)*cp)-> + btree->dhandle->checkpoint; + if (checkpoint == NULL && + F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && + !chunk->empty) + break; + + /* Make sure the Bloom config matches. */ + if (clsm->blooms[ngood] == NULL && + F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) + break; + } + + /* Spurious generation bump? */ + if (ngood == clsm->nchunks && clsm->nchunks == nchunks) { + clsm->dsk_gen = lsm_tree->dsk_gen; + goto err; + } + + /* + * Close any cursors we no longer need. If the cursor is a + * pure update cursor, close everything -- we usually only need + * a single chunk open in that case and we haven't walked all + * of the other slots in the loop above. + * + * Drop the LSM tree lock while we do this: if the cache is + * full, we may block while closing a cursor. Save the + * generation number and retry if it has changed under us. + */ + if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0) + ngood = 0; + if (clsm->cursors != NULL && ngood < clsm->nchunks) { + saved_gen = lsm_tree->dsk_gen; + locked = 0; + WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree)); + WT_ERR(__clsm_close_cursors( + clsm, ngood, clsm->nchunks)); + WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree)); + locked = 1; + if (lsm_tree->dsk_gen != saved_gen) + goto retry; + } + + /* Detach from our old primary. */ + clsm->primary_chunk = NULL; + clsm->current = NULL; + } + + WT_ERR(__wt_realloc_def(session, + &clsm->bloom_alloc, nchunks, &clsm->blooms)); + WT_ERR(__wt_realloc_def(session, + &clsm->cursor_alloc, nchunks, &clsm->cursors)); + + clsm->nchunks = nchunks; + + /* Open the cursors for chunks that have changed. */ + for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) { + chunk = lsm_tree->chunk[i + start_chunk]; + /* Copy the maximum transaction ID. */ + if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) + clsm->switch_txn[i] = chunk->switch_txn; + + /* + * Read from the checkpoint if the file has been written. + * Once all cursors switch, the in-memory tree can be evicted. + */ + WT_ASSERT(session, *cp == NULL); + ret = __wt_open_cursor(session, chunk->uri, c, + (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? + ckpt_cfg : NULL, cp); + + /* + * XXX kludge: we may have an empty chunk where no checkpoint + * was written. If so, try to open the ordinary handle on that + * chunk instead. + */ + if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { + ret = __wt_open_cursor( + session, chunk->uri, c, NULL, cp); + if (ret == 0) + chunk->empty = 1; + } + WT_ERR(ret); + + /* + * Setup all cursors other than the primary to only do conflict + * checks on insert operations. This allows us to execute + * inserts on non-primary chunks as a way of checking for + * write conflicts with concurrent updates. + */ + if (i != nchunks - 1) + (*cp)->insert = __wt_curfile_update_check; + + if (!F_ISSET(clsm, WT_CLSM_MERGE) && + F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) + WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, + lsm_tree->bloom_bit_count, + lsm_tree->bloom_hash_count, + c, &clsm->blooms[i])); + + /* Child cursors always use overwrite and raw mode. */ + F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); + } + + /* The last chunk is our new primary. */ + if (chunk != NULL && + !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && + chunk->switch_txn == WT_TXN_NONE) { + clsm->primary_chunk = chunk; + primary = clsm->cursors[clsm->nchunks - 1]; + /* + * Disable eviction for the in-memory chunk. Also clear the + * bulk load flag here, otherwise eviction will be enabled by + * the first update. + */ + btree = ((WT_CURSOR_BTREE *)(primary))->btree; + if (btree->bulk_load_ok) { + btree->bulk_load_ok = 0; + WT_WITH_BTREE(session, btree, + __wt_btree_evictable(session, 0)); + } + } + + clsm->dsk_gen = lsm_tree->dsk_gen; + +err: +#ifdef HAVE_DIAGNOSTIC + /* Check that all cursors are open as expected. */ + if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) { + for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) { + chunk = lsm_tree->chunk[i + start_chunk]; + + /* Make sure the cursor is open. */ + WT_ASSERT(session, *cp != NULL); + + /* Easy case: the URIs should match. */ + WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0); + + /* Make sure the checkpoint config matches. */ + checkpoint = ((WT_CURSOR_BTREE *)*cp)-> + btree->dhandle->checkpoint; + WT_ASSERT(session, + (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && + !chunk->empty) ? + checkpoint != NULL : checkpoint == NULL); + + /* Make sure the Bloom config matches. */ + WT_ASSERT(session, + (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && + !F_ISSET(clsm, WT_CLSM_MERGE)) ? + clsm->blooms[i] != NULL : clsm->blooms[i] == NULL); + } + } +#endif + if (locked) + WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree)); + return (ret); +} + +/* + * __wt_clsm_init_merge -- + * Initialize an LSM cursor for a merge. + */ +int +__wt_clsm_init_merge( + WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks) +{ + WT_CURSOR_LSM *clsm; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + clsm = (WT_CURSOR_LSM *)cursor; + session = (WT_SESSION_IMPL *)cursor->session; + + F_SET(clsm, WT_CLSM_MERGE); + if (start_chunk != 0) + F_SET(clsm, WT_CLSM_MINOR_MERGE); + clsm->nchunks = nchunks; + + WT_WITH_SCHEMA_LOCK(session, + ret = __clsm_open_cursors(clsm, 0, start_chunk, start_id)); + return (ret); +} + +/* + * __clsm_get_current -- + * Find the smallest / largest of the cursors and copy its key/value. + */ +static int +__clsm_get_current( + WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, int smallest, int *deletedp) +{ + WT_CURSOR *c, *current; + int cmp, multiple; + u_int i; + + current = NULL; + multiple = 0; + + WT_FORALL_CURSORS(clsm, c, i) { + if (!F_ISSET(c, WT_CURSTD_KEY_INT)) + continue; + if (current == NULL) { + current = c; + continue; + } + WT_RET(WT_LSM_CURCMP(session, clsm->lsm_tree, c, current, cmp)); + if (smallest ? cmp < 0 : cmp > 0) { + current = c; + multiple = 0; + } else if (cmp == 0) + multiple = 1; + } + + c = &clsm->iface; + if ((clsm->current = current) == NULL) { + F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + return (WT_NOTFOUND); + } + + if (multiple) + F_SET(clsm, WT_CLSM_MULTIPLE); + else + F_CLR(clsm, WT_CLSM_MULTIPLE); + + WT_RET(current->get_key(current, &c->key)); + WT_RET(current->get_value(current, &c->value)); + + F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + if ((*deletedp = __clsm_deleted(clsm, &c->value)) == 0) + F_SET(c, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + + return (0); +} + +/* + * __clsm_compare -- + * WT_CURSOR->compare implementation for the LSM cursor type. + */ +static int +__clsm_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) +{ + WT_CURSOR_LSM *alsm; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + /* There's no need to sync with the LSM tree, avoid WT_LSM_ENTER. */ + alsm = (WT_CURSOR_LSM *)a; + CURSOR_API_CALL(a, session, compare, NULL); + + /* + * Confirm both cursors refer to the same source and have keys, then + * compare the keys. + */ + if (strcmp(a->uri, b->uri) != 0) + WT_ERR_MSG(session, EINVAL, + "comparison method cursors must reference the same object"); + + WT_CURSOR_NEEDKEY(a); + WT_CURSOR_NEEDKEY(b); + + WT_ERR(__wt_compare( + session, alsm->lsm_tree->collator, &a->key, &b->key, cmpp)); + +err: API_END_RET(session, ret); +} + +/* + * __clsm_next -- + * WT_CURSOR->next method for the LSM cursor type. + */ +static int +__clsm_next(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_CURSOR *c; + WT_DECL_RET; + WT_SESSION_IMPL *session; + u_int i; + int check, cmp, deleted; + + clsm = (WT_CURSOR_LSM *)cursor; + + CURSOR_API_CALL(cursor, session, next, NULL); + WT_CURSOR_NOVALUE(cursor); + WT_ERR(__clsm_enter(clsm, 0, 0)); + + /* If we aren't positioned for a forward scan, get started. */ + if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT)) { + F_CLR(clsm, WT_CLSM_MULTIPLE); + WT_FORALL_CURSORS(clsm, c, i) { + if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) { + WT_ERR(c->reset(c)); + ret = c->next(c); + } else if (c != clsm->current) { + c->set_key(c, &cursor->key); + if ((ret = c->search_near(c, &cmp)) == 0) { + if (cmp < 0) + ret = c->next(c); + else if (cmp == 0) { + if (clsm->current == NULL) + clsm->current = c; + else + F_SET(clsm, + WT_CLSM_MULTIPLE); + } + } else + F_CLR(c, WT_CURSTD_KEY_SET); + } + WT_ERR_NOTFOUND_OK(ret); + } + F_SET(clsm, WT_CLSM_ITERATE_NEXT); + F_CLR(clsm, WT_CLSM_ITERATE_PREV); + + /* We just positioned *at* the key, now move. */ + if (clsm->current != NULL) + goto retry; + } else { +retry: /* + * If there are multiple cursors on that key, move them + * forward. + */ + if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) { + check = 0; + WT_FORALL_CURSORS(clsm, c, i) { + if (!F_ISSET(c, WT_CURSTD_KEY_INT)) + continue; + if (check) { + WT_ERR(WT_LSM_CURCMP(session, + clsm->lsm_tree, c, clsm->current, + cmp)); + if (cmp == 0) + WT_ERR_NOTFOUND_OK(c->next(c)); + } + if (c == clsm->current) + check = 1; + } + } + + /* Move the smallest cursor forward. */ + c = clsm->current; + WT_ERR_NOTFOUND_OK(c->next(c)); + } + + /* Find the cursor(s) with the smallest key. */ + if ((ret = __clsm_get_current(session, clsm, 1, &deleted)) == 0 && + deleted) + goto retry; + +err: WT_TRET(__clsm_leave(clsm)); + API_END(session, ret); + if (ret == 0) + __clsm_deleted_decode(&cursor->value); + return (ret); +} + +/* + * __clsm_prev -- + * WT_CURSOR->prev method for the LSM cursor type. + */ +static int +__clsm_prev(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_CURSOR *c; + WT_DECL_RET; + WT_SESSION_IMPL *session; + u_int i; + int check, cmp, deleted; + + clsm = (WT_CURSOR_LSM *)cursor; + + CURSOR_API_CALL(cursor, session, prev, NULL); + WT_CURSOR_NOVALUE(cursor); + WT_ERR(__clsm_enter(clsm, 0, 0)); + + /* If we aren't positioned for a reverse scan, get started. */ + if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_PREV)) { + F_CLR(clsm, WT_CLSM_MULTIPLE); + WT_FORALL_CURSORS(clsm, c, i) { + if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) { + WT_ERR(c->reset(c)); + ret = c->prev(c); + } else if (c != clsm->current) { + c->set_key(c, &cursor->key); + if ((ret = c->search_near(c, &cmp)) == 0) { + if (cmp > 0) + ret = c->prev(c); + else if (cmp == 0) { + if (clsm->current == NULL) + clsm->current = c; + else + F_SET(clsm, + WT_CLSM_MULTIPLE); + } + } + } + WT_ERR_NOTFOUND_OK(ret); + } + F_SET(clsm, WT_CLSM_ITERATE_PREV); + F_CLR(clsm, WT_CLSM_ITERATE_NEXT); + + /* We just positioned *at* the key, now move. */ + if (clsm->current != NULL) + goto retry; + } else { +retry: /* + * If there are multiple cursors on that key, move them + * backwards. + */ + if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) { + check = 0; + WT_FORALL_CURSORS(clsm, c, i) { + if (!F_ISSET(c, WT_CURSTD_KEY_INT)) + continue; + if (check) { + WT_ERR(WT_LSM_CURCMP(session, + clsm->lsm_tree, c, clsm->current, + cmp)); + if (cmp == 0) + WT_ERR_NOTFOUND_OK(c->prev(c)); + } + if (c == clsm->current) + check = 1; + } + } + + /* Move the smallest cursor backwards. */ + c = clsm->current; + WT_ERR_NOTFOUND_OK(c->prev(c)); + } + + /* Find the cursor(s) with the largest key. */ + if ((ret = __clsm_get_current(session, clsm, 0, &deleted)) == 0 && + deleted) + goto retry; + +err: WT_TRET(__clsm_leave(clsm)); + API_END(session, ret); + if (ret == 0) + __clsm_deleted_decode(&cursor->value); + return (ret); +} + +/* + * __clsm_reset_cursors -- + * Reset any positioned chunk cursors. + * + * If the skip parameter is non-NULL, that cursor is about to be used, so + * there is no need to reset it. + */ +static int +__clsm_reset_cursors(WT_CURSOR_LSM *clsm, WT_CURSOR *skip) +{ + WT_CURSOR *c; + WT_DECL_RET; + u_int i; + + /* Fast path if the cursor is not positioned. */ + if ((clsm->current == NULL || clsm->current == skip) && + !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV)) + return (0); + + WT_FORALL_CURSORS(clsm, c, i) { + if (c == skip) + continue; + if (F_ISSET(c, WT_CURSTD_KEY_INT)) + WT_TRET(c->reset(c)); + } + + clsm->current = NULL; + F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); + + return (ret); +} + +/* + * __clsm_reset -- + * WT_CURSOR->reset method for the LSM cursor type. + */ +static int +__clsm_reset(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + /* + * Don't use the normal __clsm_enter path: that is wasted work when all + * we want to do is give up our position. + */ + clsm = (WT_CURSOR_LSM *)cursor; + CURSOR_API_CALL(cursor, session, reset, NULL); + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + + WT_TRET(__clsm_reset_cursors(clsm, NULL)); + + /* In case we were left positioned, clear that. */ + WT_TRET(__clsm_leave(clsm)); + +err: API_END_RET(session, ret); +} + +/* + * __clsm_lookup -- + * Position an LSM cursor. + */ +static int +__clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value) +{ + WT_BLOOM *bloom; + WT_BLOOM_HASH bhash; + WT_CURSOR *c, *cursor; + WT_DECL_RET; + WT_SESSION_IMPL *session; + u_int i; + int have_hash; + + c = NULL; + cursor = &clsm->iface; + have_hash = 0; + session = (WT_SESSION_IMPL *)cursor->session; + + WT_FORALL_CURSORS(clsm, c, i) { + /* If there is a Bloom filter, see if we can skip the read. */ + bloom = NULL; + if ((bloom = clsm->blooms[i]) != NULL) { + if (!have_hash) { + WT_ERR(__wt_bloom_hash( + bloom, &cursor->key, &bhash)); + have_hash = 1; + } + + ret = __wt_bloom_hash_get(bloom, &bhash); + if (ret == WT_NOTFOUND) { + WT_STAT_FAST_INCR(session, + &clsm->lsm_tree->stats, bloom_miss); + continue; + } else if (ret == 0) + WT_STAT_FAST_INCR(session, + &clsm->lsm_tree->stats, bloom_hit); + WT_ERR(ret); + } + c->set_key(c, &cursor->key); + if ((ret = c->search(c)) == 0) { + WT_ERR(c->get_key(c, &cursor->key)); + WT_ERR(c->get_value(c, value)); + if (__clsm_deleted(clsm, value)) + ret = WT_NOTFOUND; + goto done; + } + WT_ERR_NOTFOUND_OK(ret); + F_CLR(c, WT_CURSTD_KEY_SET); + /* Update stats: the active chunk can't have a bloom filter. */ + if (bloom != NULL) + WT_STAT_FAST_INCR(session, + &clsm->lsm_tree->stats, bloom_false_positive); + else if (clsm->primary_chunk == NULL || i != clsm->nchunks) + WT_STAT_FAST_INCR(session, + &clsm->lsm_tree->stats, lsm_lookup_no_bloom); + } + WT_ERR(WT_NOTFOUND); + +done: +err: F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + if (ret == 0) { + clsm->current = c; + F_SET(cursor, WT_CURSTD_KEY_INT); + if (value == &cursor->value) + F_SET(cursor, WT_CURSTD_VALUE_INT); + } else if (c != NULL) + WT_TRET(c->reset(c)); + + return (ret); +} + +/* + * __clsm_search -- + * WT_CURSOR->search method for the LSM cursor type. + */ +static int +__clsm_search(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + clsm = (WT_CURSOR_LSM *)cursor; + + CURSOR_API_CALL(cursor, session, search, NULL); + WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_NOVALUE(cursor); + WT_ERR(__clsm_enter(clsm, 1, 0)); + + ret = __clsm_lookup(clsm, &cursor->value); + +err: WT_TRET(__clsm_leave(clsm)); + API_END(session, ret); + if (ret == 0) + __clsm_deleted_decode(&cursor->value); + return (ret); +} + +/* + * __clsm_search_near -- + * WT_CURSOR->search_near method for the LSM cursor type. + */ +static int +__clsm_search_near(WT_CURSOR *cursor, int *exactp) +{ + WT_CURSOR *c, *larger, *smaller; + WT_CURSOR_LSM *clsm; + WT_DECL_RET; + WT_ITEM v; + WT_SESSION_IMPL *session; + u_int i; + int cmp, deleted; + + larger = smaller = NULL; + clsm = (WT_CURSOR_LSM *)cursor; + + CURSOR_API_CALL(cursor, session, search_near, NULL); + WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_NOVALUE(cursor); + WT_ERR(__clsm_enter(clsm, 1, 0)); + F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); + + /* + * search_near is somewhat fiddly: we can't just use a nearby key from + * the in-memory chunk because there could be a closer key on disk. + * + * As we search down the chunks, we stop as soon as we find an exact + * match. Otherwise, we maintain the smallest cursor larger than the + * search key and the largest cursor smaller than the search key. At + * the bottom, we prefer the larger cursor, but if no record is larger, + * use the smaller cursor, or if no record at all was found, + * WT_NOTFOUND. + */ + WT_FORALL_CURSORS(clsm, c, i) { + c->set_key(c, &cursor->key); + if ((ret = c->search_near(c, &cmp)) == WT_NOTFOUND) { + F_CLR(c, WT_CURSTD_KEY_SET); + ret = 0; + continue; + } else if (ret != 0) + goto err; + + WT_ERR(c->get_value(c, &v)); + deleted = __clsm_deleted(clsm, &v); + + if (cmp == 0 && !deleted) { + clsm->current = c; + *exactp = 0; + goto done; + } + + /* + * Prefer larger cursors. There are two reasons: (1) we expect + * prefix searches to be a common case (as in our own indices); + * and (2) we need a way to unambiguously know we have the + * "closest" result. + */ + if (cmp < 0) { + if ((ret = c->next(c)) == 0) + cmp = 1; + else if (ret == WT_NOTFOUND) + ret = c->prev(c); + if (ret != 0) + goto err; + } + + /* + * If we land on a deleted item, try going forwards or + * backwards to find one that isn't deleted. + */ + while (deleted && (ret = c->next(c)) == 0) { + cmp = 1; + WT_ERR(c->get_value(c, &v)); + deleted = __clsm_deleted(clsm, &v); + } + WT_ERR_NOTFOUND_OK(ret); + while (deleted && (ret = c->prev(c)) == 0) { + cmp = -1; + WT_ERR(c->get_value(c, &v)); + deleted = __clsm_deleted(clsm, &v); + } + WT_ERR_NOTFOUND_OK(ret); + if (deleted) + continue; + + /* + * We are trying to find the smallest cursor greater than the + * search key, or, if there is no larger key, the largest + * cursor smaller than the search key. + * + * It could happen that one cursor contains both of the closest + * records. In that case, we will track it in "larger", and it + * will be the one we finally choose. + */ + if (cmp > 0) { + if (larger == NULL) + larger = c; + else { + WT_ERR(WT_LSM_CURCMP(session, + clsm->lsm_tree, c, larger, cmp)); + if (cmp < 0) { + WT_ERR(larger->reset(larger)); + larger = c; + } + } + } else { + if (smaller == NULL) + smaller = c; + else { + WT_ERR(WT_LSM_CURCMP(session, + clsm->lsm_tree, c, smaller, cmp)); + if (cmp > 0) { + WT_ERR(smaller->reset(smaller)); + smaller = c; + } + } + } + + if (c != smaller && c != larger) + WT_ERR(c->reset(c)); + } + + if (larger != NULL) { + clsm->current = larger; + larger = NULL; + *exactp = 1; + } else if (smaller != NULL) { + clsm->current = smaller; + smaller = NULL; + *exactp = -1; + } else + ret = WT_NOTFOUND; + +done: +err: WT_TRET(__clsm_leave(clsm)); + API_END(session, ret); + if (ret == 0) { + c = clsm->current; + WT_TRET(c->get_key(c, &cursor->key)); + WT_TRET(c->get_value(c, &cursor->value)); + } + if (smaller != NULL) + WT_TRET(smaller->reset(smaller)); + if (larger != NULL) + WT_TRET(larger->reset(larger)); + + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + if (ret == 0) { + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + __clsm_deleted_decode(&cursor->value); + } else + clsm->current = NULL; + + return (ret); +} + +/* + * __clsm_put -- + * Put an entry into the in-memory tree, trigger a file switch if + * necessary. + */ +static inline int +__clsm_put(WT_SESSION_IMPL *session, + WT_CURSOR_LSM *clsm, const WT_ITEM *key, const WT_ITEM *value, int position) +{ + WT_CURSOR *c, *primary; + WT_LSM_TREE *lsm_tree; + u_int i; + + lsm_tree = clsm->lsm_tree; + + WT_ASSERT(session, + clsm->primary_chunk != NULL && + (clsm->primary_chunk->switch_txn == WT_TXN_NONE || + TXNID_LE(session->txn.id, clsm->primary_chunk->switch_txn))); + + /* + * Clear the existing cursor position. Don't clear the primary cursor: + * we're about to use it anyway. + */ + primary = clsm->cursors[clsm->nchunks - 1]; + WT_RET(__clsm_reset_cursors(clsm, primary)); + + /* If necessary, set the position for future scans. */ + if (position) + clsm->current = primary; + + for (i = 0; i < clsm->nupdates; i++) { + c = clsm->cursors[(clsm->nchunks - i) - 1]; + c->set_key(c, key); + c->set_value(c, value); + WT_RET((position && i == 0) ? c->update(c) : c->insert(c)); + } + + /* + * Update the record count. It is in a shared structure, but it's only + * approximate, so don't worry about protecting access. + * + * Throttle if necessary. Every 100 update operations on each cursor, + * check if throttling is required. Don't rely only on the shared + * counter because it can race, and because for some workloads, there + * may not be enough records per chunk to get effective throttling. + */ + if ((++clsm->primary_chunk->count % 100 == 0 || + ++clsm->update_count >= 100) && + lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) { + clsm->update_count = 0; + WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats, + lsm_checkpoint_throttle, (uint64_t)lsm_tree->ckpt_throttle); + WT_STAT_FAST_CONN_INCRV(session, + lsm_checkpoint_throttle, (uint64_t)lsm_tree->ckpt_throttle); + WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats, + lsm_merge_throttle, (uint64_t)lsm_tree->merge_throttle); + WT_STAT_FAST_CONN_INCRV(session, + lsm_merge_throttle, (uint64_t)lsm_tree->merge_throttle); + __wt_sleep(0, + lsm_tree->ckpt_throttle + lsm_tree->merge_throttle); + } + + return (0); +} + +/* + * __clsm_insert -- + * WT_CURSOR->insert method for the LSM cursor type. + */ +static int +__clsm_insert(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_ITEM value; + WT_SESSION_IMPL *session; + + clsm = (WT_CURSOR_LSM *)cursor; + + CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL); + WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_NEEDVALUE(cursor); + WT_ERR(__clsm_enter(clsm, 0, 1)); + + if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && + (ret = __clsm_lookup(clsm, &value)) != WT_NOTFOUND) { + if (ret == 0) + ret = WT_DUPLICATE_KEY; + goto err; + } + + WT_ERR(__clsm_deleted_encode(session, &cursor->value, &value, &buf)); + ret = __clsm_put(session, clsm, &cursor->key, &value, 0); + +err: __wt_scr_free(&buf); + WT_TRET(__clsm_leave(clsm)); + CURSOR_UPDATE_API_END(session, ret); + return (ret); +} + +/* + * __clsm_update -- + * WT_CURSOR->update method for the LSM cursor type. + */ +static int +__clsm_update(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_ITEM value; + WT_SESSION_IMPL *session; + + clsm = (WT_CURSOR_LSM *)cursor; + + CURSOR_UPDATE_API_CALL(cursor, session, update, NULL); + WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_NEEDVALUE(cursor); + WT_ERR(__clsm_enter(clsm, 0, 1)); + + if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) || + (ret = __clsm_lookup(clsm, &value)) == 0) { + WT_ERR(__clsm_deleted_encode( + session, &cursor->value, &value, &buf)); + ret = __clsm_put(session, clsm, &cursor->key, &value, 1); + } + +err: __wt_scr_free(&buf); + WT_TRET(__clsm_leave(clsm)); + CURSOR_UPDATE_API_END(session, ret); + return (ret); +} + +/* + * __clsm_remove -- + * WT_CURSOR->remove method for the LSM cursor type. + */ +static int +__clsm_remove(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_DECL_RET; + WT_ITEM value; + WT_SESSION_IMPL *session; + + clsm = (WT_CURSOR_LSM *)cursor; + + CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL); + WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_NOVALUE(cursor); + WT_ERR(__clsm_enter(clsm, 0, 1)); + + if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) || + (ret = __clsm_lookup(clsm, &value)) == 0) + ret = __clsm_put(session, clsm, &cursor->key, &__tombstone, 1); + +err: WT_TRET(__clsm_leave(clsm)); + CURSOR_UPDATE_API_END(session, ret); + return (ret); +} + +/* + * __clsm_close -- + * WT_CURSOR->close method for the LSM cursor type. + */ +static int +__clsm_close(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + /* + * Don't use the normal __clsm_enter path: that is wasted work when + * closing, and the cursor may never have been used. + */ + clsm = (WT_CURSOR_LSM *)cursor; + CURSOR_API_CALL(cursor, session, close, NULL); + WT_TRET(__clsm_close_cursors(clsm, 0, clsm->nchunks)); + __wt_free(session, clsm->blooms); + __wt_free(session, clsm->cursors); + __wt_free(session, clsm->switch_txn); + + /* In case we were somehow left positioned, clear that. */ + WT_TRET(__clsm_leave(clsm)); + + /* The WT_LSM_TREE owns the URI. */ + cursor->uri = NULL; + if (clsm->lsm_tree != NULL) + __wt_lsm_tree_release(session, clsm->lsm_tree); + WT_TRET(__wt_cursor_close(cursor)); + +err: API_END_RET(session, ret); +} + +/* + * __wt_clsm_open -- + * WT_SESSION->open_cursor method for LSM cursors. + */ +int +__wt_clsm_open(WT_SESSION_IMPL *session, + const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) +{ + WT_CONFIG_ITEM cval; + WT_CURSOR_STATIC_INIT(iface, + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __clsm_compare, /* compare */ + __clsm_next, /* next */ + __clsm_prev, /* prev */ + __clsm_reset, /* reset */ + __clsm_search, /* search */ + __clsm_search_near, /* search-near */ + __clsm_insert, /* insert */ + __clsm_update, /* update */ + __clsm_remove, /* remove */ + __clsm_close); /* close */ + WT_CURSOR *cursor; + WT_CURSOR_LSM *clsm; + WT_DECL_RET; + WT_LSM_TREE *lsm_tree; + + clsm = NULL; + cursor = NULL; + + if (!WT_PREFIX_MATCH(uri, "lsm:")) + return (EINVAL); + + WT_RET(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval)); + if (cval.len != 0) + WT_RET_MSG(session, EINVAL, + "LSM does not support opening by checkpoint"); + + /* Get the LSM tree. */ + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)); + WT_RET(ret); + + WT_ERR(__wt_calloc_def(session, 1, &clsm)); + + cursor = &clsm->iface; + *cursor = iface; + cursor->session = &session->iface; + cursor->uri = lsm_tree->name; + cursor->key_format = lsm_tree->key_format; + cursor->value_format = lsm_tree->value_format; + + WT_ERR(__wt_cursor_config_readonly(cursor, cfg, 0)); + + clsm->lsm_tree = lsm_tree; + + /* + * The tree's dsk_gen starts at one, so starting the cursor on zero + * will force a call into open_cursors on the first operation. + */ + clsm->dsk_gen = 0; + + WT_STATIC_ASSERT(offsetof(WT_CURSOR_LSM, iface) == 0); + WT_ERR(__wt_cursor_init(cursor, cursor->uri, owner, cfg, cursorp)); + + if (0) { +err: __wt_lsm_tree_release(session, lsm_tree); + if (clsm != NULL) { + clsm->lsm_tree = NULL; + WT_TRET(__clsm_close(cursor)); + } + } + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c new file mode 100644 index 00000000000..8f4b3ba49ef --- /dev/null +++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c @@ -0,0 +1,667 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __lsm_manager_aggressive_update(WT_SESSION_IMPL *, WT_LSM_TREE *); +static int __lsm_manager_run_server(WT_SESSION_IMPL *); +static int __lsm_manager_worker_setup(WT_SESSION_IMPL *); + +static void * __lsm_worker_manager(void *); + +/* + * __wt_lsm_manager_config -- + * Configure the LSM manager. + */ +int +__wt_lsm_manager_config(WT_SESSION_IMPL *session, const char **cfg) +{ + WT_CONNECTION_IMPL *conn; + WT_CONFIG_ITEM cval; + + conn = S2C(session); + + WT_RET(__wt_config_gets(session, cfg, "lsm_manager.merge", &cval)); + if (cval.val) + F_SET(conn, WT_CONN_LSM_MERGE); + WT_RET(__wt_config_gets( + session, cfg, "lsm_manager.worker_thread_max", &cval)); + if (cval.val) + conn->lsm_manager.lsm_workers_max = (uint32_t)cval.val; + return (0); +} + +/* + * __lsm_general_worker_start -- + * Start up all of the general LSM worker threads. + */ +static int +__lsm_general_worker_start(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_LSM_MANAGER *manager; + WT_LSM_WORKER_ARGS *worker_args; + + conn = S2C(session); + manager = &conn->lsm_manager; + + /* + * Start the remaining worker threads. + * This should get more sophisticated in the future - only launching + * as many worker threads as are required to keep up with demand. + */ + WT_ASSERT(session, manager->lsm_workers > 1); + for (; manager->lsm_workers < manager->lsm_workers_max; + manager->lsm_workers++) { + worker_args = + &manager->lsm_worker_cookies[manager->lsm_workers]; + worker_args->work_cond = manager->work_cond; + worker_args->id = manager->lsm_workers; + worker_args->type = + WT_LSM_WORK_BLOOM | + WT_LSM_WORK_DROP | + WT_LSM_WORK_FLUSH | + WT_LSM_WORK_SWITCH; + F_SET(worker_args, WT_LSM_WORKER_RUN); + /* + * Only allow half of the threads to run merges to avoid all + * all workers getting stuck in long-running merge operations. + * Make sure the first worker is allowed, so that there is at + * least one thread capable of running merges. We know the + * first worker is id 2, so set merges on even numbered workers. + */ + if (manager->lsm_workers % 2 == 0) + FLD_SET(worker_args->type, WT_LSM_WORK_MERGE); + WT_RET(__wt_lsm_worker_start(session, worker_args)); + } + return (0); +} + +/* + * __lsm_stop_workers -- + * Stop worker threads until the number reaches the configured amount. + */ +static int +__lsm_stop_workers(WT_SESSION_IMPL *session) +{ + WT_LSM_MANAGER *manager; + WT_LSM_WORKER_ARGS *worker_args; + uint32_t i; + + manager = &S2C(session)->lsm_manager; + /* + * Start at the end of the list of threads and stop them until we + * have the desired number. We want to keep all active threads + * packed at the front of the worker array. + */ + WT_ASSERT(session, manager->lsm_workers != 0); + for (i = manager->lsm_workers - 1; i >= manager->lsm_workers_max; i--) { + worker_args = &manager->lsm_worker_cookies[i]; + /* + * Clear this worker's flag so it stops. + */ + F_CLR(worker_args, WT_LSM_WORKER_RUN); + WT_ASSERT(session, worker_args->tid != 0); + WT_RET(__wt_thread_join(session, worker_args->tid)); + worker_args->tid = 0; + worker_args->type = 0; + worker_args->flags = 0; + manager->lsm_workers--; + /* + * We do not clear the session because they are allocated + * statically when the connection was opened. + */ + } + return (0); +} + +/* + * __wt_lsm_manager_reconfig -- + * Re-configure the LSM manager. + */ +int +__wt_lsm_manager_reconfig(WT_SESSION_IMPL *session, const char **cfg) +{ + WT_LSM_MANAGER *manager; + uint32_t orig_workers; + + manager = &S2C(session)->lsm_manager; + orig_workers = manager->lsm_workers_max; + + WT_RET(__wt_lsm_manager_config(session, cfg)); + /* + * If LSM hasn't started yet, we simply reconfigured the settings + * and we'll let the normal code path start the threads. + */ + if (manager->lsm_workers_max == 0) + return (0); + if (manager->lsm_workers == 0) + return (0); + /* + * If the number of workers has not changed, we're done. + */ + if (orig_workers == manager->lsm_workers_max) + return (0); + /* + * If we want more threads, start them. + */ + if (manager->lsm_workers_max > orig_workers) + return (__lsm_general_worker_start(session)); + + /* + * Otherwise we want to reduce the number of workers. + */ + WT_ASSERT(session, manager->lsm_workers_max < orig_workers); + WT_RET(__lsm_stop_workers(session)); + return (0); +} + +/* + * __wt_lsm_manager_start -- + * Start the LSM management infrastructure. Our queues and locks were + * initialized when the connection was initialized. + */ +int +__wt_lsm_manager_start(WT_SESSION_IMPL *session) +{ + WT_DECL_RET; + WT_LSM_MANAGER *manager; + WT_SESSION_IMPL *worker_session; + uint32_t i; + + manager = &S2C(session)->lsm_manager; + + /* + * We need at least a manager, a switch thread and a generic + * worker. + */ + WT_ASSERT(session, manager->lsm_workers_max > 2); + + /* + * Open sessions for all potential worker threads here - it's not + * safe to have worker threads open/close sessions themselves. + * All the LSM worker threads do their operations on read-only + * files. Use read-uncommitted isolation to avoid keeping + * updates in cache unnecessarily. + */ + for (i = 0; i < WT_LSM_MAX_WORKERS; i++) { + WT_ERR(__wt_open_internal_session( + S2C(session), "lsm-worker", 1, 0, &worker_session)); + worker_session->isolation = TXN_ISO_READ_UNCOMMITTED; + manager->lsm_worker_cookies[i].session = worker_session; + } + + /* Start the LSM manager thread. */ + WT_ERR(__wt_thread_create(session, &manager->lsm_worker_cookies[0].tid, + __lsm_worker_manager, &manager->lsm_worker_cookies[0])); + + F_SET(S2C(session), WT_CONN_SERVER_LSM); + + if (0) { +err: for (i = 0; + (worker_session = + manager->lsm_worker_cookies[i].session) != NULL; + i++) + WT_TRET((&worker_session->iface)->close( + &worker_session->iface, NULL)); + } + return (ret); +} + +/* + * __wt_lsm_manager_free_work_unit -- + * Release an LSM tree work unit. + */ +void +__wt_lsm_manager_free_work_unit( + WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT *entry) +{ + if (entry != NULL) { + WT_ASSERT(session, entry->lsm_tree->queue_ref > 0); + + (void)WT_ATOMIC_SUB4(entry->lsm_tree->queue_ref, 1); + __wt_free(session, entry); + } +} + +/* + * __wt_lsm_manager_destroy -- + * Destroy the LSM manager threads and subsystem. + */ +int +__wt_lsm_manager_destroy(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LSM_MANAGER *manager; + WT_LSM_WORK_UNIT *current, *next; + WT_SESSION *wt_session; + uint32_t i; + uint64_t removed; + + conn = S2C(session); + manager = &conn->lsm_manager; + removed = 0; + + if (manager->lsm_workers > 0) { + /* + * Stop the main LSM manager thread first. + */ + while (F_ISSET(conn, WT_CONN_SERVER_LSM)) + __wt_yield(); + + /* Clean up open LSM handles. */ + ret = __wt_lsm_tree_close_all(session); + + WT_TRET(__wt_thread_join( + session, manager->lsm_worker_cookies[0].tid)); + manager->lsm_worker_cookies[0].tid = 0; + + /* Release memory from any operations left on the queue. */ + for (current = TAILQ_FIRST(&manager->switchqh); + current != NULL; current = next) { + next = TAILQ_NEXT(current, q); + TAILQ_REMOVE(&manager->switchqh, current, q); + ++removed; + __wt_lsm_manager_free_work_unit(session, current); + } + for (current = TAILQ_FIRST(&manager->appqh); + current != NULL; current = next) { + next = TAILQ_NEXT(current, q); + TAILQ_REMOVE(&manager->appqh, current, q); + ++removed; + __wt_lsm_manager_free_work_unit(session, current); + } + for (current = TAILQ_FIRST(&manager->managerqh); + current != NULL; current = next) { + next = TAILQ_NEXT(current, q); + TAILQ_REMOVE(&manager->managerqh, current, q); + ++removed; + __wt_lsm_manager_free_work_unit(session, current); + } + + /* Close all LSM worker sessions. */ + for (i = 0; i < WT_LSM_MAX_WORKERS; i++) { + wt_session = + &manager->lsm_worker_cookies[i].session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + } + } + WT_STAT_FAST_CONN_INCRV(session, + lsm_work_units_discarded, removed); + + /* Free resources that are allocated in connection initialize */ + __wt_spin_destroy(session, &manager->switch_lock); + __wt_spin_destroy(session, &manager->app_lock); + __wt_spin_destroy(session, &manager->manager_lock); + WT_TRET(__wt_cond_destroy(session, &manager->work_cond)); + + return (ret); +} + +/* + * __lsm_manager_aggressive_update -- + * Update the merge aggressiveness for a single LSM tree. + */ +static int +__lsm_manager_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + struct timespec now; + uint64_t chunk_wait, stallms; + u_int new_aggressive; + + WT_RET(__wt_epoch(session, &now)); + stallms = WT_TIMEDIFF(now, lsm_tree->last_flush_ts) / WT_MILLION; + /* + * Get aggressive if more than enough chunks for a merge should have + * been created by now. Use 10 seconds as a default if we don't have an + * estimate. + */ + if (lsm_tree->nchunks > 1) + chunk_wait = stallms / (lsm_tree->chunk_fill_ms == 0 ? + 10000 : lsm_tree->chunk_fill_ms); + else + chunk_wait = 0; + new_aggressive = (u_int)(chunk_wait / lsm_tree->merge_min); + + if (new_aggressive > lsm_tree->merge_aggressiveness) { + WT_RET(__wt_verbose(session, WT_VERB_LSM, + "LSM merge %s got aggressive (old %u new %u), " + "merge_min %d, %u / %" PRIu64, + lsm_tree->name, lsm_tree->merge_aggressiveness, + new_aggressive, lsm_tree->merge_min, stallms, + lsm_tree->chunk_fill_ms)); + lsm_tree->merge_aggressiveness = new_aggressive; + } + return (0); +} + +/* + * __lsm_manager_worker_setup -- + * Do setup owned by the LSM manager thread including starting the worker + * threads. + */ +static int +__lsm_manager_worker_setup(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_LSM_MANAGER *manager; + WT_LSM_WORKER_ARGS *worker_args; + + conn = S2C(session); + manager = &conn->lsm_manager; + + WT_ASSERT(session, manager->lsm_workers == 1); + /* + * The LSM manager is worker[0]. The switch thread is worker[1]. + * Setup and start the switch/drop worker explicitly. + */ + worker_args = &manager->lsm_worker_cookies[1]; + worker_args->work_cond = manager->work_cond; + worker_args->id = manager->lsm_workers++; + worker_args->type = WT_LSM_WORK_DROP | WT_LSM_WORK_SWITCH; + F_SET(worker_args, WT_LSM_WORKER_RUN); + /* Start the switch thread. */ + WT_RET(__wt_lsm_worker_start(session, worker_args)); + WT_RET(__lsm_general_worker_start(session)); + + return (0); +} + +/* + * __lsm_manager_worker_shutdown -- + * Shutdown the LSM manager and worker threads. + */ +static int +__lsm_manager_worker_shutdown(WT_SESSION_IMPL *session) +{ + WT_DECL_RET; + WT_LSM_MANAGER *manager; + u_int i; + + manager = &S2C(session)->lsm_manager; + + /* + * Wait for the rest of the LSM workers to shutdown. Stop at index + * one - since we (the manager) are at index 0. + */ + for (i = 1; i < manager->lsm_workers; i++) { + WT_ASSERT(session, manager->lsm_worker_cookies[i].tid != 0); + WT_TRET(__wt_cond_signal(session, manager->work_cond)); + WT_TRET(__wt_thread_join( + session, manager->lsm_worker_cookies[i].tid)); + } + return (ret); +} + +/* + * __lsm_manager_run_server -- + * Run manager thread operations. + */ +static int +__lsm_manager_run_server(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_LSM_TREE *lsm_tree; + struct timespec now; + uint64_t fillms, pushms; + + conn = S2C(session); + while (F_ISSET(conn, WT_CONN_SERVER_RUN)) { + if (TAILQ_EMPTY(&conn->lsmqh)) { + __wt_sleep(0, 10000); + continue; + } + __wt_sleep(0, 10000); + TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) { + if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) + continue; + WT_RET(__lsm_manager_aggressive_update( + session, lsm_tree)); + WT_RET(__wt_epoch(session, &now)); + pushms = lsm_tree->work_push_ts.tv_sec == 0 ? 0 : + WT_TIMEDIFF( + now, lsm_tree->work_push_ts) / WT_MILLION; + fillms = 3 * lsm_tree->chunk_fill_ms; + if (fillms == 0) + fillms = 10000; + /* + * If the tree appears to not be triggering enough + * LSM maintenance, help it out. Additional work units + * don't hurt, and can be necessary if some work + * units aren't completed for some reason. + * If the tree hasn't been modified, and there are + * more than 1 chunks - try to get the tree smaller + * so queries run faster. + * If we are getting aggressive - ensure there are + * enough work units that we can get chunks merged. + * If we aren't pushing enough work units, compared + * to how often new chunks are being created add some + * more. + */ + if (lsm_tree->queue_ref >= LSM_TREE_MAX_QUEUE) + WT_STAT_FAST_CONN_INCR(session, + lsm_work_queue_max); + else if ((!lsm_tree->modified && + lsm_tree->nchunks > 1) || + (lsm_tree->queue_ref == 0 && + lsm_tree->nchunks > 1) || + (lsm_tree->merge_aggressiveness > 3 && + !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) || + pushms > fillms) { + WT_RET(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_SWITCH, 0, lsm_tree)); + WT_RET(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_DROP, 0, lsm_tree)); + WT_RET(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_FLUSH, 0, lsm_tree)); + WT_RET(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); + WT_RET(__wt_verbose(session, WT_VERB_LSM, + "MGR %s: queue %d mod %d nchunks %d" + " flags 0x%x aggressive %d pushms %" PRIu64 + " fillms %" PRIu64, + lsm_tree->name, lsm_tree->queue_ref, + lsm_tree->modified, lsm_tree->nchunks, + lsm_tree->flags, + lsm_tree->merge_aggressiveness, + pushms, fillms)); + WT_RET(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_MERGE, 0, lsm_tree)); + } + } + } + + return (0); +} + +/* + * __lsm_worker_manager -- + * A thread that manages all open LSM trees, and the shared LSM worker + * threads. + */ +static void * +__lsm_worker_manager(void *arg) +{ + WT_DECL_RET; + WT_LSM_WORKER_ARGS *cookie; + WT_SESSION_IMPL *session; + + cookie = (WT_LSM_WORKER_ARGS *)arg; + session = cookie->session; + + WT_ERR(__lsm_manager_worker_setup(session)); + WT_ERR(__lsm_manager_run_server(session)); + WT_ERR(__lsm_manager_worker_shutdown(session)); + + if (ret != 0) { +err: __wt_err(session, ret, "LSM worker manager thread error"); + } + F_CLR(S2C(session), WT_CONN_SERVER_LSM); + return (NULL); +} + +/* + * __wt_lsm_manager_clear_tree -- + * Remove all entries for a tree from the LSM manager queues. This + * introduces an inefficiency if LSM trees are being opened and closed + * regularly. + */ +int +__wt_lsm_manager_clear_tree( + WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_LSM_MANAGER *manager; + WT_LSM_WORK_UNIT *current, *next; + uint64_t removed; + + manager = &S2C(session)->lsm_manager; + removed = 0; + + /* Clear out the tree from the switch queue */ + __wt_spin_lock(session, &manager->switch_lock); + + /* Structure the loop so that it's safe to free as we iterate */ + for (current = TAILQ_FIRST(&manager->switchqh); + current != NULL; current = next) { + next = TAILQ_NEXT(current, q); + if (current->lsm_tree != lsm_tree) + continue; + ++removed; + TAILQ_REMOVE(&manager->switchqh, current, q); + __wt_lsm_manager_free_work_unit(session, current); + } + __wt_spin_unlock(session, &manager->switch_lock); + /* Clear out the tree from the application queue */ + __wt_spin_lock(session, &manager->app_lock); + for (current = TAILQ_FIRST(&manager->appqh); + current != NULL; current = next) { + next = TAILQ_NEXT(current, q); + if (current->lsm_tree != lsm_tree) + continue; + ++removed; + TAILQ_REMOVE(&manager->appqh, current, q); + __wt_lsm_manager_free_work_unit(session, current); + } + __wt_spin_unlock(session, &manager->app_lock); + /* Clear out the tree from the manager queue */ + __wt_spin_lock(session, &manager->manager_lock); + for (current = TAILQ_FIRST(&manager->managerqh); + current != NULL; current = next) { + next = TAILQ_NEXT(current, q); + if (current->lsm_tree != lsm_tree) + continue; + ++removed; + TAILQ_REMOVE(&manager->managerqh, current, q); + __wt_lsm_manager_free_work_unit(session, current); + } + __wt_spin_unlock(session, &manager->manager_lock); + WT_STAT_FAST_CONN_INCRV(session, lsm_work_units_discarded, removed); + return (0); +} + +/* + * We assume this is only called from __wt_lsm_manager_pop_entry and we + * have session, entry and type available to use. If the queue is empty + * we may return from the macro. + */ +#define LSM_POP_ENTRY(qh, qlock, qlen) do { \ + if (TAILQ_EMPTY(qh)) \ + return (0); \ + __wt_spin_lock(session, qlock); \ + TAILQ_FOREACH(entry, (qh), q) { \ + if (FLD_ISSET(type, entry->type)) { \ + TAILQ_REMOVE(qh, entry, q); \ + WT_STAT_FAST_CONN_DECR(session, qlen); \ + break; \ + } \ + } \ + __wt_spin_unlock(session, (qlock)); \ +} while (0) + +/* + * __wt_lsm_manager_pop_entry -- + * Retrieve the head of the queue, if it matches the requested work + * unit type. + */ +int +__wt_lsm_manager_pop_entry( + WT_SESSION_IMPL *session, uint32_t type, WT_LSM_WORK_UNIT **entryp) +{ + WT_LSM_MANAGER *manager; + WT_LSM_WORK_UNIT *entry; + + manager = &S2C(session)->lsm_manager; + *entryp = NULL; + entry = NULL; + + /* + * Pop the entry off the correct queue based on our work type. + */ + if (type == WT_LSM_WORK_SWITCH) + LSM_POP_ENTRY(&manager->switchqh, + &manager->switch_lock, lsm_work_queue_switch); + else if (type == WT_LSM_WORK_MERGE) + LSM_POP_ENTRY(&manager->managerqh, + &manager->manager_lock, lsm_work_queue_manager); + else + LSM_POP_ENTRY(&manager->appqh, + &manager->app_lock, lsm_work_queue_app); + if (entry != NULL) + WT_STAT_FAST_CONN_INCR(session, lsm_work_units_done); + *entryp = entry; + return (0); +} + +/* + * Push a work unit onto the appropriate queue. This macro assumes we are + * called from __wt_lsm_manager_push_entry and we have session and entry + * available for use. + */ +#define LSM_PUSH_ENTRY(qh, qlock, qlen) do { \ + __wt_spin_lock(session, qlock); \ + TAILQ_INSERT_TAIL((qh), entry, q); \ + WT_STAT_FAST_CONN_INCR(session, qlen); \ + __wt_spin_unlock(session, qlock); \ +} while (0) + +/* + * __wt_lsm_manager_push_entry -- + * Add an entry to the end of the switch queue. + */ +int +__wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, + uint32_t type, uint32_t flags, WT_LSM_TREE *lsm_tree) +{ + WT_LSM_MANAGER *manager; + WT_LSM_WORK_UNIT *entry; + + manager = &S2C(session)->lsm_manager; + + WT_RET(__wt_epoch(session, &lsm_tree->work_push_ts)); + + WT_RET(__wt_calloc_def(session, 1, &entry)); + entry->type = type; + entry->flags = flags; + entry->lsm_tree = lsm_tree; + (void)WT_ATOMIC_ADD4(lsm_tree->queue_ref, 1); + WT_STAT_FAST_CONN_INCR(session, lsm_work_units_created); + + if (type == WT_LSM_WORK_SWITCH) + LSM_PUSH_ENTRY(&manager->switchqh, + &manager->switch_lock, lsm_work_queue_switch); + else if (type == WT_LSM_WORK_MERGE) + LSM_PUSH_ENTRY(&manager->managerqh, + &manager->manager_lock, lsm_work_queue_manager); + else + LSM_PUSH_ENTRY(&manager->appqh, + &manager->app_lock, lsm_work_queue_app); + + WT_RET(__wt_cond_signal(session, manager->work_cond)); + + return (0); +} diff --git a/src/third_party/wiredtiger/src/lsm/lsm_merge.c b/src/third_party/wiredtiger/src/lsm/lsm_merge.c new file mode 100644 index 00000000000..784837092cd --- /dev/null +++ b/src/third_party/wiredtiger/src/lsm/lsm_merge.c @@ -0,0 +1,489 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_lsm_merge_update_tree -- + * Merge a set of chunks and populate a new one. + * Must be called with the LSM lock held. + */ +int +__wt_lsm_merge_update_tree(WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree, u_int start_chunk, u_int nchunks, + WT_LSM_CHUNK *chunk) +{ + size_t chunks_after_merge; + u_int i; + + WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); + + /* Setup the array of obsolete chunks. */ + WT_RET(__wt_realloc_def(session, &lsm_tree->old_alloc, + lsm_tree->nold_chunks + nchunks, &lsm_tree->old_chunks)); + + /* Copy entries one at a time, so we can reuse gaps in the list. */ + for (i = 0; i < nchunks; i++) + lsm_tree->old_chunks[lsm_tree->nold_chunks++] = + lsm_tree->chunk[start_chunk + i]; + + /* Update the current chunk list. */ + chunks_after_merge = lsm_tree->nchunks - (nchunks + start_chunk); + memmove(lsm_tree->chunk + start_chunk + 1, + lsm_tree->chunk + start_chunk + nchunks, + chunks_after_merge * sizeof(*lsm_tree->chunk)); + lsm_tree->nchunks -= nchunks - 1; + memset(lsm_tree->chunk + lsm_tree->nchunks, 0, + (nchunks - 1) * sizeof(*lsm_tree->chunk)); + lsm_tree->chunk[start_chunk] = chunk; + + return (0); +} + +/* + * __wt_lsm_merge -- + * Merge a set of chunks of an LSM tree. + */ +int +__wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) +{ + WT_BLOOM *bloom; + WT_CURSOR *dest, *src; + WT_DECL_ITEM(bbuf); + WT_DECL_RET; + WT_ITEM key, value; + WT_LSM_CHUNK *chunk, *previous, *youngest; + uint32_t aggressive, generation, max_gap, max_gen, max_level, start_id; + uint64_t insert_count, record_count, chunk_size; + u_int dest_id, end_chunk, i, merge_max, merge_min, nchunks, start_chunk; + u_int verb; + int create_bloom, locked, in_sync, tret; + const char *cfg[3]; + const char *drop_cfg[] = + { WT_CONFIG_BASE(session, session_drop), "force", NULL }; + + bloom = NULL; + chunk_size = 0; + create_bloom = 0; + dest = src = NULL; + locked = 0; + start_id = 0; + in_sync = 0; + + /* + * If the tree is open read-only or we are compacting, be very + * aggressive. Otherwise, we can spend a long time waiting for merges + * to start in read-only applications. + */ + if (!lsm_tree->modified || + F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) + lsm_tree->merge_aggressiveness = 10; + + aggressive = lsm_tree->merge_aggressiveness; + merge_max = (aggressive > 5) ? 100 : lsm_tree->merge_min; + merge_min = (aggressive > 5) ? 2 : lsm_tree->merge_min; + max_gap = (aggressive + 4) / 5; + max_level = (lsm_tree->merge_throttle > 0) ? 0 : id + aggressive; + + /* + * If there aren't any chunks to merge, or some of the chunks aren't + * yet written, we're done. A non-zero error indicates that the worker + * should assume there is no work to do: if there are unwritten chunks, + * the worker should write them immediately. + */ + if (lsm_tree->nchunks < merge_min) + return (WT_NOTFOUND); + + /* + * Use the lsm_tree lock to read the chunks (so no switches occur), but + * avoid holding it while the merge is in progress: that may take a + * long time. + */ + WT_RET(__wt_lsm_tree_writelock(session, lsm_tree)); + + /* + * Only include chunks that already have a Bloom filter or are the + * result of a merge and not involved in a merge. + */ + for (end_chunk = lsm_tree->nchunks - 1; end_chunk > 0; --end_chunk) { + chunk = lsm_tree->chunk[end_chunk]; + WT_ASSERT(session, chunk != NULL); + if (F_ISSET(chunk, WT_LSM_CHUNK_MERGING)) + continue; + if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) || chunk->generation > 0) + break; + else if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && + F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) + break; + } + + /* + * Give up immediately if there aren't enough on disk chunks in the + * tree for a merge. + */ + if (end_chunk < merge_min - 1) { + WT_RET(__wt_lsm_tree_writeunlock(session, lsm_tree)); + return (WT_NOTFOUND); + } + + /* + * Look for the most efficient merge we can do. We define efficiency + * as collapsing as many levels as possible while processing the + * smallest number of rows. + * + * We make a distinction between "major" and "minor" merges. The + * difference is whether the oldest chunk is involved: if it is, we can + * discard tombstones, because there can be no older record to marked + * deleted. + * + * Respect the configured limit on the number of chunks to merge: start + * with the most recent set of chunks and work backwards until going + * further becomes significantly less efficient. + */ + for (start_chunk = end_chunk + 1, record_count = 0; + start_chunk > 0; ) { + chunk = lsm_tree->chunk[start_chunk - 1]; + youngest = lsm_tree->chunk[end_chunk]; + nchunks = (end_chunk + 1) - start_chunk; + + /* + * If the chunk is already involved in a merge or a Bloom + * filter is being built for it, stop. + */ + if (F_ISSET(chunk, WT_LSM_CHUNK_MERGING) || chunk->bloom_busy) + break; + + /* + * Look for small merges before trying a big one: some threads + * should stay in low levels until we get more aggressive. + */ + if (chunk->generation > max_level) + break; + + /* + * If the size of the chunks selected so far exceeds the + * configured maximum chunk size, stop. Keep going if we can + * slide the window further into the tree: we don't want to + * leave small chunks in the middle. + */ + if ((chunk_size += chunk->size) > lsm_tree->chunk_max) + if (nchunks < merge_min || + (chunk->generation > youngest->generation && + chunk_size - youngest->size > lsm_tree->chunk_max)) + break; + + /* + * If we have enough chunks for a merge and the next chunk is + * in too high a generation, stop. + */ + if (nchunks >= merge_min) { + previous = lsm_tree->chunk[start_chunk]; + max_gen = youngest->generation + max_gap; + if (previous->generation <= max_gen && + chunk->generation > max_gen) + break; + } + + F_SET(chunk, WT_LSM_CHUNK_MERGING); + record_count += chunk->count; + --start_chunk; + + /* + * If we have a full window, or the merge would be too big, + * remove the youngest chunk. + */ + if (nchunks == merge_max || + chunk_size > lsm_tree->chunk_max) { + WT_ASSERT(session, + F_ISSET(youngest, WT_LSM_CHUNK_MERGING)); + F_CLR(youngest, WT_LSM_CHUNK_MERGING); + record_count -= youngest->count; + chunk_size -= youngest->size; + --end_chunk; + } + } + + nchunks = (end_chunk + 1) - start_chunk; + WT_ASSERT(session, nchunks <= merge_max); + + if (nchunks > 0) { + WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); + for (i = 0; i < nchunks; i++) { + chunk = lsm_tree->chunk[start_chunk + i]; + WT_ASSERT(session, + F_ISSET(chunk, WT_LSM_CHUNK_MERGING)); + } + + chunk = lsm_tree->chunk[start_chunk]; + youngest = lsm_tree->chunk[end_chunk]; + start_id = chunk->id; + + /* + * Don't do merges that are too small or across too many + * generations. + */ + if (nchunks < merge_min || + chunk->generation > youngest->generation + max_gap) { + for (i = 0; i < nchunks; i++) { + chunk = lsm_tree->chunk[start_chunk + i]; + WT_ASSERT(session, + F_ISSET(chunk, WT_LSM_CHUNK_MERGING)); + F_CLR(chunk, WT_LSM_CHUNK_MERGING); + } + nchunks = 0; + } + } + + /* Find the merge generation. */ + for (generation = 0, i = 0; i < nchunks; i++) + generation = WT_MAX(generation, + lsm_tree->chunk[start_chunk + i]->generation + 1); + + WT_RET(__wt_lsm_tree_writeunlock(session, lsm_tree)); + + if (nchunks == 0) + return (WT_NOTFOUND); + + /* Allocate an ID for the merge. */ + dest_id = WT_ATOMIC_ADD4(lsm_tree->last, 1); + + /* + * We only want to do the chunk loop if we're running with verbose, + * so we wrap these statements in the conditional. Avoid the loop + * in the normal path. + */ + if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) { + WT_RET(__wt_verbose(session, WT_VERB_LSM, + "Merging %s chunks %u-%u into %u (%" PRIu64 " records)" + ", generation %" PRIu32, + lsm_tree->name, + start_chunk, end_chunk, dest_id, record_count, generation)); + for (verb = start_chunk; verb <= end_chunk; verb++) + WT_RET(__wt_verbose(session, WT_VERB_LSM, + "%s: Chunk[%u] id %u", + lsm_tree->name, verb, lsm_tree->chunk[verb]->id)); + } + + WT_RET(__wt_calloc_def(session, 1, &chunk)); + chunk->id = dest_id; + + if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) && + (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) || + start_chunk > 0) && record_count > 0) + create_bloom = 1; + + /* + * Special setup for the merge cursor: + * first, reset to open the dependent cursors; + * then restrict the cursor to a specific number of chunks; + * then set MERGE so the cursor doesn't track updates to the tree. + */ + WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); + F_SET(src, WT_CURSTD_RAW); + WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks)); + + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); + WT_ERR(ret); + if (create_bloom) { + WT_ERR(__wt_lsm_tree_bloom_name( + session, lsm_tree, chunk->id, &chunk->bloom_uri)); + + WT_ERR(__wt_bloom_create(session, chunk->bloom_uri, + lsm_tree->bloom_config, + record_count, lsm_tree->bloom_bit_count, + lsm_tree->bloom_hash_count, &bloom)); + } + + /* Discard pages we read as soon as we're done with them. */ + F_SET(session, WT_SESSION_NO_CACHE); + + cfg[0] = WT_CONFIG_BASE(session, session_open_cursor); + cfg[1] = "bulk,raw,skip_sort_check"; + cfg[2] = NULL; + WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); + +#define LSM_MERGE_CHECK_INTERVAL 1000 + for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { + if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) { + if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) + WT_ERR(EINTR); + /* + * Help out with switching chunks in case the + * checkpoint worker is busy. + */ + if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { + WT_WITH_SCHEMA_LOCK(session, ret = + __wt_lsm_tree_switch(session, lsm_tree)); + WT_ERR(ret); + } + WT_STAT_FAST_CONN_INCRV(session, + lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL); + ++lsm_tree->merge_progressing; + } + + WT_ERR(src->get_key(src, &key)); + dest->set_key(dest, &key); + WT_ERR(src->get_value(src, &value)); + dest->set_value(dest, &value); + WT_ERR(dest->insert(dest)); + if (create_bloom) + WT_ERR(__wt_bloom_insert(bloom, &key)); + } + WT_ERR_NOTFOUND_OK(ret); + + WT_STAT_FAST_CONN_INCRV(session, + lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL); + ++lsm_tree->merge_progressing; + WT_ERR(__wt_verbose(session, WT_VERB_LSM, + "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.", + record_count, insert_count)); + + /* + * Closing and syncing the files can take a while. Set the + * merge_syncing field so that compact knows it is still in + * progress. + */ + (void)WT_ATOMIC_ADD4(lsm_tree->merge_syncing, 1); + in_sync = 1; + /* + * We've successfully created the new chunk. Now install it. We need + * to ensure that the NO_CACHE flag is cleared and the bloom filter + * is closed (even if a step fails), so track errors but don't return + * until we've cleaned up. + */ + WT_TRET(src->close(src)); + WT_TRET(dest->close(dest)); + src = dest = NULL; + + F_CLR(session, WT_SESSION_NO_CACHE); + + /* + * We're doing advisory reads to fault the new trees into cache. + * Don't block if the cache is full: our next unit of work may be to + * discard some trees to free space. + */ + F_SET(session, WT_SESSION_NO_CACHE_CHECK); + + if (create_bloom) { + if (ret == 0) + WT_TRET(__wt_bloom_finalize(bloom)); + + /* + * Read in a key to make sure the Bloom filters btree handle is + * open before it becomes visible to application threads. + * Otherwise application threads will stall while it is opened + * and internal pages are read into cache. + */ + if (ret == 0) { + WT_CLEAR(key); + WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); + } + + WT_TRET(__wt_bloom_close(bloom)); + bloom = NULL; + } + WT_ERR(ret); + + /* + * Open a handle on the new chunk before application threads attempt + * to access it, opening it pre-loads internal pages into the file + * system cache. + */ + cfg[1] = "checkpoint=" WT_CHECKPOINT; + WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); + WT_TRET(dest->close(dest)); + dest = NULL; + ++lsm_tree->merge_progressing; + (void)WT_ATOMIC_SUB4(lsm_tree->merge_syncing, 1); + in_sync = 0; + WT_ERR_NOTFOUND_OK(ret); + + WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); + WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); + locked = 1; + + /* + * Check whether we raced with another merge, and adjust the chunk + * array offset as necessary. + */ + if (start_chunk >= lsm_tree->nchunks || + lsm_tree->chunk[start_chunk]->id != start_id) + for (start_chunk = 0; + start_chunk < lsm_tree->nchunks; + start_chunk++) + if (lsm_tree->chunk[start_chunk]->id == start_id) + break; + + /* + * It is safe to error out here - since the update can only fail + * prior to making updates to the tree. + */ + WT_ERR(__wt_lsm_merge_update_tree( + session, lsm_tree, start_chunk, nchunks, chunk)); + + if (create_bloom) + F_SET(chunk, WT_LSM_CHUNK_BLOOM); + chunk->count = insert_count; + chunk->generation = generation; + F_SET(chunk, WT_LSM_CHUNK_ONDISK); + + /* + * We have no current way of continuing if the metadata update fails, + * so we will panic in that case. Put some effort into cleaning up + * after ourselves here - so things have a chance of shutting down. + * + * Any errors that happened after the tree was locked are + * fatal - we can't guarantee the state of the tree. + */ + if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0) + WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge"); + + lsm_tree->dsk_gen++; + + /* Update the throttling while holding the tree lock. */ + __wt_lsm_tree_throttle(session, lsm_tree, 1); + + /* Schedule a pass to discard old chunks */ + WT_ERR(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_DROP, 0, lsm_tree)); + +err: if (locked) + WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); + if (in_sync) + (void)WT_ATOMIC_SUB4(lsm_tree->merge_syncing, 1); + if (src != NULL) + WT_TRET(src->close(src)); + if (dest != NULL) + WT_TRET(dest->close(dest)); + if (bloom != NULL) + WT_TRET(__wt_bloom_close(bloom)); + __wt_scr_free(&bbuf); + if (ret != 0) { + /* Drop the newly-created files on error. */ + WT_WITH_SCHEMA_LOCK(session, + tret = __wt_schema_drop(session, chunk->uri, drop_cfg)); + WT_TRET(tret); + if (create_bloom) { + WT_WITH_SCHEMA_LOCK(session, tret = __wt_schema_drop( + session, chunk->bloom_uri, drop_cfg)); + WT_TRET(tret); + } + __wt_free(session, chunk->bloom_uri); + __wt_free(session, chunk->uri); + __wt_free(session, chunk); + + if (ret == EINTR) + WT_TRET(__wt_verbose(session, WT_VERB_LSM, + "Merge aborted due to close")); + else + WT_TRET(__wt_verbose(session, WT_VERB_LSM, + "Merge failed with %s", wiredtiger_strerror(ret))); + } + F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/lsm/lsm_meta.c b/src/third_party/wiredtiger/src/lsm/lsm_meta.c new file mode 100644 index 00000000000..fbb5a9958d5 --- /dev/null +++ b/src/third_party/wiredtiger/src/lsm/lsm_meta.c @@ -0,0 +1,238 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_lsm_meta_read -- + * Read the metadata for an LSM tree. + */ +int +__wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_CONFIG cparser, lparser; + WT_CONFIG_ITEM ck, cv, lk, lv; + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + WT_NAMED_COLLATOR *ncoll; + const char *lsmconfig; + u_int nchunks; + + chunk = NULL; /* -Wconditional-uninitialized */ + + WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconfig)); + WT_ERR(__wt_config_init(session, &cparser, lsmconfig)); + while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) { + if (WT_STRING_MATCH("key_format", ck.str, ck.len)) { + __wt_free(session, lsm_tree->key_format); + WT_ERR(__wt_strndup(session, + cv.str, cv.len, &lsm_tree->key_format)); + } else if (WT_STRING_MATCH("value_format", ck.str, ck.len)) { + __wt_free(session, lsm_tree->value_format); + WT_ERR(__wt_strndup(session, + cv.str, cv.len, &lsm_tree->value_format)); + } else if (WT_STRING_MATCH("collator", ck.str, ck.len)) { + if (cv.len == 0) + continue; + TAILQ_FOREACH(ncoll, &S2C(session)->collqh, q) { + if (WT_STRING_MATCH( + ncoll->name, cv.str, cv.len)) { + lsm_tree->collator = ncoll->collator; + break; + } + } + if (lsm_tree->collator == NULL) + WT_ERR_MSG(session, EINVAL, + "unknown collator '%.*s'", + (int)cv.len, cv.str); + WT_ERR(__wt_strndup(session, + cv.str, cv.len, &lsm_tree->collator_name)); + } else if (WT_STRING_MATCH("bloom_config", ck.str, ck.len)) { + __wt_free(session, lsm_tree->bloom_config); + /* Don't include the brackets. */ + WT_ERR(__wt_strndup(session, + cv.str + 1, cv.len - 2, &lsm_tree->bloom_config)); + } else if (WT_STRING_MATCH("file_config", ck.str, ck.len)) { + __wt_free(session, lsm_tree->file_config); + /* Don't include the brackets. */ + WT_ERR(__wt_strndup(session, + cv.str + 1, cv.len - 2, &lsm_tree->file_config)); + } else if (WT_STRING_MATCH("auto_throttle", ck.str, ck.len)) { + if (cv.val) + F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); + else + F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); + } else if (WT_STRING_MATCH("bloom", ck.str, ck.len)) + lsm_tree->bloom = (uint32_t)cv.val; + else if (WT_STRING_MATCH("bloom_bit_count", ck.str, ck.len)) + lsm_tree->bloom_bit_count = (uint32_t)cv.val; + else if (WT_STRING_MATCH("bloom_hash_count", ck.str, ck.len)) + lsm_tree->bloom_hash_count = (uint32_t)cv.val; + else if (WT_STRING_MATCH("chunk_max", ck.str, ck.len)) + lsm_tree->chunk_max = (uint64_t)cv.val; + else if (WT_STRING_MATCH("chunk_size", ck.str, ck.len)) + lsm_tree->chunk_size = (uint64_t)cv.val; + else if (WT_STRING_MATCH("merge_max", ck.str, ck.len)) + lsm_tree->merge_max = (uint32_t)cv.val; + else if (WT_STRING_MATCH("merge_min", ck.str, ck.len)) + lsm_tree->merge_min = (uint32_t)cv.val; + else if (WT_STRING_MATCH("last", ck.str, ck.len)) + lsm_tree->last = (u_int)cv.val; + else if (WT_STRING_MATCH("chunks", ck.str, ck.len)) { + WT_ERR(__wt_config_subinit(session, &lparser, &cv)); + for (nchunks = 0; (ret = + __wt_config_next(&lparser, &lk, &lv)) == 0; ) { + if (WT_STRING_MATCH("id", lk.str, lk.len)) { + WT_ERR(__wt_realloc_def(session, + &lsm_tree->chunk_alloc, + nchunks + 1, &lsm_tree->chunk)); + WT_ERR(__wt_calloc_def( + session, 1, &chunk)); + lsm_tree->chunk[nchunks++] = chunk; + chunk->id = (uint32_t)lv.val; + WT_ERR(__wt_lsm_tree_chunk_name(session, + lsm_tree, chunk->id, &chunk->uri)); + F_SET(chunk, + WT_LSM_CHUNK_ONDISK | + WT_LSM_CHUNK_STABLE); + } else if (WT_STRING_MATCH( + "bloom", lk.str, lk.len)) { + WT_ERR(__wt_lsm_tree_bloom_name( + session, lsm_tree, + chunk->id, &chunk->bloom_uri)); + F_SET(chunk, WT_LSM_CHUNK_BLOOM); + continue; + } else if (WT_STRING_MATCH( + "chunk_size", lk.str, lk.len)) { + chunk->size = (uint64_t)lv.val; + continue; + } else if (WT_STRING_MATCH( + "count", lk.str, lk.len)) { + chunk->count = (uint64_t)lv.val; + continue; + } else if (WT_STRING_MATCH( + "generation", lk.str, lk.len)) { + chunk->generation = (uint32_t)lv.val; + continue; + } + } + WT_ERR_NOTFOUND_OK(ret); + lsm_tree->nchunks = nchunks; + } else if (WT_STRING_MATCH("old_chunks", ck.str, ck.len)) { + WT_ERR(__wt_config_subinit(session, &lparser, &cv)); + for (nchunks = 0; (ret = + __wt_config_next(&lparser, &lk, &lv)) == 0; ) { + if (WT_STRING_MATCH("bloom", lk.str, lk.len)) { + WT_ERR(__wt_strndup(session, + lv.str, lv.len, &chunk->bloom_uri)); + F_SET(chunk, WT_LSM_CHUNK_BLOOM); + continue; + } + WT_ERR(__wt_realloc_def(session, + &lsm_tree->old_alloc, nchunks + 1, + &lsm_tree->old_chunks)); + WT_ERR(__wt_calloc_def(session, 1, &chunk)); + lsm_tree->old_chunks[nchunks++] = chunk; + WT_ERR(__wt_strndup(session, + lk.str, lk.len, &chunk->uri)); + F_SET(chunk, WT_LSM_CHUNK_ONDISK); + } + WT_ERR_NOTFOUND_OK(ret); + lsm_tree->nold_chunks = nchunks; + /* Values included for backward compatibility */ + } else if (WT_STRING_MATCH("merge_threads", ck.str, ck.len)) { + } else + WT_ERR(__wt_illegal_value(session, "LSM metadata")); + } + WT_ERR_NOTFOUND_OK(ret); + + /* + * If the default merge_min was not overridden, calculate it now. We + * do this here so that trees created before merge_min was added get a + * sane value. + */ + if (lsm_tree->merge_min < 2) + lsm_tree->merge_min = WT_MAX(2, lsm_tree->merge_max / 2); + +err: __wt_free(session, lsmconfig); + return (ret); +} + +/* + * __wt_lsm_meta_write -- + * Write the metadata for an LSM tree. + */ +int +__wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + u_int i; + int first; + + WT_RET(__wt_scr_alloc(session, 0, &buf)); + WT_ERR(__wt_buf_fmt(session, buf, + "key_format=%s,value_format=%s,bloom_config=(%s),file_config=(%s)", + lsm_tree->key_format, lsm_tree->value_format, + lsm_tree->bloom_config, lsm_tree->file_config)); + if (lsm_tree->collator_name != NULL) + WT_ERR(__wt_buf_catfmt( + session, buf, ",collator=%s", lsm_tree->collator_name)); + WT_ERR(__wt_buf_catfmt(session, buf, + ",last=%" PRIu32 + ",chunk_max=%" PRIu64 + ",chunk_size=%" PRIu64 + ",auto_throttle=%" PRIu32 + ",merge_max=%" PRIu32 + ",merge_min=%" PRIu32 + ",bloom=%" PRIu32 + ",bloom_bit_count=%" PRIu32 + ",bloom_hash_count=%" PRIu32, + lsm_tree->last, lsm_tree->chunk_max, lsm_tree->chunk_size, + F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) ? 1 : 0, + lsm_tree->merge_max, lsm_tree->merge_min, lsm_tree->bloom, + lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count)); + WT_ERR(__wt_buf_catfmt(session, buf, ",chunks=[")); + for (i = 0; i < lsm_tree->nchunks; i++) { + chunk = lsm_tree->chunk[i]; + if (i > 0) + WT_ERR(__wt_buf_catfmt(session, buf, ",")); + WT_ERR(__wt_buf_catfmt(session, buf, "id=%" PRIu32, chunk->id)); + if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) + WT_ERR(__wt_buf_catfmt(session, buf, ",bloom")); + if (chunk->size != 0) + WT_ERR(__wt_buf_catfmt(session, buf, + ",chunk_size=%" PRIu64, chunk->size)); + if (chunk->count != 0) + WT_ERR(__wt_buf_catfmt( + session, buf, ",count=%" PRIu64, chunk->count)); + WT_ERR(__wt_buf_catfmt( + session, buf, ",generation=%" PRIu32, chunk->generation)); + } + WT_ERR(__wt_buf_catfmt(session, buf, "]")); + WT_ERR(__wt_buf_catfmt(session, buf, ",old_chunks=[")); + first = 1; + for (i = 0; i < lsm_tree->nold_chunks; i++) { + chunk = lsm_tree->old_chunks[i]; + WT_ASSERT(session, chunk != NULL); + if (first) + first = 0; + else + WT_ERR(__wt_buf_catfmt(session, buf, ",")); + WT_ERR(__wt_buf_catfmt(session, buf, "\"%s\"", chunk->uri)); + if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) + WT_ERR(__wt_buf_catfmt( + session, buf, ",bloom=\"%s\"", chunk->bloom_uri)); + } + WT_ERR(__wt_buf_catfmt(session, buf, "]")); + ret = __wt_metadata_update(session, lsm_tree->name, buf->data); + WT_ERR(ret); + +err: __wt_scr_free(&buf); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/lsm/lsm_stat.c b/src/third_party/wiredtiger/src/lsm/lsm_stat.c new file mode 100644 index 00000000000..dc7d17e7a2c --- /dev/null +++ b/src/third_party/wiredtiger/src/lsm/lsm_stat.c @@ -0,0 +1,162 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __lsm_stat_init -- + * Initialize a LSM statistics structure. + */ +static int +__lsm_stat_init(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR_STAT *cst) +{ + WT_CURSOR *stat_cursor; + WT_DECL_ITEM(uribuf); + WT_DECL_RET; + WT_DSRC_STATS *new, *stats; + WT_LSM_CHUNK *chunk; + WT_LSM_TREE *lsm_tree; + u_int i; + int locked; + char config[64]; + const char *cfg[] = { + WT_CONFIG_BASE(session, session_open_cursor), NULL, NULL }; + const char *disk_cfg[] = { + WT_CONFIG_BASE(session, session_open_cursor), + "checkpoint=" WT_CHECKPOINT, NULL, NULL }; + + locked = 0; + WT_RET(__wt_lsm_tree_get(session, uri, 0, &lsm_tree)); + WT_ERR(__wt_scr_alloc(session, 0, &uribuf)); + + /* Propagate all, fast and/or clear to the cursors we open. */ + if (!F_ISSET(cst, WT_CONN_STAT_NONE)) { + (void)snprintf(config, sizeof(config), + "statistics=(%s%s%s)", + F_ISSET(cst, WT_CONN_STAT_CLEAR) ? "clear," : "", + F_ISSET(cst, WT_CONN_STAT_ALL) ? "all," : "", + !F_ISSET(cst, WT_CONN_STAT_ALL) && + F_ISSET(cst, WT_CONN_STAT_FAST) ? "fast," : ""); + cfg[1] = disk_cfg[1] = config; + } + + /* + * Set the cursor to reference the data source statistics; we don't + * initialize it, instead we copy (rather than aggregate), the first + * chunk's statistics, which has the same effect. + */ + stats = &cst->u.dsrc_stats; + + /* Hold the LSM lock so that we can safely walk through the chunks. */ + WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree)); + locked = 1; + + /* + * For each chunk, aggregate its statistics, as well as any associated + * bloom filter statistics, into the total statistics. + */ + for (i = 0; i < lsm_tree->nchunks; i++) { + chunk = lsm_tree->chunk[i]; + + /* + * Get the statistics for the chunk's underlying object. + * + * XXX kludge: we may have an empty chunk where no checkpoint + * was written. If so, try to open the ordinary handle on that + * chunk instead. + */ + WT_ERR(__wt_buf_fmt( + session, uribuf, "statistics:%s", chunk->uri)); + ret = __wt_curstat_open(session, uribuf->data, + F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ? disk_cfg : cfg, + &stat_cursor); + if (ret == WT_NOTFOUND && + F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) + ret = __wt_curstat_open( + session, uribuf->data, cfg, &stat_cursor); + WT_ERR(ret); + + /* + * The underlying statistics have now been initialized; fill in + * values from the chunk's information, then aggregate into the + * top-level. + */ + new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor); + WT_STAT_SET(new, lsm_generation_max, chunk->generation); + + /* + * We want to aggregate the table's statistics. Get a base set + * of statistics from the first chunk, then aggregate statistics + * from each new chunk. + */ + if (i == 0) + *stats = *new; + else + __wt_stat_aggregate_dsrc_stats(new, stats); + WT_ERR(stat_cursor->close(stat_cursor)); + + if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) + continue; + + /* Maintain a count of bloom filters. */ + WT_STAT_INCR(&lsm_tree->stats, bloom_count); + + /* Get the bloom filter's underlying object. */ + WT_ERR(__wt_buf_fmt( + session, uribuf, "statistics:%s", chunk->bloom_uri)); + WT_ERR(__wt_curstat_open( + session, uribuf->data, cfg, &stat_cursor)); + + /* + * The underlying statistics have now been initialized; fill in + * values from the bloom filter's information, then aggregate + * into the top-level. + */ + new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor); + WT_STAT_SET(new, + bloom_size, (chunk->count * lsm_tree->bloom_bit_count) / 8); + WT_STAT_SET(new, bloom_page_evict, + WT_STAT(new, cache_eviction_clean) + + WT_STAT(new, cache_eviction_dirty)); + WT_STAT_SET(new, bloom_page_read, WT_STAT(new, cache_read)); + + __wt_stat_aggregate_dsrc_stats(new, stats); + WT_ERR(stat_cursor->close(stat_cursor)); + } + + /* Set statistics that aren't aggregated directly into the cursor */ + WT_STAT_SET(stats, lsm_chunk_count, lsm_tree->nchunks); + + /* Aggregate, and optionally clear, LSM-level specific information. */ + __wt_stat_aggregate_dsrc_stats(&lsm_tree->stats, stats); + if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + __wt_stat_refresh_dsrc_stats(&lsm_tree->stats); + + __wt_curstat_dsrc_final(cst); + +err: if (locked) + WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree)); + __wt_lsm_tree_release(session, lsm_tree); + __wt_scr_free(&uribuf); + + return (ret); +} + +/* + * __wt_curstat_lsm_init -- + * Initialize the statistics for a LSM tree. + */ +int +__wt_curstat_lsm_init( + WT_SESSION_IMPL *session, const char *uri, WT_CURSOR_STAT *cst) +{ + WT_DECL_RET; + + WT_WITH_SCHEMA_LOCK(session, ret = __lsm_stat_init(session, uri, cst)); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c new file mode 100644 index 00000000000..447a8eb60a6 --- /dev/null +++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c @@ -0,0 +1,1266 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __lsm_tree_open_check(WT_SESSION_IMPL *, WT_LSM_TREE *); +static int __lsm_tree_open(WT_SESSION_IMPL *, const char *, WT_LSM_TREE **); +static int __lsm_tree_set_name(WT_SESSION_IMPL *, WT_LSM_TREE *, const char *); + +/* + * __lsm_tree_discard -- + * Free an LSM tree structure. + */ +static int +__lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + u_int i; + + /* We may be destroying an lsm_tree before it was added. */ + if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) + TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q); + + __wt_free(session, lsm_tree->name); + __wt_free(session, lsm_tree->config); + __wt_free(session, lsm_tree->key_format); + __wt_free(session, lsm_tree->value_format); + __wt_free(session, lsm_tree->collator_name); + __wt_free(session, lsm_tree->bloom_config); + __wt_free(session, lsm_tree->file_config); + + WT_TRET(__wt_rwlock_destroy(session, &lsm_tree->rwlock)); + + for (i = 0; i < lsm_tree->nchunks; i++) { + if ((chunk = lsm_tree->chunk[i]) == NULL) + continue; + + __wt_free(session, chunk->bloom_uri); + __wt_free(session, chunk->uri); + __wt_free(session, chunk); + } + __wt_free(session, lsm_tree->chunk); + + for (i = 0; i < lsm_tree->nold_chunks; i++) { + chunk = lsm_tree->old_chunks[i]; + WT_ASSERT(session, chunk != NULL); + + __wt_free(session, chunk->bloom_uri); + __wt_free(session, chunk->uri); + __wt_free(session, chunk); + } + __wt_free(session, lsm_tree->old_chunks); + __wt_free(session, lsm_tree); + + return (ret); +} + +/* + * __lsm_tree_close -- + * Close an LSM tree structure. + */ +static int +__lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_DECL_RET; + int i; + + /* Stop any active merges. */ + F_CLR(lsm_tree, WT_LSM_TREE_ACTIVE); + + /* + * Wait for all LSM operations and work units that were in flight to + * finish. + */ + for (i = 0; lsm_tree->refcnt > 1 || lsm_tree->queue_ref > 0; ++i) { + /* + * Remove any work units from the manager queues. Do this step + * repeatedly in case a work unit was in the process of being + * created when we cleared the active flag. + * !! Drop the schema lock whilst completing this step so that + * we don't block any operations that require the schema + * lock to complete. This is safe because any operation that + * is closing the tree should first have gotten exclusive + * access to the LSM tree via __wt_lsm_tree_get, so other + * schema level operations will return EBUSY, even though + * we're dropping the schema lock here. + */ + if (i % 1000 == 0) { + WT_WITHOUT_SCHEMA_LOCK(session, ret = + __wt_lsm_manager_clear_tree(session, lsm_tree)); + WT_RET(ret); + } + __wt_yield(); + } + return (0); +} + +/* + * __wt_lsm_tree_close_all -- + * Close all LSM tree structures. + */ +int +__wt_lsm_tree_close_all(WT_SESSION_IMPL *session) +{ + WT_DECL_RET; + WT_LSM_TREE *lsm_tree; + + while ((lsm_tree = TAILQ_FIRST(&S2C(session)->lsmqh)) != NULL) { + /* + * Tree close assumes that we have a reference to the tree + * so it can tell when it's safe to do the close. We could + * got through tree get here, but short circuit instead. There + * is no need to decrement the reference count since destroy + * is unconditional. + */ + (void)WT_ATOMIC_ADD4(lsm_tree->refcnt, 1); + WT_TRET(__lsm_tree_close(session, lsm_tree)); + WT_TRET(__lsm_tree_discard(session, lsm_tree)); + } + + return (ret); +} + +/* + * __lsm_tree_set_name -- + * Set or reset the name of an LSM tree + */ +static int +__lsm_tree_set_name(WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree, const char *uri) +{ + if (lsm_tree->name != NULL) + __wt_free(session, lsm_tree->name); + WT_RET(__wt_strdup(session, uri, &lsm_tree->name)); + lsm_tree->filename = lsm_tree->name + strlen("lsm:"); + return (0); +} + +/* + * __wt_lsm_tree_bloom_name -- + * Get the URI of the Bloom filter for a given chunk. + */ +int +__wt_lsm_tree_bloom_name(WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__wt_buf_fmt( + session, tmp, "file:%s-%06" PRIu32 ".bf", lsm_tree->filename, id)); + WT_ERR(__wt_strndup(session, tmp->data, tmp->size, retp)); + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __wt_lsm_tree_chunk_name -- + * Get the URI of the file for a given chunk. + */ +int +__wt_lsm_tree_chunk_name(WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__wt_buf_fmt( + session, tmp, "file:%s-%06" PRIu32 ".lsm", lsm_tree->filename, id)); + WT_ERR(__wt_strndup(session, tmp->data, tmp->size, retp)); + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __wt_lsm_tree_set_chunk_size -- + * Set the size of the chunk. Should only be called for chunks that are + * on disk, or about to become on disk. + */ +int +__wt_lsm_tree_set_chunk_size( + WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk) +{ + wt_off_t size; + const char *filename; + + filename = chunk->uri; + if (!WT_PREFIX_SKIP(filename, "file:")) + WT_RET_MSG(session, EINVAL, + "Expected a 'file:' URI: %s", chunk->uri); + WT_RET(__wt_filesize_name(session, filename, &size)); + + chunk->size = (uint64_t)size; + + return (0); +} + +/* + * __wt_lsm_tree_setup_chunk -- + * Initialize a chunk of an LSM tree. + */ +int +__wt_lsm_tree_setup_chunk( + WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) +{ + const char *cfg[] = + { WT_CONFIG_BASE(session, session_drop), "force", NULL }; + int exists; + + WT_RET(__wt_epoch(session, &chunk->create_ts)); + + WT_RET(__wt_lsm_tree_chunk_name( + session, lsm_tree, chunk->id, &chunk->uri)); + + /* + * If the underlying file exists, drop the chunk first - there may be + * some content hanging over from an aborted merge or checkpoint. + * + * Don't do this for the very first chunk: we are called during + * WT_SESSION::create, and doing a drop inside there does interesting + * things with handle locks and metadata tracking. It can never have + * been the result of an interrupted merge, anyway. + */ + if (chunk->id > 1) { + WT_RET(__wt_exist( + session, chunk->uri + strlen("file:"), &exists)); + if (exists) + WT_RET(__wt_schema_drop(session, chunk->uri, cfg)); + } + return (__wt_schema_create(session, chunk->uri, lsm_tree->file_config)); +} + +/* + * __wt_lsm_tree_create -- + * Create an LSM tree structure for the given name. + */ +int +__wt_lsm_tree_create(WT_SESSION_IMPL *session, + const char *uri, int exclusive, const char *config) +{ + WT_CONFIG_ITEM cval; + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_LSM_TREE *lsm_tree; + const char *cfg[] = + { WT_CONFIG_BASE(session, session_create), config, NULL }; + const char *tmpconfig; + + /* If the tree is open, it already exists. */ + if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) { + __wt_lsm_tree_release(session, lsm_tree); + return (exclusive ? EEXIST : 0); + } + WT_RET_NOTFOUND_OK(ret); + + /* + * If the tree has metadata, it already exists. + * + * !!! + * Use a local variable: we don't care what the existing configuration + * is, but we don't want to overwrite the real config. + */ + if (__wt_metadata_search(session, uri, &tmpconfig) == 0) { + __wt_free(session, tmpconfig); + return (exclusive ? EEXIST : 0); + } + WT_RET_NOTFOUND_OK(ret); + + WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); + if (WT_STRING_MATCH("r", cval.str, cval.len)) + WT_RET_MSG(session, EINVAL, + "LSM trees cannot be configured as column stores"); + + WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); + + WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); + + WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval)); + WT_ERR(__wt_strndup( + session, cval.str, cval.len, &lsm_tree->key_format)); + WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval)); + WT_ERR(__wt_strndup( + session, cval.str, cval.len, &lsm_tree->value_format)); + + WT_ERR(__wt_config_gets(session, cfg, "collator", &cval)); + WT_ERR(__wt_strndup( + session, cval.str, cval.len, &lsm_tree->collator_name)); + + WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval)); + if (cval.val) + F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); + else + F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); + WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval)); + FLD_SET(lsm_tree->bloom, + (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED)); + WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval)); + if (cval.val != 0) + FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST); + + if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && + FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST)) + WT_ERR_MSG(session, EINVAL, + "Bloom filters can only be created on newest and oldest " + "chunks if bloom filters are enabled"); + + WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval)); + if (cval.type == WT_CONFIG_ITEM_STRUCT) { + cval.str++; + cval.len -= 2; + } + WT_ERR(__wt_strndup( + session, cval.str, cval.len, &lsm_tree->bloom_config)); + + WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval)); + lsm_tree->bloom_bit_count = (uint32_t)cval.val; + WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval)); + lsm_tree->bloom_hash_count = (uint32_t)cval.val; + WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval)); + lsm_tree->chunk_max = (uint64_t)cval.val; + WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval)); + lsm_tree->chunk_size = (uint64_t)cval.val; + if (lsm_tree->chunk_size > lsm_tree->chunk_max) + WT_ERR_MSG(session, EINVAL, + "Chunk size (chunk_size) must be smaller than or equal to " + "the maximum chunk size (chunk_max)"); + WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval)); + lsm_tree->merge_max = (uint32_t)cval.val; + WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_min", &cval)); + lsm_tree->merge_min = (uint32_t)cval.val; + if (lsm_tree->merge_min > lsm_tree->merge_max) + WT_ERR_MSG(session, EINVAL, + "LSM merge_min must be less than or equal to merge_max"); + + /* + * Set up the config for each chunk. + * + * Make the memory_page_max double the chunk size, so application + * threads don't immediately try to force evict the chunk when the + * worker thread clears the NO_EVICTION flag. + */ + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + WT_ERR(__wt_buf_fmt(session, buf, + "%s,key_format=u,value_format=u,memory_page_max=%" PRIu64, + config, 2 * lsm_tree->chunk_max)); + WT_ERR(__wt_strndup( + session, buf->data, buf->size, &lsm_tree->file_config)); + + /* Create the first chunk and flush the metadata. */ + WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); + + /* Discard our partially populated handle. */ + ret = __lsm_tree_discard(session, lsm_tree); + lsm_tree = NULL; + + /* + * Open our new tree and add it to the handle cache. Don't discard on + * error: the returned handle is NULL on error, and the metadata + * tracking macros handle cleaning up on failure. + */ + if (ret == 0) + ret = __lsm_tree_open(session, uri, &lsm_tree); + if (ret == 0) + __wt_lsm_tree_release(session, lsm_tree); + + if (0) { +err: WT_TRET(__lsm_tree_discard(session, lsm_tree)); + } + __wt_scr_free(&buf); + return (ret); +} + +/* + * __lsm_tree_open_check -- + * Validate the configuration of an LSM tree. + */ +static int +__lsm_tree_open_check(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_CONFIG_ITEM cval; + uint64_t maxleafpage, required; + const char *cfg[] = { WT_CONFIG_BASE( + session, session_create), lsm_tree->file_config, NULL }; + + WT_RET(__wt_config_gets(session, cfg, "leaf_page_max", &cval)); + maxleafpage = (uint64_t)cval.val; + + /* + * Three chunks, plus one page for each participant in up to three + * concurrent merges. + */ + required = 3 * lsm_tree->chunk_size + + 3 * (lsm_tree->merge_max * maxleafpage); + if (S2C(session)->cache_size < required) + WT_RET_MSG(session, EINVAL, + "LSM cache size %" PRIu64 " (%" PRIu64 "MB) too small, " + "must be at least %" PRIu64 " (%" PRIu64 "MB)", + S2C(session)->cache_size, + S2C(session)->cache_size / WT_MEGABYTE, + required, required / WT_MEGABYTE); + return (0); +} + +/* + * __lsm_tree_open -- + * Open an LSM tree structure. + */ +static int +__lsm_tree_open( + WT_SESSION_IMPL *session, const char *uri, WT_LSM_TREE **treep) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LSM_TREE *lsm_tree; + + conn = S2C(session); + lsm_tree = NULL; + + WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); + + /* Start the LSM manager thread if it isn't running. */ + if (WT_ATOMIC_CAS4(conn->lsm_manager.lsm_workers, 0, 1)) + WT_RET(__wt_lsm_manager_start(session)); + + /* Make sure no one beat us to it. */ + TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) + if (strcmp(uri, lsm_tree->name) == 0) { + *treep = lsm_tree; + return (0); + } + + /* Try to open the tree. */ + WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); + WT_ERR(__wt_rwlock_alloc(session, &lsm_tree->rwlock, "lsm tree")); + + WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); + + WT_ERR(__wt_lsm_meta_read(session, lsm_tree)); + + /* + * Sanity check the configuration. Do it now since this is the first + * time we have the LSM tree configuration. + */ + WT_ERR(__lsm_tree_open_check(session, lsm_tree)); + + if (lsm_tree->nchunks == 0) { + F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH); + WT_ERR(__wt_lsm_tree_switch(session, lsm_tree)); + } + + /* Set the generation number so cursors are opened on first usage. */ + lsm_tree->dsk_gen = 1; + + /* + * Setup reference counting. Use separate reference counts for tree + * handles and queue entries, so that queue entries don't interfere + * with getting handles exclusive. + */ + lsm_tree->refcnt = 1; + lsm_tree->queue_ref = 0; + + /* Set a flush timestamp as a baseline. */ + WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts)); + + /* Now the tree is setup, make it visible to others. */ + TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q); + F_SET(lsm_tree, WT_LSM_TREE_ACTIVE | WT_LSM_TREE_OPEN); + + *treep = lsm_tree; + + if (0) { +err: WT_TRET(__lsm_tree_discard(session, lsm_tree)); + } + return (ret); +} + +/* + * __wt_lsm_tree_get -- + * Get an LSM tree structure for the given name. Optionally get exclusive + * access to the handle. Exclusive access works separately to the LSM + * tree lock - since operations that need exclusive access may also need + * to take the LSM tree lock for example outstanding work unit operations. + */ +int +__wt_lsm_tree_get(WT_SESSION_IMPL *session, + const char *uri, int exclusive, WT_LSM_TREE **treep) +{ + WT_LSM_TREE *lsm_tree; + + /* See if the tree is already open. */ + TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) + if (strcmp(uri, lsm_tree->name) == 0) { + /* + * Short circuit if the handle is already held + * exclusively or exclusive access is requested and + * there are references held. + */ + if ((exclusive && lsm_tree->refcnt > 0) || + F_ISSET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE)) + return (EBUSY); + + if (exclusive) { + F_SET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE); + if (!WT_ATOMIC_CAS4(lsm_tree->refcnt, 0, 1)) { + F_CLR(lsm_tree, WT_LSM_TREE_EXCLUSIVE); + return (EBUSY); + } + } else + (void)WT_ATOMIC_ADD4(lsm_tree->refcnt, 1); + + /* + * If we got a reference, but an exclusive reference + * beat us to it, give our reference up. + */ + if (!exclusive && + F_ISSET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE)) { + (void)WT_ATOMIC_SUB4(lsm_tree->refcnt, 1); + return (EBUSY); + } + *treep = lsm_tree; + return (0); + } + + /* Open a new tree. */ + return (__lsm_tree_open(session, uri, treep)); +} + +/* + * __wt_lsm_tree_release -- + * Release an LSM tree structure. + */ +void +__wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_ASSERT(session, lsm_tree->refcnt > 0); + (void)WT_ATOMIC_SUB4(lsm_tree->refcnt, 1); + F_CLR_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE); +} + +/* How aggressively to ramp up or down throttle due to level 0 merging */ +#define WT_LSM_MERGE_THROTTLE_BUMP_PCT (100 / lsm_tree->merge_max) +/* Number of level 0 chunks that need to be present to throttle inserts */ +#define WT_LSM_MERGE_THROTTLE_THRESHOLD \ + (2 * lsm_tree->merge_min) +/* Minimal throttling time */ +#define WT_LSM_THROTTLE_START 20 + +#define WT_LSM_MERGE_THROTTLE_INCREASE(val) do { \ + (val) += ((val) * WT_LSM_MERGE_THROTTLE_BUMP_PCT) / 100; \ + if ((val) < WT_LSM_THROTTLE_START) \ + (val) = WT_LSM_THROTTLE_START; \ + } while (0) + +#define WT_LSM_MERGE_THROTTLE_DECREASE(val) do { \ + (val) -= ((val) * WT_LSM_MERGE_THROTTLE_BUMP_PCT) / 100; \ + if ((val) < WT_LSM_THROTTLE_START) \ + (val) = 0; \ + } while (0) + +/* + * __wt_lsm_tree_throttle -- + * Calculate whether LSM updates need to be throttled. Must be called + * with the LSM tree lock held. + */ +void +__wt_lsm_tree_throttle( + WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int decrease_only) +{ + WT_LSM_CHUNK *last_chunk, **cp, *ondisk, *prev_chunk; + uint64_t cache_sz, cache_used, oldtime, record_count, timediff; + uint32_t in_memory, gen0_chunks; + + /* Never throttle in small trees. */ + if (lsm_tree->nchunks < 3) { + lsm_tree->ckpt_throttle = lsm_tree->merge_throttle = 0; + return; + } + + cache_sz = S2C(session)->cache_size; + + /* + * In the steady state, we expect that the checkpoint worker thread + * will keep up with inserts. If not, throttle the insert rate to + * avoid filling the cache with in-memory chunks. Threads sleep every + * 100 operations, so take that into account in the calculation. + * + * Also throttle based on whether merge threads are keeping up. If + * there are enough chunks that have never been merged we slow down + * inserts so that merges have some chance of keeping up. + * + * Count the number of in-memory chunks, the number of unmerged chunk + * on disk, and find the most recent on-disk chunk (if any). + */ + record_count = 1; + gen0_chunks = in_memory = 0; + ondisk = NULL; + for (cp = lsm_tree->chunk + lsm_tree->nchunks - 1; + cp >= lsm_tree->chunk; + --cp) + if (!F_ISSET(*cp, WT_LSM_CHUNK_ONDISK)) { + record_count += (*cp)->count; + ++in_memory; + } else { + /* + * Assign ondisk to the last chunk that has been + * flushed since the tree was last opened (i.e it's on + * disk and stable is not set). + */ + if (ondisk == NULL && + ((*cp)->generation == 0 && + !F_ISSET(*cp, WT_LSM_CHUNK_STABLE))) + ondisk = *cp; + + if ((*cp)->generation == 0 && + !F_ISSET(*cp, WT_LSM_CHUNK_MERGING)) + ++gen0_chunks; + } + + last_chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]; + + /* Checkpoint throttling, based on the number of in-memory chunks. */ + if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3) + lsm_tree->ckpt_throttle = 0; + else if (decrease_only) + ; /* Nothing to do */ + else if (ondisk == NULL) { + /* + * No checkpoint has completed this run. Keep slowing down + * inserts until one does. + */ + lsm_tree->ckpt_throttle = + WT_MAX(WT_LSM_THROTTLE_START, 2 * lsm_tree->ckpt_throttle); + } else { + WT_ASSERT(session, + WT_TIMECMP(last_chunk->create_ts, ondisk->create_ts) >= 0); + timediff = + WT_TIMEDIFF(last_chunk->create_ts, ondisk->create_ts); + lsm_tree->ckpt_throttle = + (long)((in_memory - 2) * timediff / (20 * record_count)); + + /* + * Get more aggressive as the number of in memory chunks + * consumes a large proportion of the cache. In memory chunks + * are allowed to grow up to twice as large as the configured + * value when checkpoints aren't keeping up. That worst case + * is when this calculation is relevant. + * There is nothing particularly special about the chosen + * multipliers. + */ + cache_used = in_memory * lsm_tree->chunk_size * 2; + if (cache_used > cache_sz * 0.8) + lsm_tree->ckpt_throttle *= 5; + } + + /* + * Merge throttling, based on the number of on-disk, level 0 chunks. + * + * Don't throttle if the tree has less than a single level's number + * of chunks. + */ + if (lsm_tree->nchunks < lsm_tree->merge_max) + lsm_tree->merge_throttle = 0; + else if (gen0_chunks < WT_LSM_MERGE_THROTTLE_THRESHOLD) + WT_LSM_MERGE_THROTTLE_DECREASE(lsm_tree->merge_throttle); + else if (!decrease_only) + WT_LSM_MERGE_THROTTLE_INCREASE(lsm_tree->merge_throttle); + + /* Put an upper bound of 1s on both throttle calculations. */ + lsm_tree->ckpt_throttle = WT_MIN(1000000, lsm_tree->ckpt_throttle); + lsm_tree->merge_throttle = WT_MIN(1000000, lsm_tree->merge_throttle); + + /* + * Update our estimate of how long each in-memory chunk stays active. + * Filter out some noise by keeping a weighted history of the + * calculated value. Wait until we have enough chunks that we can + * check that the new value is sane: otherwise, after a long idle + * period, we can calculate a crazy value. + */ + if (in_memory > 1 && ondisk != NULL) { + prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2]; + WT_ASSERT(session, prev_chunk->generation == 0); + WT_ASSERT(session, WT_TIMECMP( + last_chunk->create_ts, prev_chunk->create_ts) >= 0); + timediff = + WT_TIMEDIFF(last_chunk->create_ts, prev_chunk->create_ts); + WT_ASSERT(session, + WT_TIMECMP(prev_chunk->create_ts, ondisk->create_ts) >= 0); + oldtime = WT_TIMEDIFF(prev_chunk->create_ts, ondisk->create_ts); + if (timediff < 10 * oldtime) + lsm_tree->chunk_fill_ms = + (3 * lsm_tree->chunk_fill_ms + + timediff / 1000000) / 4; + } +} + +/* + * __wt_lsm_tree_switch -- + * Switch to a new in-memory tree. + */ +int +__wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + uint32_t nchunks, new_id; + int first_switch; + + WT_RET(__wt_lsm_tree_writelock(session, lsm_tree)); + + nchunks = lsm_tree->nchunks; + + first_switch = nchunks == 0 ? 1 : 0; + /* + * Check if a switch is still needed: we may have raced while waiting + * for a lock. + */ + chunk = NULL; + if (!first_switch && + (chunk = lsm_tree->chunk[nchunks - 1]) != NULL && + !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && + !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) + goto err; + + /* Set the switch transaction in the previous chunk, if necessary. */ + if (chunk != NULL && chunk->switch_txn == WT_TXN_NONE) + chunk->switch_txn = __wt_txn_new_id(session); + + /* Update the throttle time. */ + __wt_lsm_tree_throttle(session, lsm_tree, 0); + + new_id = WT_ATOMIC_ADD4(lsm_tree->last, 1); + + WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc, + nchunks + 1, &lsm_tree->chunk)); + + WT_ERR(__wt_verbose(session, WT_VERB_LSM, + "Tree %s switch to: %" PRIu32 ", checkpoint throttle %ld, " + "merge throttle %ld", lsm_tree->name, + new_id, lsm_tree->ckpt_throttle, lsm_tree->merge_throttle)); + + WT_ERR(__wt_calloc_def(session, 1, &chunk)); + chunk->id = new_id; + chunk->switch_txn = WT_TXN_NONE; + lsm_tree->chunk[lsm_tree->nchunks++] = chunk; + WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); + + WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); + F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH); + ++lsm_tree->dsk_gen; + + lsm_tree->modified = 1; + +err: WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); + /* + * Errors that happen during a tree switch leave the tree in a state + * where we can't make progress. Error out of WiredTiger. + */ + if (ret != 0) + WT_PANIC_RET(session, ret, "Failed doing LSM switch"); + else if (!first_switch) + WT_RET(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_FLUSH, 0, lsm_tree)); + return (ret); +} + +/* + * __wt_lsm_tree_drop -- + * Drop an LSM tree. + */ +int +__wt_lsm_tree_drop( + WT_SESSION_IMPL *session, const char *name, const char *cfg[]) +{ + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + WT_LSM_TREE *lsm_tree; + u_int i; + int locked; + + locked = 0; + + /* Get the LSM tree. */ + WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree)); + + /* Shut down the LSM worker. */ + WT_ERR(__lsm_tree_close(session, lsm_tree)); + + /* Prevent any new opens. */ + WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); + locked = 1; + + /* Drop the chunks. */ + for (i = 0; i < lsm_tree->nchunks; i++) { + chunk = lsm_tree->chunk[i]; + WT_ERR(__wt_schema_drop(session, chunk->uri, cfg)); + if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) + WT_ERR( + __wt_schema_drop(session, chunk->bloom_uri, cfg)); + } + + /* Drop any chunks on the obsolete list. */ + for (i = 0; i < lsm_tree->nold_chunks; i++) { + if ((chunk = lsm_tree->old_chunks[i]) == NULL) + continue; + WT_ERR(__wt_schema_drop(session, chunk->uri, cfg)); + if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) + WT_ERR( + __wt_schema_drop(session, chunk->bloom_uri, cfg)); + } + + locked = 0; + WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); + ret = __wt_metadata_remove(session, name); + +err: if (locked) + WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); + WT_TRET(__lsm_tree_discard(session, lsm_tree)); + return (ret); +} + +/* + * __wt_lsm_tree_rename -- + * Rename an LSM tree. + */ +int +__wt_lsm_tree_rename(WT_SESSION_IMPL *session, + const char *olduri, const char *newuri, const char *cfg[]) +{ + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + WT_LSM_TREE *lsm_tree; + const char *old; + u_int i; + int locked; + + old = NULL; + locked = 0; + + /* Get the LSM tree. */ + WT_RET(__wt_lsm_tree_get(session, olduri, 1, &lsm_tree)); + + /* Shut down the LSM worker. */ + WT_ERR(__lsm_tree_close(session, lsm_tree)); + + /* Prevent any new opens. */ + WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); + locked = 1; + + /* Set the new name. */ + WT_ERR(__lsm_tree_set_name(session, lsm_tree, newuri)); + + /* Rename the chunks. */ + for (i = 0; i < lsm_tree->nchunks; i++) { + chunk = lsm_tree->chunk[i]; + old = chunk->uri; + chunk->uri = NULL; + + WT_ERR(__wt_lsm_tree_chunk_name( + session, lsm_tree, chunk->id, &chunk->uri)); + WT_ERR(__wt_schema_rename(session, old, chunk->uri, cfg)); + __wt_free(session, old); + + if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) { + old = chunk->bloom_uri; + chunk->bloom_uri = NULL; + WT_ERR(__wt_lsm_tree_bloom_name( + session, lsm_tree, chunk->id, &chunk->bloom_uri)); + F_SET(chunk, WT_LSM_CHUNK_BLOOM); + WT_ERR(__wt_schema_rename( + session, old, chunk->uri, cfg)); + __wt_free(session, old); + } + } + + WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); + locked = 0; + WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); + WT_ERR(__wt_metadata_remove(session, olduri)); + +err: if (locked) + WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); + if (old != NULL) + __wt_free(session, old); + /* + * Discard this LSM tree structure. The first operation on the renamed + * tree will create a new one. + */ + WT_TRET(__lsm_tree_discard(session, lsm_tree)); + return (ret); +} + +/* + * __wt_lsm_tree_truncate -- + * Truncate an LSM tree. + */ +int +__wt_lsm_tree_truncate( + WT_SESSION_IMPL *session, const char *name, const char *cfg[]) +{ + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + WT_LSM_TREE *lsm_tree; + int locked; + + WT_UNUSED(cfg); + chunk = NULL; + locked = 0; + + /* Get the LSM tree. */ + WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree)); + + /* Shut down the LSM worker. */ + WT_ERR(__lsm_tree_close(session, lsm_tree)); + + /* Prevent any new opens. */ + WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); + locked = 1; + + /* Create the new chunk. */ + WT_ERR(__wt_calloc_def(session, 1, &chunk)); + chunk->id = WT_ATOMIC_ADD4(lsm_tree->last, 1); + WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); + + /* Mark all chunks old. */ + WT_ERR(__wt_lsm_merge_update_tree( + session, lsm_tree, 0, lsm_tree->nchunks, chunk)); + + WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); + + locked = 0; + WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); + __wt_lsm_tree_release(session, lsm_tree); + +err: if (locked) + WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); + if (ret != 0) { + if (chunk != NULL) { + (void)__wt_schema_drop(session, chunk->uri, NULL); + __wt_free(session, chunk); + } + /* + * Discard the LSM tree structure on error. This will force the + * LSM tree to be re-opened the next time it is accessed and + * the last good version of the metadata will be used, resulting + * in a valid (not truncated) tree. + */ + WT_TRET(__lsm_tree_discard(session, lsm_tree)); + } + return (ret); +} + +/* + * __wt_lsm_tree_readlock -- + * Acquire a shared lock on an LSM tree. + */ +int +__wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_RET(__wt_readlock(session, lsm_tree->rwlock)); + + /* + * Diagnostic: avoid deadlocks with the schema lock: if we need it for + * an operation, we should already have it. + */ + F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); + return (0); +} + +/* + * __wt_lsm_tree_readunlock -- + * Release a shared lock on an LSM tree. + */ +int +__wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_DECL_RET; + + F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); + + if ((ret = __wt_readunlock(session, lsm_tree->rwlock)) != 0) + WT_PANIC_RET(session, ret, "Unlocking an LSM tree"); + return (0); +} + +/* + * __wt_lsm_tree_writelock -- + * Acquire an exclusive lock on an LSM tree. + */ +int +__wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_RET(__wt_writelock(session, lsm_tree->rwlock)); + + /* + * Diagnostic: avoid deadlocks with the schema lock: if we need it for + * an operation, we should already have it. + */ + F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); + return (0); +} + +/* + * __wt_lsm_tree_writeunlock -- + * Release an exclusive lock on an LSM tree. + */ +int +__wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_DECL_RET; + + F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); + + if ((ret = __wt_writeunlock(session, lsm_tree->rwlock)) != 0) + WT_PANIC_RET(session, ret, "Unlocking an LSM tree"); + return (0); +} + +/* + * __wt_lsm_compact -- + * Compact an LSM tree called via __wt_schema_worker. + */ +int +__wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) +{ + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + WT_LSM_TREE *lsm_tree; + time_t begin, end; + uint64_t progress; + int i, compacting, flushing, locked, ref; + + compacting = flushing = locked = ref = 0; + chunk = NULL; + /* + * This function is applied to all matching sources: ignore anything + * that is not an LSM tree. + */ + if (!WT_PREFIX_MATCH(name, "lsm:")) + return (0); + + /* Tell __wt_schema_worker not to look inside the LSM tree. */ + *skip = 1; + + WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree)); + + if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) + WT_ERR_MSG(session, EINVAL, + "LSM compaction requires active merge threads"); + + WT_ERR(__wt_seconds(session, &begin)); + + /* + * Compacting has two distinct phases. + * 1. All in-memory chunks up to and including the current + * current chunk must be flushed. Normally, the flush code + * does not flush the last, in-use chunk, so we set a force + * flag to include that last chunk. We monitor the state of the + * last chunk and periodically push another forced flush work + * unit until it is complete. + * 2. After all flushing is done, we move onto the merging + * phase for compaction. Again, we monitor the state and + * continue to push merge work units until all merging is done. + */ + + /* Lock the tree: single-thread compaction. */ + WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); + locked = 1; + + /* Clear any merge throttle: compact throws out that calculation. */ + lsm_tree->merge_throttle = 0; + lsm_tree->merge_aggressiveness = 0; + progress = lsm_tree->merge_progressing; + + /* If another thread started a compact on this tree, we're done. */ + if (F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) + goto err; + + /* + * Set the switch transaction on the current chunk, if it + * hasn't been set before. This prevents further writes, so it + * can be flushed by the checkpoint worker. + */ + if (lsm_tree->nchunks > 0 && + (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL) { + if (chunk->switch_txn == WT_TXN_NONE) + chunk->switch_txn = __wt_txn_new_id(session); + /* + * If we have a chunk, we want to look for it to be on-disk. + * So we need to add a reference to keep it available. + */ + (void)WT_ATOMIC_ADD4(chunk->refcnt, 1); + ref = 1; + } + + locked = 0; + WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); + + if (chunk != NULL) { + WT_ERR(__wt_verbose(session, WT_VERB_LSM, + "Compact force flush %s flags 0x%" PRIx32 + " chunk %u flags 0x%" + PRIx32, name, lsm_tree->flags, chunk->id, chunk->flags)); + flushing = 1; + /* + * Make sure the in-memory chunk gets flushed do not push a + * switch, because we don't want to create a new in-memory + * chunk if the tree is being used read-only now. + */ + WT_ERR(__wt_lsm_manager_push_entry(session, + WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree)); + } else { + /* + * If there is no chunk to flush, go straight to the + * compacting state. + */ + compacting = 1; + progress = lsm_tree->merge_progressing; + F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); + WT_ERR(__wt_verbose(session, WT_VERB_LSM, + "COMPACT: Start compacting %s", lsm_tree->name)); + } + + /* Wait for the work unit queues to drain. */ + while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) { + /* + * The flush flag is cleared when the chunk has been flushed. + * Continue to push forced flushes until the chunk is on disk. + * Once it is on disk move to the compacting phase. + */ + if (flushing) { + WT_ASSERT(session, chunk != NULL); + if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { + WT_ERR(__wt_verbose(session, + WT_VERB_LSM, + "Compact flush done %s chunk %u. " + "Start compacting progress %" PRIu64, + name, chunk->id, + lsm_tree->merge_progressing)); + (void)WT_ATOMIC_SUB4(chunk->refcnt, 1); + flushing = ref = 0; + compacting = 1; + F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); + progress = lsm_tree->merge_progressing; + } else { + WT_ERR(__wt_verbose(session, WT_VERB_LSM, + "Compact flush retry %s chunk %u", + name, chunk->id)); + WT_ERR(__wt_lsm_manager_push_entry(session, + WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, + lsm_tree)); + } + } + + /* + * The compacting flag is cleared when no merges can be done. + * Ensure that we push through some aggressive merges before + * stopping otherwise we might not do merges that would + * span chunks with different generations. + */ + if (compacting && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) { + if (lsm_tree->merge_aggressiveness < 10 || + (progress < lsm_tree->merge_progressing) || + lsm_tree->merge_syncing) { + progress = lsm_tree->merge_progressing; + F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); + lsm_tree->merge_aggressiveness = 10; + } else + break; + } + __wt_sleep(1, 0); + WT_ERR(__wt_seconds(session, &end)); + if (session->compact->max_time > 0 && + session->compact->max_time < (uint64_t)(end - begin)) { + WT_ERR(ETIMEDOUT); + } + /* + * Push merge operations while they are still getting work + * done. If we are pushing merges, make sure they are + * aggressive, to avoid duplicating effort. + */ + if (compacting) +#define COMPACT_PARALLEL_MERGES 5 + for (i = lsm_tree->queue_ref; + i < COMPACT_PARALLEL_MERGES; i++) { + lsm_tree->merge_aggressiveness = 10; + WT_ERR(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_MERGE, 0, lsm_tree)); + } + } +err: + /* Ensure anything we set is cleared. */ + if (ref) + (void)WT_ATOMIC_SUB4(chunk->refcnt, 1); + if (compacting) { + F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING); + lsm_tree->merge_aggressiveness = 0; + } + if (locked) + WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); + + WT_TRET(__wt_verbose(session, WT_VERB_LSM, + "Compact %s complete, return %d", name, ret)); + + __wt_lsm_tree_release(session, lsm_tree); + return (ret); + +} + +/* + * __wt_lsm_tree_worker -- + * Run a schema worker operation on each level of a LSM tree. + */ +int +__wt_lsm_tree_worker(WT_SESSION_IMPL *session, + const char *uri, + int (*file_func)(WT_SESSION_IMPL *, const char *[]), + int (*name_func)(WT_SESSION_IMPL *, const char *, int *), + const char *cfg[], uint32_t open_flags) +{ + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + WT_LSM_TREE *lsm_tree; + u_int i; + int exclusive, locked; + + locked = 0; + exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE) ? 1 : 0; + WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree)); + + /* + * We mark that we're busy using the tree to coordinate + * with merges so that merging doesn't change the chunk + * array out from underneath us. + */ + WT_ERR(exclusive ? + __wt_lsm_tree_writelock(session, lsm_tree) : + __wt_lsm_tree_readlock(session, lsm_tree)); + locked = 1; + for (i = 0; i < lsm_tree->nchunks; i++) { + chunk = lsm_tree->chunk[i]; + if (file_func == __wt_checkpoint && + F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) + continue; + WT_ERR(__wt_schema_worker(session, chunk->uri, + file_func, name_func, cfg, open_flags)); + if (name_func == __wt_backup_list_uri_append && + F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) + WT_ERR(__wt_schema_worker(session, chunk->bloom_uri, + file_func, name_func, cfg, open_flags)); + } +err: if (locked) + WT_TRET(exclusive ? + __wt_lsm_tree_writeunlock(session, lsm_tree) : + __wt_lsm_tree_readunlock(session, lsm_tree)); + __wt_lsm_tree_release(session, lsm_tree); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c new file mode 100644 index 00000000000..278c400070f --- /dev/null +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -0,0 +1,625 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __lsm_bloom_create( + WT_SESSION_IMPL *, WT_LSM_TREE *, WT_LSM_CHUNK *, u_int); +static int __lsm_discard_handle(WT_SESSION_IMPL *, const char *, const char *); + +/* + * __lsm_copy_chunks -- + * Take a copy of part of the LSM tree chunk array so that we can work on + * the contents without holding the LSM tree handle lock long term. + */ +static int +__lsm_copy_chunks(WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree, WT_LSM_WORKER_COOKIE *cookie, int old_chunks) +{ + WT_DECL_RET; + u_int i, nchunks; + size_t alloc; + + /* Always return zero chunks on error. */ + cookie->nchunks = 0; + + WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); + if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) + return (__wt_lsm_tree_readunlock(session, lsm_tree)); + + /* Take a copy of the current state of the LSM tree. */ + nchunks = old_chunks ? lsm_tree->nold_chunks : lsm_tree->nchunks; + alloc = old_chunks ? lsm_tree->old_alloc : lsm_tree->chunk_alloc; + + /* + * If the tree array of active chunks is larger than our current buffer, + * increase the size of our current buffer to match. + */ + if (cookie->chunk_alloc < alloc) + WT_ERR(__wt_realloc(session, + &cookie->chunk_alloc, alloc, &cookie->chunk_array)); + if (nchunks > 0) + memcpy(cookie->chunk_array, + old_chunks ? lsm_tree->old_chunks : lsm_tree->chunk, + nchunks * sizeof(*cookie->chunk_array)); + + /* + * Mark each chunk as active, so we don't drop it until after we know + * it's safe. + */ + for (i = 0; i < nchunks; i++) + (void)WT_ATOMIC_ADD4(cookie->chunk_array[i]->refcnt, 1); + +err: WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree)); + + if (ret == 0) + cookie->nchunks = nchunks; + return (ret); +} + +/* + * __wt_lsm_get_chunk_to_flush -- + * Find and pin a chunk in the LSM tree that is likely to need flushing. + */ +int +__wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree, int force, WT_LSM_CHUNK **chunkp) +{ + u_int i, end; + + *chunkp = NULL; + + WT_ASSERT(session, lsm_tree->queue_ref > 0); + WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); + if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) + return (__wt_lsm_tree_readunlock(session, lsm_tree)); + + /* + * Normally we don't want to force out the last chunk. But if we're + * doing a forced flush, likely from a compact call, then we want + * to include the final chunk. + */ + end = force ? lsm_tree->nchunks : lsm_tree->nchunks - 1; + for (i = 0; i < end; i++) { + if (!F_ISSET(lsm_tree->chunk[i], WT_LSM_CHUNK_ONDISK)) { + (void)WT_ATOMIC_ADD4(lsm_tree->chunk[i]->refcnt, 1); + WT_RET(__wt_verbose(session, WT_VERB_LSM, + "Flush%s: return chunk %u of %u: %s", + force ? " w/ force" : "", i, end - 1, + lsm_tree->chunk[i]->uri)); + *chunkp = lsm_tree->chunk[i]; + break; + } + } + + WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree)); + + return (0); +} + +/* + * __lsm_unpin_chunks -- + * Decrement the reference count for a set of chunks. Allowing those + * chunks to be considered for deletion. + */ +static void +__lsm_unpin_chunks(WT_SESSION_IMPL *session, WT_LSM_WORKER_COOKIE *cookie) +{ + u_int i; + + for (i = 0; i < cookie->nchunks; i++) { + if (cookie->chunk_array[i] == NULL) + continue; + WT_ASSERT(session, cookie->chunk_array[i]->refcnt > 0); + (void)WT_ATOMIC_SUB4(cookie->chunk_array[i]->refcnt, 1); + } + /* Ensure subsequent calls don't double decrement. */ + cookie->nchunks = 0; +} + +/* + * __wt_lsm_work_switch -- + * Do a switch if the LSM tree needs one. + */ +int +__wt_lsm_work_switch( + WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT **entryp, int *ran) +{ + WT_DECL_RET; + WT_LSM_WORK_UNIT *entry; + + /* We've become responsible for freeing the work unit. */ + entry = *entryp; + *ran = 0; + *entryp = NULL; + + if (F_ISSET(entry->lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { + WT_WITH_SCHEMA_LOCK(session, ret = + __wt_lsm_tree_switch(session, entry->lsm_tree)); + /* Failing to complete the switch is fine */ + if (ret == EBUSY) { + if (F_ISSET(entry->lsm_tree, WT_LSM_TREE_NEED_SWITCH)) + WT_ERR(__wt_lsm_manager_push_entry(session, + WT_LSM_WORK_SWITCH, 0, entry->lsm_tree)); + ret = 0; + } else + *ran = 1; + } +err: __wt_lsm_manager_free_work_unit(session, entry); + return (ret); +} + +/* + * __wt_lsm_work_bloom -- + * Try to create a Bloom filter for the newest on-disk chunk that doesn't + * have one. + */ +int +__wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + WT_LSM_WORKER_COOKIE cookie; + u_int i, merge; + + WT_CLEAR(cookie); + + WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 0)); + + /* Create bloom filters in all checkpointed chunks. */ + merge = 0; + for (i = 0; i < cookie.nchunks; i++) { + chunk = cookie.chunk_array[i]; + + /* + * Skip if a thread is still active in the chunk or it + * isn't suitable. + */ + if (!F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) || + F_ISSET(chunk, WT_LSM_CHUNK_BLOOM | WT_LSM_CHUNK_MERGING) || + chunk->generation > 0 || + chunk->count == 0) + continue; + + /* + * See if we win the race to switch on the "busy" flag and + * recheck that the chunk still needs a Bloom filter. + */ + if (WT_ATOMIC_CAS4(chunk->bloom_busy, 0, 1)) { + if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) { + ret = __lsm_bloom_create( + session, lsm_tree, chunk, (u_int)i); + /* + * Record if we were successful so that we can + * later push a merge work unit. + */ + if (ret == 0) + merge = 1; + } + chunk->bloom_busy = 0; + break; + } + } + /* + * If we created any bloom filters, we push a merge work unit now. + */ + if (merge) + WT_ERR(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_MERGE, 0, lsm_tree)); + +err: + __lsm_unpin_chunks(session, &cookie); + __wt_free(session, cookie.chunk_array); + return (ret); +} + +/* + * __wt_lsm_checkpoint_chunk -- + * Flush a single LSM chunk to disk. + */ +int +__wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) +{ + WT_DECL_RET; + WT_TXN_ISOLATION saved_isolation; + + /* + * If the chunk is already checkpointed, make sure it is also evicted. + * Either way, there is no point trying to checkpoint it again. + */ + if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && + !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && + !chunk->evicted) { + if ((ret = __lsm_discard_handle( + session, chunk->uri, NULL)) == 0) + chunk->evicted = 1; + else if (ret == EBUSY) + ret = 0; + else + WT_RET_MSG(session, ret, "discard handle"); + } + if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { + WT_RET(__wt_verbose(session, WT_VERB_LSM, + "LSM worker %s already on disk", + chunk->uri)); + return (0); + } + + /* Stop if a running transaction needs the chunk. */ + __wt_txn_update_oldest(session); + if (chunk->switch_txn == WT_TXN_NONE || + !__wt_txn_visible_all(session, chunk->switch_txn)) { + WT_RET(__wt_verbose(session, WT_VERB_LSM, + "LSM worker %s: running transaction, return", + chunk->uri)); + return (0); + } + + WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s", + chunk->uri)); + + /* + * Flush the file before checkpointing: this is the expensive part in + * terms of I/O. + * + * Use the special eviction isolation level to avoid interfering with + * an application checkpoint: we have already checked that all of the + * updates in this chunk are globally visible. + * + * !!! We can wait here for checkpoints and fsyncs to complete, which + * can be a long time. + */ + if ((ret = __wt_session_get_btree( + session, chunk->uri, NULL, NULL, 0)) == 0) { + saved_isolation = session->txn.isolation; + session->txn.isolation = TXN_ISO_EVICTION; + ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); + session->txn.isolation = saved_isolation; + WT_TRET(__wt_session_release_btree(session)); + } + WT_RET(ret); + + WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", + chunk->uri)); + + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_schema_worker(session, chunk->uri, + __wt_checkpoint, NULL, NULL, 0)); + + if (ret != 0) + WT_RET_MSG(session, ret, "LSM checkpoint"); + + /* Now the file is written, get the chunk size. */ + WT_RET(__wt_lsm_tree_set_chunk_size(session, chunk)); + + /* Update the flush timestamp to help track ongoing progress. */ + WT_RET(__wt_epoch(session, &lsm_tree->last_flush_ts)); + + /* Lock the tree, mark the chunk as on disk and update the metadata. */ + WT_RET(__wt_lsm_tree_writelock(session, lsm_tree)); + F_SET(chunk, WT_LSM_CHUNK_ONDISK); + ret = __wt_lsm_meta_write(session, lsm_tree); + ++lsm_tree->dsk_gen; + + /* Update the throttle time. */ + __wt_lsm_tree_throttle(session, lsm_tree, 1); + WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); + + if (ret != 0) + WT_RET_MSG(session, ret, "LSM metadata write"); + + /* + * Clear the "cache resident" flag so the primary can be evicted and + * eventually closed. Only do this once the checkpoint has succeeded: + * otherwise, accessing the leaf page during the checkpoint can trigger + * forced eviction. + */ + WT_RET(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); + __wt_btree_evictable(session, 1); + WT_RET(__wt_session_release_btree(session)); + + /* Make sure we aren't pinning a transaction ID. */ + __wt_txn_release_snapshot(session); + + WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s", + chunk->uri)); + /* + * Schedule a bloom filter create for our newly flushed chunk */ + if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF)) + WT_RET(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); + else + WT_RET(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_MERGE, 0, lsm_tree)); + return (0); +} + +/* + * __lsm_bloom_create -- + * Create a bloom filter for a chunk of the LSM tree that has been + * checkpointed but not yet been merged. + */ +static int +__lsm_bloom_create(WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off) +{ + WT_BLOOM *bloom; + WT_CURSOR *src; + WT_DECL_RET; + WT_ITEM key; + WT_SESSION *wt_session; + uint64_t insert_count; + int exist; + + /* + * Normally, the Bloom URI is populated when the chunk struct is + * allocated. After an open, however, it may not have been. + * Deal with that here. + */ + if (chunk->bloom_uri == NULL) + WT_RET(__wt_lsm_tree_bloom_name( + session, lsm_tree, chunk->id, &chunk->bloom_uri)); + + /* + * Drop the bloom filter first - there may be some content hanging over + * from an aborted merge or checkpoint. + */ + wt_session = &session->iface; + WT_RET(__wt_exist(session, chunk->bloom_uri + strlen("file:"), &exist)); + if (exist) + WT_RET(wt_session->drop(wt_session, chunk->bloom_uri, "force")); + + bloom = NULL; + /* + * This is merge-like activity, and we don't want compacts to give up + * because we are creating a bunch of bloom filters before merging. + */ + ++lsm_tree->merge_progressing; + WT_RET(__wt_bloom_create(session, chunk->bloom_uri, + lsm_tree->bloom_config, chunk->count, + lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); + + /* Open a special merge cursor just on this chunk. */ + WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); + F_SET(src, WT_CURSTD_RAW); + WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1)); + + F_SET(session, WT_SESSION_NO_CACHE); + for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { + WT_ERR(src->get_key(src, &key)); + WT_ERR(__wt_bloom_insert(bloom, &key)); + } + WT_ERR_NOTFOUND_OK(ret); + WT_TRET(src->close(src)); + + WT_TRET(__wt_bloom_finalize(bloom)); + WT_ERR(ret); + + F_CLR(session, WT_SESSION_NO_CACHE); + + /* + * Load the new Bloom filter into cache. + * + * We're doing advisory reads to fault the new trees into cache. + * Don't block if the cache is full: our next unit of work may be to + * discard some trees to free space. + */ + F_SET(session, WT_SESSION_NO_CACHE_CHECK); + + WT_CLEAR(key); + WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); + + WT_ERR(__wt_verbose(session, WT_VERB_LSM, + "LSM worker created bloom filter %s. " + "Expected %" PRIu64 " items, got %" PRIu64, + chunk->bloom_uri, chunk->count, insert_count)); + + /* Ensure the bloom filter is in the metadata. */ + WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); + F_SET(chunk, WT_LSM_CHUNK_BLOOM); + ret = __wt_lsm_meta_write(session, lsm_tree); + ++lsm_tree->dsk_gen; + WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); + + if (ret != 0) + WT_ERR_MSG(session, ret, "LSM bloom worker metadata write"); + +err: if (bloom != NULL) + WT_TRET(__wt_bloom_close(bloom)); + F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK); + return (ret); +} + +/* + * __lsm_discard_handle -- + * Try to discard a handle from cache. + */ +static int +__lsm_discard_handle( + WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) +{ + /* This will fail with EBUSY if the file is still in use. */ + WT_RET(__wt_session_get_btree(session, uri, checkpoint, NULL, + WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY)); + + F_SET(session->dhandle, WT_DHANDLE_DISCARD); + return (__wt_session_release_btree(session)); +} + +/* + * __lsm_drop_file -- + * Helper function to drop part of an LSM tree. + */ +static int +__lsm_drop_file(WT_SESSION_IMPL *session, const char *uri) +{ + WT_DECL_RET; + const char *drop_cfg[] = { + WT_CONFIG_BASE(session, session_drop), "remove_files=false", NULL + }; + + /* + * We need to grab the schema lock to drop the file, so first try to + * make sure there is minimal work to freeing space in the cache. Only + * bother trying to discard the checkpoint handle: the in-memory handle + * should have been closed already. + * + * This will fail with EBUSY if the file is still in use. + */ + WT_RET(__lsm_discard_handle(session, uri, WT_CHECKPOINT)); + + /* + * Take the schema lock for the drop operation. Since __wt_schema_drop + * results in the hot backup lock being taken when it updates the + * metadata (which would be too late to prevent our drop). + */ + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_schema_drop(session, uri, drop_cfg)); + + if (ret == 0) + ret = __wt_remove(session, uri + strlen("file:")); + WT_RET(__wt_verbose(session, WT_VERB_LSM, "Dropped %s", uri)); + + if (ret == EBUSY || ret == ENOENT) + WT_RET(__wt_verbose(session, WT_VERB_LSM, + "LSM worker drop of %s failed with %d", uri, ret)); + + return (ret); +} + +/* + * __wt_lsm_free_chunks -- + * Try to drop chunks from the tree that are no longer required. + */ +int +__wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + WT_LSM_WORKER_COOKIE cookie; + u_int i, skipped; + int flush_metadata, drop_ret; + + flush_metadata = 0; + + if (lsm_tree->nold_chunks == 0) + return (0); + + /* + * Make sure only a single thread is freeing the old chunk array + * at any time. + */ + if (!WT_ATOMIC_CAS4(lsm_tree->freeing_old_chunks, 0, 1)) + return (0); + /* + * Take a copy of the current state of the LSM tree and look for chunks + * to drop. We do it this way to avoid holding the LSM tree lock while + * doing I/O or waiting on the schema lock. + * + * This is safe because only one thread will be in this function at a + * time. Merges may complete concurrently, and the old_chunks array + * may be extended, but we shuffle down the pointers each time we free + * one to keep the non-NULL slots at the beginning of the array. + */ + WT_CLEAR(cookie); + WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 1)); + for (i = skipped = 0; i < cookie.nchunks; i++) { + chunk = cookie.chunk_array[i]; + WT_ASSERT(session, chunk != NULL); + /* Skip the chunk if another worker is using it. */ + if (chunk->refcnt > 1) { + ++skipped; + continue; + } + + /* + * Don't remove files if a hot backup is in progress. + * + * The schema lock protects the set of live files, this check + * prevents us from removing a file that hot backup already + * knows about. + */ + if (S2C(session)->hot_backup != 0) + break; + + /* + * Drop any bloom filters and chunks we can. Don't try to drop + * a chunk if the bloom filter drop fails. + * An EBUSY return indicates that a cursor is still open in + * the tree - move to the next chunk in that case. + * An ENOENT return indicates that the LSM tree metadata was + * out of sync with the on disk state. Update the + * metadata to match in that case. + */ + if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) { + drop_ret = __lsm_drop_file(session, chunk->bloom_uri); + if (drop_ret == EBUSY) { + ++skipped; + continue; + } else if (drop_ret != ENOENT) + WT_ERR(drop_ret); + + flush_metadata = 1; + F_CLR(chunk, WT_LSM_CHUNK_BLOOM); + } + if (chunk->uri != NULL) { + drop_ret = __lsm_drop_file(session, chunk->uri); + if (drop_ret == EBUSY) { + ++skipped; + continue; + } else if (drop_ret != ENOENT) + WT_ERR(drop_ret); + flush_metadata = 1; + } + + /* Lock the tree to clear out the old chunk information. */ + WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); + + /* + * The chunk we are looking at should be the first one in the + * tree that we haven't already skipped over. + */ + WT_ASSERT(session, lsm_tree->old_chunks[skipped] == chunk); + __wt_free(session, chunk->bloom_uri); + __wt_free(session, chunk->uri); + __wt_free(session, lsm_tree->old_chunks[skipped]); + + /* Shuffle down to keep all occupied slots at the beginning. */ + if (--lsm_tree->nold_chunks > skipped) { + memmove(lsm_tree->old_chunks + skipped, + lsm_tree->old_chunks + skipped + 1, + (lsm_tree->nold_chunks - skipped) * + sizeof(WT_LSM_CHUNK *)); + lsm_tree->old_chunks[lsm_tree->nold_chunks] = NULL; + } + + WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); + + /* + * Clear the chunk in the cookie so we don't attempt to + * decrement the reference count. + */ + cookie.chunk_array[i] = NULL; + } + +err: /* Flush the metadata unless the system is in panic */ + if (flush_metadata && ret != WT_PANIC) { + WT_TRET(__wt_lsm_tree_writelock(session, lsm_tree)); + WT_TRET(__wt_lsm_meta_write(session, lsm_tree)); + WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); + } + __lsm_unpin_chunks(session, &cookie); + __wt_free(session, cookie.chunk_array); + lsm_tree->freeing_old_chunks = 0; + + /* Returning non-zero means there is no work to do. */ + if (!flush_metadata) + WT_TRET(WT_NOTFOUND); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/lsm/lsm_worker.c b/src/third_party/wiredtiger/src/lsm/lsm_worker.c new file mode 100644 index 00000000000..f24e58148b1 --- /dev/null +++ b/src/third_party/wiredtiger/src/lsm/lsm_worker.c @@ -0,0 +1,167 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __lsm_worker_general_op( + WT_SESSION_IMPL *, WT_LSM_WORKER_ARGS *, int *); +static void * __lsm_worker(void *); + +/* + * __wt_lsm_worker_start -- + * A wrapper around the LSM worker thread start. + */ +int +__wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) +{ + WT_RET(__wt_verbose(session, WT_VERB_LSM, + "Start LSM worker %d type 0x%x", args->id, args->type)); + return (__wt_thread_create(session, &args->tid, __lsm_worker, args)); +} + +/* + * __lsm_worker_general_op -- + * Execute a single bloom, drop or flush work unit. + */ +static int +__lsm_worker_general_op( + WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *cookie, int *completed) +{ + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + WT_LSM_WORK_UNIT *entry; + int force; + + *completed = 0; + /* + * Return if this thread cannot process a bloom, drop or flush. + */ + if (!FLD_ISSET(cookie->type, + WT_LSM_WORK_BLOOM | WT_LSM_WORK_DROP | WT_LSM_WORK_FLUSH)) + return (WT_NOTFOUND); + + if ((ret = __wt_lsm_manager_pop_entry(session, + cookie->type, &entry)) != 0 || entry == NULL) + return (ret); + + if (entry->type == WT_LSM_WORK_FLUSH) { + force = F_ISSET(entry, WT_LSM_WORK_FORCE); + F_CLR(entry, WT_LSM_WORK_FORCE); + WT_ERR(__wt_lsm_get_chunk_to_flush(session, + entry->lsm_tree, force, &chunk)); + /* + * If we got a chunk to flush, checkpoint it. + */ + if (chunk != NULL) { + WT_ERR(__wt_verbose(session, WT_VERB_LSM, + "Flush%s chunk %d %s", + force ? " w/ force" : "", + chunk->id, chunk->uri)); + ret = __wt_lsm_checkpoint_chunk( + session, entry->lsm_tree, chunk); + WT_ASSERT(session, chunk->refcnt > 0); + (void)WT_ATOMIC_SUB4(chunk->refcnt, 1); + WT_ERR(ret); + } + } else if (entry->type == WT_LSM_WORK_DROP) + WT_ERR(__wt_lsm_free_chunks(session, entry->lsm_tree)); + else if (entry->type == WT_LSM_WORK_BLOOM) + WT_ERR(__wt_lsm_work_bloom(session, entry->lsm_tree)); + *completed = 1; + +err: __wt_lsm_manager_free_work_unit(session, entry); + return (ret); +} + +/* + * __lsm_worker -- + * A thread that executes work units for all open LSM trees. + */ +static void * +__lsm_worker(void *arg) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LSM_WORK_UNIT *entry; + WT_LSM_WORKER_ARGS *cookie; + WT_SESSION_IMPL *session; + int progress, ran; + + cookie = (WT_LSM_WORKER_ARGS *)arg; + session = cookie->session; + conn = S2C(session); + + entry = NULL; + while (F_ISSET(conn, WT_CONN_SERVER_RUN) && + F_ISSET(cookie, WT_LSM_WORKER_RUN)) { + progress = 0; + + /* + * Workers process the different LSM work queues. Some workers + * can handle several or all work unit types. So the code is + * prioritized so important operations happen first. + * Switches are the highest priority. + */ + while (FLD_ISSET(cookie->type, WT_LSM_WORK_SWITCH) && + (ret = __wt_lsm_manager_pop_entry( + session, WT_LSM_WORK_SWITCH, &entry)) == 0 && + entry != NULL) + WT_ERR( + __wt_lsm_work_switch(session, &entry, &progress)); + /* Flag an error if the pop failed. */ + WT_ERR(ret); + + /* + * Next the general operations. + */ + ret = __lsm_worker_general_op(session, cookie, &ran); + if (ret == EBUSY || ret == WT_NOTFOUND) + ret = 0; + WT_ERR(ret); + progress = progress || ran; + + /* + * Finally see if there is any merge work we can do. This is + * last because the earlier operations may result in adding + * merge work to the queue. + */ + if (FLD_ISSET(cookie->type, WT_LSM_WORK_MERGE) && + (ret = __wt_lsm_manager_pop_entry( + session, WT_LSM_WORK_MERGE, &entry)) == 0 && + entry != NULL) { + WT_ASSERT(session, entry->type == WT_LSM_WORK_MERGE); + ret = __wt_lsm_merge(session, + entry->lsm_tree, cookie->id); + if (ret == WT_NOTFOUND) { + F_CLR(entry->lsm_tree, WT_LSM_TREE_COMPACTING); + ret = 0; + } else if (ret == EBUSY) + ret = 0; + /* Clear any state */ + WT_CLEAR_BTREE_IN_SESSION(session); + __wt_lsm_manager_free_work_unit(session, entry); + entry = NULL; + progress = 1; + } + /* Flag an error if the pop failed. */ + WT_ERR(ret); + + /* Don't busy wait if there was any work to do. */ + if (!progress) { + WT_ERR( + __wt_cond_wait(session, cookie->work_cond, 10000)); + continue; + } + } + + if (ret != 0) { +err: __wt_lsm_manager_free_work_unit(session, entry); + __wt_err(session, ret, + "Error in LSM worker thread %d", cookie->id); + } + return (NULL); +} diff --git a/src/third_party/wiredtiger/src/meta/meta_apply.c b/src/third_party/wiredtiger/src/meta/meta_apply.c new file mode 100644 index 00000000000..313516148c0 --- /dev/null +++ b/src/third_party/wiredtiger/src/meta/meta_apply.c @@ -0,0 +1,62 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_meta_btree_apply -- + * Apply a function to all files listed in the metadata, apart from the + * metadata file. + */ +int +__wt_meta_btree_apply(WT_SESSION_IMPL *session, + int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) +{ + WT_CURSOR *cursor; + WT_DATA_HANDLE *saved_dhandle; + WT_DECL_RET; + const char *uri; + int cmp, tret; + + saved_dhandle = session->dhandle; + WT_RET(__wt_metadata_cursor(session, NULL, &cursor)); + cursor->set_key(cursor, "file:"); + if ((tret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0) + tret = cursor->next(cursor); + for (; tret == 0; tret = cursor->next(cursor)) { + WT_ERR(cursor->get_key(cursor, &uri)); + if (!WT_PREFIX_MATCH(uri, "file:")) + break; + else if (strcmp(uri, WT_METAFILE_URI) == 0) + continue; + + /* + * We need to pull the handle into the session handle cache + * and make sure it's referenced to stop other internal code + * dropping the handle (e.g in LSM when cleaning up obsolete + * chunks). Holding the metadata lock isn't enough. + */ + ret = __wt_session_get_btree(session, uri, NULL, NULL, 0); + if (ret == 0) { + ret = func(session, cfg); + if (WT_META_TRACKING(session)) + WT_TRET( + __wt_meta_track_handle_lock(session, 0)); + else + WT_TRET(__wt_session_release_btree(session)); + } else if (ret == EBUSY) + ret = __wt_conn_btree_apply_single( + session, uri, NULL, func, cfg); + WT_ERR(ret); + } + + if (tret != WT_NOTFOUND) + WT_TRET(tret); +err: WT_TRET(cursor->close(cursor)); + session->dhandle = saved_dhandle; + return (ret); +} diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c new file mode 100644 index 00000000000..998ae7e0d02 --- /dev/null +++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c @@ -0,0 +1,528 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __ckpt_last(WT_SESSION_IMPL *, const char *, WT_CKPT *); +static int __ckpt_last_name(WT_SESSION_IMPL *, const char *, const char **); +static int __ckpt_load(WT_SESSION_IMPL *, + WT_CONFIG_ITEM *, WT_CONFIG_ITEM *, WT_CKPT *); +static int __ckpt_named( + WT_SESSION_IMPL *, const char *, const char *, WT_CKPT *); +static int __ckpt_set(WT_SESSION_IMPL *, const char *, const char *); +static int __ckpt_version_chk(WT_SESSION_IMPL *, const char *, const char *); + +/* + * __wt_meta_checkpoint -- + * Return a file's checkpoint information. + */ +int +__wt_meta_checkpoint(WT_SESSION_IMPL *session, + const char *fname, const char *checkpoint, WT_CKPT *ckpt) +{ + WT_DECL_RET; + const char *config; + + config = NULL; + + /* Retrieve the metadata entry for the file. */ + WT_ERR(__wt_metadata_search(session, fname, &config)); + + /* Check the major/minor version numbers. */ + WT_ERR(__ckpt_version_chk(session, fname, config)); + + /* + * Retrieve the named checkpoint or the last checkpoint. + * + * If we don't find a named checkpoint, we're done, they're read-only. + * If we don't find a default checkpoint, it's creation, return "no + * data" and let our caller handle it. + */ + if (checkpoint == NULL) { + if ((ret = __ckpt_last(session, config, ckpt)) == WT_NOTFOUND) { + ret = 0; + ckpt->addr.data = ckpt->raw.data = NULL; + ckpt->addr.size = ckpt->raw.size = 0; + } + } else + WT_ERR(__ckpt_named(session, checkpoint, config, ckpt)); + +err: __wt_free(session, config); + return (ret); +} + +/* + * __wt_meta_checkpoint_last_name -- + * Return the last unnamed checkpoint's name. + */ +int +__wt_meta_checkpoint_last_name( + WT_SESSION_IMPL *session, const char *fname, const char **namep) +{ + WT_DECL_RET; + const char *config; + + config = NULL; + + /* Retrieve the metadata entry for the file. */ + WT_ERR(__wt_metadata_search(session, fname, &config)); + + /* Check the major/minor version numbers. */ + WT_ERR(__ckpt_version_chk(session, fname, config)); + + /* Retrieve the name of the last unnamed checkpoint. */ + WT_ERR(__ckpt_last_name(session, config, namep)); + +err: __wt_free(session, config); + return (ret); +} + +/* + * __wt_meta_checkpoint_clear -- + * Clear a file's checkpoint. + */ +int +__wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fname) +{ + /* + * If we are unrolling a failed create, we may have already removed the + * metadata entry. If no entry is found to update and we're trying to + * clear the checkpoint, just ignore it. + */ + WT_RET_NOTFOUND_OK(__ckpt_set(session, fname, NULL)); + + return (0); +} + +/* + * __ckpt_set -- + * Set a file's checkpoint. + */ +static int +__ckpt_set(WT_SESSION_IMPL *session, const char *fname, const char *v) +{ + WT_DECL_RET; + const char *config, *cfg[3], *newcfg; + + config = newcfg = NULL; + + /* Retrieve the metadata for this file. */ + WT_ERR(__wt_metadata_search(session, fname, &config)); + + /* Replace the checkpoint entry. */ + cfg[0] = config; + cfg[1] = v == NULL ? "checkpoint=()" : v; + cfg[2] = NULL; + WT_ERR(__wt_config_collapse(session, cfg, &newcfg)); + WT_ERR(__wt_metadata_update(session, fname, newcfg)); + +err: __wt_free(session, config); + __wt_free(session, newcfg); + return (ret); +} + +/* + * __ckpt_named -- + * Return the information associated with a file's named checkpoint. + */ +static int +__ckpt_named(WT_SESSION_IMPL *session, + const char *checkpoint, const char *config, WT_CKPT *ckpt) +{ + WT_CONFIG ckptconf; + WT_CONFIG_ITEM k, v; + + WT_RET(__wt_config_getones(session, config, "checkpoint", &v)); + WT_RET(__wt_config_subinit(session, &ckptconf, &v)); + + /* + * Take the first match: there should never be more than a single + * checkpoint of any name. + */ + while (__wt_config_next(&ckptconf, &k, &v) == 0) + if (WT_STRING_MATCH(checkpoint, k.str, k.len)) + return (__ckpt_load(session, &k, &v, ckpt)); + + return (WT_NOTFOUND); +} + +/* + * __ckpt_last -- + * Return the information associated with the file's last checkpoint. + */ +static int +__ckpt_last(WT_SESSION_IMPL *session, const char *config, WT_CKPT *ckpt) +{ + WT_CONFIG ckptconf; + WT_CONFIG_ITEM a, k, v; + int64_t found; + + WT_RET(__wt_config_getones(session, config, "checkpoint", &v)); + WT_RET(__wt_config_subinit(session, &ckptconf, &v)); + for (found = 0; __wt_config_next(&ckptconf, &k, &v) == 0;) { + /* Ignore checkpoints before the ones we've already seen. */ + WT_RET(__wt_config_subgets(session, &v, "order", &a)); + if (found) { + if (a.val < found) + continue; + __wt_meta_checkpoint_free(session, ckpt); + } + found = a.val; + WT_RET(__ckpt_load(session, &k, &v, ckpt)); + } + + return (found ? 0 : WT_NOTFOUND); +} + +/* + * __ckpt_last_name -- + * Return the name associated with the file's last unnamed checkpoint. + */ +static int +__ckpt_last_name( + WT_SESSION_IMPL *session, const char *config, const char **namep) +{ + WT_CONFIG ckptconf; + WT_CONFIG_ITEM a, k, v; + WT_DECL_RET; + int64_t found; + + *namep = NULL; + + WT_ERR(__wt_config_getones(session, config, "checkpoint", &v)); + WT_ERR(__wt_config_subinit(session, &ckptconf, &v)); + for (found = 0; __wt_config_next(&ckptconf, &k, &v) == 0;) { + /* + * We only care about unnamed checkpoints; applications may not + * use any matching prefix as a checkpoint name, the comparison + * is pretty simple. + */ + if (k.len < strlen(WT_CHECKPOINT) || + strncmp(k.str, WT_CHECKPOINT, strlen(WT_CHECKPOINT)) != 0) + continue; + + /* Ignore checkpoints before the ones we've already seen. */ + WT_ERR(__wt_config_subgets(session, &v, "order", &a)); + if (found && a.val < found) + continue; + + if (*namep != NULL) + __wt_free(session, *namep); + WT_ERR(__wt_strndup(session, k.str, k.len, namep)); + found = a.val; + } + if (!found) + ret = WT_NOTFOUND; + + if (0) { +err: __wt_free(session, namep); + } + return (ret); +} + +/* + * __ckpt_compare_order -- + * Qsort comparison routine for the checkpoint list. + */ +static int +__ckpt_compare_order(const void *a, const void *b) +{ + WT_CKPT *ackpt, *bckpt; + + ackpt = (WT_CKPT *)a; + bckpt = (WT_CKPT *)b; + + return (ackpt->order > bckpt->order ? 1 : -1); +} + +/* + * __wt_meta_ckptlist_get -- + * Load all available checkpoint information for a file. + */ +int +__wt_meta_ckptlist_get( + WT_SESSION_IMPL *session, const char *fname, WT_CKPT **ckptbasep) +{ + WT_CKPT *ckpt, *ckptbase; + WT_CONFIG ckptconf; + WT_CONFIG_ITEM k, v; + WT_DECL_ITEM(buf); + WT_DECL_RET; + size_t allocated, slot; + const char *config; + + *ckptbasep = NULL; + + ckptbase = NULL; + allocated = slot = 0; + config = NULL; + + /* Retrieve the metadata information for the file. */ + WT_RET(__wt_metadata_search(session, fname, &config)); + + /* Load any existing checkpoints into the array. */ + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + if (__wt_config_getones(session, config, "checkpoint", &v) == 0 && + __wt_config_subinit(session, &ckptconf, &v) == 0) + for (; __wt_config_next(&ckptconf, &k, &v) == 0; ++slot) { + WT_ERR(__wt_realloc_def( + session, &allocated, slot + 1, &ckptbase)); + ckpt = &ckptbase[slot]; + + WT_ERR(__ckpt_load(session, &k, &v, ckpt)); + } + + /* + * Allocate an extra slot for a new value, plus a slot to mark the end. + * + * This isn't very clean, but there's necessary cooperation between the + * schema layer (that maintains the list of checkpoints), the btree + * layer (that knows when the root page is written, creating a new + * checkpoint), and the block manager (which actually creates the + * checkpoint). All of that cooperation is handled in the WT_CKPT + * structure referenced from the WT_BTREE structure. + */ + WT_ERR(__wt_realloc_def(session, &allocated, slot + 2, &ckptbase)); + + /* Sort in creation-order. */ + qsort(ckptbase, slot, sizeof(WT_CKPT), __ckpt_compare_order); + + /* Return the array to our caller. */ + *ckptbasep = ckptbase; + + if (0) { +err: __wt_meta_ckptlist_free(session, ckptbase); + } + __wt_free(session, config); + __wt_scr_free(&buf); + + return (ret); +} + +/* + * __ckpt_load -- + * Load a single checkpoint's information into a WT_CKPT structure. + */ +static int +__ckpt_load(WT_SESSION_IMPL *session, + WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v, WT_CKPT *ckpt) +{ + WT_CONFIG_ITEM a; + char timebuf[64]; + + /* + * Copy the name, address (raw and hex), order and time into the slot. + * If there's no address, it's a fake. + */ + WT_RET(__wt_strndup(session, k->str, k->len, &ckpt->name)); + + WT_RET(__wt_config_subgets(session, v, "addr", &a)); + WT_RET(__wt_buf_set(session, &ckpt->addr, a.str, a.len)); + if (a.len == 0) + F_SET(ckpt, WT_CKPT_FAKE); + else + WT_RET(__wt_nhex_to_raw(session, a.str, a.len, &ckpt->raw)); + + WT_RET(__wt_config_subgets(session, v, "order", &a)); + if (a.len == 0) + goto format; + ckpt->order = a.val; + + WT_RET(__wt_config_subgets(session, v, "time", &a)); + if (a.len == 0 || a.len > sizeof(timebuf) - 1) + goto format; + memcpy(timebuf, a.str, a.len); + timebuf[a.len] = '\0'; + if (sscanf(timebuf, "%" SCNuMAX, &ckpt->sec) != 1) + goto format; + + WT_RET(__wt_config_subgets(session, v, "size", &a)); + ckpt->ckpt_size = (uint64_t)a.val; + + WT_RET(__wt_config_subgets(session, v, "write_gen", &a)); + if (a.len == 0) + goto format; + /* + * The largest value a WT_CONFIG_ITEM can handle is signed: this value + * appears on disk and I don't want to sign it there, so I'm casting it + * here instead. + */ + ckpt->write_gen = (uint64_t)a.val; + + return (0); + +format: + WT_RET_MSG(session, WT_ERROR, "corrupted checkpoint list"); +} + +/* + * __wt_meta_ckptlist_set -- + * Set a file's checkpoint value from the WT_CKPT list. + */ +int +__wt_meta_ckptlist_set(WT_SESSION_IMPL *session, + const char *fname, WT_CKPT *ckptbase, WT_LSN *ckptlsn) +{ + WT_CKPT *ckpt; + WT_DECL_ITEM(buf); + WT_DECL_RET; + time_t secs; + int64_t maxorder; + const char *sep; + + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + maxorder = 0; + sep = ""; + WT_ERR(__wt_buf_fmt(session, buf, "checkpoint=(")); + WT_CKPT_FOREACH(ckptbase, ckpt) { + /* + * Each internal checkpoint name is appended with a generation + * to make it a unique name. We're solving two problems: when + * two checkpoints are taken quickly, the timer may not be + * unique and/or we can even see time travel on the second + * checkpoint if we snapshot the time in-between nanoseconds + * rolling over. Second, if we reset the generational counter + * when new checkpoints arrive, we could logically re-create + * specific checkpoints, racing with cursors open on those + * checkpoints. I can't think of any way to return incorrect + * results by racing with those cursors, but it's simpler not + * to worry about it. + */ + if (ckpt->order > maxorder) + maxorder = ckpt->order; + + /* Skip deleted checkpoints. */ + if (F_ISSET(ckpt, WT_CKPT_DELETE)) + continue; + + if (F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_UPDATE)) { + /* + * We fake checkpoints for handles in the middle of a + * bulk load. If there is a checkpoint, convert the + * raw cookie to a hex string. + */ + if (ckpt->raw.size == 0) + ckpt->addr.size = 0; + else + WT_ERR(__wt_raw_to_hex(session, + ckpt->raw.data, + ckpt->raw.size, &ckpt->addr)); + + /* Set the order and timestamp. */ + if (F_ISSET(ckpt, WT_CKPT_ADD)) + ckpt->order = ++maxorder; + + /* + * XXX + * Assumes a time_t fits into a uintmax_t, which isn't + * guaranteed, a time_t has to be an arithmetic type, + * but not an integral type. + */ + WT_ERR(__wt_seconds(session, &secs)); + ckpt->sec = (uintmax_t)secs; + } + if (strcmp(ckpt->name, WT_CHECKPOINT) == 0) + WT_ERR(__wt_buf_catfmt(session, buf, + "%s%s.%" PRId64 "=(addr=\"%.*s\",order=%" PRIu64 + ",time=%" PRIuMAX ",size=%" PRIu64 + ",write_gen=%" PRIu64 ")", + sep, ckpt->name, ckpt->order, + (int)ckpt->addr.size, (char *)ckpt->addr.data, + ckpt->order, ckpt->sec, ckpt->ckpt_size, + ckpt->write_gen)); + else + WT_ERR(__wt_buf_catfmt(session, buf, + "%s%s=(addr=\"%.*s\",order=%" PRIu64 + ",time=%" PRIuMAX ",size=%" PRIu64 + ",write_gen=%" PRIu64 ")", + sep, ckpt->name, + (int)ckpt->addr.size, (char *)ckpt->addr.data, + ckpt->order, ckpt->sec, ckpt->ckpt_size, + ckpt->write_gen)); + sep = ","; + } + WT_ERR(__wt_buf_catfmt(session, buf, ")")); + if (ckptlsn != NULL) + WT_ERR(__wt_buf_catfmt(session, buf, + ",checkpoint_lsn=(%" PRIu32 ",%" PRIuMAX ")", + ckptlsn->file, (uintmax_t)ckptlsn->offset)); + WT_ERR(__ckpt_set(session, fname, buf->mem)); + +err: __wt_scr_free(&buf); + return (ret); +} + +/* + * __wt_meta_ckptlist_free -- + * Discard the checkpoint array. + */ +void +__wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase) +{ + WT_CKPT *ckpt; + + if (ckptbase == NULL) + return; + + WT_CKPT_FOREACH(ckptbase, ckpt) + __wt_meta_checkpoint_free(session, ckpt); + __wt_free(session, ckptbase); +} + +/* + * __wt_meta_checkpoint_free -- + * Clean up a single checkpoint structure. + */ +void +__wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt) +{ + if (ckpt == NULL) + return; + + __wt_free(session, ckpt->name); + __wt_buf_free(session, &ckpt->addr); + __wt_buf_free(session, &ckpt->raw); + __wt_free(session, ckpt->bpriv); + + WT_CLEAR(*ckpt); /* Clear to prepare for re-use. */ +} + +/* + * __ckpt_version_chk -- + * Check the version major/minor numbers. + */ +static int +__ckpt_version_chk( + WT_SESSION_IMPL *session, const char *fname, const char *config) +{ + WT_CONFIG_ITEM a, v; + int majorv, minorv; + + WT_RET(__wt_config_getones(session, config, "version", &v)); + WT_RET(__wt_config_subgets(session, &v, "major", &a)); + majorv = (int)a.val; + WT_RET(__wt_config_subgets(session, &v, "minor", &a)); + minorv = (int)a.val; + + if (majorv < WT_BTREE_MAJOR_VERSION_MIN || + majorv > WT_BTREE_MAJOR_VERSION_MAX || + (majorv == WT_BTREE_MAJOR_VERSION_MIN && + minorv < WT_BTREE_MINOR_VERSION_MIN) || + (majorv == WT_BTREE_MAJOR_VERSION_MAX && + minorv > WT_BTREE_MINOR_VERSION_MAX)) + WT_RET_MSG(session, EACCES, + "%s is an unsupported WiredTiger source file version %d.%d" + "; this WiredTiger build only supports versions from %d.%d " + "to %d.%d", + fname, + majorv, minorv, + WT_BTREE_MAJOR_VERSION_MIN, + WT_BTREE_MINOR_VERSION_MIN, + WT_BTREE_MAJOR_VERSION_MAX, + WT_BTREE_MINOR_VERSION_MAX); + return (0); +} diff --git a/src/third_party/wiredtiger/src/meta/meta_ext.c b/src/third_party/wiredtiger/src/meta/meta_ext.c new file mode 100644 index 00000000000..b68058a6e91 --- /dev/null +++ b/src/third_party/wiredtiger/src/meta/meta_ext.c @@ -0,0 +1,103 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_ext_metadata_insert -- + * Insert a row into the metadata (external API version). + */ +int +__wt_ext_metadata_insert(WT_EXTENSION_API *wt_api, + WT_SESSION *wt_session, const char *key, const char *value) +{ + WT_CONNECTION_IMPL *conn; + WT_SESSION_IMPL *session; + + conn = (WT_CONNECTION_IMPL *)wt_api->conn; + if ((session = (WT_SESSION_IMPL *)wt_session) == NULL) + session = conn->default_session; + + return (__wt_metadata_insert(session, key, value)); +} + +/* + * __wt_ext_metadata_remove -- + * Remove a row from the metadata (external API version). + */ +int +__wt_ext_metadata_remove( + WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key) +{ + WT_CONNECTION_IMPL *conn; + WT_SESSION_IMPL *session; + + conn = (WT_CONNECTION_IMPL *)wt_api->conn; + if ((session = (WT_SESSION_IMPL *)wt_session) == NULL) + session = conn->default_session; + + return (__wt_metadata_remove(session, key)); +} + +/* + * __wt_ext_metadata_search -- + * Return a copied row from the metadata (external API version). + * The caller is responsible for freeing the allocated memory. + */ +int +__wt_ext_metadata_search(WT_EXTENSION_API *wt_api, + WT_SESSION *wt_session, const char *key, const char **valuep) +{ + WT_CONNECTION_IMPL *conn; + WT_SESSION_IMPL *session; + + conn = (WT_CONNECTION_IMPL *)wt_api->conn; + if ((session = (WT_SESSION_IMPL *)wt_session) == NULL) + session = conn->default_session; + + return (__wt_metadata_search(session, key, valuep)); +} + +/* + * __wt_ext_metadata_update -- + * Update a row in the metadata (external API version). + */ +int +__wt_ext_metadata_update(WT_EXTENSION_API *wt_api, + WT_SESSION *wt_session, const char *key, const char *value) +{ + WT_CONNECTION_IMPL *conn; + WT_SESSION_IMPL *session; + + conn = (WT_CONNECTION_IMPL *)wt_api->conn; + if ((session = (WT_SESSION_IMPL *)wt_session) == NULL) + session = conn->default_session; + + return (__wt_metadata_update(session, key, value)); +} + +/* + * __wt_metadata_get_ckptlist -- + * Public entry point to __wt_meta_ckptlist_get (for wt list). + */ +int +__wt_metadata_get_ckptlist( + WT_SESSION *session, const char *name, WT_CKPT **ckptbasep) +{ + return (__wt_meta_ckptlist_get( + (WT_SESSION_IMPL *)session, name, ckptbasep)); +} + +/* + * __wt_metadata_free_ckptlist -- + * Public entry point to __wt_meta_ckptlist_free (for wt list). + */ +void +__wt_metadata_free_ckptlist(WT_SESSION *session, WT_CKPT *ckptbase) +{ + __wt_meta_ckptlist_free((WT_SESSION_IMPL *)session, ckptbase); +} diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c new file mode 100644 index 00000000000..e66ed609952 --- /dev/null +++ b/src/third_party/wiredtiger/src/meta/meta_table.c @@ -0,0 +1,206 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __metadata_turtle -- + * Return if a key's value should be taken from the turtle file. + */ +static int +__metadata_turtle(const char *key) +{ + switch (key[0]) { + case 'f': + if (strcmp(key, WT_METAFILE_URI) == 0) + return (1); + break; + case 'W': + if (strcmp(key, "WiredTiger version") == 0) + return (1); + if (strcmp(key, "WiredTiger version string") == 0) + return (1); + break; + } + return (0); +} + +/* + * __wt_metadata_open -- + * Opens the metadata file, sets session->metafile. + */ +int +__wt_metadata_open(WT_SESSION_IMPL *session) +{ + if (session->metafile != NULL) + return (0); + + WT_RET(__wt_session_get_btree(session, WT_METAFILE_URI, NULL, NULL, 0)); + + session->metafile = S2BT(session); + WT_ASSERT(session, session->metafile != NULL); + + /* The metafile doesn't need to stay locked -- release it. */ + return (__wt_session_release_btree(session)); +} + +/* + * __wt_metadata_cursor -- + * Opens a cursor on the metadata. + */ +int +__wt_metadata_cursor( + WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp) +{ + WT_DATA_HANDLE *saved_dhandle; + WT_DECL_RET; + const char *cfg[] = + { WT_CONFIG_BASE(session, session_open_cursor), config, NULL }; + + saved_dhandle = session->dhandle; + WT_ERR(__wt_metadata_open(session)); + + WT_SET_BTREE_IN_SESSION(session, session->metafile); + + /* + * We use the metadata a lot, so we have a handle cached; lock it and + * increment the in-use counter. + */ + WT_ERR(__wt_session_lock_btree(session, 0)); + __wt_session_dhandle_incr_use(session); + + ret = __wt_curfile_create(session, NULL, cfg, 0, 0, cursorp); + + /* Restore the caller's btree. */ +err: session->dhandle = saved_dhandle; + return (ret); +} + +/* + * __wt_metadata_insert -- + * Insert a row into the metadata. + */ +int +__wt_metadata_insert( + WT_SESSION_IMPL *session, const char *key, const char *value) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + + WT_RET(__wt_verbose(session, WT_VERB_METADATA, + "Insert: key: %s, value: %s, tracking: %s, %s" "turtle", + key, value, WT_META_TRACKING(session) ? "true" : "false", + __metadata_turtle(key) ? "" : "not ")); + + if (__metadata_turtle(key)) + WT_RET_MSG(session, EINVAL, + "%s: insert not supported on the turtle file", key); + + WT_RET(__wt_metadata_cursor(session, NULL, &cursor)); + cursor->set_key(cursor, key); + cursor->set_value(cursor, value); + WT_ERR(cursor->insert(cursor)); + if (WT_META_TRACKING(session)) + WT_ERR(__wt_meta_track_insert(session, key)); + +err: WT_TRET(cursor->close(cursor)); + return (ret); +} + +/* + * __wt_metadata_update -- + * Update a row in the metadata. + */ +int +__wt_metadata_update( + WT_SESSION_IMPL *session, const char *key, const char *value) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + + WT_RET(__wt_verbose(session, WT_VERB_METADATA, + "Update: key: %s, value: %s, tracking: %s, %s" "turtle", + key, value, WT_META_TRACKING(session) ? "true" : "false", + __metadata_turtle(key) ? "" : "not ")); + + if (__metadata_turtle(key)) + return (__wt_turtle_update(session, key, value)); + + if (WT_META_TRACKING(session)) + WT_RET(__wt_meta_track_update(session, key)); + + WT_RET(__wt_metadata_cursor(session, "overwrite", &cursor)); + cursor->set_key(cursor, key); + cursor->set_value(cursor, value); + WT_ERR(cursor->insert(cursor)); + +err: WT_TRET(cursor->close(cursor)); + return (ret); +} + +/* + * __wt_metadata_remove -- + * Remove a row from the metadata. + */ +int +__wt_metadata_remove(WT_SESSION_IMPL *session, const char *key) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + + WT_RET(__wt_verbose(session, WT_VERB_METADATA, + "Remove: key: %s, tracking: %s, %s" "turtle", + key, WT_META_TRACKING(session) ? "true" : "false", + __metadata_turtle(key) ? "" : "not ")); + + if (__metadata_turtle(key)) + WT_RET_MSG(session, EINVAL, + "%s: remove not supported on the turtle file", key); + + WT_RET(__wt_metadata_cursor(session, NULL, &cursor)); + cursor->set_key(cursor, key); + WT_ERR(cursor->search(cursor)); + if (WT_META_TRACKING(session)) + WT_ERR(__wt_meta_track_update(session, key)); + WT_ERR(cursor->remove(cursor)); + +err: WT_TRET(cursor->close(cursor)); + return (ret); +} + +/* + * __wt_metadata_search -- + * Return a copied row from the metadata. + * The caller is responsible for freeing the allocated memory. + */ +int +__wt_metadata_search( + WT_SESSION_IMPL *session, const char *key, const char **valuep) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + const char *value; + + *valuep = NULL; + + WT_RET(__wt_verbose(session, WT_VERB_METADATA, + "Search: key: %s, tracking: %s, %s" "turtle", + key, WT_META_TRACKING(session) ? "true" : "false", + __metadata_turtle(key) ? "" : "not ")); + + if (__metadata_turtle(key)) + return (__wt_turtle_read(session, key, valuep)); + + WT_RET(__wt_metadata_cursor(session, NULL, &cursor)); + cursor->set_key(cursor, key); + WT_ERR(cursor->search(cursor)); + WT_ERR(cursor->get_value(cursor, &value)); + WT_ERR(__wt_strdup(session, value, valuep)); + +err: WT_TRET(cursor->close(cursor)); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c new file mode 100644 index 00000000000..55e61f8d1bc --- /dev/null +++ b/src/third_party/wiredtiger/src/meta/meta_track.c @@ -0,0 +1,365 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * WT_META_TRACK -- A tracked metadata operation: a non-transactional log, + * maintained to make it easy to unroll simple metadata and filesystem + * operations. + */ +typedef struct __wt_meta_track { + enum { + WT_ST_EMPTY, /* Unused slot */ + WT_ST_CHECKPOINT, /* Complete a checkpoint */ + WT_ST_FILEOP, /* File operation */ + WT_ST_LOCK, /* Lock a handle */ + WT_ST_REMOVE, /* Remove a metadata entry */ + WT_ST_SET /* Reset a metadata entry */ + } op; + const char *a, *b; /* Strings */ + WT_BTREE *btree; /* Locked handle */ + int created; /* Handle on newly created file */ +} WT_META_TRACK; + +/* + * __meta_track_next -- + * Extend the list of operations we're tracking, as necessary, and + * optionally return the next slot. + */ +static int +__meta_track_next(WT_SESSION_IMPL *session, WT_META_TRACK **trkp) +{ + size_t offset, sub_off; + + if (session->meta_track_next == NULL) + session->meta_track_next = session->meta_track; + + offset = WT_PTRDIFF(session->meta_track_next, session->meta_track); + sub_off = WT_PTRDIFF(session->meta_track_sub, session->meta_track); + if (offset == session->meta_track_alloc) { + WT_RET(__wt_realloc(session, &session->meta_track_alloc, + WT_MAX(2 * session->meta_track_alloc, + 20 * sizeof(WT_META_TRACK)), &session->meta_track)); + + /* Maintain positions in the new chunk of memory. */ + session->meta_track_next = + (uint8_t *)session->meta_track + offset; + if (session->meta_track_sub != NULL) + session->meta_track_sub = + (uint8_t *)session->meta_track + sub_off; + } + + WT_ASSERT(session, session->meta_track_next != NULL); + + if (trkp != NULL) { + *trkp = session->meta_track_next; + session->meta_track_next = *trkp + 1; + } + + return (0); +} + +/* + * __wt_meta_track_discard -- + * Cleanup metadata tracking when closing a session. + */ +void +__wt_meta_track_discard(WT_SESSION_IMPL *session) +{ + __wt_free(session, session->meta_track); + session->meta_track_next = NULL; + session->meta_track_alloc = 0; +} + +/* + * __wt_meta_track_on -- + * Turn on metadata operation tracking. + */ +int +__wt_meta_track_on(WT_SESSION_IMPL *session) +{ + if (session->meta_track_nest++ == 0) + WT_RET(__meta_track_next(session, NULL)); + + return (0); +} + +/* + * __meta_track_apply -- + * Apply the changes in a metadata tracking record. + */ +static int +__meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) +{ + WT_BM *bm; + WT_DECL_RET; + int tret; + + /* + * Unlock handles and complete checkpoints regardless of whether we are + * unrolling. + */ + if (!unroll && trk->op != WT_ST_CHECKPOINT && trk->op != WT_ST_LOCK) + goto free; + + switch (trk->op) { + case WT_ST_EMPTY: /* Unused slot */ + break; + case WT_ST_CHECKPOINT: /* Checkpoint, see above */ + if (!unroll) { + bm = trk->btree->bm; + WT_WITH_BTREE(session, trk->btree, + WT_TRET(bm->checkpoint_resolve(bm, session))); + } + break; + case WT_ST_LOCK: /* Handle lock, see above */ + if (unroll && trk->created) + F_SET(trk->btree->dhandle, WT_DHANDLE_DISCARD); + WT_WITH_BTREE(session, trk->btree, + WT_TRET(__wt_session_release_btree(session))); + break; + case WT_ST_FILEOP: /* File operation */ + /* + * For renames, both a and b are set. + * For creates, a is NULL. + * For removes, b is NULL. + */ + if (trk->a != NULL && trk->b != NULL && + (tret = __wt_rename(session, + trk->b + strlen("file:"), + trk->a + strlen("file:"))) != 0) { + __wt_err(session, tret, + "metadata unroll rename %s to %s", + trk->b, trk->a); + WT_TRET(tret); + } else if (trk->a == NULL) { + if ((tret = __wt_remove(session, + trk->b + strlen("file:"))) != 0) { + __wt_err(session, tret, + "metadata unroll create %s", + trk->b); + WT_TRET(tret); + } + } + /* + * We can't undo removes yet: that would imply + * some kind of temporary rename and remove in + * roll forward. + */ + break; + case WT_ST_REMOVE: /* Remove trk.a */ + if ((tret = __wt_metadata_remove(session, trk->a)) != 0) { + __wt_err(session, tret, + "metadata unroll remove: %s", + trk->a); + WT_TRET(tret); + } + break; + case WT_ST_SET: /* Set trk.a to trk.b */ + if ((tret = __wt_metadata_update( + session, trk->a, trk->b)) != 0) { + __wt_err(session, tret, + "metadata unroll update %s to %s", + trk->a, trk->b); + WT_TRET(tret); + } + break; + WT_ILLEGAL_VALUE(session); + } + +free: trk->op = WT_ST_EMPTY; + __wt_free(session, trk->a); + __wt_free(session, trk->b); + trk->btree = NULL; + + return (ret); +} + +/* + * __wt_meta_track_off -- + * Turn off metadata operation tracking, unrolling on error. + */ +int +__wt_meta_track_off(WT_SESSION_IMPL *session, int unroll) +{ + WT_DECL_RET; + WT_META_TRACK *trk, *trk_orig; + + WT_ASSERT(session, + WT_META_TRACKING(session) && session->meta_track_nest > 0); + + trk_orig = session->meta_track; + trk = session->meta_track_next; + + /* If it was a nested transaction, there is nothing to do. */ + if (--session->meta_track_nest != 0) + return (0); + + /* Turn off tracking for unroll. */ + session->meta_track_next = session->meta_track_sub = NULL; + + /* + * If there were no operations logged, return now and avoid unnecessary + * metadata checkpoints. For example, this happens if attempting to + * create a data source that already exists (or drop one that doesn't). + */ + if (trk == trk_orig) + return (0); + + while (--trk >= trk_orig) + WT_TRET(__meta_track_apply(session, trk, unroll)); + + /* + * If the operation succeeded and we aren't relying on the log for + * durability, checkpoint the metadata. */ + if (!unroll && ret == 0 && session->metafile != NULL && + !S2C(session)->logging) + WT_WITH_BTREE(session, session->metafile, + ret = __wt_checkpoint(session, NULL)); + + return (ret); +} + +/* + * __wt_meta_track_sub_on -- + * Start a group of operations that can be committed independent of the + * main transaction. + */ +int +__wt_meta_track_sub_on(WT_SESSION_IMPL *session) +{ + WT_ASSERT(session, session->meta_track_sub == NULL); + session->meta_track_sub = session->meta_track_next; + return (0); +} + +/* + * __wt_meta_track_sub_off -- + * Commit a group of operations independent of the main transaction. + */ +int +__wt_meta_track_sub_off(WT_SESSION_IMPL *session) +{ + WT_DECL_RET; + WT_META_TRACK *trk, *trk_orig; + + if (!WT_META_TRACKING(session) || session->meta_track_sub == NULL) + return (0); + + trk_orig = session->meta_track_sub; + trk = session->meta_track_next; + + /* Turn off tracking for unroll. */ + session->meta_track_next = session->meta_track_sub = NULL; + + while (--trk >= trk_orig) + WT_TRET(__meta_track_apply(session, trk, 0)); + + session->meta_track_next = trk_orig; + return (ret); +} + +/* + * __wt_meta_track_checkpoint -- + * Track a handle involved in a checkpoint. + */ +int +__wt_meta_track_checkpoint(WT_SESSION_IMPL *session) +{ + WT_META_TRACK *trk; + + WT_ASSERT(session, session->dhandle != NULL); + + WT_RET(__meta_track_next(session, &trk)); + + trk->op = WT_ST_CHECKPOINT; + trk->btree = S2BT(session); + return (0); +} +/* + * __wt_meta_track_insert -- + * Track an insert operation. + */ +int +__wt_meta_track_insert(WT_SESSION_IMPL *session, const char *key) +{ + WT_META_TRACK *trk; + + WT_RET(__meta_track_next(session, &trk)); + + trk->op = WT_ST_REMOVE; + WT_RET(__wt_strdup(session, key, &trk->a)); + + return (0); +} + +/* + * __wt_meta_track_update -- + * Track a metadata update operation. + */ +int +__wt_meta_track_update(WT_SESSION_IMPL *session, const char *key) +{ + WT_DECL_RET; + WT_META_TRACK *trk; + + WT_RET(__meta_track_next(session, &trk)); + + trk->op = WT_ST_SET; + WT_RET(__wt_strdup(session, key, &trk->a)); + + /* + * If there was a previous value, keep it around -- if not, then this + * "update" is really an insert. + */ + if ((ret = + __wt_metadata_search(session, key, &trk->b)) == WT_NOTFOUND) { + trk->op = WT_ST_REMOVE; + ret = 0; + } + return (ret); +} + +/* + * __wt_meta_track_fileop -- + * Track a filesystem operation. + */ +int +__wt_meta_track_fileop( + WT_SESSION_IMPL *session, const char *olduri, const char *newuri) +{ + WT_META_TRACK *trk; + + WT_RET(__meta_track_next(session, &trk)); + + trk->op = WT_ST_FILEOP; + if (olduri != NULL) + WT_RET(__wt_strdup(session, olduri, &trk->a)); + if (newuri != NULL) + WT_RET(__wt_strdup(session, newuri, &trk->b)); + return (0); +} + +/* + * __wt_meta_track_handle_lock -- + * Track a locked handle. + */ +int +__wt_meta_track_handle_lock(WT_SESSION_IMPL *session, int created) +{ + WT_META_TRACK *trk; + + WT_ASSERT(session, session->dhandle != NULL); + + WT_RET(__meta_track_next(session, &trk)); + + trk->op = WT_ST_LOCK; + trk->btree = S2BT(session); + trk->created = created; + return (0); +} diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c new file mode 100644 index 00000000000..d6060ebf47b --- /dev/null +++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c @@ -0,0 +1,318 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __metadata_config -- + * Return the default configuration information for the metadata file. + */ +static int +__metadata_config(WT_SESSION_IMPL *session, const char **metaconfp) +{ + WT_DECL_ITEM(buf); + WT_DECL_RET; + const char *cfg[] = { WT_CONFIG_BASE(session, file_meta), NULL, NULL }; + const char *metaconf; + + *metaconfp = NULL; + + metaconf = NULL; + + /* Create a turtle file with default values. */ + WT_RET(__wt_scr_alloc(session, 0, &buf)); + WT_ERR(__wt_buf_fmt(session, buf, + "key_format=S,value_format=S,id=%d,version=(major=%d,minor=%d)", + WT_METAFILE_ID, + WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX)); + cfg[1] = buf->data; + WT_ERR(__wt_config_collapse(session, cfg, &metaconf)); + + *metaconfp = metaconf; + + if (0) { +err: __wt_free(session, metaconf); + } + __wt_scr_free(&buf); + return (ret); +} + +/* + * __metadata_init -- + * Create the metadata file. + */ +static int +__metadata_init(WT_SESSION_IMPL *session) +{ + WT_DECL_RET; + + /* + * We're single-threaded, but acquire the schema lock regardless: the + * lower level code checks that it is appropriately synchronized. + */ + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_schema_create(session, WT_METAFILE_URI, NULL)); + + return (ret); +} + +/* + * __metadata_load_hot_backup -- + * Load the contents of any hot backup file. + */ +static int +__metadata_load_hot_backup(WT_SESSION_IMPL *session) +{ + FILE *fp; + WT_DECL_ITEM(key); + WT_DECL_ITEM(value); + WT_DECL_RET; + char *path; + + fp = NULL; + path = NULL; + + /* Look for a hot backup file: if we find it, load it. */ + WT_RET(__wt_filename(session, WT_METADATA_BACKUP, &path)); + fp = fopen(path, "r"); + __wt_free(session, path); + if (fp == NULL) + return (0); + + /* Read line pairs and load them into the metadata file. */ + WT_ERR(__wt_scr_alloc(session, 512, &key)); + WT_ERR(__wt_scr_alloc(session, 512, &value)); + for (;;) { + WT_ERR(__wt_getline(session, key, fp)); + if (key->size == 0) + break; + WT_ERR(__wt_getline(session, value, fp)); + if (value->size == 0) + WT_ERR(__wt_illegal_value(session, WT_METADATA_BACKUP)); + WT_ERR(__wt_metadata_update(session, key->data, value->data)); + } + + F_SET(S2C(session), WT_CONN_WAS_BACKUP); + +err: if (fp != NULL) + WT_TRET(fclose(fp) == 0 ? 0 : __wt_errno()); + __wt_scr_free(&key); + __wt_scr_free(&value); + return (ret); +} + +/* + * __metadata_load_bulk -- + * Create any bulk-loaded file stubs. + */ +static int +__metadata_load_bulk(WT_SESSION_IMPL *session) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + uint32_t allocsize; + int exist; + const char *filecfg[] = { WT_CONFIG_BASE(session, file_meta), NULL }; + const char *key; + + /* + * If a file was being bulk-loaded during the hot backup, it will appear + * in the metadata file, but the file won't exist. Create on demand. + */ + WT_ERR(__wt_metadata_cursor(session, NULL, &cursor)); + while ((ret = cursor->next(cursor)) == 0) { + WT_ERR(cursor->get_key(cursor, &key)); + if (!WT_PREFIX_SKIP(key, "file:")) + continue; + + /* If the file exists, it's all good. */ + WT_ERR(__wt_exist(session, key, &exist)); + if (exist) + continue; + + /* + * If the file doesn't exist, assume it's a bulk-loaded file; + * retrieve the allocation size and re-create the file. + */ + WT_ERR(__wt_direct_io_size_check( + session, filecfg, "allocation_size", &allocsize)); + WT_ERR(__wt_block_manager_create(session, key, allocsize)); + } + WT_ERR_NOTFOUND_OK(ret); + +err: if (cursor != NULL) + WT_TRET(cursor->close(cursor)); + + return (ret); +} + +/* + * __wt_turtle_init -- + * Check the turtle file and create if necessary. + */ +int +__wt_turtle_init(WT_SESSION_IMPL *session) +{ + WT_DECL_RET; + int exist; + const char *metaconf; + + metaconf = NULL; + + /* + * Discard any turtle setup file left-over from previous runs. This + * doesn't matter for correctness, it's just cleaning up random files. + */ + WT_RET(__wt_exist(session, WT_METADATA_TURTLE_SET, &exist)); + if (exist) + WT_RET(__wt_remove(session, WT_METADATA_TURTLE_SET)); + + /* + * We could die after creating the turtle file and before creating the + * metadata file, or worse, the metadata file might be in some random + * state. Make sure that doesn't happen: if we don't find the turtle + * file, first create the metadata file, load any hot backup, and then + * create the turtle file. No matter what happens, if metadata file + * creation doesn't fully complete, we won't have a turtle file and we + * will repeat the process until we succeed. + * + * If there's already a turtle file, we're done. + */ + WT_RET(__wt_exist(session, WT_METADATA_TURTLE, &exist)); + if (exist) + return (0); + + /* Create the metadata file. */ + WT_RET(__metadata_init(session)); + + /* Load any hot-backup information. */ + WT_RET(__metadata_load_hot_backup(session)); + + /* Create any bulk-loaded file stubs. */ + WT_RET(__metadata_load_bulk(session)); + + /* Create the turtle file. */ + WT_RET(__metadata_config(session, &metaconf)); + WT_ERR(__wt_turtle_update(session, WT_METAFILE_URI, metaconf)); + + /* Remove the backup file if it exists, we'll never read it again. */ + WT_ERR(__wt_exist(session, WT_METADATA_BACKUP, &exist)); + if (exist) + WT_ERR(__wt_remove(session, WT_METADATA_BACKUP)); + +err: __wt_free(session, metaconf); + return (ret); +} + +/* + * __wt_turtle_read -- + * Read the turtle file. + */ +int +__wt_turtle_read(WT_SESSION_IMPL *session, const char *key, const char **valuep) +{ + FILE *fp; + WT_DECL_ITEM(buf); + WT_DECL_RET; + int match; + char *path; + + *valuep = NULL; + + fp = NULL; + path = NULL; + + /* + * Open the turtle file; there's one case where we won't find the turtle + * file, yet still succeed. We create the metadata file before creating + * the turtle file, and that means returning the default configuration + * string for the metadata file. + */ + WT_RET(__wt_filename(session, WT_METADATA_TURTLE, &path)); + if ((fp = fopen(path, "r")) == NULL) + ret = __wt_errno(); + __wt_free(session, path); + if (fp == NULL) + return (strcmp(key, WT_METAFILE_URI) == 0 ? + __metadata_config(session, valuep) : ret); + + /* Search for the key. */ + WT_ERR(__wt_scr_alloc(session, 512, &buf)); + for (match = 0;;) { + WT_ERR(__wt_getline(session, buf, fp)); + if (buf->size == 0) + WT_ERR(WT_NOTFOUND); + if (strcmp(key, buf->data) == 0) + match = 1; + + /* Key matched: read the subsequent line for the value. */ + WT_ERR(__wt_getline(session, buf, fp)); + if (buf->size == 0) + WT_ERR(__wt_illegal_value(session, WT_METADATA_TURTLE)); + if (match) + break; + } + + /* Copy the value for the caller. */ + WT_ERR(__wt_strdup(session, buf->data, valuep)); + +err: if (fp != NULL) + WT_TRET(fclose(fp) == 0 ? 0 : __wt_errno()); + __wt_scr_free(&buf); + return (ret); +} + +/* + * __wt_turtle_update -- + * Update the turtle file. + */ +int +__wt_turtle_update( + WT_SESSION_IMPL *session, const char *key, const char *value) +{ + FILE *fp; + WT_DECL_RET; + int vmajor, vminor, vpatch; + const char *version; + char *path; + + fp = NULL; + path = NULL; + + /* + * Create the turtle setup file: we currently re-write it from scratch + * every time. + */ + WT_RET(__wt_filename(session, WT_METADATA_TURTLE_SET, &path)); + if ((fp = fopen(path, "w")) == NULL) + ret = __wt_errno(); + __wt_free(session, path); + if (fp == NULL) + return (ret); + + version = wiredtiger_version(&vmajor, &vminor, &vpatch); + WT_ERR_TEST((fprintf(fp, + "%s\n%s\n%s\n" "major=%d,minor=%d,patch=%d\n%s\n%s\n", + WT_METADATA_VERSION_STR, version, + WT_METADATA_VERSION, vmajor, vminor, vpatch, + key, value) < 0), __wt_errno()); + + ret = fclose(fp); + fp = NULL; + WT_ERR_TEST(ret == EOF, __wt_errno()); + + WT_ERR( + __wt_rename(session, WT_METADATA_TURTLE_SET, WT_METADATA_TURTLE)); + + if (0) { +err: WT_TRET(__wt_remove(session, WT_METADATA_TURTLE_SET)); + } + + if (fp != NULL) + WT_TRET(fclose(fp) == 0 ? 0 : __wt_errno()); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_abort.c b/src/third_party/wiredtiger/src/os_posix/os_abort.c new file mode 100644 index 00000000000..3d99ffe20b2 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_abort.c @@ -0,0 +1,26 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_abort -- + * Abort the process, dropping core. + */ +void +__wt_abort(WT_SESSION_IMPL *session) + WT_GCC_FUNC_ATTRIBUTE((noreturn)) +{ + __wt_errx(session, "aborting WiredTiger library"); + +#ifdef HAVE_DIAGNOSTIC + __wt_attach(session); +#endif + + abort(); + /* NOTREACHED */ +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_alloc.c b/src/third_party/wiredtiger/src/os_posix/os_alloc.c new file mode 100644 index 00000000000..f7344032a15 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_alloc.c @@ -0,0 +1,238 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * There's no malloc interface, WiredTiger never calls malloc. + * + * The problem is an application might allocate memory, write secret stuff in + * it, free the memory, then WiredTiger allocates the memory and uses it for a + * file page or log record, then writes it to disk, without having overwritten + * it fully. That results in the secret stuff being protected by WiredTiger's + * permission mechanisms, potentially inappropriate for the secret stuff. + */ + +/* + * __wt_calloc -- + * ANSI calloc function. + */ +int +__wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp) +{ + void *p; + + /* + * !!! + * This function MUST handle a NULL WT_SESSION_IMPL handle. + */ + WT_ASSERT(session, number != 0 && size != 0); + + if (session != NULL) + WT_STAT_FAST_CONN_INCR(session, memory_allocation); + + if ((p = calloc(number, size)) == NULL) + WT_RET_MSG(session, __wt_errno(), "memory allocation"); + + *(void **)retp = p; + return (0); +} + +/* + * __wt_realloc -- + * ANSI realloc function. + */ +int +__wt_realloc(WT_SESSION_IMPL *session, + size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp) +{ + void *p; + size_t bytes_allocated; + + /* + * !!! + * This function MUST handle a NULL WT_SESSION_IMPL handle. + * + * Sometimes we're allocating memory and we don't care about the + * final length -- bytes_allocated_ret may be NULL. + */ + p = *(void **)retp; + bytes_allocated = + (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret; + WT_ASSERT(session, + (p == NULL && bytes_allocated == 0) || + (p != NULL && + (bytes_allocated_ret == NULL || bytes_allocated != 0))); + WT_ASSERT(session, bytes_to_allocate != 0); + WT_ASSERT(session, bytes_allocated < bytes_to_allocate); + + if (session != NULL) { + if (p == NULL) + WT_STAT_FAST_CONN_INCR(session, memory_allocation); + else + WT_STAT_FAST_CONN_INCR(session, memory_grow); + } + + if ((p = realloc(p, bytes_to_allocate)) == NULL) + WT_RET_MSG(session, __wt_errno(), "memory allocation"); + + /* + * Clear the allocated memory -- an application might: allocate memory, + * write secret stuff into it, free the memory, then we re-allocate the + * memory and use it for a file page or log record, and then write it to + * disk. That would result in the secret stuff being protected by the + * WiredTiger permission mechanisms, potentially inappropriate for the + * secret stuff. + */ + memset((uint8_t *) + p + bytes_allocated, 0, bytes_to_allocate - bytes_allocated); + + /* Update caller's bytes allocated value. */ + if (bytes_allocated_ret != NULL) + *bytes_allocated_ret = bytes_to_allocate; + + *(void **)retp = p; + return (0); +} + +/* + * __wt_realloc_aligned -- + * ANSI realloc function that aligns to buffer boundaries, configured with + * the "buffer_alignment" key to wiredtiger_open. + */ +int +__wt_realloc_aligned(WT_SESSION_IMPL *session, + size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp) +{ +#if defined(HAVE_POSIX_MEMALIGN) + WT_DECL_RET; + + /* + * !!! + * This function MUST handle a NULL WT_SESSION_IMPL handle. + */ + if (session != NULL && S2C(session)->buffer_alignment > 0) { + void *p, *newp; + size_t bytes_allocated; + + /* + * Sometimes we're allocating memory and we don't care about the + * final length -- bytes_allocated_ret may be NULL. + */ + p = *(void **)retp; + bytes_allocated = + (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret; + WT_ASSERT(session, + (p == NULL && bytes_allocated == 0) || + (p != NULL && + (bytes_allocated_ret == NULL || bytes_allocated != 0))); + WT_ASSERT(session, bytes_to_allocate != 0); + WT_ASSERT(session, bytes_allocated < bytes_to_allocate); + + if (session != NULL) + WT_STAT_FAST_CONN_INCR(session, memory_allocation); + + if ((ret = posix_memalign(&newp, + S2C(session)->buffer_alignment, + bytes_to_allocate)) != 0) + WT_RET_MSG(session, ret, "memory allocation"); + + if (p != NULL) + memcpy(newp, p, bytes_allocated); + __wt_free(session, p); + p = newp; + + /* Clear the allocated memory (see above). */ + memset((uint8_t *)p + bytes_allocated, 0, + bytes_to_allocate - bytes_allocated); + + /* Update caller's bytes allocated value. */ + if (bytes_allocated_ret != NULL) + *bytes_allocated_ret = bytes_to_allocate; + + *(void **)retp = p; + return (0); + } +#endif + /* + * If there is no posix_memalign function, or no alignment configured, + * fall back to realloc. + * + * Windows note: Visual C CRT memalign does not match Posix behavior + * and would also double each allocation so it is bad for memory use + */ + return (__wt_realloc( + session, bytes_allocated_ret, bytes_to_allocate, retp)); +} + +/* + * __wt_strndup -- + * Duplicate a byte string of a given length (and NUL-terminate). + */ +int +__wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp) +{ + void *p; + + if (str == NULL) { + *(void **)retp = NULL; + return (0); + } + + WT_RET(__wt_calloc(session, len + 1, 1, &p)); + + /* + * Don't change this to strncpy, we rely on this function to duplicate + * "strings" that contain nul bytes. + */ + memcpy(p, str, len); + + *(void **)retp = p; + return (0); +} + +/* + * __wt_strdup -- + * ANSI strdup function. + */ +int +__wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp) +{ + return (__wt_strndup( + session, str, (str == NULL) ? 0 : strlen(str), retp)); +} + +/* + * __wt_free_int -- + * ANSI free function. + */ +void +__wt_free_int(WT_SESSION_IMPL *session, const void *p_arg) +{ + void *p; + + p = *(void **)p_arg; + if (p == NULL) /* ANSI C free semantics */ + return; + + /* + * If there's a serialization bug we might race with another thread. + * We can't avoid the race (and we aren't willing to flush memory), + * but we minimize the window by clearing the free address, hoping a + * racing thread will see, and won't free, a NULL pointer. + */ + *(void **)p_arg = NULL; + + /* + * !!! + * This function MUST handle a NULL WT_SESSION_IMPL handle. + */ + if (session != NULL) + WT_STAT_FAST_CONN_INCR(session, memory_free); + + free(p); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_dir.c b/src/third_party/wiredtiger/src/os_posix/os_dir.c new file mode 100644 index 00000000000..98b2d4926cd --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_dir.c @@ -0,0 +1,94 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" +/* I'm sure we need to config this */ +#include <dirent.h> + +/* + * __wt_dirlist -- + * Get a list of files from a directory, optionally filtered by + * a given prefix. + */ +int +__wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix, + uint32_t flags, char ***dirlist, u_int *countp) +{ + struct dirent *dp; + DIR *dirp; + WT_DECL_RET; + size_t dirallocsz; + u_int count, dirsz; + int match; + char **entries, *path; + + *dirlist = NULL; + *countp = 0; + + WT_RET(__wt_filename(session, dir, &path)); + + dirp = NULL; + dirallocsz = 0; + dirsz = 0; + entries = NULL; + if (flags == 0) + LF_SET(WT_DIRLIST_INCLUDE); + + WT_ERR(__wt_verbose(session, WT_VERB_FILEOPS, + "wt_dirlist of %s %s prefix %s", + path, LF_ISSET(WT_DIRLIST_INCLUDE) ? "include" : "exclude", + prefix == NULL ? "all" : prefix)); + + WT_SYSCALL_RETRY(((dirp = opendir(path)) == NULL ? 1 : 0), ret); + if (ret != 0) + WT_ERR_MSG(session, ret, "%s: opendir", path); + for (dirsz = 0, count = 0; (dp = readdir(dirp)) != NULL;) { + /* + * Skip . and .. + */ + if (strcmp(dp->d_name, ".") == 0 || + strcmp(dp->d_name, "..") == 0) + continue; + match = 0; + if (prefix != NULL && + ((LF_ISSET(WT_DIRLIST_INCLUDE) && + WT_PREFIX_MATCH(dp->d_name, prefix)) || + (LF_ISSET(WT_DIRLIST_EXCLUDE) && + !WT_PREFIX_MATCH(dp->d_name, prefix)))) + match = 1; + if (prefix == NULL || match) { + /* + * We have a file name we want to return. + */ + count++; + if (count > dirsz) { + dirsz += WT_DIR_ENTRY; + WT_ERR(__wt_realloc_def( + session, &dirallocsz, dirsz, &entries)); + } + WT_ERR(__wt_strdup( + session, dp->d_name, &entries[count-1])); + } + } + if (count > 0) + *dirlist = entries; + *countp = count; +err: + if (dirp != NULL) + (void)closedir(dirp); + __wt_free(session, path); + + if (ret == 0) + return (0); + + if (*dirlist != NULL) { + for (count = dirsz; count > 0; count--) + __wt_free(session, entries[count]); + __wt_free(session, entries); + } + WT_RET_MSG(session, ret, "dirlist %s prefix %s", dir, prefix); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_dlopen.c b/src/third_party/wiredtiger/src/os_posix/os_dlopen.c new file mode 100644 index 00000000000..91410c54c04 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_dlopen.c @@ -0,0 +1,83 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_dlopen -- + * Open a dynamic library. + */ +int +__wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp) +{ + WT_DECL_RET; + WT_DLH *dlh; + + WT_RET(__wt_calloc_def(session, 1, &dlh)); + WT_ERR(__wt_strdup(session, path, &dlh->name)); + + if ((dlh->handle = dlopen(path, RTLD_LAZY)) == NULL) + WT_ERR_MSG( + session, __wt_errno(), "dlopen(%s): %s", path, dlerror()); + + *dlhp = dlh; + if (0) { +err: __wt_free(session, dlh->name); + __wt_free(session, dlh); + } + return (ret); +} + +/* + * __wt_dlsym -- + * Lookup a symbol in a dynamic library. + */ +int +__wt_dlsym(WT_SESSION_IMPL *session, + WT_DLH *dlh, const char *name, int fail, void *sym_ret) +{ + void *sym; + + *(void **)sym_ret = NULL; + if ((sym = dlsym(dlh->handle, name)) == NULL) { + if (fail) + WT_RET_MSG(session, __wt_errno(), + "dlsym(%s in %s): %s", name, dlh->name, dlerror()); + return (0); + } + + *(void **)sym_ret = sym; + return (0); +} + +/* + * __wt_dlclose -- + * Close a dynamic library + */ +int +__wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh) +{ + WT_DECL_RET; + + /* + * FreeBSD dies inside __cxa_finalize when closing handles. + * + * For now, just skip the dlclose: this may leak some resources until + * the process exits, but that is preferable to hard-to-debug crashes + * during exit. + */ +#ifndef __FreeBSD__ + if (dlclose(dlh->handle) != 0) { + ret = __wt_errno(); + __wt_err(session, ret, "dlclose: %s", dlerror()); + } +#endif + + __wt_free(session, dlh->name); + __wt_free(session, dlh); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_errno.c b/src/third_party/wiredtiger/src/os_posix/os_errno.c new file mode 100644 index 00000000000..9290f7d651f --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_errno.c @@ -0,0 +1,22 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_errno -- + * Return errno, or WT_ERROR if errno not set. + */ +int +__wt_errno(void) +{ + /* + * Called when we know an error occurred, and we want the system + * error code, but there's some chance it's not set. + */ + return (errno == 0 ? WT_ERROR : errno); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_exist.c b/src/third_party/wiredtiger/src/os_posix/os_exist.c new file mode 100644 index 00000000000..723f07026e1 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_exist.c @@ -0,0 +1,37 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_exist -- + * Return if the file exists. + */ +int +__wt_exist(WT_SESSION_IMPL *session, const char *filename, int *existp) +{ + struct stat sb; + WT_DECL_RET; + char *path; + + WT_RET(__wt_filename(session, filename, &path)); + + WT_SYSCALL_RETRY(stat(path, &sb), ret); + + __wt_free(session, path); + + if (ret == 0) { + *existp = 1; + return (0); + } + if (ret == ENOENT) { + *existp = 0; + return (0); + } + + WT_RET_MSG(session, ret, "%s: fstat", filename); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_fallocate.c b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c new file mode 100644 index 00000000000..28cd1979c77 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c @@ -0,0 +1,97 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +#if defined(HAVE_FALLOCATE) +#include <linux/falloc.h> +#endif + +/* + * __wt_fallocate_config -- + * Configure fallocate behavior for a file handle. + */ +void +__wt_fallocate_config(WT_SESSION_IMPL *session, WT_FH *fh) +{ + WT_UNUSED(session); + + fh->fallocate_available = 0; + fh->fallocate_requires_locking = 0; + +#ifdef __linux__ + /* + * We've seen Linux systems where posix_fallocate corrupts existing data + * (even though that is explicitly disallowed by POSIX). We've not seen + * problems with fallocate, it's unlocked for now. + */ +#if defined(HAVE_FALLOCATE) + fh->fallocate_available = 1; + fh->fallocate_requires_locking = 0; +#elif defined(HAVE_POSIX_FALLOCATE) + fh->fallocate_available = 1; + fh->fallocate_requires_locking = 1; +#endif +#elif defined(HAVE_POSIX_FALLOCATE) + /* + * FreeBSD and Solaris support posix_fallocate, and so far we've seen + * no problems leaving it unlocked. + */ + fh->fallocate_available = 1; + fh->fallocate_requires_locking = 0; +#endif +} + +/* + * __wt_fallocate -- + * Allocate space for a file handle. + */ +int +__wt_fallocate( + WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len) +{ + WT_DECL_RET; + +#if defined(HAVE_FALLOCATE) + WT_RET(__wt_verbose( + session, WT_VERB_FILEOPS, "%s: fallocate", fh->name)); + WT_SYSCALL_RETRY( + fallocate(fh->fd, FALLOC_FL_KEEP_SIZE, offset, len), ret); + if (ret == 0) + return (0); + + /* + * Linux returns ENOTSUP for fallocate on some file systems; we return + * ENOTSUP, and our caller should avoid calling us again. + */ + if (ret != ENOTSUP) + WT_RET_MSG(session, ret, "%s: fallocate", fh->name); +#elif defined(HAVE_POSIX_FALLOCATE) + WT_RET(__wt_verbose( + session, WT_VERB_FILEOPS, "%s: posix_fallocate", fh->name)); + WT_SYSCALL_RETRY(posix_fallocate(fh->fd, offset, len), ret); + if (ret == 0) + return (0); + + /* + * Solaris returns EINVAL for posix_fallocate on some file systems; we + * return ENOTSUP, and our caller should avoid calling us again. + */ + if (ret != EINVAL) + WT_RET_MSG(session, ret, "%s: posix_fallocate", fh->name); +#else + WT_UNUSED(session); + WT_UNUSED(fh); + WT_UNUSED(offset); + WT_UNUSED(len); + WT_UNUSED(ret); +#endif + + fh->fallocate_available = 0; + fh->fallocate_requires_locking = 0; + return (ENOTSUP); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_filesize.c b/src/third_party/wiredtiger/src/os_posix/os_filesize.c new file mode 100644 index 00000000000..3692b135d73 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_filesize.c @@ -0,0 +1,55 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_filesize -- + * Get the size of a file in bytes. + */ +int +__wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep) +{ + struct stat sb; + WT_DECL_RET; + + WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: fstat", fh->name)); + + WT_SYSCALL_RETRY(fstat(fh->fd, &sb), ret); + if (ret == 0) { + *sizep = sb.st_size; + return (0); + } + + WT_RET_MSG(session, ret, "%s: fstat", fh->name); +} + +/* + * __wt_filesize_name -- + * Return the size of a file in bytes, given a file name. + */ +int +__wt_filesize_name( + WT_SESSION_IMPL *session, const char *filename, wt_off_t *sizep) +{ + struct stat sb; + WT_DECL_RET; + char *path; + + WT_RET(__wt_filename(session, filename, &path)); + + WT_SYSCALL_RETRY(stat(path, &sb), ret); + + __wt_free(session, path); + + if (ret == 0) { + *sizep = sb.st_size; + return (0); + } + + WT_RET_MSG(session, ret, "%s: fstat", filename); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_flock.c b/src/third_party/wiredtiger/src/os_posix/os_flock.c new file mode 100644 index 00000000000..e9e653d73e6 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_flock.c @@ -0,0 +1,37 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_bytelock -- + * Lock/unlock a byte in a file. + */ +int +__wt_bytelock(WT_FH *fhp, wt_off_t byte, int lock) +{ + struct flock fl; + WT_DECL_RET; + + /* + * WiredTiger requires this function be able to acquire locks past + * the end of file. + * + * Note we're using fcntl(2) locking: all fcntl locks associated with a + * file for a given process are removed when any file descriptor for the + * file is closed by the process, even if a lock was never requested for + * that file descriptor. + */ + fl.l_start = byte; + fl.l_len = 1; + fl.l_type = lock ? F_WRLCK : F_UNLCK; + fl.l_whence = SEEK_SET; + + WT_SYSCALL_RETRY(fcntl(fhp->fd, F_SETLK, &fl), ret); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_fsync.c b/src/third_party/wiredtiger/src/os_posix/os_fsync.c new file mode 100644 index 00000000000..c181809df95 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_fsync.c @@ -0,0 +1,54 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_fsync -- + * Flush a file handle. + */ +int +__wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh) +{ + WT_DECL_RET; + + WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: fsync", fh->name)); + +#ifdef HAVE_FDATASYNC + WT_SYSCALL_RETRY(fdatasync(fh->fd), ret); +#else + WT_SYSCALL_RETRY(fsync(fh->fd), ret); +#endif + if (ret != 0) + WT_RET_MSG(session, ret, "%s fsync error", fh->name); + + return (0); +} + +/* + * __wt_fsync_async -- + * Flush a file handle and don't wait for the result. + */ +int +__wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh) +{ +#ifdef HAVE_SYNC_FILE_RANGE + WT_DECL_RET; + + WT_RET(__wt_verbose( + session, WT_VERB_FILEOPS, "%s: sync_file_range", fh->name)); + + if ((ret = sync_file_range(fh->fd, + (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) == 0) + return (0); + WT_RET_MSG(session, ret, "%s: sync_file_range", fh->name); +#else + WT_UNUSED(session); + WT_UNUSED(fh); + return (0); +#endif +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_ftruncate.c b/src/third_party/wiredtiger/src/os_posix/os_ftruncate.c new file mode 100644 index 00000000000..3f3034de551 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_ftruncate.c @@ -0,0 +1,26 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_ftruncate -- + * Truncate a file. + */ +int +__wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len) +{ + WT_DECL_RET; + + WT_SYSCALL_RETRY(ftruncate(fh->fd, len), ret); + if (ret == 0) { + fh->size = fh->extend_size = len; + return (0); + } + + WT_RET_MSG(session, ret, "%s ftruncate error", fh->name); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_getline.c b/src/third_party/wiredtiger/src/os_posix/os_getline.c new file mode 100644 index 00000000000..7ef4065ac3b --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_getline.c @@ -0,0 +1,48 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_getline -- + * Get a line from a stream. + * + * Implementation of the POSIX getline or BSD fgetln functions (finding the + * function in a portable way is hard, it's simple enough to write it instead). + * + * Note: Unlike the standard getline calls, this function doesn't include the + * trailing newline character in the returned buffer and discards empty lines + * (so the caller's EOF marker is a returned line length of 0). + */ +int +__wt_getline(WT_SESSION_IMPL *session, WT_ITEM *buf, FILE *fp) +{ + int c; + + /* + * We always NUL-terminate the returned string (even if it's empty), + * make sure there's buffer space for a trailing NUL in all cases. + */ + WT_RET(__wt_buf_init(session, buf, 100)); + + while ((c = fgetc(fp)) != EOF) { + /* Leave space for a trailing NUL. */ + WT_RET(__wt_buf_extend(session, buf, buf->size + 2)); + if (c == '\n') { + if (buf->size == 0) + continue; + break; + } + ((char *)buf->mem)[buf->size++] = (char)c; + } + if (c == EOF && ferror(fp)) + WT_RET_MSG(session, __wt_errno(), "file read"); + + ((char *)buf->mem)[buf->size] = '\0'; + + return (0); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_getopt.c b/src/third_party/wiredtiger/src/os_posix/os_getopt.c new file mode 100644 index 00000000000..1c25521dacd --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_getopt.c @@ -0,0 +1,150 @@ +/*- + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* $NetBSD: getopt.c,v 1.26 2003/08/07 16:43:40 agc Exp $ */ + +/* + * Copyright (c) 1987, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "wt_internal.h" + +extern int __wt_opterr, __wt_optind, __wt_optopt, __wt_optreset; +int __wt_opterr = 1, /* if error message should be printed */ + __wt_optind = 1, /* index into parent argv vector */ + __wt_optopt, /* character checked for validity */ + __wt_optreset; /* reset getopt */ + +extern char *__wt_optarg; +char *__wt_optarg; /* argument associated with option */ + +#define BADCH (int)'?' +#define BADARG (int)':' +#define EMSG "" + +/* + * __wt_getopt -- + * Parse argc/argv argument vector. + */ +int +__wt_getopt( + const char *progname, int nargc, char * const *nargv, const char *ostr) +{ + static const char *place = EMSG; /* option letter processing */ + const char *oli; /* option letter list index */ + + if (__wt_optreset || *place == 0) { /* update scanning pointer */ + __wt_optreset = 0; + place = nargv[__wt_optind]; + if (__wt_optind >= nargc || *place++ != '-') { + /* Argument is absent or is not an option */ + place = EMSG; + return (-1); + } + __wt_optopt = *place++; + if (__wt_optopt == '-' && *place == 0) { + /* "--" => end of options */ + ++__wt_optind; + place = EMSG; + return (-1); + } + if (__wt_optopt == 0) { + /* Solitary '-', treat as a '-' option + if the program (eg su) is looking for it. */ + place = EMSG; + if (strchr(ostr, '-') == NULL) + return (-1); + __wt_optopt = '-'; + } + } else + __wt_optopt = *place++; + + /* See if option letter is one the caller wanted... */ + if (__wt_optopt == ':' || (oli = strchr(ostr, __wt_optopt)) == NULL) { + if (*place == 0) + ++__wt_optind; + if (__wt_opterr && *ostr != ':') + (void)fprintf(stderr, + "%s: illegal option -- %c\n", progname, + __wt_optopt); + return (BADCH); + } + + /* Does this option need an argument? */ + if (oli[1] != ':') { + /* don't need argument */ + __wt_optarg = NULL; + if (*place == 0) + ++__wt_optind; + } else { + /* Option-argument is either the rest of this argument or the + entire next argument. */ + if (*place) + __wt_optarg = (char *)place; + else if (nargc > ++__wt_optind) + __wt_optarg = nargv[__wt_optind]; + else { + /* option-argument absent */ + place = EMSG; + if (*ostr == ':') + return (BADARG); + if (__wt_opterr) + (void)fprintf(stderr, + "%s: option requires an argument -- %c\n", + progname, __wt_optopt); + return (BADCH); + } + place = EMSG; + ++__wt_optind; + } + return (__wt_optopt); /* return option letter */ +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_map.c b/src/third_party/wiredtiger/src/os_posix/os_map.c new file mode 100644 index 00000000000..be4d27e96a3 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_map.c @@ -0,0 +1,136 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_mmap -- + * Map a file into memory. + */ +int +__wt_mmap(WT_SESSION_IMPL *session, + WT_FH *fh, void *mapp, size_t *lenp, void **mappingcookie) +{ + void *map; + size_t orig_size; + + WT_UNUSED(mappingcookie); + + /* + * Record the current size and only map and set that as the length, it + * could change between the map call and when we set the return length. + * For the same reason we could actually map past the end of the file; + * we don't read bytes past the end of the file though, so as long as + * the map call succeeds, it's all OK. + */ + orig_size = (size_t)fh->size; + if ((map = mmap(NULL, orig_size, + PROT_READ, +#ifdef MAP_NOCORE + MAP_NOCORE | +#endif + MAP_PRIVATE, + fh->fd, (wt_off_t)0)) == MAP_FAILED) { + WT_RET_MSG(session, __wt_errno(), + "%s map error: failed to map %" WT_SIZET_FMT " bytes", + fh->name, orig_size); + } + (void)__wt_verbose(session, WT_VERB_FILEOPS, + "%s: map %p: %" WT_SIZET_FMT " bytes", fh->name, map, orig_size); + + *(void **)mapp = map; + *lenp = orig_size; + return (0); +} + +#define WT_VM_PAGESIZE 4096 + +/* + * __wt_mmap_preload -- + * Cause a section of a memory map to be faulted in. + */ +int +__wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size) +{ +#ifdef HAVE_POSIX_MADVISE + /* Linux requires the address be aligned to a 4KB boundary. */ + WT_BM *bm = S2BT(session)->bm; + WT_DECL_RET; + void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1)); + size += WT_PTRDIFF(p, blk); + + /* XXX proxy for "am I doing a scan?" -- manual read-ahead */ + if (F_ISSET(session, WT_SESSION_NO_CACHE)) { + /* Read in 2MB blocks every 1MB of data. */ + if (((uintptr_t)((uint8_t *)blk + size) & + (uintptr_t)((1<<20) - 1)) < (uintptr_t)blk) + return (0); + size = WT_MIN(WT_MAX(20 * size, 2 << 20), + WT_PTRDIFF((uint8_t *)bm->map + bm->maplen, blk)); + } + + /* + * Manual pages aren't clear on whether alignment is required for the + * size, so we will be conservative. + */ + size &= ~(size_t)(WT_VM_PAGESIZE - 1); + + if (size > WT_VM_PAGESIZE && + (ret = posix_madvise(blk, size, POSIX_MADV_WILLNEED)) != 0) + WT_RET_MSG(session, ret, "posix_madvise will need"); +#else + WT_UNUSED(session); + WT_UNUSED(p); + WT_UNUSED(size); +#endif + + return (0); +} + +/* + * __wt_mmap_discard -- + * Discard a chunk of the memory map. + */ +int +__wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size) +{ +#ifdef HAVE_POSIX_MADVISE + /* Linux requires the address be aligned to a 4KB boundary. */ + WT_DECL_RET; + void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1)); + size += WT_PTRDIFF(p, blk); + + if ((ret = posix_madvise(blk, size, POSIX_MADV_DONTNEED)) != 0) + WT_RET_MSG(session, ret, "posix_madvise don't need"); +#else + WT_UNUSED(session); + WT_UNUSED(p); + WT_UNUSED(size); +#endif + return (0); +} + +/* + * __wt_munmap -- + * Remove a memory mapping. + */ +int +__wt_munmap(WT_SESSION_IMPL *session, + WT_FH *fh, void *map, size_t len, void **mappingcookie) +{ + WT_UNUSED(mappingcookie); + + WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, + "%s: unmap %p: %" WT_SIZET_FMT " bytes", fh->name, map, len)); + + if (munmap(map, len) == 0) + return (0); + + WT_RET_MSG(session, __wt_errno(), + "%s unmap error: failed to unmap %" WT_SIZET_FMT " bytes", + fh->name, len); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c new file mode 100644 index 00000000000..3a76cceb3f0 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c @@ -0,0 +1,157 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_cond_alloc -- + * Allocate and initialize a condition variable. + */ +int +__wt_cond_alloc(WT_SESSION_IMPL *session, + const char *name, int is_signalled, WT_CONDVAR **condp) +{ + WT_CONDVAR *cond; + WT_DECL_RET; + + /* + * !!! + * This function MUST handle a NULL session handle. + */ + WT_RET(__wt_calloc(session, 1, sizeof(WT_CONDVAR), &cond)); + + WT_ERR(pthread_mutex_init(&cond->mtx, NULL)); + + /* Initialize the condition variable to permit self-blocking. */ + WT_ERR(pthread_cond_init(&cond->cond, NULL)); + + cond->name = name; + cond->waiters = is_signalled ? -1 : 0; + + *condp = cond; + return (0); + +err: __wt_free(session, cond); + return (ret); +} + +/* + * __wt_cond_wait -- + * Wait on a mutex, optionally timing out. + */ +int +__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs) +{ + struct timespec ts; + WT_DECL_RET; + int locked; + + locked = 0; + WT_ASSERT(session, usecs >= 0); + + /* Fast path if already signalled. */ + if (WT_ATOMIC_ADD4(cond->waiters, 1) == 0) + return (0); + + /* + * !!! + * This function MUST handle a NULL session handle. + */ + if (session != NULL) { + WT_RET(__wt_verbose(session, WT_VERB_MUTEX, + "wait %s cond (%p)", cond->name, cond)); + WT_STAT_FAST_CONN_INCR(session, cond_wait); + } + + WT_ERR(pthread_mutex_lock(&cond->mtx)); + locked = 1; + + if (usecs > 0) { + WT_ERR(__wt_epoch(session, &ts)); + ts.tv_sec += (ts.tv_nsec + 1000 * usecs) / WT_BILLION; + ts.tv_nsec = (ts.tv_nsec + 1000 * usecs) % WT_BILLION; + ret = pthread_cond_timedwait(&cond->cond, &cond->mtx, &ts); + } else + ret = pthread_cond_wait(&cond->cond, &cond->mtx); + + /* + * Check pthread_cond_wait() return for EINTR, ETIME and + * ETIMEDOUT, some systems return these errors. + */ + if (ret == EINTR || +#ifdef ETIME + ret == ETIME || +#endif + ret == ETIMEDOUT) + ret = 0; + + (void)WT_ATOMIC_SUB4(cond->waiters, 1); + +err: if (locked) + WT_TRET(pthread_mutex_unlock(&cond->mtx)); + if (ret == 0) + return (0); + WT_RET_MSG(session, ret, "pthread_cond_wait"); +} + +/* + * __wt_cond_signal -- + * Signal a waiting thread. + */ +int +__wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) +{ + WT_DECL_RET; + int locked; + + locked = 0; + + /* + * !!! + * This function MUST handle a NULL session handle. + */ + if (session != NULL) + WT_RET(__wt_verbose(session, WT_VERB_MUTEX, + "signal %s cond (%p)", cond->name, cond)); + + /* Fast path if already signalled. */ + if (cond->waiters == -1) + return (0); + + if (cond->waiters > 0 || !WT_ATOMIC_CAS4(cond->waiters, 0, -1)) { + WT_ERR(pthread_mutex_lock(&cond->mtx)); + locked = 1; + WT_ERR(pthread_cond_broadcast(&cond->cond)); + } + +err: if (locked) + WT_TRET(pthread_mutex_unlock(&cond->mtx)); + if (ret == 0) + return (0); + WT_RET_MSG(session, ret, "pthread_cond_broadcast"); +} + +/* + * __wt_cond_destroy -- + * Destroy a condition variable. + */ +int +__wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) +{ + WT_CONDVAR *cond; + WT_DECL_RET; + + cond = *condp; + if (cond == NULL) + return (0); + + ret = pthread_cond_destroy(&cond->cond); + WT_TRET(pthread_mutex_destroy(&cond->mtx)); + __wt_free(session, *condp); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c new file mode 100644 index 00000000000..1a692f71dce --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c @@ -0,0 +1,227 @@ +/*- + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * Based on "Spinlocks and Read-Write Locks" by Dr. Steven Fuerst: + * http://locklessinc.com/articles/locks/ + * + * Dr. Fuerst further credits: + * There exists a form of the ticket lock that is designed for read-write + * locks. An example written in assembly was posted to the Linux kernel mailing + * list in 2002 by David Howells from RedHat. This was a highly optimized + * version of a read-write ticket lock developed at IBM in the early 90's by + * Joseph Seigh. Note that a similar (but not identical) algorithm was published + * by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable + * Reader-Writer Synchronization for Shared-Memory Multiprocessors". + */ + +#include "wt_internal.h" + +/* + * __wt_rwlock_alloc -- + * Allocate and initialize a read/write lock. + */ +int +__wt_rwlock_alloc( + WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name) +{ + WT_RWLOCK *rwlock; + + WT_RET(__wt_verbose(session, WT_VERB_MUTEX, "rwlock: alloc %s", name)); + + WT_RET(__wt_calloc_def(session, 1, &rwlock)); + + rwlock->name = name; + + *rwlockp = rwlock; + return (0); +} + +/* + * __wt_try_readlock -- + * Try to get a shared lock, fail immediately if unavailable. + */ +int +__wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +{ + wt_rwlock_t *l; + uint64_t old, new, pad, users, writers; + + WT_RET(__wt_verbose( + session, WT_VERB_MUTEX, "rwlock: try_readlock %s", rwlock->name)); + WT_STAT_FAST_CONN_INCR(session, rwlock_read); + + l = &rwlock->rwlock; + pad = l->s.pad; + users = l->s.users; + writers = l->s.writers; + old = (pad << 48) + (users << 32) + (users << 16) + writers; + new = (pad << 48) + ((users + 1) << 32) + ((users + 1) << 16) + writers; + return (WT_ATOMIC_CAS_VAL8(l->u, old, new) == old ? 0 : EBUSY); +} + +/* + * __wt_readlock -- + * Get a shared lock. + */ +int +__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +{ + wt_rwlock_t *l; + uint64_t me; + uint16_t val; + + WT_RET(__wt_verbose( + session, WT_VERB_MUTEX, "rwlock: readlock %s", rwlock->name)); + WT_STAT_FAST_CONN_INCR(session, rwlock_read); + + l = &rwlock->rwlock; + me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32); + val = (uint16_t)(me >> 32); + while (val != l->s.readers) + WT_PAUSE(); + + ++l->s.readers; + + return (0); +} + +/* + * __wt_readunlock -- + * Release a shared lock. + */ +int +__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +{ + wt_rwlock_t *l; + + WT_RET(__wt_verbose( + session, WT_VERB_MUTEX, "rwlock: read unlock %s", rwlock->name)); + + l = &rwlock->rwlock; + WT_ATOMIC_ADD2(l->s.writers, 1); + + return (0); +} + +/* + * __wt_try_writelock -- + * Try to get an exclusive lock, fail immediately if unavailable. + */ +int +__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +{ + wt_rwlock_t *l; + uint64_t old, new, pad, readers, users; + + WT_RET(__wt_verbose( + session, WT_VERB_MUTEX, "rwlock: try_writelock %s", rwlock->name)); + WT_STAT_FAST_CONN_INCR(session, rwlock_write); + + l = &rwlock->rwlock; + pad = l->s.pad; + readers = l->s.readers; + users = l->s.users; + old = (pad << 48) + (users << 32) + (readers << 16) + users; + new = (pad << 48) + ((users + 1) << 32) + (readers << 16) + users; + return (WT_ATOMIC_CAS_VAL8(l->u, old, new) == old ? 0 : EBUSY); +} + +/* + * __wt_writelock -- + * Wait to get an exclusive lock. + */ +int +__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +{ + wt_rwlock_t *l; + uint64_t me; + uint16_t val; + + WT_RET(__wt_verbose( + session, WT_VERB_MUTEX, "rwlock: writelock %s", rwlock->name)); + WT_STAT_FAST_CONN_INCR(session, rwlock_write); + + /* + * Possibly wrap: if we have more than 64K lockers waiting, the count + * of writers will wrap and two lockers will simultaneously be granted + * the write lock. + */ + l = &rwlock->rwlock; + me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32); + val = (uint16_t)(me >> 32); + while (val != l->s.writers) + WT_PAUSE(); + + return (0); +} + +/* + * __wt_writeunlock -- + * Release an exclusive lock. + */ +int +__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +{ + wt_rwlock_t *l, copy; + + WT_RET(__wt_verbose( + session, WT_VERB_MUTEX, "rwlock: writeunlock %s", rwlock->name)); + + l = &rwlock->rwlock; + + copy = *l; + + WT_BARRIER(); + + ++copy.s.writers; + ++copy.s.readers; + + l->us = copy.us; + return (0); +} + +/* + * __wt_rwlock_destroy -- + * Destroy a read/write lock. + */ +int +__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp) +{ + WT_RWLOCK *rwlock; + + rwlock = *rwlockp; /* Clear our caller's reference. */ + if (rwlock == NULL) + return (0); + *rwlockp = NULL; + + WT_RET(__wt_verbose( + session, WT_VERB_MUTEX, "rwlock: destroy %s", rwlock->name)); + + __wt_free(session, rwlock); + return (0); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_once.c b/src/third_party/wiredtiger/src/os_posix/os_once.c new file mode 100644 index 00000000000..22eaf5f0ee5 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_once.c @@ -0,0 +1,20 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_once -- + * One-time initialization per process. + */ +int +__wt_once(void (*init_routine)(void)) +{ + static pthread_once_t once_control = PTHREAD_ONCE_INIT; + + return (pthread_once(&once_control, init_routine)); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_open.c b/src/third_party/wiredtiger/src/os_posix/os_open.c new file mode 100644 index 00000000000..a1bc3feb7d2 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_open.c @@ -0,0 +1,253 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __open_directory_sync -- + * Fsync the directory in which we created the file. + */ +static int +__open_directory_sync(WT_SESSION_IMPL *session, char *path) +{ +#ifdef __linux__ + WT_DECL_RET; + int fd; + char *dir; + + /* + * According to the Linux fsync man page: + * Calling fsync() does not necessarily ensure that the entry in + * the directory containing the file has also reached disk. For + * that an explicit fsync() on a file descriptor for the directory + * is also needed. + * + * Open the WiredTiger home directory and sync it, I don't want the rest + * of the system to have to wonder if opening a file creates it. + */ + if ((dir = strrchr(path, '/')) == NULL) + path = (char *)"."; + else + *dir = '\0'; + WT_SYSCALL_RETRY(((fd = + open(path, O_RDONLY, 0444)) == -1 ? 1 : 0), ret); + if (dir != NULL) + *dir = '/'; + if (ret != 0) + WT_RET_MSG(session, ret, "%s: open", path); + + WT_SYSCALL_RETRY(fsync(fd), ret); + if (ret != 0) + WT_ERR_MSG(session, ret, "%s: fsync", path); + +err: WT_SYSCALL_RETRY(close(fd), ret); + if (ret != 0) + __wt_err(session, ret, "%s: close", path); + return (ret); +#else + WT_UNUSED(session); + WT_UNUSED(path); + return (0); +#endif +} + +/* + * __wt_open -- + * Open a file handle. + */ +int +__wt_open(WT_SESSION_IMPL *session, + const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_FH *fh, *tfh; + mode_t mode; + int direct_io, f, fd, matched; + char *path; + + conn = S2C(session); + fh = NULL; + fd = -1; + path = NULL; + + WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: open", name)); + + /* Increment the reference count if we already have the file open. */ + matched = 0; + __wt_spin_lock(session, &conn->fh_lock); + TAILQ_FOREACH(tfh, &conn->fhqh, q) + if (strcmp(name, tfh->name) == 0) { + ++tfh->ref; + *fhp = tfh; + matched = 1; + break; + } + __wt_spin_unlock(session, &conn->fh_lock); + if (matched) + return (0); + + WT_RET(__wt_filename(session, name, &path)); + + f = O_RDWR; +#ifdef O_BINARY + /* Windows clones: we always want to treat the file as a binary. */ + f |= O_BINARY; +#endif +#ifdef O_CLOEXEC + /* + * Security: + * The application may spawn a new process, and we don't want another + * process to have access to our file handles. + */ + f |= O_CLOEXEC; +#endif +#ifdef O_NOATIME + /* Avoid updating metadata for read-only workloads. */ + if (dio_type == WT_FILE_TYPE_DATA || + dio_type == WT_FILE_TYPE_CHECKPOINT) + f |= O_NOATIME; +#endif + + if (ok_create) { + f |= O_CREAT; + if (exclusive) + f |= O_EXCL; + mode = 0666; + } else + mode = 0; + + direct_io = 0; +#ifdef O_DIRECT + if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) { + f |= O_DIRECT; + direct_io = 1; + } +#endif + if (dio_type == WT_FILE_TYPE_LOG && + FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) +#ifdef O_DSYNC + f |= O_DSYNC; +#elif defined(O_SYNC) + f |= O_SYNC; +#else + WT_ERR_MSG(session, ENOTSUP, + "Unsupported log sync mode requested"); +#endif + WT_SYSCALL_RETRY(((fd = open(path, f, mode)) == -1 ? 1 : 0), ret); + if (ret != 0) + WT_ERR_MSG(session, ret, + direct_io ? + "%s: open failed with direct I/O configured, some " + "filesystem types do not support direct I/O" : "%s", path); + +#if defined(HAVE_FCNTL) && defined(FD_CLOEXEC) && !defined(O_CLOEXEC) + /* + * Security: + * The application may spawn a new process, and we don't want another + * process to have access to our file handles. There's an obvious + * race here, so we prefer the flag to open if available. + */ + if ((f = fcntl(fd, F_GETFD)) == -1 || + fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1) + WT_ERR_MSG(session, __wt_errno(), "%s: fcntl", name); +#endif + +#if defined(HAVE_POSIX_FADVISE) + /* Disable read-ahead on trees: it slows down random read workloads. */ + if (dio_type == WT_FILE_TYPE_DATA || + dio_type == WT_FILE_TYPE_CHECKPOINT) + WT_ERR(posix_fadvise(fd, 0, 0, POSIX_FADV_RANDOM)); +#endif + + if (F_ISSET(conn, WT_CONN_CKPT_SYNC)) + WT_ERR(__open_directory_sync(session, path)); + + WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh)); + WT_ERR(__wt_strdup(session, name, &fh->name)); + fh->fd = fd; + fh->ref = 1; + fh->direct_io = direct_io; + + /* Set the file's size. */ + WT_ERR(__wt_filesize(session, fh, &fh->size)); + + /* Configure file extension. */ + if (dio_type == WT_FILE_TYPE_DATA || + dio_type == WT_FILE_TYPE_CHECKPOINT) + fh->extend_len = conn->data_extend_len; + + /* Configure fallocate/posix_fallocate calls. */ + __wt_fallocate_config(session, fh); + + /* + * Repeat the check for a match, but then link onto the database's list + * of files. + */ + matched = 0; + __wt_spin_lock(session, &conn->fh_lock); + TAILQ_FOREACH(tfh, &conn->fhqh, q) + if (strcmp(name, tfh->name) == 0) { + ++tfh->ref; + *fhp = tfh; + matched = 1; + break; + } + if (!matched) { + TAILQ_INSERT_TAIL(&conn->fhqh, fh, q); + WT_STAT_FAST_CONN_INCR(session, file_open); + + *fhp = fh; + } + __wt_spin_unlock(session, &conn->fh_lock); + if (matched) { +err: if (fh != NULL) { + __wt_free(session, fh->name); + __wt_free(session, fh); + } + if (fd != -1) + (void)close(fd); + } + + __wt_free(session, path); + return (ret); +} + +/* + * __wt_close -- + * Close a file handle. + */ +int +__wt_close(WT_SESSION_IMPL *session, WT_FH *fh) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + conn = S2C(session); + + __wt_spin_lock(session, &conn->fh_lock); + if (fh == NULL || fh->ref == 0 || --fh->ref > 0) { + __wt_spin_unlock(session, &conn->fh_lock); + return (0); + } + + /* Remove from the list. */ + TAILQ_REMOVE(&conn->fhqh, fh, q); + WT_STAT_FAST_CONN_DECR(session, file_open); + + __wt_spin_unlock(session, &conn->fh_lock); + + /* Discard the memory. */ + if (close(fh->fd) != 0) { + ret = __wt_errno(); + __wt_err(session, ret, "close: %s", fh->name); + } + + __wt_free(session, fh->name); + __wt_free(session, fh); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_path.c b/src/third_party/wiredtiger/src/os_posix/os_path.c new file mode 100644 index 00000000000..aed99d1d027 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_path.c @@ -0,0 +1,28 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_absolute_path -- + * Return if a filename is an absolute path. + */ +int +__wt_absolute_path(const char *path) +{ + return (path[0] == '/' ? 1 : 0); +} + +/* + * __wt_path_separator -- + * Return the path separator string. + */ +const char * +__wt_path_separator(void) +{ + return ("/"); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_priv.c b/src/third_party/wiredtiger/src/os_posix/os_priv.c new file mode 100644 index 00000000000..7d56359da4f --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_priv.c @@ -0,0 +1,19 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_has_priv -- + * Return if the process has special privileges, defined as having + * different effective and read UIDs or GIDs. + */ +int +__wt_has_priv(void) +{ + return (getuid() != geteuid() || getgid() != getegid()); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_remove.c b/src/third_party/wiredtiger/src/os_posix/os_remove.c new file mode 100644 index 00000000000..a52a4db6bc7 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_remove.c @@ -0,0 +1,66 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __remove_file_check -- + * Check if the file is currently open before removing it. + */ +static void +__remove_file_check(WT_SESSION_IMPL *session, const char *name) +{ +#ifdef HAVE_DIAGNOSTIC + WT_CONNECTION_IMPL *conn; + WT_FH *fh; + + conn = S2C(session); + fh = NULL; + + /* + * Check if the file is open: it's an error if it is, since a higher + * level should have closed it before removing. + */ + __wt_spin_lock(session, &conn->fh_lock); + TAILQ_FOREACH(fh, &conn->fhqh, q) { + if (strcmp(name, fh->name) == 0) + break; + } + __wt_spin_unlock(session, &conn->fh_lock); + + WT_ASSERT(session, fh == NULL); +#else + WT_UNUSED(session); + WT_UNUSED(name); +#endif +} + +/* + * __wt_remove -- + * Remove a file. + */ +int +__wt_remove(WT_SESSION_IMPL *session, const char *name) +{ + WT_DECL_RET; + char *path; + + WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: remove", name)); + + __remove_file_check(session, name); + + WT_RET(__wt_filename(session, name, &path)); + + WT_SYSCALL_RETRY(remove(path), ret); + + __wt_free(session, path); + + if (ret == 0 || ret == ENOENT) + return (0); + + WT_RET_MSG(session, ret, "%s: remove", name); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_rename.c b/src/third_party/wiredtiger/src/os_posix/os_rename.c new file mode 100644 index 00000000000..ddbb59aaf37 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_rename.c @@ -0,0 +1,38 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_rename -- + * Rename a file. + */ +int +__wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to) +{ + WT_DECL_RET; + char *from_path, *to_path; + + WT_RET(__wt_verbose( + session, WT_VERB_FILEOPS, "rename %s to %s", from, to)); + + from_path = to_path = NULL; + + WT_RET(__wt_filename(session, from, &from_path)); + WT_TRET(__wt_filename(session, to, &to_path)); + + if (ret == 0) + WT_SYSCALL_RETRY(rename(from_path, to_path), ret); + + __wt_free(session, from_path); + __wt_free(session, to_path); + + if (ret == 0) + return (0); + + WT_RET_MSG(session, ret, "rename %s to %s", from, to); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_rw.c b/src/third_party/wiredtiger/src/os_posix/os_rw.c new file mode 100644 index 00000000000..4247fb30fd1 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_rw.c @@ -0,0 +1,86 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_read -- + * Read a chunk. + */ +int +__wt_read( + WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf) +{ + size_t chunk; + ssize_t nr; + uint8_t *addr; + + WT_STAT_FAST_CONN_INCR(session, read_io); + + WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, + "%s: read %" WT_SIZET_FMT " bytes at offset %" PRIuMAX, + fh->name, len, (uintmax_t)offset)); + + /* Assert direct I/O is aligned and a multiple of the alignment. */ + WT_ASSERT(session, + !fh->direct_io || + S2C(session)->buffer_alignment == 0 || + (!((uintptr_t)buf & + (uintptr_t)(S2C(session)->buffer_alignment - 1)) && + len >= S2C(session)->buffer_alignment && + len % S2C(session)->buffer_alignment == 0)); + + /* Break reads larger than 1GB into 1GB chunks. */ + for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) { + chunk = WT_MIN(len, WT_GIGABYTE); + if ((nr = pread(fh->fd, addr, chunk, offset)) <= 0) + WT_RET_MSG(session, nr == 0 ? WT_ERROR : __wt_errno(), + "%s read error: failed to read %" WT_SIZET_FMT + " bytes at offset %" PRIuMAX, + fh->name, chunk, (uintmax_t)offset); + } + return (0); +} + +/* + * __wt_write -- + * Write a chunk. + */ +int +__wt_write(WT_SESSION_IMPL *session, + WT_FH *fh, wt_off_t offset, size_t len, const void *buf) +{ + size_t chunk; + ssize_t nw; + const uint8_t *addr; + + WT_STAT_FAST_CONN_INCR(session, write_io); + + WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, + "%s: write %" WT_SIZET_FMT " bytes at offset %" PRIuMAX, + fh->name, len, (uintmax_t)offset)); + + /* Assert direct I/O is aligned and a multiple of the alignment. */ + WT_ASSERT(session, + !fh->direct_io || + S2C(session)->buffer_alignment == 0 || + (!((uintptr_t)buf & + (uintptr_t)(S2C(session)->buffer_alignment - 1)) && + len >= S2C(session)->buffer_alignment && + len % S2C(session)->buffer_alignment == 0)); + + /* Break writes larger than 1GB into 1GB chunks. */ + for (addr = buf; len > 0; addr += nw, len -= (size_t)nw, offset += nw) { + chunk = WT_MIN(len, WT_GIGABYTE); + if ((nw = pwrite(fh->fd, addr, chunk, offset)) < 0) + WT_RET_MSG(session, __wt_errno(), + "%s write error: failed to write %" WT_SIZET_FMT + " bytes at offset %" PRIuMAX, + fh->name, chunk, (uintmax_t)offset); + } + return (0); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_sleep.c b/src/third_party/wiredtiger/src/os_posix/os_sleep.c new file mode 100644 index 00000000000..665330a26e7 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_sleep.c @@ -0,0 +1,23 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_sleep -- + * Pause the thread of control. + */ +void +__wt_sleep(long seconds, long micro_seconds) +{ + struct timeval t; + + t.tv_sec = seconds + micro_seconds / 1000000; + t.tv_usec = (suseconds_t)(micro_seconds % 1000000); + + (void)select(0, NULL, NULL, NULL, &t); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_strtouq.c b/src/third_party/wiredtiger/src/os_posix/os_strtouq.c new file mode 100644 index 00000000000..97f9759f76f --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_strtouq.c @@ -0,0 +1,24 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_strtouq -- + * Convert a string to an unsigned quad integer. + */ +uint64_t +__wt_strtouq(const char *nptr, char **endptr, int base) +{ +#if defined(HAVE_STRTOUQ) + return (strtouq(nptr, endptr, base)); +#else + WT_STATIC_ASSERT(sizeof(uint64_t) == sizeof(unsigned long long)); + + return (strtoull(nptr, endptr, base)); +#endif +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_thread.c b/src/third_party/wiredtiger/src/os_posix/os_thread.c new file mode 100644 index 00000000000..7c447710b46 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_thread.c @@ -0,0 +1,59 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_thread_create -- + * Create a new thread of control. + */ +int +__wt_thread_create(WT_SESSION_IMPL *session, + wt_thread_t *tidret, void *(*func)(void *), void *arg) +{ + WT_DECL_RET; + + /* Spawn a new thread of control. */ + if ((ret = pthread_create(tidret, NULL, func, arg)) == 0) + return (0); + WT_RET_MSG(session, ret, "pthread_create"); +} + +/* + * __wt_thread_join -- + * Wait for a thread of control to exit. + */ +int +__wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) +{ + WT_DECL_RET; + + if ((ret = pthread_join(tid, NULL)) == 0) + return (0); + + WT_RET_MSG(session, ret, "pthread_join"); +} + +/* + * __wt_thread_id -- + * Fill in a printable version of the process and thread IDs. + */ +void +__wt_thread_id(char *buf, size_t buflen) +{ + pthread_t self; + + /* + * POSIX 1003.1 allows pthread_t to be an opaque type, but on systems + * where it's a pointer, we'd rather print out the pointer and match + * gdb output. Since we don't yet run on any systems where pthread_t + * is not a pointer, do it that way for now. + */ + self = pthread_self(); + (void)snprintf(buf, buflen, + "%" PRIu64 ":%p", (uint64_t)getpid(), (void *)self); +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_time.c b/src/third_party/wiredtiger/src/os_posix/os_time.c new file mode 100644 index 00000000000..56f688a1e14 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_time.c @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_seconds -- + * Return the seconds since the Epoch. + */ +int +__wt_seconds(WT_SESSION_IMPL *session, time_t *timep) +{ + struct timespec t; + + WT_RET(__wt_epoch(session, &t)); + + *timep = t.tv_sec; + + return (0); +} + +/* + * __wt_epoch -- + * Return the time since the Epoch. + */ +int +__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) +{ + WT_DECL_RET; + +#if defined(HAVE_CLOCK_GETTIME) + WT_SYSCALL_RETRY(clock_gettime(CLOCK_REALTIME, tsp), ret); + if (ret == 0) + return (0); + WT_RET_MSG(session, ret, "clock_gettime"); +#elif defined(HAVE_GETTIMEOFDAY) + struct timeval v; + + WT_SYSCALL_RETRY(gettimeofday(&v, NULL), ret); + if (ret == 0) { + tsp->tv_sec = v.tv_sec; + tsp->tv_nsec = v.tv_usec * 1000; + return (0); + } + WT_RET_MSG(session, ret, "gettimeofday"); +#else + NO TIME-OF-DAY IMPLEMENTATION: see src/os_posix/os_time.c +#endif +} diff --git a/src/third_party/wiredtiger/src/os_posix/os_yield.c b/src/third_party/wiredtiger/src/os_posix/os_yield.c new file mode 100644 index 00000000000..6af30803e81 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_yield.c @@ -0,0 +1,18 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_yield -- + * Yield the thread of control. + */ +void +__wt_yield(void) +{ + sched_yield(); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_dir.c b/src/third_party/wiredtiger/src/os_win/os_dir.c new file mode 100644 index 00000000000..076c64670d4 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_dir.c @@ -0,0 +1,111 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_dirlist -- + * Get a list of files from a directory, optionally filtered by + * a given prefix. + */ +int +__wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix, + uint32_t flags, char ***dirlist, u_int *countp) +{ + HANDLE findhandle; + WIN32_FIND_DATA finddata; + WT_DECL_ITEM(pathbuf); + WT_DECL_RET; + size_t dirallocsz, pathlen; + u_int count, dirsz; + int match; + char **entries, *path; + + *dirlist = NULL; + *countp = 0; + + findhandle = INVALID_HANDLE_VALUE; + count = 0; + + WT_RET(__wt_filename(session, dir, &path)); + + pathlen = strlen(path); + if (path[pathlen - 1] == '\\') { + path[pathlen - 1] = '\0'; + } + + WT_ERR(__wt_scr_alloc(session, 0, &pathbuf)); + WT_ERR(__wt_buf_fmt(session, pathbuf, "%s\\*", path)); + + dirallocsz = 0; + dirsz = 0; + entries = NULL; + if (flags == 0) + LF_SET(WT_DIRLIST_INCLUDE); + + WT_ERR(__wt_verbose(session, WT_VERB_FILEOPS, + "wt_dirlist of %s %s prefix %s", + pathbuf->data, LF_ISSET(WT_DIRLIST_INCLUDE) ? "include" : "exclude", + prefix == NULL ? "all" : prefix)); + + findhandle = FindFirstFileA(pathbuf->data, &finddata); + + if (INVALID_HANDLE_VALUE == findhandle) + WT_ERR_MSG(session, __wt_errno(), "%s: FindFirstFile", + pathbuf->data); + else { + do { + /* + * Skip . and .. + */ + if (strcmp(finddata.cFileName, ".") == 0 || + strcmp(finddata.cFileName, "..") == 0) + continue; + match = 0; + if (prefix != NULL && + ((LF_ISSET(WT_DIRLIST_INCLUDE) && + WT_PREFIX_MATCH(finddata.cFileName, prefix)) || + (LF_ISSET(WT_DIRLIST_EXCLUDE) && + !WT_PREFIX_MATCH(finddata.cFileName, prefix)))) + match = 1; + if (prefix == NULL || match) { + /* + * We have a file name we want to return. + */ + count++; + if (count > dirsz) { + dirsz += WT_DIR_ENTRY; + WT_ERR(__wt_realloc_def(session, + &dirallocsz, dirsz, &entries)); + } + WT_ERR(__wt_strdup(session, + finddata.cFileName, &entries[count - 1])); + } + } while (FindNextFileA(findhandle, &finddata) != 0); + } + + if (count > 0) + *dirlist = entries; + *countp = count; + +err: + if (findhandle != INVALID_HANDLE_VALUE) + (void)FindClose(findhandle); + __wt_free(session, path); + __wt_buf_free(session, pathbuf); + + if (ret == 0) + return (0); + + if (*dirlist != NULL) { + for (count = dirsz; count > 0; count--) + __wt_free(session, entries[count]); + __wt_free(session, entries); + } + + WT_RET_MSG(session, ret, "dirlist %s prefix %s", dir, prefix); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_dlopen.c b/src/third_party/wiredtiger/src/os_win/os_dlopen.c new file mode 100644 index 00000000000..ebc90edd2b2 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_dlopen.c @@ -0,0 +1,86 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_dlopen -- + * Open a dynamic library. + */ +int +__wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp) +{ + WT_DECL_RET; + WT_DLH *dlh; + + WT_RET(__wt_calloc_def(session, 1, &dlh)); + WT_ERR(__wt_strdup(session, path, &dlh->name)); + + /* NULL means load from the current binary */ + if (path == NULL) { + ret = GetModuleHandleExA(0, NULL, &dlh->handle); + if (ret == FALSE) + WT_ERR_MSG(session, + __wt_errno(), "GetModuleHandleEx(%s): %s", path, 0); + } else { + // TODO: load dll here + DebugBreak(); + } + + /* Windows returns 0 on failure, WT expects 0 on success */ + ret = !ret; + + *dlhp = dlh; + if (0) { +err: __wt_free(session, dlh->name); + __wt_free(session, dlh); + } + return (ret); +} + +/* + * __wt_dlsym -- + * Lookup a symbol in a dynamic library. + */ +int +__wt_dlsym(WT_SESSION_IMPL *session, + WT_DLH *dlh, const char *name, int fail, void *sym_ret) +{ + void *sym; + + *(void **)sym_ret = NULL; + + sym = GetProcAddress(dlh->handle, name); + if (sym == NULL && fail) { + WT_RET_MSG(session, __wt_errno(), + "GetProcAddress(%s in %s): %s", name, dlh->name, 0); + } + + *(void **)sym_ret = sym; + return (0); +} + +/* + * __wt_dlclose -- + * Close a dynamic library + */ +int +__wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh) +{ + WT_DECL_RET; + + if ((ret = FreeLibrary(dlh->handle)) == FALSE) { + __wt_err(session, __wt_errno(), "FreeLibrary"); + } + + /* Windows returns 0 on failure, WT expects 0 on success */ + ret = !ret; + + __wt_free(session, dlh->name); + __wt_free(session, dlh); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_errno.c b/src/third_party/wiredtiger/src/os_win/os_errno.c new file mode 100644 index 00000000000..ce50106b0cc --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_errno.c @@ -0,0 +1,27 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_errno -- + * Return errno, or WT_ERROR if errno not set. + */ +int +__wt_errno(void) +{ + /* + * Called when we know an error occurred, and we want the system + * error code, but there's some chance it's not set. + */ + DWORD err = GetLastError(); + + /* GetLastError should only be called if we hit an actual error */ + WT_ASSERT(NULL, err != ERROR_SUCCESS); + + return (err == ERROR_SUCCESS ? WT_ERROR : err); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_exist.c b/src/third_party/wiredtiger/src/os_win/os_exist.c new file mode 100644 index 00000000000..ab3805f19df --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_exist.c @@ -0,0 +1,32 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_exist -- + * Return if the file exists. + */ +int +__wt_exist(WT_SESSION_IMPL *session, const char *filename, int *existp) +{ + WT_DECL_RET; + char *path; + + WT_RET(__wt_filename(session, filename, &path)); + + ret = GetFileAttributesA(path); + + __wt_free(session, path); + + if (ret != INVALID_FILE_ATTRIBUTES) + *existp = 1; + else + *existp = 0; + + return (0); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_fallocate.c b/src/third_party/wiredtiger/src/os_win/os_fallocate.c new file mode 100644 index 00000000000..bd71c780dc5 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_fallocate.c @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_fallocate_config -- + * Configure fallocate behavior for a file handle. + */ +void +__wt_fallocate_config(WT_SESSION_IMPL *session, WT_FH *fh) +{ + fh->fallocate_available = 1; + + /* + * We use a separate handle for file size changes, so there's no need + * for locking. + */ + fh->fallocate_requires_locking = 0; +} + +/* + * __wt_fallocate -- + * Allocate space for a file handle. + */ +int +__wt_fallocate( + WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len) +{ + WT_DECL_RET; + LARGE_INTEGER largeint; + + WT_RET(__wt_verbose( + session, WT_VERB_FILEOPS, "%s: fallocate", fh->name)); + + largeint.QuadPart = offset + len; + + if ((ret = SetFilePointerEx( + fh->filehandle_secondary, largeint, NULL, FILE_BEGIN)) == FALSE) + WT_RET_MSG(session, + __wt_errno(), "%s SetFilePointerEx error", fh->name); + + if ((ret = SetEndOfFile(fh->filehandle_secondary)) != FALSE) { + fh->size = fh->extend_size = len; + return (0); + } + + WT_RET_MSG(session, __wt_errno(), "%s SetEndOfFile error", fh->name); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_filesize.c b/src/third_party/wiredtiger/src/os_win/os_filesize.c new file mode 100644 index 00000000000..309ee1db40b --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_filesize.c @@ -0,0 +1,56 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_filesize -- + * Get the size of a file in bytes. + */ +int +__wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep) +{ + WT_DECL_RET; + LARGE_INTEGER size; + + WT_RET(__wt_verbose( + session, WT_VERB_FILEOPS, "%s: GetFileSizeEx", fh->name)); + + if ((ret = GetFileSizeEx(fh->filehandle, &size)) != 0) { + *sizep = size.QuadPart; + return (0); + } + + WT_RET_MSG(session, __wt_errno(), "%s: GetFileSizeEx", fh->name); +} + +/* + * __wt_filesize_name -- + * Return the size of a file in bytes, given a file name. + */ +int +__wt_filesize_name( + WT_SESSION_IMPL *session, const char *filename, wt_off_t *sizep) +{ + WT_DECL_RET; + WIN32_FILE_ATTRIBUTE_DATA data; + char *path; + + WT_RET(__wt_filename(session, filename, &path)); + + ret = GetFileAttributesExA(path, GetFileExInfoStandard, &data); + + __wt_free(session, path); + + if (ret != 0) { + *sizep = + ((int64_t)data.nFileSizeHigh << 32) | data.nFileSizeLow; + return (0); + } + + WT_RET_MSG(session, __wt_errno(), "%s: GetFileAttributesEx", filename); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_flock.c b/src/third_party/wiredtiger/src/os_win/os_flock.c new file mode 100644 index 00000000000..4b3ca34d65f --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_flock.c @@ -0,0 +1,46 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_bytelock -- + * Lock/unlock a byte in a file. + */ +int +__wt_bytelock(WT_FH *fhp, wt_off_t byte, int lock) +{ + WT_DECL_RET; + + /* + * WiredTiger requires this function be able to acquire locks past + * the end of file. + * + * Note we're using fcntl(2) locking: all fcntl locks associated with a + * file for a given process are removed when any file descriptor for the + * file is closed by the process, even if a lock was never requested for + * that file descriptor. + * + * http://msdn.microsoft.com/ + * en-us/library/windows/desktop/aa365202%28v=vs.85%29.aspx + * + * You can lock bytes that are beyond the end of the current file. + * This is useful to coordinate adding records to the end of a file. + */ + if (lock) { + ret = LockFile(fhp->filehandle, UINT32_MAX & byte, + UINT32_MAX & (byte >> 32), 1, 0); + } else { + ret = UnlockFile(fhp->filehandle, UINT32_MAX & byte, + UINT32_MAX & (byte >> 32), 1, 0); + } + + if (ret == FALSE) + WT_RET_MSG(NULL, __wt_errno(), "%s: LockFile", fhp->name); + + return (0); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_fsync.c b/src/third_party/wiredtiger/src/os_win/os_fsync.c new file mode 100644 index 00000000000..cd509131649 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_fsync.c @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_fsync -- + * Flush a file handle. + */ +int +__wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh) +{ + WT_DECL_RET; + + WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: FlushFileBuffers", + fh->name)); + + if ((ret = FlushFileBuffers(fh->filehandle)) == FALSE) + WT_RET_MSG(session, + __wt_errno(), "%s FlushFileBuffers error", fh->name); + + return (0); +} + +/* + * __wt_fsync_async -- + * Flush a file handle and don't wait for the result. + */ +int +__wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh) +{ + WT_UNUSED(session); + WT_UNUSED(fh); + + return (0); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_ftruncate.c b/src/third_party/wiredtiger/src/os_win/os_ftruncate.c new file mode 100644 index 00000000000..5d87f1ce06a --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_ftruncate.c @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_ftruncate -- + * Truncate a file. + */ +int +__wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len) +{ + WT_DECL_RET; + LARGE_INTEGER largeint; + uint32_t lasterror; + + largeint.QuadPart = len; + + if ((ret = SetFilePointerEx( + fh->filehandle_secondary, largeint, NULL, FILE_BEGIN)) == FALSE) + WT_RET_MSG(session, __wt_errno(), "%s SetFilePointerEx error", + fh->name); + + ret = SetEndOfFile(fh->filehandle_secondary); + if (ret != FALSE) { + fh->size = fh->extend_size = len; + return (0); + } + + lasterror = GetLastError(); + + if (lasterror = ERROR_USER_MAPPED_FILE) + return (EBUSY); + + WT_RET_MSG(session, lasterror, "%s SetEndOfFile error", fh->name); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_map.c b/src/third_party/wiredtiger/src/os_win/os_map.c new file mode 100644 index 00000000000..b3b4f0f7501 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_map.c @@ -0,0 +1,106 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_mmap -- + * Map a file into memory. + */ +int +__wt_mmap(WT_SESSION_IMPL *session, WT_FH *fh, void *mapp, size_t *lenp, + void** mappingcookie) +{ + void *map; + size_t orig_size; + + /* + * Record the current size and only map and set that as the length, it + * could change between the map call and when we set the return length. + * For the same reason we could actually map past the end of the file; + * we don't read bytes past the end of the file though, so as long as + * the map call succeeds, it's all OK. + */ + orig_size = (size_t)fh->size; + *mappingcookie = + CreateFileMappingA(fh->filehandle, NULL, PAGE_READONLY, 0, 0, NULL); + if (*mappingcookie == NULL) + WT_RET_MSG(session, __wt_errno(), + "%s CreateFileMapping error: failed to map %" + WT_SIZET_FMT " bytes", + fh->name, orig_size); + + if ((map = MapViewOfFile( + *mappingcookie, FILE_MAP_READ, 0, 0, orig_size)) == NULL) { + CloseHandle(*mappingcookie); + *mappingcookie = NULL; + + WT_RET_MSG(session, __wt_errno(), + "%s map error: failed to map %" WT_SIZET_FMT " bytes", + fh->name, orig_size); + } + (void)__wt_verbose(session, WT_VERB_FILEOPS, + "%s: MapViewOfFile %p: %" WT_SIZET_FMT " bytes", + fh->name, map, orig_size); + + *(void **)mapp = map; + *lenp = orig_size; + return (0); +} + +/* + * __wt_mmap_preload -- + * Cause a section of a memory map to be faulted in. + */ +int +__wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size) +{ + WT_UNUSED(session); + WT_UNUSED(p); + WT_UNUSED(size); + + return (0); +} + +/* + * __wt_mmap_discard -- + * Discard a chunk of the memory map. + */ +int +__wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size) +{ + WT_UNUSED(session); + WT_UNUSED(p); + WT_UNUSED(size); + return (0); +} + +/* + * __wt_munmap -- + * Remove a memory mapping. + */ +int +__wt_munmap(WT_SESSION_IMPL *session, WT_FH *fh, void *map, size_t len, + void** mappingcookie) +{ + WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, + "%s: UnmapViewOfFile %p: %" WT_SIZET_FMT " bytes", + fh->name, map, len)); + + if (UnmapViewOfFile(map) == 0) { + WT_RET_MSG(session, __wt_errno(), + "%s UnmapViewOfFile error: failed to unmap %" WT_SIZET_FMT + " bytes", + fh->name, len); + } + + CloseHandle(*mappingcookie); + + *mappingcookie = 0; + + return (0); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c new file mode 100644 index 00000000000..9c9907bd8be --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c @@ -0,0 +1,155 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_cond_alloc -- + * Allocate and initialize a condition variable. + */ +int +__wt_cond_alloc(WT_SESSION_IMPL *session, + const char *name, int is_signalled, WT_CONDVAR **condp) +{ + WT_CONDVAR *cond; + + /* + * !!! + * This function MUST handle a NULL session handle. + */ + WT_RET(__wt_calloc(session, 1, sizeof(WT_CONDVAR), &cond)); + + InitializeCriticalSection(&cond->mtx); + + /* Initialize the condition variable to permit self-blocking. */ + InitializeConditionVariable(&cond->cond); + + cond->name = name; + cond->waiters = is_signalled ? -1 : 0; + + *condp = cond; + return (0); +} + +/* + * __wt_cond_wait -- + * Wait on a mutex, optionally timing out. + */ +int +__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs) +{ + WT_DECL_RET; + int locked; + int lasterror; + int milliseconds; + locked = 0; + WT_ASSERT(session, usecs >= 0); + + /* Fast path if already signalled. */ + if (WT_ATOMIC_ADD4(cond->waiters, 1) == 0) + return (0); + + /* + * !!! + * This function MUST handle a NULL session handle. + */ + if (session != NULL) { + WT_RET(__wt_verbose(session, WT_VERB_MUTEX, + "wait %s cond (%p)", cond->name, cond)); + WT_STAT_FAST_CONN_INCR(session, cond_wait); + } + + EnterCriticalSection(&cond->mtx); + locked = 1; + + if (usecs > 0) { + milliseconds = usecs / 1000; + /* + * 0 would mean the CV sleep becomes a TryCV which we do not + * want + */ + if (milliseconds == 0) + milliseconds = 1; + ret = SleepConditionVariableCS( + &cond->cond, &cond->mtx, milliseconds); + } else + ret = SleepConditionVariableCS( + &cond->cond, &cond->mtx, INFINITE); + + if (ret == 0) { + lasterror = GetLastError(); + if (lasterror == ERROR_TIMEOUT) { + ret = 1; + } + } + + (void)WT_ATOMIC_SUB4(cond->waiters, 1); + + if (locked) + LeaveCriticalSection(&cond->mtx); + if (ret != 0) + return (0); + WT_RET_MSG(session, ret, "SleepConditionVariableCS"); +} + +/* + * __wt_cond_signal -- + * Signal a waiting thread. + */ +int +__wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) +{ + WT_DECL_RET; + int locked; + + locked = 0; + + /* + * !!! + * This function MUST handle a NULL session handle. + */ + if (session != NULL) + WT_RET(__wt_verbose(session, WT_VERB_MUTEX, + "signal %s cond (%p)", cond->name, cond)); + + /* Fast path if already signalled. */ + if (cond->waiters == -1) + return (0); + + if (cond->waiters > 0 || !WT_ATOMIC_CAS4(cond->waiters, 0, -1)) { + EnterCriticalSection(&cond->mtx); + locked = 1; + WakeAllConditionVariable(&cond->cond); + } + + if (locked) + LeaveCriticalSection(&cond->mtx); + if (ret == 0) + return (0); + WT_RET_MSG(session, ret, "WakeAllConditionVariable"); +} + +/* + * __wt_cond_destroy -- + * Destroy a condition variable. + */ +int +__wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) +{ + WT_CONDVAR *cond; + WT_DECL_RET; + + cond = *condp; + if (cond == NULL) + return (0); + + /* Do nothing to delete Condition Variable */ + DeleteCriticalSection(&cond->mtx); + __wt_free(session, *condp); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_win/os_mtx_rw.c new file mode 100644 index 00000000000..ec0894a2f29 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_mtx_rw.c @@ -0,0 +1,126 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_rwlock_alloc -- + * Allocate and initialize a read/write lock. + */ +int +__wt_rwlock_alloc( + WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name) +{ + WT_RWLOCK *rwlock; + + WT_RET(__wt_verbose(session, WT_VERB_MUTEX, "rwlock: alloc %s", name)); + + WT_RET(__wt_calloc_def(session, 1, &rwlock)); + + rwlock->name = name; + InitializeSRWLock(&rwlock->rwlock); + + *rwlockp = rwlock; + return (0); +} + +/* + * __wt_readlock -- + * Get a shared lock. + */ +int +__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +{ + WT_RET(__wt_verbose( + session, WT_VERB_MUTEX, "rwlock: readlock %s", rwlock->name)); + WT_STAT_FAST_CONN_INCR(session, rwlock_read); + + AcquireSRWLockShared(&rwlock->rwlock); + + return (0); +} + +/* + * __wt_readunlock -- + * Release a shared lock. + */ +int +__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +{ + WT_RET(__wt_verbose( + session, WT_VERB_MUTEX, "rwlock: read unlock %s", rwlock->name)); + + ReleaseSRWLockShared(&rwlock->rwlock); + return (0); +} + +/* + * __wt_try_writelock -- + * Try to get an exclusive lock, fail immediately if unavailable. + */ +int +__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +{ + WT_RET(__wt_verbose( + session, WT_VERB_MUTEX, "rwlock: try_writelock %s", rwlock->name)); + WT_STAT_FAST_CONN_INCR(session, rwlock_write); + + return (TryAcquireSRWLockExclusive(&rwlock->rwlock) == 0 ? EBUSY : 0); +} + +/* + * __wt_writelock -- + * Wait to get an exclusive lock. + */ +int +__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +{ + WT_RET(__wt_verbose( + session, WT_VERB_MUTEX, "rwlock: writelock %s", rwlock->name)); + WT_STAT_FAST_CONN_INCR(session, rwlock_write); + + AcquireSRWLockExclusive(&rwlock->rwlock); + + return (0); +} + +/* + * __wt_writeunlock -- + * Release an exclusive lock. + */ +int +__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +{ + WT_RET(__wt_verbose( + session, WT_VERB_MUTEX, "rwlock: writeunlock %s", rwlock->name)); + + ReleaseSRWLockExclusive(&rwlock->rwlock); + return (0); +} + +/* + * __wt_rwlock_destroy -- + * Destroy a read/write lock. + */ +int +__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp) +{ + WT_RWLOCK *rwlock; + + rwlock = *rwlockp; /* Clear our caller's reference. */ + if (rwlock == NULL) + return (0); + *rwlockp = NULL; + + WT_RET(__wt_verbose( + session, WT_VERB_MUTEX, "rwlock: destroy %s", rwlock->name)); + + /* Nothing to delete for Slim Reader Writer lock */ + + __wt_free(session, rwlock); + return (0); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_once.c b/src/third_party/wiredtiger/src/os_win/os_once.c new file mode 100644 index 00000000000..40640acf129 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_once.c @@ -0,0 +1,39 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_init_once_callback -- + * Global initialization, run once. + */ +BOOL CALLBACK _wt_init_once_callback( + _Inout_ PINIT_ONCE InitOnce, + _Inout_opt_ PVOID Parameter, + _Out_opt_ PVOID *Context + ) +{ + void(*init_routine)(void) = Parameter; + + init_routine(); + + return (TRUE); +} + +/* + * __wt_library_init -- + * Some things to do, before we do anything else. + */ +int +__wt_once(void(*init_routine)(void)) +{ + INIT_ONCE once_control = INIT_ONCE_STATIC_INIT; + PVOID lpContext = NULL; + + return !InitOnceExecuteOnce(&once_control, &_wt_init_once_callback, + init_routine, lpContext); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_open.c b/src/third_party/wiredtiger/src/os_win/os_open.c new file mode 100644 index 00000000000..7be98b604ec --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_open.c @@ -0,0 +1,219 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_open -- + * Open a file handle. + */ +int +__wt_open(WT_SESSION_IMPL *session, + const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp) +{ + DWORD dwCreationDisposition; + HANDLE filehandle, filehandle_secondary; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_FH *fh, *tfh; + int direct_io, f, matched, share_mode; + char *path; + + conn = S2C(session); + fh = NULL; + path = NULL; + filehandle = INVALID_HANDLE_VALUE; + filehandle_secondary = INVALID_HANDLE_VALUE; + + WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: open", name)); + + /* Increment the reference count if we already have the file open. */ + matched = 0; + __wt_spin_lock(session, &conn->fh_lock); + TAILQ_FOREACH(tfh, &conn->fhqh, q) + if (strcmp(name, tfh->name) == 0) { + ++tfh->ref; + *fhp = tfh; + matched = 1; + break; + } + __wt_spin_unlock(session, &conn->fh_lock); + if (matched) + return (0); + + WT_RET(__wt_filename(session, name, &path)); + + share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE; + /* + * Security: + * The application may spawn a new process, and we don't want another + * process to have access to our file handles. + * + * TODO: Set tighter file permissions but set bInheritHandle to false + * to prevent inheritance + */ + + f = FILE_ATTRIBUTE_NORMAL; + + dwCreationDisposition = 0; + if (ok_create) { + dwCreationDisposition = CREATE_NEW; + if (exclusive) + dwCreationDisposition = CREATE_ALWAYS; + } else + dwCreationDisposition = OPEN_EXISTING; + + direct_io = 0; + + if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) { + f |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH; + direct_io = 1; + } + + if (dio_type == WT_FILE_TYPE_LOG && + FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) { + f |= FILE_FLAG_WRITE_THROUGH; + } + + /* Disable read-ahead on trees: it slows down random read workloads. */ + if (dio_type == WT_FILE_TYPE_DATA || + dio_type == WT_FILE_TYPE_CHECKPOINT) + f |= FILE_FLAG_RANDOM_ACCESS; + + filehandle = CreateFileA(path, + (GENERIC_READ | GENERIC_WRITE), + share_mode, + NULL, + dwCreationDisposition, + f, + NULL); + if (filehandle == INVALID_HANDLE_VALUE) { + if (GetLastError() == ERROR_FILE_EXISTS && ok_create) + filehandle = CreateFileA(path, + (GENERIC_READ | GENERIC_WRITE), + share_mode, + NULL, + OPEN_EXISTING, + f, + NULL); + + if (filehandle == INVALID_HANDLE_VALUE) + WT_ERR_MSG(session, __wt_errno(), + direct_io ? + "%s: open failed with direct I/O configured, some " + "filesystem types do not support direct I/O" : + "%s", path); + } + + /* + * Open a second handle to file to support allocation/truncation + * concurrently with reads on the file. Writes would also move the file + * pointer. + */ + filehandle_secondary = CreateFileA(path, + (GENERIC_READ | GENERIC_WRITE), + share_mode, + NULL, + OPEN_EXISTING, + f, + NULL); + if (filehandle == INVALID_HANDLE_VALUE) + WT_ERR_MSG(session, __wt_errno(), + "open failed for secondary handle: %s", path); + + WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh)); + WT_ERR(__wt_strdup(session, name, &fh->name)); + fh->filehandle = filehandle; + fh->filehandle_secondary = filehandle_secondary; + fh->ref = 1; + fh->direct_io = direct_io; + + /* Set the file's size. */ + WT_ERR(__wt_filesize(session, fh, &fh->size)); + + /* Configure file extension. */ + if (dio_type == WT_FILE_TYPE_DATA || + dio_type == WT_FILE_TYPE_CHECKPOINT) + fh->extend_len = conn->data_extend_len; + + /* Configure fallocate/posix_fallocate calls. */ + __wt_fallocate_config(session, fh); + + /* + * Repeat the check for a match, but then link onto the database's list + * of files. + */ + matched = 0; + __wt_spin_lock(session, &conn->fh_lock); + TAILQ_FOREACH(tfh, &conn->fhqh, q) + if (strcmp(name, tfh->name) == 0) { + ++tfh->ref; + *fhp = tfh; + matched = 1; + break; + } + if (!matched) { + TAILQ_INSERT_TAIL(&conn->fhqh, fh, q); + WT_STAT_FAST_CONN_INCR(session, file_open); + + *fhp = fh; + } + __wt_spin_unlock(session, &conn->fh_lock); + if (matched) { +err: if (fh != NULL) { + __wt_free(session, fh->name); + __wt_free(session, fh); + } + if (filehandle != INVALID_HANDLE_VALUE) + (void)CloseHandle(filehandle); + if (filehandle_secondary != INVALID_HANDLE_VALUE) + (void)CloseHandle(filehandle_secondary); + } + + __wt_free(session, path); + return (ret); +} + +/* + * __wt_close -- + * Close a file handle. + */ +int +__wt_close(WT_SESSION_IMPL *session, WT_FH *fh) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + conn = S2C(session); + + __wt_spin_lock(session, &conn->fh_lock); + if (fh == NULL || fh->ref == 0 || --fh->ref > 0) { + __wt_spin_unlock(session, &conn->fh_lock); + return (0); + } + + /* Remove from the list. */ + TAILQ_REMOVE(&conn->fhqh, fh, q); + WT_STAT_FAST_CONN_DECR(session, file_open); + + __wt_spin_unlock(session, &conn->fh_lock); + + /* Discard the memory. */ + if (!CloseHandle(fh->filehandle) != 0) { + ret = __wt_errno(); + __wt_err(session, ret, "CloseHandle: %s", fh->name); + } + + if (!CloseHandle(fh->filehandle_secondary) != 0) { + ret = __wt_errno(); + __wt_err(session, ret, "CloseHandle: secondary: %s", fh->name); + } + + __wt_free(session, fh->name); + __wt_free(session, fh); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_path.c b/src/third_party/wiredtiger/src/os_win/os_path.c new file mode 100644 index 00000000000..9f6b79c565c --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_path.c @@ -0,0 +1,34 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_absolute_path -- + * Return if a filename is an absolute path. + */ +int +__wt_absolute_path(const char *path) +{ + /* + * Check for a drive name (for example, "D:"), allow both forward and + * backward slashes. + */ + if (strlen(path) >= 3 && isalpha(path[0]) && path[1] == ':') + path += 2; + return (path[0] == '/' || path[0] == '\\' ? 1 : 0); +} + +/* + * __wt_path_separator -- + * Return the path separator string. + */ +const char * +__wt_path_separator(void) +{ + return ("\\"); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_priv.c b/src/third_party/wiredtiger/src/os_win/os_priv.c new file mode 100644 index 00000000000..7b5152b4652 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_priv.c @@ -0,0 +1,19 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_has_priv -- + * Return if the process has special privileges, defined as having + * different effective and read UIDs or GIDs. + */ +int +__wt_has_priv(void) +{ + return (0); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_remove.c b/src/third_party/wiredtiger/src/os_win/os_remove.c new file mode 100644 index 00000000000..d15ee929c00 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_remove.c @@ -0,0 +1,68 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __remove_file_check -- + * Check if the file is currently open before removing it. + */ +static inline void +__remove_file_check(WT_SESSION_IMPL *session, const char *name) +{ +#ifdef HAVE_DIAGNOSTIC + WT_CONNECTION_IMPL *conn; + WT_FH *fh; + + conn = S2C(session); + fh = NULL; + + /* + * Check if the file is open: it's an error if it is, since a higher + * level should have closed it before removing. + */ + __wt_spin_lock(session, &conn->fh_lock); + TAILQ_FOREACH(fh, &conn->fhqh, q) { + if (strcmp(name, fh->name) == 0) + break; + } + __wt_spin_unlock(session, &conn->fh_lock); + + WT_ASSERT(session, fh == NULL); +#else + WT_UNUSED(session); + WT_UNUSED(name); +#endif +} + +/* + * __wt_remove -- + * Remove a file. + */ +int +__wt_remove(WT_SESSION_IMPL *session, const char *name) +{ + WT_DECL_RET; + char *path; + uint32_t lasterror; + + WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: remove", name)); + + __remove_file_check(session, name); + + WT_RET(__wt_filename(session, name, &path)); + + if ((ret = DeleteFileA(path)) == FALSE) + lasterror = __wt_errno(); + + __wt_free(session, path); + + if (ret != FALSE) + return (0); + + WT_RET_MSG(session, lasterror, "%s: remove", name); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_rename.c b/src/third_party/wiredtiger/src/os_win/os_rename.c new file mode 100644 index 00000000000..092f5d62a40 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_rename.c @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_rename -- + * Rename a file. + */ +int +__wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to) +{ + WT_DECL_RET; + uint32_t lasterror; + char *from_path, *to_path; + + WT_RET(__wt_verbose( + session, WT_VERB_FILEOPS, "rename %s to %s", from, to)); + + from_path = to_path = NULL; + + WT_RET(__wt_filename(session, from, &from_path)); + WT_TRET(__wt_filename(session, to, &to_path)); + + /* + * Check if file exists since Windows does not override the file if + * it exists. + */ + if ((ret = GetFileAttributesA(to_path)) != INVALID_FILE_ATTRIBUTES) { + if ((ret = DeleteFileA(to_path)) == FALSE) { + lasterror = GetLastError(); + goto err; + } + } + + if ((MoveFileA(from_path, to_path)) == FALSE) + lasterror = GetLastError(); + +err: + __wt_free(session, from_path); + __wt_free(session, to_path); + + if (ret != FALSE) + return (0); + + WT_RET_MSG(session, lasterror, "MoveFile %s to %s", from, to); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_rw.c b/src/third_party/wiredtiger/src/os_win/os_rw.c new file mode 100644 index 00000000000..291533bc6bc --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_rw.c @@ -0,0 +1,98 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_read -- + * Read a chunk. + */ +int +__wt_read( + WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf) +{ + DWORD chunk; + DWORD nr; + uint8_t *addr; + OVERLAPPED overlapped = { 0 }; + + nr = 0; + + WT_STAT_FAST_CONN_INCR(session, read_io); + + WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, + "%s: read %" WT_SIZET_FMT " bytes at offset %" PRIuMAX, + fh->name, len, (uintmax_t)offset)); + + /* Assert direct I/O is aligned and a multiple of the alignment. */ + WT_ASSERT(session, + !fh->direct_io || + S2C(session)->buffer_alignment == 0 || + (!((uintptr_t)buf & + (uintptr_t)(S2C(session)->buffer_alignment - 1)) && + len >= S2C(session)->buffer_alignment && + len % S2C(session)->buffer_alignment == 0)); + + /* Break reads larger than 1GB into 1GB chunks. */ + for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) { + chunk = (DWORD)WT_MIN(len, WT_GIGABYTE); + overlapped.Offset = UINT32_MAX & offset; + overlapped.OffsetHigh = UINT32_MAX & (offset >> 32); + + if (!ReadFile(fh->filehandle, addr, chunk, &nr, &overlapped)) + WT_RET_MSG(session, nr == 0 ? WT_ERROR : __wt_errno(), + "%s read error: failed to read %" WT_SIZET_FMT + " bytes at offset %" PRIuMAX, + fh->name, chunk, (uintmax_t)offset); + } + return (0); +} + +/* + * __wt_write -- + * Write a chunk. + */ +int +__wt_write(WT_SESSION_IMPL *session, + WT_FH *fh, wt_off_t offset, size_t len, const void *buf) +{ + DWORD chunk; + DWORD nw; + const uint8_t *addr; + OVERLAPPED overlapped = { 0 }; + + nw = 0; + + WT_STAT_FAST_CONN_INCR(session, write_io); + + WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, + "%s: write %" WT_SIZET_FMT " bytes at offset %" PRIuMAX, + fh->name, len, (uintmax_t)offset)); + + /* Assert direct I/O is aligned and a multiple of the alignment. */ + WT_ASSERT(session, + !fh->direct_io || + S2C(session)->buffer_alignment == 0 || + (!((uintptr_t)buf & + (uintptr_t)(S2C(session)->buffer_alignment - 1)) && + len >= S2C(session)->buffer_alignment && + len % S2C(session)->buffer_alignment == 0)); + + /* Break writes larger than 1GB into 1GB chunks. */ + for (addr = buf; len > 0; addr += nw, len -= (size_t)nw, offset += nw) { + chunk = (DWORD)WT_MIN(len, WT_GIGABYTE); + overlapped.Offset = UINT32_MAX & offset; + overlapped.OffsetHigh = UINT32_MAX & (offset >> 32); + + if (!WriteFile(fh->filehandle, addr, chunk, &nw, &overlapped)) + WT_RET_MSG(session, __wt_errno(), + "%s write error: failed to write %" WT_SIZET_FMT + " bytes at offset %" PRIuMAX, + fh->name, chunk, (uintmax_t)offset); + } + return (0); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_sleep.c b/src/third_party/wiredtiger/src/os_win/os_sleep.c new file mode 100644 index 00000000000..b9a8cc2e545 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_sleep.c @@ -0,0 +1,18 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_sleep -- + * Pause the thread of control. + */ +void +__wt_sleep(long seconds, long micro_seconds) +{ + Sleep(seconds * 1000 + micro_seconds / 1000); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_thread.c b/src/third_party/wiredtiger/src/os_win/os_thread.c new file mode 100644 index 00000000000..4d8cf89f264 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_thread.c @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_thread_create -- + * Create a new thread of control. + */ +int +__wt_thread_create(WT_SESSION_IMPL *session, + wt_thread_t *tidret, void *(*func)(void *), void *arg) +{ + /* Spawn a new thread of control. */ + *tidret = CreateThread(NULL, 0, func, arg, 0, NULL); + if (*tidret != NULL) + return (0); + + WT_RET_MSG(session, __wt_errno(), "CreateThread"); +} + +/* + * __wt_thread_join -- + * Wait for a thread of control to exit. + */ +int +__wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) +{ + WT_DECL_RET; + + if ((ret = WaitForSingleObject(tid, INFINITE)) == WAIT_OBJECT_0) + return (0); + + WT_RET_MSG(session, ret, "WaitForSingleObject"); +} + +/* + * __wt_thread_id -- + * Fill in a printable version of the process and thread IDs. + */ +void +__wt_thread_id(char* buf, size_t buflen) +{ + (void)snprintf(buf, buflen, + "%" PRIu64 ":%" PRIu64, + (uint64_t)GetCurrentProcessId(), (uint64_t)GetCurrentThreadId); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_time.c b/src/third_party/wiredtiger/src/os_win/os_time.c new file mode 100644 index 00000000000..b49b738fe54 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_time.c @@ -0,0 +1,62 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_seconds -- + * Return the seconds since the Epoch. + */ +int +__wt_seconds(WT_SESSION_IMPL *session, time_t *timep) +{ + struct timespec t; + + WT_RET(__wt_epoch(session, &t)); + + *timep = t.tv_sec; + + return (0); +} + +/* + * __wt_epoch -- + * Return the time since the Epoch. + */ +int +__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) +{ + uint64_t ns100; + + FILETIME time; + GetSystemTimeAsFileTime(&time); + + ns100 = (((int64_t)time.dwHighDateTime << 32) + time.dwLowDateTime) + - 116444736000000000LL; + tsp->tv_sec = ns100 / 10000000; + tsp->tv_nsec = (long)((ns100 % 10000000) * 100); + + return (0); +} + +/* + * localtime_r -- + * Return the current local time. + */ +struct tm * +localtime_r(const time_t *timer, struct tm *result) +{ + errno_t err; + + err = localtime_s(result, timer); + if (err != 0) { + __wt_err(NULL, err, "localtime_s"); + return (NULL); + } + + return (result); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_vsnprintf.c b/src/third_party/wiredtiger/src/os_win/os_vsnprintf.c new file mode 100644 index 00000000000..1058203e326 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_vsnprintf.c @@ -0,0 +1,31 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +#undef vsnprintf + +_Check_return_opt_ int __cdecl _wt_vsnprintf( + _Out_writes_(_MaxCount) char * _DstBuf, + _In_ size_t _MaxCount, + _In_z_ _Printf_format_string_ const char * _Format, + va_list _ArgList) +{ + int len; + + len = (size_t)vsnprintf(_DstBuf, _MaxCount, _Format, _ArgList); + + /* + * The MSVC implementation returns -1 on truncation instead of what + * it would have written. We could iteratively grow the buffer, or + * just ask us how big a buffer they would like. + */ + if (len == -1) + len = _vscprintf(_Format, _ArgList) + 1; + + return (len); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_yield.c b/src/third_party/wiredtiger/src/os_win/os_yield.c new file mode 100644 index 00000000000..970bfa139d0 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_yield.c @@ -0,0 +1,18 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_yield -- + * Yield the thread of control. + */ +void +__wt_yield(void) +{ + SwitchToThread(); +} diff --git a/src/third_party/wiredtiger/src/packing/pack_api.c b/src/third_party/wiredtiger/src/packing/pack_api.c new file mode 100644 index 00000000000..c0c1e53c8ca --- /dev/null +++ b/src/third_party/wiredtiger/src/packing/pack_api.c @@ -0,0 +1,137 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * wiredtiger_struct_pack -- + * Pack a byte string (extension API). + */ +int +wiredtiger_struct_pack(WT_SESSION *wt_session, + void *buffer, size_t size, const char *fmt, ...) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + va_list ap; + + session = (WT_SESSION_IMPL *)wt_session; + + va_start(ap, fmt); + ret = __wt_struct_packv(session, buffer, size, fmt, ap); + va_end(ap); + + return (ret); +} + +/* + * wiredtiger_struct_size -- + * Calculate the size of a packed byte string (extension API). + */ +int +wiredtiger_struct_size(WT_SESSION *wt_session, + size_t *sizep, const char *fmt, ...) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + va_list ap; + + session = (WT_SESSION_IMPL *)wt_session; + + va_start(ap, fmt); + ret = __wt_struct_sizev(session, sizep, fmt, ap); + va_end(ap); + + return (ret); +} + +/* + * wiredtiger_struct_unpack -- + * Unpack a byte string (extension API). + */ +int +wiredtiger_struct_unpack(WT_SESSION *wt_session, + const void *buffer, size_t size, const char *fmt, ...) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + va_list ap; + + session = (WT_SESSION_IMPL *)wt_session; + + va_start(ap, fmt); + ret = __wt_struct_unpackv(session, buffer, size, fmt, ap); + va_end(ap); + + return (ret); +} + +/* + * __wt_ext_struct_pack -- + * Pack a byte string (extension API). + */ +int +__wt_ext_struct_pack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, + void *buffer, size_t size, const char *fmt, ...) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + va_list ap; + + session = (wt_session != NULL) ? (WT_SESSION_IMPL *)wt_session : + ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session; + + va_start(ap, fmt); + ret = __wt_struct_packv(session, buffer, size, fmt, ap); + va_end(ap); + + return (ret); +} + +/* + * __wt_ext_struct_size -- + * Calculate the size of a packed byte string (extension API). + */ +int +__wt_ext_struct_size(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, + size_t *sizep, const char *fmt, ...) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + va_list ap; + + session = (wt_session != NULL) ? (WT_SESSION_IMPL *)wt_session : + ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session; + + va_start(ap, fmt); + ret = __wt_struct_sizev(session, sizep, fmt, ap); + va_end(ap); + + return (ret); +} + +/* + * __wt_ext_struct_unpack -- + * Unpack a byte string (extension API). + */ +int +__wt_ext_struct_unpack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, + const void *buffer, size_t size, const char *fmt, ...) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + va_list ap; + + session = (wt_session != NULL) ? (WT_SESSION_IMPL *)wt_session : + ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session; + + va_start(ap, fmt); + ret = __wt_struct_unpackv(session, buffer, size, fmt, ap); + va_end(ap); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/packing/pack_impl.c b/src/third_party/wiredtiger/src/packing/pack_impl.c new file mode 100644 index 00000000000..12b1582e6d0 --- /dev/null +++ b/src/third_party/wiredtiger/src/packing/pack_impl.c @@ -0,0 +1,96 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_struct_check -- + * Check that the specified packing format is valid, and whether it fits + * into a fixed-sized bitfield. + */ +int +__wt_struct_check(WT_SESSION_IMPL *session, + const char *fmt, size_t len, int *fixedp, uint32_t *fixed_lenp) +{ + WT_DECL_PACK_VALUE(pv); + WT_DECL_RET; + WT_PACK pack; + int fields; + + WT_RET(__pack_initn(session, &pack, fmt, len)); + for (fields = 0; (ret = __pack_next(&pack, &pv)) == 0; fields++) + ; + + if (ret != WT_NOTFOUND) + return (ret); + + if (fixedp != NULL && fixed_lenp != NULL) { + if (fields == 0) { + *fixedp = 1; + *fixed_lenp = 0; + } else if (fields == 1 && pv.type == 't') { + *fixedp = 1; + *fixed_lenp = pv.size; + } else + *fixedp = 0; + } + + return (0); +} + +/* + * __wt_struct_size -- + * Calculate the size of a packed byte string. + */ +int +__wt_struct_size(WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, ...) +{ + WT_DECL_RET; + va_list ap; + + va_start(ap, fmt); + ret = __wt_struct_sizev(session, sizep, fmt, ap); + va_end(ap); + + return (ret); +} + +/* + * __wt_struct_pack -- + * Pack a byte string. + */ +int +__wt_struct_pack(WT_SESSION_IMPL *session, + void *buffer, size_t size, const char *fmt, ...) +{ + WT_DECL_RET; + va_list ap; + + va_start(ap, fmt); + ret = __wt_struct_packv(session, buffer, size, fmt, ap); + va_end(ap); + + return (ret); +} + +/* + * __wt_struct_unpack -- + * Unpack a byte string. + */ +int +__wt_struct_unpack(WT_SESSION_IMPL *session, + const void *buffer, size_t size, const char *fmt, ...) +{ + WT_DECL_RET; + va_list ap; + + va_start(ap, fmt); + ret = __wt_struct_unpackv(session, buffer, size, fmt, ap); + va_end(ap); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/packing/pack_stream.c b/src/third_party/wiredtiger/src/packing/pack_stream.c new file mode 100644 index 00000000000..efbbd5d9adb --- /dev/null +++ b/src/third_party/wiredtiger/src/packing/pack_stream.c @@ -0,0 +1,296 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * Streaming interface to packing. + * + * This allows applications to pack or unpack records one field at a time. + */ +struct __wt_pack_stream { + WT_PACK pack; + uint8_t *end, *p, *start; +}; + +/* + * wiredtiger_pack_start -- + * Open a stream for packing. + */ +int +wiredtiger_pack_start(WT_SESSION *wt_session, + const char *format, void *buffer, size_t len, WT_PACK_STREAM **psp) +{ + WT_DECL_RET; + WT_PACK_STREAM *ps; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + WT_RET(__wt_calloc_def(session, 1, &ps)); + WT_ERR(__pack_init(session, &ps->pack, format)); + ps->p = ps->start = buffer; + ps->end = ps->p + len; + *psp = ps; + + if (0) { +err: (void)wiredtiger_pack_close(ps, NULL); + } + return (ret); +} + +/* + * wiredtiger_unpack_start -- + * Open a stream for unpacking. + */ +int +wiredtiger_unpack_start(WT_SESSION *wt_session, const char *format, + const void *buffer, size_t size, WT_PACK_STREAM **psp) +{ + return (wiredtiger_pack_start( + wt_session, format, (void *)buffer, size, psp)); +} + +/* + * wiredtiger_pack_close -- + * Close a packing stream. + */ +int +wiredtiger_pack_close(WT_PACK_STREAM *ps, size_t *usedp) +{ + if (usedp != NULL) + *usedp = WT_PTRDIFF(ps->p, ps->start); + + if (ps != NULL) + __wt_free(ps->pack.session, ps); + + return (0); +} + +/* + * wiredtiger_pack_item -- + * Pack an item. + */ +int +wiredtiger_pack_item(WT_PACK_STREAM *ps, WT_ITEM *item) +{ + WT_DECL_PACK_VALUE(pv); + WT_SESSION_IMPL *session; + + session = ps->pack.session; + + WT_RET(__pack_next(&ps->pack, &pv)); + switch (pv.type) { + case 'U': + case 'u': + pv.u.item.data = item->data; + pv.u.item.size = item->size; + WT_RET(__pack_write( + session, &pv, &ps->p, (size_t)(ps->end - ps->p))); + break; + WT_ILLEGAL_VALUE(session); + } + + return (0); +} + +/* + * wiredtiger_pack_int -- + * Pack a signed integer. + */ +int +wiredtiger_pack_int(WT_PACK_STREAM *ps, int64_t i) +{ + WT_DECL_PACK_VALUE(pv); + WT_SESSION_IMPL *session; + + session = ps->pack.session; + + WT_RET(__pack_next(&ps->pack, &pv)); + switch (pv.type) { + case 'b': + case 'h': + case 'i': + case 'l': + case 'q': + pv.u.i = i; + WT_RET(__pack_write( + session, &pv, &ps->p, (size_t)(ps->end - ps->p))); + break; + WT_ILLEGAL_VALUE(session); + } + + return (0); +} + +/* + * wiredtiger_pack_str -- + * Pack a string. + */ +int +wiredtiger_pack_str(WT_PACK_STREAM *ps, const char *s) +{ + WT_DECL_PACK_VALUE(pv); + WT_SESSION_IMPL *session; + + session = ps->pack.session; + + WT_RET(__pack_next(&ps->pack, &pv)); + switch (pv.type) { + case 'S': + case 's': + pv.u.s = s; + WT_RET(__pack_write( + session, &pv, &ps->p, (size_t)(ps->end - ps->p))); + break; + WT_ILLEGAL_VALUE(session); + } + + return (0); +} + +/* + * wiredtiger_pack_uint -- + * Pack an unsigned int. + */ +int +wiredtiger_pack_uint(WT_PACK_STREAM *ps, uint64_t u) +{ + WT_DECL_PACK_VALUE(pv); + WT_SESSION_IMPL *session; + + session = ps->pack.session; + + WT_RET(__pack_next(&ps->pack, &pv)); + switch (pv.type) { + case 'B': + case 'H': + case 'I': + case 'L': + case 'Q': + case 'R': + case 'r': + case 't': + pv.u.u = u; + WT_RET(__pack_write( + session, &pv, &ps->p, (size_t)(ps->end - ps->p))); + break; + WT_ILLEGAL_VALUE(session); + } + + return (0); +} + +/* + * wiredtiger_unpack_item -- + * Unpack an item. + */ +int +wiredtiger_unpack_item(WT_PACK_STREAM *ps, WT_ITEM *item) +{ + WT_DECL_PACK_VALUE(pv); + WT_SESSION_IMPL *session; + + session = ps->pack.session; + + WT_RET(__pack_next(&ps->pack, &pv)); + switch (pv.type) { + case 'U': + case 'u': + WT_RET(__unpack_read(session, + &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p))); + item->data = pv.u.item.data; + item->size = pv.u.item.size; + break; + WT_ILLEGAL_VALUE(session); + } + + return (0); +} + +/* + * wiredtiger_unpack_int -- + * Unpack a signed integer. + */ +int +wiredtiger_unpack_int(WT_PACK_STREAM *ps, int64_t *ip) +{ + WT_DECL_PACK_VALUE(pv); + WT_SESSION_IMPL *session; + + session = ps->pack.session; + + WT_RET(__pack_next(&ps->pack, &pv)); + switch (pv.type) { + case 'b': + case 'h': + case 'i': + case 'l': + case 'q': + WT_RET(__unpack_read(session, + &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p))); + *ip = pv.u.i; + break; + WT_ILLEGAL_VALUE(session); + } + return (0); +} + +/* + * wiredtiger_unpack_str -- + * Unpack a string. + */ +int +wiredtiger_unpack_str(WT_PACK_STREAM *ps, const char **sp) +{ + WT_DECL_PACK_VALUE(pv); + WT_SESSION_IMPL *session; + + session = ps->pack.session; + + WT_RET(__pack_next(&ps->pack, &pv)); + switch (pv.type) { + case 'S': + case 's': + WT_RET(__unpack_read(session, + &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p))); + *sp = pv.u.s; + break; + WT_ILLEGAL_VALUE(session); + } + return (0); +} + +/* + * wiredtiger_unpack_uint -- + * Unpack an unsigned integer. + */ +int +wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up) +{ + WT_DECL_PACK_VALUE(pv); + WT_SESSION_IMPL *session; + + session = ps->pack.session; + + WT_RET(__pack_next(&ps->pack, &pv)); + switch (pv.type) { + case 'B': + case 'H': + case 'I': + case 'L': + case 'Q': + case 'R': + case 'r': + case 't': + WT_RET(__unpack_read(session, + &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p))); + *up = pv.u.u; + break; + WT_ILLEGAL_VALUE(session); + } + return (0); +} diff --git a/src/third_party/wiredtiger/src/schema/schema_create.c b/src/third_party/wiredtiger/src/schema/schema_create.c new file mode 100644 index 00000000000..398fea4476f --- /dev/null +++ b/src/third_party/wiredtiger/src/schema/schema_create.c @@ -0,0 +1,595 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_direct_io_size_check -- + * Return a size from the configuration, complaining if it's insufficient + * for direct I/O. + */ +int +__wt_direct_io_size_check(WT_SESSION_IMPL *session, + const char **cfg, const char *config_name, uint32_t *allocsizep) +{ + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + int64_t align; + + *allocsizep = 0; + + conn = S2C(session); + + WT_RET(__wt_config_gets(session, cfg, config_name, &cval)); + + /* + * This function exists as a place to hang this comment: if direct I/O + * is configured, page sizes must be at least as large as any buffer + * alignment as well as a multiple of the alignment. Linux gets unhappy + * if you configure direct I/O and then don't do I/O in alignments and + * units of its happy place. + */ + if (FLD_ISSET(conn->direct_io, + WT_FILE_TYPE_CHECKPOINT | WT_FILE_TYPE_DATA)) { + align = (int64_t)conn->buffer_alignment; + if (align != 0 && (cval.val < align || cval.val % align != 0)) + WT_RET_MSG(session, EINVAL, + "when direct I/O is configured, the %s size must " + "be at least as large as the buffer alignment as " + "well as a multiple of the buffer alignment", + config_name); + } + *allocsizep = (uint32_t)cval.val; + return (0); +} + +/* + * __create_file -- + * Create a new 'file:' object. + */ +static int +__create_file(WT_SESSION_IMPL *session, + const char *uri, int exclusive, const char *config) +{ + WT_DECL_ITEM(val); + WT_DECL_RET; + uint32_t allocsize; + int is_metadata; + const char *fileconf, *filename; + const char **p, *filecfg[] = + { WT_CONFIG_BASE(session, file_meta), config, NULL, NULL }; + + fileconf = NULL; + + is_metadata = strcmp(uri, WT_METAFILE_URI) == 0; + + filename = uri; + if (!WT_PREFIX_SKIP(filename, "file:")) + WT_RET_MSG(session, EINVAL, "Expected a 'file:' URI: %s", uri); + + /* Check if the file already exists. */ + if (!is_metadata && (ret = + __wt_metadata_search(session, uri, &fileconf)) != WT_NOTFOUND) { + if (exclusive) + WT_TRET(EEXIST); + goto err; + } + + /* Sanity check the allocation size. */ + WT_RET(__wt_direct_io_size_check( + session, filecfg, "allocation_size", &allocsize)); + + /* Create the file. */ + WT_ERR(__wt_block_manager_create(session, filename, allocsize)); + if (WT_META_TRACKING(session)) + WT_ERR(__wt_meta_track_fileop(session, NULL, uri)); + + /* + * If creating an ordinary file, append the file ID and current version + * numbers to the passed-in configuration and insert the resulting + * configuration into the metadata. + */ + if (!is_metadata) { + WT_ERR(__wt_scr_alloc(session, 0, &val)); + WT_ERR(__wt_buf_fmt(session, val, + "id=%" PRIu32 ",version=(major=%d,minor=%d)", + ++S2C(session)->next_file_id, + WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX)); + for (p = filecfg; *p != NULL; ++p) + ; + *p = val->data; + WT_ERR(__wt_config_collapse(session, filecfg, &fileconf)); + WT_ERR(__wt_metadata_insert(session, uri, fileconf)); + } + + /* + * Open the file to check that it was setup correctly. We don't need + * to pass the configuration, we just wrote the collapsed configuration + * into the metadata file, and it's going to be read/used by underlying + * functions. + * + * Keep the handle exclusive until it is released at the end of the + * call, otherwise we could race with a drop. + */ + WT_ERR(__wt_session_get_btree( + session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE)); + if (WT_META_TRACKING(session)) + WT_ERR(__wt_meta_track_handle_lock(session, 1)); + else + WT_ERR(__wt_session_release_btree(session)); + +err: __wt_scr_free(&val); + __wt_free(session, fileconf); + return (ret); +} + +/* + * __wt_schema_colgroup_source -- + * Get the URI of the data source for a column group. + */ +int +__wt_schema_colgroup_source(WT_SESSION_IMPL *session, + WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf) +{ + WT_CONFIG_ITEM cval; + WT_DECL_RET; + size_t len; + const char *prefix, *suffix, *tablename; + + tablename = table->name + strlen("table:"); + if ((ret = __wt_config_getones(session, config, "type", &cval)) == 0 && + !WT_STRING_MATCH("file", cval.str, cval.len)) { + prefix = cval.str; + len = cval.len; + suffix = ""; + } else { + prefix = "file"; + len = strlen(prefix); + suffix = ".wt"; + } + WT_RET_NOTFOUND_OK(ret); + + if (cgname == NULL) + WT_RET(__wt_buf_fmt(session, buf, "%.*s:%s%s", + (int)len, prefix, tablename, suffix)); + else + WT_RET(__wt_buf_fmt(session, buf, "%.*s:%s_%s%s", + (int)len, prefix, tablename, cgname, suffix)); + + return (0); +} + +/* + * __create_colgroup -- + * Create a column group. + */ +static int +__create_colgroup(WT_SESSION_IMPL *session, + const char *name, int exclusive, const char *config) +{ + WT_CONFIG_ITEM cval; + WT_DECL_RET; + WT_ITEM confbuf, fmt, namebuf; + WT_TABLE *table; + size_t tlen; + const char *cfg[4] = + { WT_CONFIG_BASE(session, colgroup_meta), config, NULL, NULL }; + const char *sourcecfg[] = { config, NULL, NULL }; + const char **cfgp; + const char *cgconf, *cgname, *sourceconf, *oldconf; + const char *source, *tablename; + + cgconf = sourceconf = oldconf = NULL; + WT_CLEAR(fmt); + WT_CLEAR(confbuf); + WT_CLEAR(namebuf); + + tablename = name; + if (!WT_PREFIX_SKIP(tablename, "colgroup:")) + return (EINVAL); + cgname = strchr(tablename, ':'); + if (cgname != NULL) { + tlen = (size_t)(cgname - tablename); + ++cgname; + } else + tlen = strlen(tablename); + + if ((ret = + __wt_schema_get_table(session, tablename, tlen, 1, &table)) != 0) + WT_RET_MSG(session, (ret == WT_NOTFOUND) ? ENOENT : ret, + "Can't create '%s' for non-existent table '%.*s'", + name, (int)tlen, tablename); + + /* Make sure the column group is referenced from the table. */ + if (cgname != NULL && (ret = + __wt_config_subgets(session, &table->cgconf, cgname, &cval)) != 0) + WT_ERR_MSG(session, EINVAL, + "Column group '%s' not found in table '%.*s'", + cgname, (int)tlen, tablename); + + /* Find the first NULL entry in the cfg stack. */ + for (cfgp = &cfg[1]; *cfgp; cfgp++) + ; + + /* Add the source to the colgroup config before collapsing. */ + if (__wt_config_getones( + session, config, "source", &cval) == 0 && cval.len != 0) { + WT_ERR(__wt_buf_fmt( + session, &namebuf, "%.*s", (int)cval.len, cval.str)); + source = namebuf.data; + } else { + WT_ERR(__wt_schema_colgroup_source( + session, table, cgname, config, &namebuf)); + source = namebuf.data; + WT_ERR(__wt_buf_fmt( + session, &confbuf, "source=\"%s\"", source)); + *cfgp++ = confbuf.data; + } + + /* Calculate the key/value formats: these go into the source config. */ + WT_ERR(__wt_buf_fmt(session, &fmt, "key_format=%s", table->key_format)); + if (cgname == NULL) + WT_ERR(__wt_buf_catfmt + (session, &fmt, ",value_format=%s", table->value_format)); + else { + if (__wt_config_getones(session, config, "columns", &cval) != 0) + WT_ERR_MSG(session, EINVAL, + "No 'columns' configuration for '%s'", name); + WT_ERR(__wt_buf_catfmt(session, &fmt, ",value_format=")); + WT_ERR(__wt_struct_reformat(session, + table, cval.str, cval.len, NULL, 1, &fmt)); + } + sourcecfg[1] = fmt.data; + WT_ERR(__wt_config_concat(session, sourcecfg, &sourceconf)); + + WT_ERR(__wt_schema_create(session, source, sourceconf)); + + WT_ERR(__wt_config_collapse(session, cfg, &cgconf)); + if ((ret = __wt_metadata_insert(session, name, cgconf)) != 0) { + /* + * If the entry already exists in the metadata, we're done. + * This is an error for exclusive creates but okay otherwise. + */ + if (ret == WT_DUPLICATE_KEY) + ret = exclusive ? EEXIST : 0; + goto err; + } + + WT_ERR(__wt_schema_open_colgroups(session, table)); + +err: __wt_free(session, cgconf); + __wt_free(session, sourceconf); + __wt_free(session, oldconf); + __wt_buf_free(session, &confbuf); + __wt_buf_free(session, &fmt); + __wt_buf_free(session, &namebuf); + + __wt_schema_release_table(session, table); + return (ret); +} + +/* + * __wt_schema_index_source -- + * Get the URI of the data source for an index. + */ +int +__wt_schema_index_source(WT_SESSION_IMPL *session, + WT_TABLE *table, const char *idxname, const char *config, WT_ITEM *buf) +{ + WT_CONFIG_ITEM cval; + WT_DECL_RET; + size_t len; + const char *prefix, *suffix, *tablename; + + tablename = table->name + strlen("table:"); + if ((ret = __wt_config_getones(session, config, "type", &cval)) == 0 && + !WT_STRING_MATCH("file", cval.str, cval.len)) { + prefix = cval.str; + len = cval.len; + suffix = "_idx"; + } else { + prefix = "file"; + len = strlen(prefix); + suffix = ".wti"; + } + WT_RET_NOTFOUND_OK(ret); + + WT_RET(__wt_buf_fmt(session, buf, "%.*s:%s_%s%s", + (int)len, prefix, tablename, idxname, suffix)); + + return (0); +} + +/* + * __create_index -- + * Create an index. + */ +static int +__create_index(WT_SESSION_IMPL *session, + const char *name, int exclusive, const char *config) +{ + WT_CONFIG pkcols; + WT_CONFIG_ITEM ckey, cval, icols; + WT_DECL_RET; + WT_ITEM confbuf, extra_cols, fmt, namebuf; + WT_TABLE *table; + const char *cfg[4] = + { WT_CONFIG_BASE(session, index_meta), NULL, NULL, NULL }; + const char *sourcecfg[] = { config, NULL, NULL }; + const char *sourceconf, *source, *idxconf, *idxname; + const char *tablename; + size_t tlen; + u_int i; + + idxconf = sourceconf = NULL; + WT_CLEAR(confbuf); + WT_CLEAR(fmt); + WT_CLEAR(extra_cols); + WT_CLEAR(namebuf); + + tablename = name; + if (!WT_PREFIX_SKIP(tablename, "index:")) + return (EINVAL); + idxname = strchr(tablename, ':'); + if (idxname == NULL) + WT_RET_MSG(session, EINVAL, "Invalid index name, " + "should be <table name>:<index name>: %s", name); + + tlen = (size_t)(idxname++ - tablename); + if ((ret = + __wt_schema_get_table(session, tablename, tlen, 1, &table)) != 0) + WT_RET_MSG(session, ret, + "Can't create an index for a non-existent table: %.*s", + (int)tlen, tablename); + + if (__wt_config_getones(session, config, "source", &cval) == 0) { + WT_ERR(__wt_buf_fmt(session, &namebuf, + "%.*s", (int)cval.len, cval.str)); + source = namebuf.data; + } else { + WT_ERR(__wt_schema_index_source( + session, table, idxname, config, &namebuf)); + source = namebuf.data; + + /* Add the source name to the index config before collapsing. */ + WT_ERR(__wt_buf_catfmt(session, &confbuf, + ",source=\"%s\"", source)); + } + + /* Calculate the key/value formats. */ + if (__wt_config_getones(session, config, "columns", &icols) != 0) + WT_ERR_MSG(session, EINVAL, + "No 'columns' configuration for '%s'", name); + + /* + * The key format for an index is somewhat subtle: the application + * specifies a set of columns that it will use for the key, but the + * engine usually adds some hidden columns in order to derive the + * primary key. These hidden columns are part of the source's + * key_format, which we are calculating now, but not part of an index + * cursor's key_format. + */ + WT_ERR(__wt_config_subinit(session, &pkcols, &table->colconf)); + for (i = 0; i < table->nkey_columns && + (ret = __wt_config_next(&pkcols, &ckey, &cval)) == 0; + i++) { + /* + * If the primary key column is already in the secondary key, + * don't add it again. + */ + if (__wt_config_subgetraw(session, &icols, &ckey, &cval) == 0) + continue; + WT_ERR(__wt_buf_catfmt( + session, &extra_cols, "%.*s,", (int)ckey.len, ckey.str)); + } + if (ret != 0 && ret != WT_NOTFOUND) + goto err; + + /* + * Index values are normally empty: all columns are packed into the + * index key. The exception is LSM, which (currently) reserves empty + * values as tombstones. Use a single padding byte in that case. + */ + if (WT_PREFIX_MATCH(source, "lsm:")) + WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=x,")); + else + WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,")); + WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,key_format=")); + WT_ERR(__wt_struct_reformat(session, table, + icols.str, icols.len, (const char *)extra_cols.data, 0, &fmt)); + + /* Check for a record number index key, which makes no sense. */ + WT_ERR(__wt_config_getones(session, fmt.data, "key_format", &cval)); + if (cval.len == 1 && cval.str[0] == 'r') + WT_ERR_MSG(session, EINVAL, + "column-store index may not use the record number as its " + "index key"); + + sourcecfg[1] = fmt.data; + WT_ERR(__wt_config_concat(session, sourcecfg, &sourceconf)); + + WT_ERR(__wt_schema_create(session, source, sourceconf)); + + cfg[1] = sourceconf; + cfg[2] = confbuf.data; + WT_ERR(__wt_config_collapse(session, cfg, &idxconf)); + if ((ret = __wt_metadata_insert(session, name, idxconf)) != 0) { + /* + * If the entry already exists in the metadata, we're done. + * This is an error for exclusive creates but okay otherwise. + */ + if (ret == WT_DUPLICATE_KEY) + ret = exclusive ? EEXIST : 0; + goto err; + } + +err: __wt_free(session, idxconf); + __wt_free(session, sourceconf); + __wt_buf_free(session, &confbuf); + __wt_buf_free(session, &extra_cols); + __wt_buf_free(session, &fmt); + __wt_buf_free(session, &namebuf); + + __wt_schema_release_table(session, table); + return (ret); +} + +/* + * __create_table -- + * Create a table. + */ +static int +__create_table(WT_SESSION_IMPL *session, + const char *name, int exclusive, const char *config) +{ + WT_CONFIG conf; + WT_CONFIG_ITEM cgkey, cgval, cval; + WT_DECL_RET; + WT_TABLE *table; + size_t cgsize; + int ncolgroups; + char *cgname; + const char *cfg[4] = + { WT_CONFIG_BASE(session, table_meta), config, NULL, NULL }; + const char *tableconf, *tablename; + + cgname = NULL; + table = NULL; + tableconf = NULL; + + tablename = name; + if (!WT_PREFIX_SKIP(tablename, "table:")) + return (EINVAL); + + if ((ret = __wt_schema_get_table(session, + tablename, strlen(tablename), 0, &table)) == 0) { + __wt_schema_release_table(session, table); + return (exclusive ? EEXIST : 0); + } + WT_RET_NOTFOUND_OK(ret); + + WT_RET(__wt_config_gets(session, cfg, "colgroups", &cval)); + WT_RET(__wt_config_subinit(session, &conf, &cval)); + for (ncolgroups = 0; + (ret = __wt_config_next(&conf, &cgkey, &cgval)) == 0; + ncolgroups++) + ; + WT_RET_NOTFOUND_OK(ret); + + WT_RET(__wt_config_collapse(session, cfg, &tableconf)); + if ((ret = __wt_metadata_insert(session, name, tableconf)) != 0) { + /* + * If the entry already exists in the metadata, we're done. + * This is an error for exclusive creates but okay otherwise. + */ + if (ret == WT_DUPLICATE_KEY) + ret = exclusive ? EEXIST : 0; + goto err; + } + + /* Attempt to open the table now to catch any errors. */ + WT_ERR(__wt_schema_get_table( + session, tablename, strlen(tablename), 1, &table)); + + if (ncolgroups == 0) { + cgsize = strlen("colgroup:") + strlen(tablename) + 1; + WT_ERR(__wt_calloc_def(session, cgsize, &cgname)); + snprintf(cgname, cgsize, "colgroup:%s", tablename); + WT_ERR(__create_colgroup(session, cgname, exclusive, config)); + } + + if (0) { +err: if (table != NULL) { + __wt_schema_remove_table(session, table); + table = NULL; + } + } + if (table != NULL) + __wt_schema_release_table(session, table); + __wt_free(session, cgname); + __wt_free(session, tableconf); + return (ret); +} + +/* + * __create_data_source -- + * Create a custom data source. + */ +static int +__create_data_source(WT_SESSION_IMPL *session, + const char *uri, const char *config, WT_DATA_SOURCE *dsrc) +{ + WT_CONFIG_ITEM cval; + const char *cfg[] = { + WT_CONFIG_BASE(session, session_create), config, NULL }; + + /* + * Check to be sure the key/value formats are legal: the underlying + * data source doesn't have access to the functions that check. + */ + WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); + WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL)); + WT_RET(__wt_config_gets(session, cfg, "value_format", &cval)); + WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL)); + + /* + * User-specified collators aren't supported for data-source objects. + */ + if (__wt_config_getones( + session, config, "collator", &cval) != WT_NOTFOUND) + WT_RET_MSG(session, EINVAL, + "WT_DATA_SOURCE objects do not support WT_COLLATOR " + "ordering"); + + return (dsrc->create(dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg)); +} + +/* + * __wt_schema_create -- + * Process a WT_SESSION::create operation for all supported types. + */ +int +__wt_schema_create( + WT_SESSION_IMPL *session, const char *uri, const char *config) +{ + WT_CONFIG_ITEM cval; + WT_DATA_SOURCE *dsrc; + WT_DECL_RET; + int exclusive; + + exclusive = ( + __wt_config_getones(session, config, "exclusive", &cval) == 0 && + cval.val != 0); + + /* + * We track create operations: if we fail in the middle of creating a + * complex object, we want to back it all out. + */ + WT_RET(__wt_meta_track_on(session)); + + if (WT_PREFIX_MATCH(uri, "colgroup:")) + ret = __create_colgroup(session, uri, exclusive, config); + else if (WT_PREFIX_MATCH(uri, "file:")) + ret = __create_file(session, uri, exclusive, config); + else if (WT_PREFIX_MATCH(uri, "lsm:")) + ret = __wt_lsm_tree_create(session, uri, exclusive, config); + else if (WT_PREFIX_MATCH(uri, "index:")) + ret = __create_index(session, uri, exclusive, config); + else if (WT_PREFIX_MATCH(uri, "table:")) + ret = __create_table(session, uri, exclusive, config); + else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) + ret = dsrc->create == NULL ? + __wt_object_unsupported(session, uri) : + __create_data_source(session, uri, config, dsrc); + else + ret = __wt_bad_object_type(session, uri); + + session->dhandle = NULL; + WT_TRET(__wt_meta_track_off(session, ret != 0)); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/schema/schema_drop.c b/src/third_party/wiredtiger/src/schema/schema_drop.c new file mode 100644 index 00000000000..6df7e6930c9 --- /dev/null +++ b/src/third_party/wiredtiger/src/schema/schema_drop.c @@ -0,0 +1,204 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __drop_file -- + * Drop a file. + */ +static int +__drop_file( + WT_SESSION_IMPL *session, const char *uri, int force, const char *cfg[]) +{ + WT_CONFIG_ITEM cval; + WT_DECL_RET; + int exist, remove_files; + const char *filename; + + WT_RET(__wt_config_gets(session, cfg, "remove_files", &cval)); + remove_files = (cval.val != 0); + + filename = uri; + if (!WT_PREFIX_SKIP(filename, "file:")) + return (EINVAL); + + /* Close all btree handles associated with this file. */ + WT_RET(__wt_conn_dhandle_close_all(session, uri, force)); + + /* Remove the metadata entry (ignore missing items). */ + WT_TRET(__wt_metadata_remove(session, uri)); + if (force && ret == WT_NOTFOUND) + ret = 0; + + if (!remove_files) + return (ret); + + /* Remove the underlying physical file. */ + exist = 0; + WT_TRET(__wt_exist(session, filename, &exist)); + if (exist) { + /* + * There is no point tracking this operation: there is no going + * back from here. + */ + WT_TRET(__wt_remove(session, filename)); + } + + return (ret); +} + +/* + * __drop_colgroup -- + * WT_SESSION::drop for a colgroup. + */ +static int +__drop_colgroup( + WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) +{ + WT_COLGROUP *colgroup; + WT_DECL_RET; + WT_TABLE *table; + + /* If we can get the colgroup, detach it from the table. */ + if ((ret = __wt_schema_get_colgroup( + session, uri, &table, &colgroup)) == 0) { + table->cg_complete = 0; + WT_TRET(__wt_schema_drop(session, colgroup->source, cfg)); + } + + WT_TRET(__wt_metadata_remove(session, uri)); + return (ret); +} + +/* + * __drop_index -- + * WT_SESSION::drop for a colgroup. + */ +static int +__drop_index( + WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) +{ + WT_INDEX *idx; + WT_DECL_RET; + WT_TABLE *table; + + /* If we can get the colgroup, detach it from the table. */ + if ((ret = __wt_schema_get_index(session, uri, &table, &idx)) == 0) { + table->idx_complete = 0; + WT_TRET(__wt_schema_drop(session, idx->source, cfg)); + } + + WT_TRET(__wt_metadata_remove(session, uri)); + return (ret); +} + +/* + * __drop_table -- + * WT_SESSION::drop for a table. + */ +static int +__drop_table( + WT_SESSION_IMPL *session, const char *uri, int force, const char *cfg[]) +{ + WT_COLGROUP *colgroup; + WT_DECL_RET; + WT_INDEX *idx; + WT_TABLE *table; + const char *name; + u_int i; + + name = uri; + (void)WT_PREFIX_SKIP(name, "table:"); + + table = NULL; + WT_ERR(__wt_schema_get_table(session, name, strlen(name), 1, &table)); + + /* Drop the column groups. */ + for (i = 0; i < WT_COLGROUPS(table); i++) { + if ((colgroup = table->cgroups[i]) == NULL) + continue; + WT_ERR(__wt_metadata_remove(session, colgroup->name)); + WT_ERR(__wt_schema_drop(session, colgroup->source, cfg)); + } + + /* Drop the indices. */ + WT_ERR(__wt_schema_open_indices(session, table)); + for (i = 0; i < table->nindices; i++) { + if ((idx = table->indices[i]) == NULL) + continue; + WT_ERR(__wt_metadata_remove(session, idx->name)); + WT_ERR(__wt_schema_drop(session, idx->source, cfg)); + } + + __wt_schema_remove_table(session, table); + table = NULL; + + /* Remove the metadata entry (ignore missing items). */ + WT_ERR(__wt_metadata_remove(session, uri)); + +err: if (force && ret == WT_NOTFOUND) + ret = 0; + if (table != NULL) + __wt_schema_release_table(session, table); + return (ret); +} + +/* + * __wt_schema_drop -- + * Process a WT_SESSION::drop operation for all supported types. + */ +int +__wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) +{ + WT_CONFIG_ITEM cval; + WT_DATA_SOURCE *dsrc; + WT_DECL_RET; + int force; + + WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval)); + force = (cval.val != 0); + + WT_RET(__wt_meta_track_on(session)); + + /* Be careful to ignore any btree handle in our caller. */ + WT_CLEAR_BTREE_IN_SESSION(session); + + if (WT_PREFIX_MATCH(uri, "colgroup:")) + ret = __drop_colgroup(session, uri, cfg); + else if (WT_PREFIX_MATCH(uri, "file:")) + ret = __drop_file(session, uri, force, cfg); + else if (WT_PREFIX_MATCH(uri, "index:")) + ret = __drop_index(session, uri, cfg); + else if (WT_PREFIX_MATCH(uri, "lsm:")) + ret = __wt_lsm_tree_drop(session, uri, cfg); + else if (WT_PREFIX_MATCH(uri, "table:")) + ret = __drop_table(session, uri, force, cfg); + else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) + ret = dsrc->drop == NULL ? + __wt_object_unsupported(session, uri) : + dsrc->drop( + dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg); + else + ret = __wt_bad_object_type(session, uri); + + /* + * Map WT_NOTFOUND to ENOENT (or to 0 if "force" is set), based on the + * assumption WT_NOTFOUND means there was no metadata entry. The + * underlying drop functions should handle this case (we passed them + * the "force" value), but better safe than sorry. + */ + if (ret == WT_NOTFOUND) + ret = force ? 0 : ENOENT; + + /* Bump the schema generation so that stale data is ignored. */ + ++S2C(session)->schema_gen; + + WT_TRET(__wt_meta_track_off(session, ret != 0)); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/schema/schema_list.c b/src/third_party/wiredtiger/src/schema/schema_list.c new file mode 100644 index 00000000000..05421283bf6 --- /dev/null +++ b/src/third_party/wiredtiger/src/schema/schema_list.c @@ -0,0 +1,204 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __schema_add_table -- + * Add a table handle to the session's cache. + */ +static int +__schema_add_table(WT_SESSION_IMPL *session, + const char *name, size_t namelen, WT_TABLE **tablep) +{ + WT_TABLE *table; + + WT_RET(__wt_schema_open_table(session, name, namelen, &table)); + + /* Copy the schema generation into the new table. */ + table->schema_gen = S2C(session)->schema_gen; + + TAILQ_INSERT_HEAD(&session->tables, table, q); + *tablep = table; + + return (0); +} + +/* + * __schema_find_table -- + * Find the table handle for the named table in the session cache. + */ +static int +__schema_find_table(WT_SESSION_IMPL *session, + const char *name, size_t namelen, WT_TABLE **tablep) +{ + WT_TABLE *table; + const char *tablename; + +restart: + TAILQ_FOREACH(table, &session->tables, q) { + tablename = table->name; + (void)WT_PREFIX_SKIP(tablename, "table:"); + if (WT_STRING_MATCH(tablename, name, namelen)) { + /* + * Ignore stale tables. + * + * XXX: should be managed the same as btree handles, + * with a local cache in each session and a shared list + * in the connection. There is still a race here + * between checking the generation and opening the + * first column group. + */ + if (table->schema_gen != S2C(session)->schema_gen) { + if (table->refcnt == 0) { + __wt_schema_remove_table( + session, table); + goto restart; + } + continue; + } + *tablep = table; + return (0); + } + } + + return (WT_NOTFOUND); +} + +/* + * __wt_schema_get_table -- + * Get the table handle for the named table. + */ +int +__wt_schema_get_table(WT_SESSION_IMPL *session, + const char *name, size_t namelen, int ok_incomplete, WT_TABLE **tablep) +{ + WT_DECL_RET; + WT_TABLE *table; + + *tablep = table = NULL; + ret = __schema_find_table(session, name, namelen, &table); + + if (ret == WT_NOTFOUND) + WT_WITH_SCHEMA_LOCK(session, + ret = __schema_add_table(session, name, namelen, &table)); + + if (ret == 0) { + if (!ok_incomplete && !table->cg_complete) + WT_RET_MSG(session, EINVAL, "'%s' cannot be used " + "until all column groups are created", + table->name); + + ++table->refcnt; + *tablep = table; + } + + return (ret); +} + +/* + * __wt_schema_release_table -- + * Release a table handle. + */ +void +__wt_schema_release_table(WT_SESSION_IMPL *session, WT_TABLE *table) +{ + WT_ASSERT(session, table->refcnt > 0); + --table->refcnt; +} + +/* + * __wt_schema_destroy_colgroup -- + * Free a column group handle. + */ +void +__wt_schema_destroy_colgroup(WT_SESSION_IMPL *session, WT_COLGROUP *colgroup) +{ + __wt_free(session, colgroup->name); + __wt_free(session, colgroup->source); + __wt_free(session, colgroup->config); + __wt_free(session, colgroup); +} + +/* + * __wt_schema_destroy_index -- + * Free an index handle. + */ +void +__wt_schema_destroy_index(WT_SESSION_IMPL *session, WT_INDEX *idx) +{ + __wt_free(session, idx->name); + __wt_free(session, idx->source); + __wt_free(session, idx->config); + __wt_free(session, idx->key_format); + __wt_free(session, idx->key_plan); + __wt_free(session, idx->value_plan); + __wt_free(session, idx->idxkey_format); + __wt_free(session, idx); +} + +/* + * __wt_schema_destroy_table -- + * Free a table handle. + */ +void +__wt_schema_destroy_table(WT_SESSION_IMPL *session, WT_TABLE *table) +{ + WT_COLGROUP *colgroup; + WT_INDEX *idx; + u_int i; + + __wt_free(session, table->name); + __wt_free(session, table->config); + __wt_free(session, table->plan); + __wt_free(session, table->key_format); + __wt_free(session, table->value_format); + if (table->cgroups != NULL) { + for (i = 0; i < WT_COLGROUPS(table); i++) { + if ((colgroup = table->cgroups[i]) == NULL) + continue; + __wt_schema_destroy_colgroup(session, colgroup); + } + __wt_free(session, table->cgroups); + } + if (table->indices != NULL) { + for (i = 0; i < table->nindices; i++) { + if ((idx = table->indices[i]) == NULL) + continue; + __wt_schema_destroy_index(session, idx); + } + __wt_free(session, table->indices); + } + __wt_free(session, table); +} + +/* + * __wt_schema_remove_table -- + * Remove the table handle from the session, closing if necessary. + */ +void +__wt_schema_remove_table( + WT_SESSION_IMPL *session, WT_TABLE *table) +{ + WT_ASSERT(session, table->refcnt <= 1); + + TAILQ_REMOVE(&session->tables, table, q); + __wt_schema_destroy_table(session, table); +} + +/* + * __wt_schema_close_tables -- + * Close all of the tables in a session. + */ +void +__wt_schema_close_tables(WT_SESSION_IMPL *session) +{ + WT_TABLE *table; + + while ((table = TAILQ_FIRST(&session->tables)) != NULL) + __wt_schema_remove_table(session, table); +} diff --git a/src/third_party/wiredtiger/src/schema/schema_open.c b/src/third_party/wiredtiger/src/schema/schema_open.c new file mode 100644 index 00000000000..0332569a8e3 --- /dev/null +++ b/src/third_party/wiredtiger/src/schema/schema_open.c @@ -0,0 +1,510 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_schema_colgroup_name -- + * Get the URI for a column group. This is used for metadata lookups. + * The only complexity here is that simple tables (with a single column + * group) use a simpler naming scheme. + */ +int +__wt_schema_colgroup_name(WT_SESSION_IMPL *session, + WT_TABLE *table, const char *cgname, size_t len, WT_ITEM *buf) +{ + const char *tablename; + + tablename = table->name; + (void)WT_PREFIX_SKIP(tablename, "table:"); + + return ((table->ncolgroups == 0) ? + __wt_buf_fmt(session, buf, "colgroup:%s", tablename) : + __wt_buf_fmt(session, buf, "colgroup:%s:%.*s", + tablename, (int)len, cgname)); +} + +/* + * __wt_schema_open_colgroups -- + * Open the column groups for a table. + */ +int +__wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table) +{ + WT_COLGROUP *colgroup; + WT_CONFIG cparser; + WT_CONFIG_ITEM ckey, cval; + WT_DECL_RET; + WT_DECL_ITEM(buf); + const char *cgconfig; + u_int i; + + if (table->cg_complete) + return (0); + + colgroup = NULL; + cgconfig = NULL; + + WT_RET(__wt_scr_alloc(session, 0, &buf)); + + WT_ERR(__wt_config_subinit(session, &cparser, &table->cgconf)); + + /* Open each column group. */ + for (i = 0; i < WT_COLGROUPS(table); i++) { + if (table->ncolgroups > 0) + WT_ERR(__wt_config_next(&cparser, &ckey, &cval)); + else + WT_CLEAR(ckey); + + /* + * Always open from scratch: we may have failed part of the way + * through opening a table, or column groups may have changed. + */ + if (table->cgroups[i] != NULL) { + __wt_schema_destroy_colgroup( + session, table->cgroups[i]); + table->cgroups[i] = NULL; + } + + WT_ERR(__wt_buf_init(session, buf, 0)); + WT_ERR(__wt_schema_colgroup_name(session, table, + ckey.str, ckey.len, buf)); + if ((ret = __wt_metadata_search( + session, buf->data, &cgconfig)) != 0) { + /* It is okay if the table is incomplete. */ + if (ret == WT_NOTFOUND) + ret = 0; + goto err; + } + + WT_ERR(__wt_calloc_def(session, 1, &colgroup)); + WT_ERR(__wt_strndup( + session, buf->data, buf->size, &colgroup->name)); + colgroup->config = cgconfig; + cgconfig = NULL; + WT_ERR(__wt_config_getones(session, + colgroup->config, "columns", &colgroup->colconf)); + WT_ERR(__wt_config_getones( + session, colgroup->config, "source", &cval)); + WT_ERR(__wt_buf_init(session, buf, 0)); + WT_ERR(__wt_buf_fmt( + session, buf, "%.*s", (int)cval.len, cval.str)); + WT_ERR(__wt_strndup( + session, buf->data, buf->size, &colgroup->source)); + table->cgroups[i] = colgroup; + colgroup = NULL; + } + + if (!table->is_simple) { + WT_ERR(__wt_table_check(session, table)); + + WT_ERR(__wt_buf_init(session, buf, 0)); + WT_ERR(__wt_struct_plan(session, + table, table->colconf.str, table->colconf.len, 1, buf)); + WT_ERR(__wt_strndup( + session, buf->data, buf->size, &table->plan)); + } + + table->cg_complete = 1; + +err: __wt_scr_free(&buf); + if (colgroup != NULL) + __wt_schema_destroy_colgroup(session, colgroup); + if (cgconfig != NULL) + __wt_free(session, cgconfig); + return (ret); +} + +/* + * __open_index -- + * Open an index. + */ +static int +__open_index(WT_SESSION_IMPL *session, WT_TABLE *table, WT_INDEX *idx) +{ + WT_CONFIG colconf; + WT_CONFIG_ITEM ckey, cval; + WT_DECL_ITEM(buf); + WT_DECL_ITEM(plan); + WT_DECL_RET; + u_int cursor_key_cols, i; + + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + + /* Get the data source from the index config. */ + WT_ERR(__wt_config_getones(session, idx->config, "source", &cval)); + WT_ERR(__wt_buf_fmt(session, buf, "%.*s", (int)cval.len, cval.str)); + WT_ERR(__wt_strndup(session, buf->data, buf->size, &idx->source)); + + WT_ERR(__wt_buf_init(session, buf, 0)); + WT_ERR(__wt_config_getones(session, idx->config, "key_format", &cval)); + WT_ERR(__wt_buf_fmt(session, buf, "%.*s", (int)cval.len, cval.str)); + WT_ERR(__wt_strndup(session, buf->data, buf->size, &idx->key_format)); + + /* + * The key format for an index is somewhat subtle: the application + * specifies a set of columns that it will use for the key, but the + * engine usually adds some hidden columns in order to derive the + * primary key. These hidden columns are part of the file's key. + * + * The file's key_format is stored persistently, we need to calculate + * the index cursor key format (which will usually omit some of those + * keys). + */ + WT_ERR(__wt_buf_init(session, buf, 0)); + WT_ERR(__wt_config_getones( + session, idx->config, "columns", &idx->colconf)); + + /* Start with the declared index columns. */ + WT_ERR(__wt_config_subinit(session, &colconf, &idx->colconf)); + cursor_key_cols = 0; + while ((ret = __wt_config_next(&colconf, &ckey, &cval)) == 0) { + WT_ERR(__wt_buf_catfmt( + session, buf, "%.*s,", (int)ckey.len, ckey.str)); + ++cursor_key_cols; + } + if (ret != 0 && ret != WT_NOTFOUND) + goto err; + + /* + * Now add any primary key columns from the table that are not + * already part of the index key. + */ + WT_ERR(__wt_config_subinit(session, &colconf, &table->colconf)); + for (i = 0; i < table->nkey_columns && + (ret = __wt_config_next(&colconf, &ckey, &cval)) == 0; + i++) { + /* + * If the primary key column is already in the secondary key, + * don't add it again. + */ + if (__wt_config_subgetraw( + session, &idx->colconf, &ckey, &cval) == 0) + continue; + WT_ERR(__wt_buf_catfmt( + session, buf, "%.*s,", (int)ckey.len, ckey.str)); + } + if (ret != 0 && ret != WT_NOTFOUND) + goto err; + + WT_ERR(__wt_scr_alloc(session, 0, &plan)); + WT_ERR(__wt_struct_plan(session, table, buf->data, buf->size, 0, plan)); + WT_ERR(__wt_strndup(session, plan->data, plan->size, &idx->key_plan)); + + /* Set up the cursor key format (the visible columns). */ + WT_ERR(__wt_buf_init(session, buf, 0)); + WT_ERR(__wt_struct_truncate(session, + idx->key_format, cursor_key_cols, buf)); + WT_ERR(__wt_strndup( + session, buf->data, buf->size, &idx->idxkey_format)); + + /* By default, index cursor values are the table value columns. */ + /* TODO Optimize to use index columns in preference to table lookups. */ + WT_ERR(__wt_buf_init(session, plan, 0)); + WT_ERR(__wt_struct_plan(session, + table, table->colconf.str, table->colconf.len, 1, plan)); + WT_ERR(__wt_strndup(session, plan->data, plan->size, &idx->value_plan)); + +err: __wt_scr_free(&buf); + __wt_scr_free(&plan); + return (ret); +} + +/* + * __wt_schema_open_index -- + * Open one or more indices for a table. + */ +int +__wt_schema_open_index(WT_SESSION_IMPL *session, + WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp) +{ + WT_CURSOR *cursor; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_INDEX *idx; + u_int i; + int cmp, match; + const char *idxconf, *name, *tablename, *uri; + + /* Check if we've already done the work. */ + if (idxname == NULL && table->idx_complete) + return (0); + + cursor = NULL; + idx = NULL; + + /* Build a search key. */ + tablename = table->name; + (void)WT_PREFIX_SKIP(tablename, "table:"); + WT_ERR(__wt_scr_alloc(session, 512, &tmp)); + WT_ERR(__wt_buf_fmt(session, tmp, "index:%s:", tablename)); + + /* Find matching indices. */ + WT_ERR(__wt_metadata_cursor(session, NULL, &cursor)); + cursor->set_key(cursor, tmp->data); + if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0) + ret = cursor->next(cursor); + for (i = 0; ret == 0; i++, ret = cursor->next(cursor)) { + WT_ERR(cursor->get_key(cursor, &uri)); + name = uri; + if (!WT_PREFIX_SKIP(name, tmp->data)) + break; + + /* Is this the index we are looking for? */ + match = idxname == NULL || WT_STRING_MATCH(name, idxname, len); + + /* + * Ensure there is space, including if we have to make room for + * a new entry in the middle of the list. + */ + WT_ERR(__wt_realloc_def(session, &table->idx_alloc, + WT_MAX(i, table->nindices) + 1, &table->indices)); + + /* Keep the in-memory list in sync with the metadata. */ + cmp = 0; + while (table->indices[i] != NULL && + (cmp = strcmp(uri, table->indices[i]->name)) > 0) { + /* Index no longer exists, remove it. */ + __wt_free(session, table->indices[i]); + memmove(&table->indices[i], &table->indices[i + 1], + (table->nindices - i) * sizeof(WT_INDEX *)); + table->indices[--table->nindices] = NULL; + } + if (cmp < 0) { + /* Make room for a new index. */ + memmove(&table->indices[i + 1], &table->indices[i], + (table->nindices - i) * sizeof(WT_INDEX *)); + table->indices[i] = NULL; + ++table->nindices; + } + + if (!match) + continue; + + if (table->indices[i] == NULL) { + WT_ERR(cursor->get_value(cursor, &idxconf)); + WT_ERR(__wt_calloc_def(session, 1, &idx)); + WT_ERR(__wt_strdup(session, uri, &idx->name)); + WT_ERR(__wt_strdup(session, idxconf, &idx->config)); + WT_ERR(__open_index(session, table, idx)); + + table->indices[i] = idx; + idx = NULL; + } + + /* If we were looking for a single index, we're done. */ + if (indexp != NULL) + *indexp = table->indices[i]; + if (idxname != NULL) + break; + } + WT_ERR_NOTFOUND_OK(ret); + + /* If we did a full pass, we won't need to do it again. */ + if (idxname == NULL) { + table->nindices = i; + table->idx_complete = 1; + } + +err: __wt_scr_free(&tmp); + if (idx != NULL) + __wt_schema_destroy_index(session, idx); + if (cursor != NULL) + WT_TRET(cursor->close(cursor)); + return (ret); +} + +/* + * __wt_schema_open_indices -- + * Open the indices for a table. + */ +int +__wt_schema_open_indices(WT_SESSION_IMPL *session, WT_TABLE *table) +{ + return (__wt_schema_open_index(session, table, NULL, 0, NULL)); +} + +/* + * __wt_schema_open_table -- + * Open a named table. + */ +int +__wt_schema_open_table(WT_SESSION_IMPL *session, + const char *name, size_t namelen, WT_TABLE **tablep) +{ + WT_CONFIG cparser; + WT_CONFIG_ITEM ckey, cval; + WT_CURSOR *cursor; + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_TABLE *table; + const char *tconfig; + char *tablename; + + cursor = NULL; + table = NULL; + tablename = NULL; + + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + WT_ERR(__wt_buf_fmt(session, buf, "table:%.*s", (int)namelen, name)); + WT_ERR(__wt_strndup(session, buf->data, buf->size, &tablename)); + + WT_ERR(__wt_metadata_cursor(session, NULL, &cursor)); + cursor->set_key(cursor, tablename); + WT_ERR(cursor->search(cursor)); + WT_ERR(cursor->get_value(cursor, &tconfig)); + + WT_ERR(__wt_calloc_def(session, 1, &table)); + table->name = tablename; + tablename = NULL; + + WT_ERR(__wt_config_getones(session, tconfig, "columns", &cval)); + + WT_ERR(__wt_config_getones(session, tconfig, "key_format", &cval)); + WT_ERR(__wt_strndup(session, cval.str, cval.len, &table->key_format)); + WT_ERR(__wt_config_getones(session, tconfig, "value_format", &cval)); + WT_ERR(__wt_strndup(session, cval.str, cval.len, &table->value_format)); + WT_ERR(__wt_strdup(session, tconfig, &table->config)); + + /* Point to some items in the copy to save re-parsing. */ + WT_ERR(__wt_config_getones(session, table->config, + "columns", &table->colconf)); + + /* + * Count the number of columns: tables are "simple" if the columns + * are not named. + */ + WT_ERR(__wt_config_subinit(session, &cparser, &table->colconf)); + table->is_simple = 1; + while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0) + table->is_simple = 0; + if (ret != WT_NOTFOUND) + goto err; + + /* Check that the columns match the key and value formats. */ + if (!table->is_simple) + WT_ERR(__wt_schema_colcheck(session, + table->key_format, table->value_format, &table->colconf, + &table->nkey_columns, NULL)); + + WT_ERR(__wt_config_getones(session, table->config, + "colgroups", &table->cgconf)); + + /* Count the number of column groups. */ + WT_ERR(__wt_config_subinit(session, &cparser, &table->cgconf)); + table->ncolgroups = 0; + while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0) + ++table->ncolgroups; + if (ret != WT_NOTFOUND) + goto err; + + WT_ERR(__wt_calloc_def(session, WT_COLGROUPS(table), &table->cgroups)); + WT_ERR(__wt_schema_open_colgroups(session, table)); + *tablep = table; + + if (0) { +err: if (table != NULL) + __wt_schema_destroy_table(session, table); + } + if (cursor != NULL) + WT_TRET(cursor->close(cursor)); + + __wt_free(session, tablename); + __wt_scr_free(&buf); + return (ret); +} + +/* + * __wt_schema_get_colgroup -- + * Find a column group by URI. + */ +int +__wt_schema_get_colgroup(WT_SESSION_IMPL *session, + const char *uri, WT_TABLE **tablep, WT_COLGROUP **colgroupp) +{ + WT_COLGROUP *colgroup; + WT_TABLE *table; + const char *tablename, *tend; + u_int i; + + *colgroupp = NULL; + + tablename = uri; + if (!WT_PREFIX_SKIP(tablename, "colgroup:")) + return (__wt_bad_object_type(session, uri)); + + if ((tend = strchr(tablename, ':')) == NULL) + tend = tablename + strlen(tablename); + + WT_RET(__wt_schema_get_table(session, + tablename, WT_PTRDIFF(tend, tablename), 0, &table)); + + for (i = 0; i < WT_COLGROUPS(table); i++) { + colgroup = table->cgroups[i]; + if (strcmp(colgroup->name, uri) == 0) { + *colgroupp = colgroup; + if (tablep != NULL) + *tablep = table; + else + __wt_schema_release_table(session, table); + return (0); + } + } + + __wt_schema_release_table(session, table); + WT_RET_MSG(session, ENOENT, "%s not found in table", uri); +} + +/* + * __wt_schema_get_index -- + * Find a column group by URI. + */ +int +__wt_schema_get_index(WT_SESSION_IMPL *session, + const char *uri, WT_TABLE **tablep, WT_INDEX **indexp) +{ + WT_DECL_RET; + WT_INDEX *idx; + WT_TABLE *table; + const char *tablename, *tend; + u_int i; + + *indexp = NULL; + + tablename = uri; + if (!WT_PREFIX_SKIP(tablename, "index:") || + (tend = strchr(tablename, ':')) == NULL) + return (__wt_bad_object_type(session, uri)); + + WT_RET(__wt_schema_get_table(session, + tablename, WT_PTRDIFF(tend, tablename), 0, &table)); + + /* Try to find the index in the table. */ + for (i = 0; i < table->nindices; i++) { + idx = table->indices[i]; + if (strcmp(idx->name, uri) == 0) { + if (tablep != NULL) + *tablep = table; + else + __wt_schema_release_table(session, table); + *indexp = idx; + return (0); + } + } + + /* Otherwise, open it. */ + WT_ERR(__wt_schema_open_index( + session, table, tend + 1, strlen(tend + 1), indexp)); + +err: __wt_schema_release_table(session, table); + WT_RET(ret); + + if (*indexp != NULL) + return (0); + + WT_RET_MSG(session, ENOENT, "%s not found in table", uri); +} diff --git a/src/third_party/wiredtiger/src/schema/schema_plan.c b/src/third_party/wiredtiger/src/schema/schema_plan.c new file mode 100644 index 00000000000..5abe0dd67d4 --- /dev/null +++ b/src/third_party/wiredtiger/src/schema/schema_plan.c @@ -0,0 +1,394 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __find_next_col -- + * Find the next column to use for a plan. + */ +static int +__find_next_col(WT_SESSION_IMPL *session, WT_TABLE *table, + WT_CONFIG_ITEM *colname, u_int *cgnump, u_int *colnump, char *coltype) +{ + WT_COLGROUP *colgroup; + WT_CONFIG conf; + WT_CONFIG_ITEM cval, k, v; + WT_DECL_RET; + u_int cg, col, foundcg, foundcol, matchcg, matchcol; + int getnext; + + foundcg = foundcol = UINT_MAX; + matchcg = *cgnump; + matchcol = (*coltype == WT_PROJ_KEY) ? + *colnump : *colnump + table->nkey_columns; + + getnext = 1; + for (colgroup = NULL, cg = 0; cg < WT_COLGROUPS(table); cg++) { + colgroup = table->cgroups[cg]; + + /* + * If there is only one column group, we just scan through all + * of the columns. For tables with multiple column groups, we + * look at the key columns once, then go through the value + * columns for each group. + */ + if (cg == 0) { + cval = table->colconf; + col = 0; + } else { +cgcols: cval = colgroup->colconf; + col = table->nkey_columns; + } + WT_RET(__wt_config_subinit(session, &conf, &cval)); + for (; (ret = __wt_config_next(&conf, &k, &v)) == 0; col++) { + if (k.len == colname->len && + strncmp(colname->str, k.str, k.len) == 0) { + if (getnext) { + foundcg = cg; + foundcol = col; + } + getnext = (cg == matchcg && col == matchcol); + } + if (cg == 0 && table->ncolgroups > 0 && + col == table->nkey_columns - 1) + goto cgcols; + } + WT_RET_TEST(ret != WT_NOTFOUND, ret); + + colgroup = NULL; + } + + if (foundcg == UINT_MAX) + return (WT_NOTFOUND); + + *cgnump = foundcg; + if (foundcol < table->nkey_columns) { + *coltype = WT_PROJ_KEY; + *colnump = foundcol; + } else { + *coltype = WT_PROJ_VALUE; + *colnump = foundcol - table->nkey_columns; + } + return (0); +} + +/* + * __wt_schema_colcheck -- + * Check that a list of columns matches a (key,value) format pair. + */ +int +__wt_schema_colcheck(WT_SESSION_IMPL *session, + const char *key_format, const char *value_format, WT_CONFIG_ITEM *colconf, + u_int *kcolsp, u_int *vcolsp) +{ + WT_CONFIG conf; + WT_CONFIG_ITEM k, v; + WT_DECL_PACK_VALUE(pv); + WT_DECL_RET; + WT_PACK pack; + u_int kcols, ncols, vcols; + + WT_RET(__pack_init(session, &pack, key_format)); + for (kcols = 0; (ret = __pack_next(&pack, &pv)) == 0; kcols++) + ; + WT_RET_TEST(ret != WT_NOTFOUND, ret); + + WT_RET(__pack_init(session, &pack, value_format)); + for (vcols = 0; (ret = __pack_next(&pack, &pv)) == 0; vcols++) + ; + WT_RET_TEST(ret != WT_NOTFOUND, ret); + + /* Walk through the named columns. */ + WT_RET(__wt_config_subinit(session, &conf, colconf)); + for (ncols = 0; (ret = __wt_config_next(&conf, &k, &v)) == 0; ncols++) + ; + WT_RET_TEST(ret != WT_NOTFOUND, ret); + + if (ncols != 0 && ncols != kcols + vcols) + WT_RET_MSG(session, EINVAL, "Number of columns in '%.*s' " + "does not match key format '%s' plus value format '%s'", + (int)colconf->len, colconf->str, key_format, value_format); + + if (kcolsp != NULL) + *kcolsp = kcols; + if (vcolsp != NULL) + *vcolsp = vcols; + + return (0); +} + +/* + * __wt_table_check -- + * Make sure all columns appear in a column group. + */ +int +__wt_table_check(WT_SESSION_IMPL *session, WT_TABLE *table) +{ + WT_CONFIG conf; + WT_CONFIG_ITEM k, v; + WT_DECL_RET; + u_int cg, col, i; + char coltype; + + if (table->is_simple) + return (0); + + /* Walk through the columns. */ + WT_RET(__wt_config_subinit(session, &conf, &table->colconf)); + + /* Skip over the key columns. */ + for (i = 0; i < table->nkey_columns; i++) + WT_RET(__wt_config_next(&conf, &k, &v)); + cg = col = 0; + coltype = 0; + while ((ret = __wt_config_next(&conf, &k, &v)) == 0) { + if (__find_next_col( + session, table, &k, &cg, &col, &coltype) != 0) + WT_RET_MSG(session, EINVAL, + "Column '%.*s' in '%s' does not appear in a " + "column group", + (int)k.len, k.str, table->name); + /* + * Column groups can't store key columns in their value: + * __wt_struct_reformat should have already detected this case. + */ + WT_ASSERT(session, coltype == WT_PROJ_VALUE); + + } + WT_RET_TEST(ret != WT_NOTFOUND, ret); + + return (0); +} + +/* + * __wt_struct_plan -- + * Given a table cursor containing a complete table, build the "projection + * plan" to distribute the columns to dependent stores. A string + * representing the plan will be appended to the plan buffer. + */ +int +__wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table, + const char *columns, size_t len, int value_only, WT_ITEM *plan) +{ + WT_CONFIG conf; + WT_CONFIG_ITEM k, v; + WT_DECL_RET; + u_int cg, col, current_cg, current_col, i, start_cg, start_col; + int have_it; + char coltype, current_coltype; + + start_cg = start_col = UINT_MAX; /* -Wuninitialized */ + + /* Work through the value columns by skipping over the key columns. */ + WT_RET(__wt_config_initn(session, &conf, columns, len)); + if (value_only) + for (i = 0; i < table->nkey_columns; i++) + WT_RET(__wt_config_next(&conf, &k, &v)); + + current_cg = cg = 0; + current_col = col = INT_MAX; + current_coltype = coltype = WT_PROJ_KEY; /* Keep lint quiet. */ + for (i = 0; (ret = __wt_config_next(&conf, &k, &v)) == 0; i++) { + have_it = 0; + + while (__find_next_col(session, table, + &k, &cg, &col, &coltype) == 0 && + (!have_it || cg != start_cg || col != start_col)) { + /* + * First we move to the column. If that is in a + * different column group to the last column we + * accessed, or before the last column in the same + * column group, or moving from the key to the value, + * we need to switch column groups or rewind. + */ + if (current_cg != cg || current_col > col || + current_coltype != coltype) { + WT_ASSERT(session, !value_only || + coltype == WT_PROJ_VALUE); + WT_RET(__wt_buf_catfmt( + session, plan, "%d%c", cg, coltype)); + + /* + * Set the current column group and column + * within the table. + */ + current_cg = cg; + current_col = 0; + current_coltype = coltype; + } + /* Now move to the column we want. */ + if (current_col < col) { + if (col - current_col > 1) + WT_RET(__wt_buf_catfmt(session, + plan, "%d", col - current_col)); + WT_RET(__wt_buf_catfmt(session, + plan, "%c", WT_PROJ_SKIP)); + } + /* + * Now copy the value in / out. In the common case, + * where each value is used in one column, we do a + * "next" operation. If the value is used again, we do + * a "reuse" operation to avoid making another copy. + */ + if (!have_it) { + WT_RET(__wt_buf_catfmt(session, + plan, "%c", WT_PROJ_NEXT)); + + start_cg = cg; + start_col = col; + have_it = 1; + } else + WT_RET(__wt_buf_catfmt(session, + plan, "%c", WT_PROJ_REUSE)); + current_col = col + 1; + } + } + WT_RET_TEST(ret != WT_NOTFOUND, ret); + + /* Special case empty plans. */ + if (i == 0 && plan->size == 0) + WT_RET(__wt_buf_set(session, plan, "", 1)); + + return (0); +} + +/* + * __find_column_format -- + * Find the format of the named column. + */ +static int +__find_column_format(WT_SESSION_IMPL *session, + WT_TABLE *table, WT_CONFIG_ITEM *colname, int value_only, WT_PACK_VALUE *pv) +{ + WT_CONFIG conf; + WT_CONFIG_ITEM k, v; + WT_DECL_RET; + WT_PACK pack; + int inkey; + + WT_RET(__wt_config_subinit(session, &conf, &table->colconf)); + WT_RET(__pack_init(session, &pack, table->key_format)); + inkey = 1; + + while ((ret = __wt_config_next(&conf, &k, &v)) == 0) { + if ((ret = __pack_next(&pack, pv)) == WT_NOTFOUND && inkey) { + ret = __pack_init(session, &pack, table->value_format); + if (ret == 0) + ret = __pack_next(&pack, pv); + inkey = 0; + } + if (ret != 0) + return (ret); + + if (k.len == colname->len && + strncmp(colname->str, k.str, k.len) == 0) { + if (value_only && inkey) + return (EINVAL); + return (0); + } + } + + return (ret); +} + +/* + * __wt_struct_reformat -- + * Given a table and a list of columns (which could be values in a column + * group or index keys), calculate the resulting new format string. + * The result will be appended to the format buffer. + */ +int +__wt_struct_reformat(WT_SESSION_IMPL *session, WT_TABLE *table, + const char *columns, size_t len, const char *extra_cols, int value_only, + WT_ITEM *format) +{ + WT_CONFIG config; + WT_CONFIG_ITEM k, next_k, next_v; + WT_DECL_PACK_VALUE(pv); + WT_DECL_RET; + int have_next; + + WT_RET(__wt_config_initn(session, &config, columns, len)); + /* + * If an empty column list is specified, this will fail with + * WT_NOTFOUND, that's okay. + */ + WT_RET_NOTFOUND_OK(ret = __wt_config_next(&config, &next_k, &next_v)); + if (ret == WT_NOTFOUND) { + if (format->size == 0) + WT_RET(__wt_buf_set(session, format, "", 1)); + return (0); + } + do { + k = next_k; + ret = __wt_config_next(&config, &next_k, &next_v); + if (ret != 0 && ret != WT_NOTFOUND) + return (ret); + have_next = (ret == 0); + + if (!have_next && extra_cols != NULL) { + WT_RET(__wt_config_init(session, &config, extra_cols)); + WT_RET(__wt_config_next(&config, &next_k, &next_v)); + have_next = 1; + extra_cols = NULL; + } + + if ((ret = __find_column_format(session, + table, &k, value_only, &pv)) != 0) { + if (value_only && ret == EINVAL) + WT_RET_MSG(session, EINVAL, + "A column group cannot store key column " + "'%.*s' in its value", (int)k.len, k.str); + WT_RET_MSG(session, EINVAL, + "Column '%.*s' not found", (int)k.len, k.str); + } + + /* + * Check whether we're moving an unsized WT_ITEM from the end + * to the middle, or vice-versa. This determines whether the + * size needs to be prepended. This is the only case where the + * destination size can be larger than the source size. + */ + if (pv.type == 'u' && !pv.havesize && have_next) + pv.type = 'U'; + else if (pv.type == 'U' && !have_next) + pv.type = 'u'; + + if (pv.havesize) + WT_RET(__wt_buf_catfmt( + session, format, "%d%c", (int)pv.size, pv.type)); + else + WT_RET(__wt_buf_catfmt(session, format, "%c", pv.type)); + } while (have_next); + + return (0); +} + +/* + * __wt_struct_truncate -- + * Return a packing string for the first N columns in a value. + */ +int +__wt_struct_truncate(WT_SESSION_IMPL *session, + const char *input_fmt, u_int ncols, WT_ITEM *format) +{ + WT_DECL_PACK_VALUE(pv); + WT_PACK pack; + + WT_RET(__pack_init(session, &pack, input_fmt)); + while (ncols-- > 0) { + WT_RET(__pack_next(&pack, &pv)); + if (pv.havesize) + WT_RET(__wt_buf_catfmt( + session, format, "%d%c", (int)pv.size, pv.type)); + else + WT_RET(__wt_buf_catfmt(session, format, "%c", pv.type)); + } + + return (0); +} diff --git a/src/third_party/wiredtiger/src/schema/schema_project.c b/src/third_party/wiredtiger/src/schema/schema_project.c new file mode 100644 index 00000000000..9aff4c8dded --- /dev/null +++ b/src/third_party/wiredtiger/src/schema/schema_project.c @@ -0,0 +1,474 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_schema_project_in -- + * Given list of cursors and a projection, read columns from the + * application into the dependent cursors. + */ +int +__wt_schema_project_in(WT_SESSION_IMPL *session, + WT_CURSOR **cp, const char *proj_arg, va_list ap) +{ + WT_CURSOR *c; + WT_DECL_ITEM(buf); + WT_DECL_PACK_VALUE(pv); + WT_DECL_PACK(pack); + WT_PACK_VALUE old_pv; + size_t len, offset, old_len; + u_long arg; + char *proj; + uint8_t *p, *end; + const uint8_t *next; + + p = end = NULL; /* -Wuninitialized */ + + /* Reset any of the buffers we will be setting. */ + for (proj = (char *)proj_arg; *proj != '\0'; proj++) { + arg = strtoul(proj, &proj, 10); + if (*proj == WT_PROJ_KEY) { + c = cp[arg]; + WT_RET(__wt_buf_init(session, &c->key, 0)); + } else if (*proj == WT_PROJ_VALUE) { + c = cp[arg]; + WT_RET(__wt_buf_init(session, &c->value, 0)); + } + } + + for (proj = (char *)proj_arg; *proj != '\0'; proj++) { + arg = strtoul(proj, &proj, 10); + + switch (*proj) { + case WT_PROJ_KEY: + c = cp[arg]; + if (WT_CURSOR_RECNO(c)) { + c->key.data = &c->recno; + c->key.size = sizeof(c->recno); + WT_RET(__pack_init(session, &pack, "R")); + } else + WT_RET(__pack_init( + session, &pack, c->key_format)); + buf = &c->key; + p = (uint8_t *)buf->data; + end = p + buf->size; + continue; + + case WT_PROJ_VALUE: + c = cp[arg]; + WT_RET(__pack_init(session, &pack, c->value_format)); + buf = &c->value; + p = (uint8_t *)buf->data; + end = p + buf->size; + continue; + } + + /* We have to get a key or value before any operations. */ + WT_ASSERT(session, buf != NULL); + + /* + * Otherwise, the argument is a count, where a missing + * count means a count of 1. + */ + for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) { + switch (*proj) { + case WT_PROJ_SKIP: + WT_RET(__pack_next(&pack, &pv)); + /* + * A nasty case: if we are inserting + * out-of-order, we may reach the end of the + * data. That's okay: we want to append in + * that case, and we're positioned to do that. + */ + if (p == end) { + /* Set up an empty value. */ + WT_CLEAR(pv.u); + if (pv.type == 'S' || pv.type == 's') + pv.u.s = ""; + + len = __pack_size(session, &pv); + WT_RET(__wt_buf_grow(session, + buf, buf->size + len)); + p = (uint8_t *)buf->mem + buf->size; + WT_RET(__pack_write( + session, &pv, &p, len)); + buf->size += len; + end = (uint8_t *)buf->mem + buf->size; + } else if (*proj == WT_PROJ_SKIP) + WT_RET(__unpack_read(session, + &pv, (const uint8_t **)&p, + (size_t)(end - p))); + break; + + case WT_PROJ_NEXT: + WT_RET(__pack_next(&pack, &pv)); + WT_PACK_GET(session, pv, ap); + /* FALLTHROUGH */ + + case WT_PROJ_REUSE: + /* Read the item we're about to overwrite. */ + next = p; + if (p < end) { + old_pv = pv; + WT_RET(__unpack_read(session, &old_pv, + &next, (size_t)(end - p))); + } + old_len = (size_t)(next - p); + + len = __pack_size(session, &pv); + offset = WT_PTRDIFF(p, buf->mem); + WT_RET(__wt_buf_grow(session, + buf, buf->size + len)); + p = (uint8_t *)buf->mem + offset; + end = (uint8_t *)buf->mem + buf->size + len; + /* Make room if we're inserting out-of-order. */ + if (offset + old_len < buf->size) + memmove(p + len, p + old_len, + buf->size - (offset + old_len)); + WT_RET(__pack_write(session, &pv, &p, len)); + buf->size += len; + break; + + default: + WT_RET_MSG(session, EINVAL, + "unexpected projection plan: %c", + (int)*proj); + } + } + } + + return (0); +} + +/* + * __wt_schema_project_out -- + * Given list of cursors and a projection, read columns from the + * dependent cursors and return them to the application. + */ +int +__wt_schema_project_out(WT_SESSION_IMPL *session, + WT_CURSOR **cp, const char *proj_arg, va_list ap) +{ + WT_CURSOR *c; + WT_DECL_PACK(pack); + WT_DECL_PACK_VALUE(pv); + u_long arg; + char *proj; + uint8_t *p, *end; + + p = end = NULL; /* -Wuninitialized */ + + for (proj = (char *)proj_arg; *proj != '\0'; proj++) { + arg = strtoul(proj, &proj, 10); + + switch (*proj) { + case WT_PROJ_KEY: + c = cp[arg]; + if (WT_CURSOR_RECNO(c)) { + c->key.data = &c->recno; + c->key.size = sizeof(c->recno); + WT_RET(__pack_init(session, &pack, "R")); + } else + WT_RET(__pack_init( + session, &pack, c->key_format)); + p = (uint8_t *)c->key.data; + end = p + c->key.size; + continue; + + case WT_PROJ_VALUE: + c = cp[arg]; + WT_RET(__pack_init(session, &pack, c->value_format)); + p = (uint8_t *)c->value.data; + end = p + c->value.size; + continue; + } + + /* + * Otherwise, the argument is a count, where a missing + * count means a count of 1. + */ + for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) { + switch (*proj) { + case WT_PROJ_NEXT: + case WT_PROJ_SKIP: + case WT_PROJ_REUSE: + WT_RET(__pack_next(&pack, &pv)); + WT_RET(__unpack_read(session, &pv, + (const uint8_t **)&p, (size_t)(end - p))); + /* Only copy the value out once. */ + if (*proj != WT_PROJ_NEXT) + break; + WT_UNPACK_PUT(session, pv, ap); + break; + } + } + } + + return (0); +} + +/* + * __wt_schema_project_slice -- + * Given list of cursors and a projection, read columns from the + * a raw buffer. + */ +int +__wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp, + const char *proj_arg, int key_only, const char *vformat, WT_ITEM *value) +{ + WT_CURSOR *c; + WT_DECL_ITEM(buf); + WT_DECL_PACK(pack); + WT_DECL_PACK_VALUE(pv); + WT_DECL_PACK_VALUE(vpv); + WT_PACK vpack; + u_long arg; + char *proj; + uint8_t *end, *p; + const uint8_t *next, *vp, *vend; + size_t len, offset, old_len; + int skip; + + p = end = NULL; /* -Wuninitialized */ + + WT_RET(__pack_init(session, &vpack, vformat)); + vp = value->data; + vend = vp + value->size; + + /* Reset any of the buffers we will be setting. */ + for (proj = (char *)proj_arg; *proj != '\0'; proj++) { + arg = strtoul(proj, &proj, 10); + if (*proj == WT_PROJ_KEY) { + c = cp[arg]; + WT_RET(__wt_buf_init(session, &c->key, 0)); + } else if (*proj == WT_PROJ_VALUE && !key_only) { + c = cp[arg]; + WT_RET(__wt_buf_init(session, &c->value, 0)); + } + } + + skip = key_only; + for (proj = (char *)proj_arg; *proj != '\0'; proj++) { + arg = strtoul(proj, &proj, 10); + + switch (*proj) { + case WT_PROJ_KEY: + skip = 0; + c = cp[arg]; + if (WT_CURSOR_RECNO(c)) { + c->key.data = &c->recno; + c->key.size = sizeof(c->recno); + WT_RET(__pack_init(session, &pack, "R")); + } else + WT_RET(__pack_init( + session, &pack, c->key_format)); + buf = &c->key; + p = (uint8_t *)buf->data; + end = p + buf->size; + continue; + + case WT_PROJ_VALUE: + if ((skip = key_only) != 0) + continue; + c = cp[arg]; + WT_RET(__pack_init(session, &pack, c->value_format)); + buf = &c->value; + p = (uint8_t *)buf->data; + end = p + buf->size; + continue; + } + + /* We have to get a key or value before any operations. */ + WT_ASSERT(session, skip || buf != NULL); + + /* + * Otherwise, the argument is a count, where a missing + * count means a count of 1. + */ + for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) { + switch (*proj) { + case WT_PROJ_SKIP: + if (skip) + break; + WT_RET(__pack_next(&pack, &pv)); + + /* + * A nasty case: if we are inserting + * out-of-order, append a zero value to keep + * the buffer in the correct format. + */ + if (p == end) { + /* Set up an empty value. */ + WT_CLEAR(pv.u); + if (pv.type == 'S' || pv.type == 's') + pv.u.s = ""; + + len = __pack_size(session, &pv); + WT_RET(__wt_buf_grow(session, + buf, buf->size + len)); + p = (uint8_t *)buf->data + buf->size; + WT_RET(__pack_write( + session, &pv, &p, len)); + end = p; + buf->size += len; + } else + WT_RET(__unpack_read(session, + &pv, (const uint8_t **)&p, + (size_t)(end - p))); + break; + + case WT_PROJ_NEXT: + WT_RET(__pack_next(&vpack, &vpv)); + WT_RET(__unpack_read(session, &vpv, + &vp, (size_t)(vend - vp))); + /* FALLTHROUGH */ + + case WT_PROJ_REUSE: + if (skip) + break; + + /* + * Read the item we're about to overwrite. + * + * There is subtlety here: the value format + * may not exactly match the cursor's format. + * In particular, we need lengths with raw + * columns in the middle of a packed struct, + * but not if they are at the end of a struct. + */ + WT_RET(__pack_next(&pack, &pv)); + + next = p; + if (p < end) + WT_RET(__unpack_read(session, &pv, + &next, (size_t)(end - p))); + old_len = (size_t)(next - p); + + /* Make sure the types are compatible. */ + WT_ASSERT(session, + tolower(pv.type) == tolower(vpv.type)); + pv.u = vpv.u; + + len = __pack_size(session, &pv); + offset = WT_PTRDIFF(p, buf->data); + /* + * Avoid growing the buffer if the value fits. + * This is not just a performance issue: it + * covers the case of record number keys, which + * have to be written to cursor->recno. + */ + if (len > old_len) + WT_RET(__wt_buf_grow(session, + buf, buf->size + len - old_len)); + p = (uint8_t *)buf->data + offset; + /* Make room if we're inserting out-of-order. */ + if (offset + old_len < buf->size) + memmove(p + len, p + old_len, + buf->size - (offset + old_len)); + WT_RET(__pack_write(session, &pv, &p, len)); + buf->size += len - old_len; + end = (uint8_t *)buf->data + buf->size; + break; + default: + WT_RET_MSG(session, EINVAL, + "unexpected projection plan: %c", + (int)*proj); + } + } + } + + return (0); +} + +/* + * __wt_schema_project_merge -- + * Given list of cursors and a projection, build a buffer containing the + * column values read from the cursors. + */ +int +__wt_schema_project_merge(WT_SESSION_IMPL *session, + WT_CURSOR **cp, const char *proj_arg, const char *vformat, WT_ITEM *value) +{ + WT_CURSOR *c; + WT_ITEM *buf; + WT_DECL_PACK(pack); + WT_DECL_PACK_VALUE(pv); + WT_DECL_PACK_VALUE(vpv); + WT_PACK vpack; + u_long arg; + char *proj; + const uint8_t *p, *end; + uint8_t *vp; + size_t len; + + p = end = NULL; /* -Wuninitialized */ + + WT_RET(__wt_buf_init(session, value, 0)); + WT_RET(__pack_init(session, &vpack, vformat)); + + for (proj = (char *)proj_arg; *proj != '\0'; proj++) { + arg = strtoul(proj, &proj, 10); + + switch (*proj) { + case WT_PROJ_KEY: + c = cp[arg]; + if (WT_CURSOR_RECNO(c)) { + c->key.data = &c->recno; + c->key.size = sizeof(c->recno); + WT_RET(__pack_init(session, &pack, "R")); + } else + WT_RET(__pack_init( + session, &pack, c->key_format)); + buf = &c->key; + p = buf->data; + end = p + buf->size; + continue; + + case WT_PROJ_VALUE: + c = cp[arg]; + WT_RET(__pack_init(session, &pack, c->value_format)); + buf = &c->value; + p = buf->data; + end = p + buf->size; + continue; + } + + /* + * Otherwise, the argument is a count, where a missing + * count means a count of 1. + */ + for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) { + switch (*proj) { + case WT_PROJ_NEXT: + case WT_PROJ_SKIP: + case WT_PROJ_REUSE: + WT_RET(__pack_next(&pack, &pv)); + WT_RET(__unpack_read(session, &pv, + &p, (size_t)(end - p))); + /* Only copy the value out once. */ + if (*proj != WT_PROJ_NEXT) + break; + + WT_RET(__pack_next(&vpack, &vpv)); + /* Make sure the types are compatible. */ + WT_ASSERT(session, + tolower(pv.type) == tolower(vpv.type)); + vpv.u = pv.u; + len = __pack_size(session, &vpv); + WT_RET(__wt_buf_grow(session, + value, value->size + len)); + vp = (uint8_t *)value->mem + value->size; + WT_RET(__pack_write(session, &vpv, &vp, len)); + value->size += len; + break; + } + } + } + + return (0); +} diff --git a/src/third_party/wiredtiger/src/schema/schema_rename.c b/src/third_party/wiredtiger/src/schema/schema_rename.c new file mode 100644 index 00000000000..8605ea41c80 --- /dev/null +++ b/src/third_party/wiredtiger/src/schema/schema_rename.c @@ -0,0 +1,276 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __rename_file -- + * WT_SESSION::rename for a file. + */ +static int +__rename_file( + WT_SESSION_IMPL *session, const char *uri, const char *newuri) +{ + WT_DECL_RET; + int exist; + const char *filename, *newfile, *newvalue, *oldvalue; + + newvalue = oldvalue = NULL; + + filename = uri; + newfile = newuri; + if (!WT_PREFIX_SKIP(filename, "file:") || + !WT_PREFIX_SKIP(newfile, "file:")) + return (EINVAL); + + /* Close any btree handles in the file. */ + WT_ERR(__wt_conn_dhandle_close_all(session, uri, 0)); + + /* + * First, check if the file being renamed exists in the system. Doing + * this check first matches the table rename behavior because we return + * WT_NOTFOUND when the renamed file doesn't exist (subsequently mapped + * to ENOENT by the session layer). + */ + WT_ERR(__wt_metadata_search(session, uri, &oldvalue)); + + /* + * Check to see if the proposed name is already in use, in either the + * metadata or the filesystem. + */ + switch (ret = __wt_metadata_search(session, newuri, &newvalue)) { + case 0: + WT_ERR_MSG(session, EEXIST, "%s", newuri); + /* NOTREACHED */ + case WT_NOTFOUND: + break; + default: + WT_ERR(ret); + } + WT_ERR(__wt_exist(session, newfile, &exist)); + if (exist) + WT_ERR_MSG(session, EEXIST, "%s", newfile); + + /* Replace the old file entries with new file entries. */ + WT_ERR(__wt_metadata_remove(session, uri)); + WT_ERR(__wt_metadata_insert(session, newuri, oldvalue)); + + /* Rename the underlying file. */ + WT_ERR(__wt_rename(session, filename, newfile)); + if (WT_META_TRACKING(session)) + WT_ERR(__wt_meta_track_fileop(session, uri, newuri)); + +err: __wt_free(session, newvalue); + __wt_free(session, oldvalue); + return (ret); +} + +/* + * __rename_tree -- + * Rename an index or colgroup reference. + */ +static int +__rename_tree(WT_SESSION_IMPL *session, + WT_TABLE *table, const char *newuri, const char *name, const char *cfg[]) +{ + WT_CONFIG_ITEM cval; + WT_DECL_ITEM(nn); + WT_DECL_ITEM(ns); + WT_DECL_ITEM(nv); + WT_DECL_ITEM(os); + WT_DECL_RET; + const char *newname, *olduri, *suffix, *value; + int is_colgroup; + + olduri = table->name; + value = NULL; + + newname = newuri; + (void)WT_PREFIX_SKIP(newname, "table:"); + + /* + * Create the new data source URI and update the schema value. + * + * 'name' has the format (colgroup|index):<tablename>[:<suffix>]; + * we need the suffix. + */ + is_colgroup = WT_PREFIX_MATCH(name, "colgroup:"); + if (!is_colgroup && !WT_PREFIX_MATCH(name, "index:")) + WT_ERR_MSG(session, EINVAL, + "expected a 'colgroup:' or 'index:' source: '%s'", name); + + suffix = strchr(name, ':'); + /* An existing table should have a well formed name. */ + WT_ASSERT(session, suffix != NULL); + suffix = strchr(suffix + 1, ':'); + + WT_ERR(__wt_scr_alloc(session, 0, &nn)); + WT_ERR(__wt_buf_fmt(session, nn, "%s%s%s", + is_colgroup ? "colgroup:" : "index:", + newname, + (suffix == NULL) ? "" : suffix)); + + /* Skip the colon, if any. */ + if (suffix != NULL) + ++suffix; + + /* Read the old schema value. */ + WT_ERR(__wt_metadata_search(session, name, &value)); + + /* + * Calculate the new data source URI. Use the existing table structure + * and substitute the new name temporarily. + */ + WT_ERR(__wt_scr_alloc(session, 0, &ns)); + table->name = newuri; + if (is_colgroup) + WT_ERR(__wt_schema_colgroup_source( + session, table, suffix, value, ns)); + else + WT_ERR(__wt_schema_index_source( + session, table, suffix, value, ns)); + + if ((ret = __wt_config_getones(session, value, "source", &cval)) != 0) + WT_ERR_MSG(session, EINVAL, + "index or column group has no data source: %s", value); + + /* Take a copy of the old data source. */ + WT_ERR(__wt_scr_alloc(session, 0, &os)); + WT_ERR(__wt_buf_fmt(session, os, "%.*s", (int)cval.len, cval.str)); + + /* Overwrite it with the new data source. */ + WT_ERR(__wt_scr_alloc(session, 0, &nv)); + WT_ERR(__wt_buf_fmt(session, nv, "%.*s%s%s", + (int)WT_PTRDIFF(cval.str, value), value, + (const char *)ns->data, + cval.str + cval.len)); + + /* + * Remove the old metadata entry. + * Insert the new metadata entry. + */ + WT_ERR(__wt_metadata_remove(session, name)); + WT_ERR(__wt_metadata_insert(session, nn->data, nv->data)); + + /* Rename the file. */ + WT_ERR(__wt_schema_rename(session, os->data, ns->data, cfg)); + +err: __wt_scr_free(&nn); + __wt_scr_free(&ns); + __wt_scr_free(&nv); + __wt_scr_free(&os); + __wt_free(session, value); + table->name = olduri; + return (ret); +} + +/* + * __metadata_rename -- + * Rename an entry in the metadata table. + */ +static int +__metadata_rename(WT_SESSION_IMPL *session, const char *uri, const char *newuri) +{ + WT_DECL_RET; + const char *value; + + WT_RET(__wt_metadata_search(session, uri, &value)); + WT_ERR(__wt_metadata_remove(session, uri)); + WT_ERR(__wt_metadata_insert(session, newuri, value)); + +err: __wt_free(session, value); + return (ret); +} + +/* + * __rename_table -- + * WT_SESSION::rename for a table. + */ +static int +__rename_table(WT_SESSION_IMPL *session, + const char *uri, const char *newuri, const char *cfg[]) +{ + WT_DECL_RET; + WT_TABLE *table; + u_int i; + const char *oldname; + + oldname = uri; + (void)WT_PREFIX_SKIP(oldname, "table:"); + + WT_RET(__wt_schema_get_table( + session, oldname, strlen(oldname), 0, &table)); + + /* Rename the column groups. */ + for (i = 0; i < WT_COLGROUPS(table); i++) + WT_ERR(__rename_tree(session, table, newuri, + table->cgroups[i]->name, cfg)); + + /* Rename the indices. */ + WT_ERR(__wt_schema_open_indices(session, table)); + for (i = 0; i < table->nindices; i++) + WT_ERR(__rename_tree(session, table, newuri, + table->indices[i]->name, cfg)); + + __wt_schema_remove_table(session, table); + table = NULL; + + /* Rename the table. */ + WT_ERR(__metadata_rename(session, uri, newuri)); + +err: if (table != NULL) + __wt_schema_release_table(session, table); + return (ret); +} + +/* + * __wt_schema_rename -- + * WT_SESSION::rename. + */ +int +__wt_schema_rename(WT_SESSION_IMPL *session, + const char *uri, const char *newuri, const char *cfg[]) +{ + WT_DATA_SOURCE *dsrc; + WT_DECL_RET; + const char *p, *t; + + /* The target type must match the source type. */ + for (p = uri, t = newuri; *p == *t && *p != ':'; ++p, ++t) + ; + if (*p != ':' || *t != ':') + WT_RET_MSG(session, EINVAL, + "rename target type must match URI: %s to %s", uri, newuri); + + /* + * We track rename operations, if we fail in the middle, we want to + * back it all out. + */ + WT_RET(__wt_meta_track_on(session)); + + if (WT_PREFIX_MATCH(uri, "file:")) + ret = __rename_file(session, uri, newuri); + else if (WT_PREFIX_MATCH(uri, "lsm:")) + ret = __wt_lsm_tree_rename(session, uri, newuri, cfg); + else if (WT_PREFIX_MATCH(uri, "table:")) + ret = __rename_table(session, uri, newuri, cfg); + else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) + ret = dsrc->rename == NULL ? + __wt_object_unsupported(session, uri) : + dsrc->rename(dsrc, + &session->iface, uri, newuri, (WT_CONFIG_ARG *)cfg); + else + ret = __wt_bad_object_type(session, uri); + + /* Bump the schema generation so that stale data is ignored. */ + ++S2C(session)->schema_gen; + + WT_TRET(__wt_meta_track_off(session, ret != 0)); + + /* If we didn't find a metadata entry, map that error to ENOENT. */ + return (ret == WT_NOTFOUND ? ENOENT : ret); +} diff --git a/src/third_party/wiredtiger/src/schema/schema_stat.c b/src/third_party/wiredtiger/src/schema/schema_stat.c new file mode 100644 index 00000000000..cb8e7f6c418 --- /dev/null +++ b/src/third_party/wiredtiger/src/schema/schema_stat.c @@ -0,0 +1,114 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_curstat_colgroup_init -- + * Initialize the statistics for a column group. + */ +int +__wt_curstat_colgroup_init(WT_SESSION_IMPL *session, + const char *uri, const char *cfg[], WT_CURSOR_STAT *cst) +{ + WT_COLGROUP *colgroup; + WT_DECL_ITEM(buf); + WT_DECL_RET; + + WT_RET(__wt_schema_get_colgroup(session, uri, NULL, &colgroup)); + + WT_RET(__wt_scr_alloc(session, 0, &buf)); + WT_ERR(__wt_buf_fmt(session, buf, "statistics:%s", colgroup->source)); + ret = __wt_curstat_init(session, buf->data, cfg, cst); + +err: __wt_scr_free(&buf); + return (ret); +} + +/* + * __wt_curstat_index_init -- + * Initialize the statistics for an index. + */ +int +__wt_curstat_index_init(WT_SESSION_IMPL *session, + const char *uri, const char *cfg[], WT_CURSOR_STAT *cst) +{ + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_INDEX *idx; + + WT_RET(__wt_schema_get_index(session, uri, NULL, &idx)); + + WT_RET(__wt_scr_alloc(session, 0, &buf)); + WT_ERR(__wt_buf_fmt(session, buf, "statistics:%s", idx->source)); + ret = __wt_curstat_init(session, buf->data, cfg, cst); + +err: __wt_scr_free(&buf); + return (ret); +} + +/* + * __wt_curstat_table_init -- + * Initialize the statistics for a table. + */ +int +__wt_curstat_table_init(WT_SESSION_IMPL *session, + const char *uri, const char *cfg[], WT_CURSOR_STAT *cst) +{ + WT_CURSOR *stat_cursor; + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_DSRC_STATS *new, *stats; + WT_TABLE *table; + u_int i; + const char *name; + + name = uri + strlen("table:"); + WT_RET(__wt_schema_get_table(session, name, strlen(name), 0, &table)); + + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + + /* + * Process the column groups. + * + * Set the cursor to reference the data source statistics; we don't + * initialize it, instead we copy (rather than aggregate), the first + * column's statistics, which has the same effect. + */ + stats = &cst->u.dsrc_stats; + for (i = 0; i < WT_COLGROUPS(table); i++) { + WT_ERR(__wt_buf_fmt( + session, buf, "statistics:%s", table->cgroups[i]->name)); + WT_ERR(__wt_curstat_open( + session, buf->data, cfg, &stat_cursor)); + new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor); + if (i == 0) + *stats = *new; + else + __wt_stat_aggregate_dsrc_stats(new, stats); + WT_ERR(stat_cursor->close(stat_cursor)); + } + + /* Process the indices. */ + WT_ERR(__wt_schema_open_indices(session, table)); + for (i = 0; i < table->nindices; i++) { + WT_ERR(__wt_buf_fmt( + session, buf, "statistics:%s", table->indices[i]->name)); + WT_ERR(__wt_curstat_open( + session, buf->data, cfg, &stat_cursor)); + new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor); + __wt_stat_aggregate_dsrc_stats(new, stats); + WT_ERR(stat_cursor->close(stat_cursor)); + } + + __wt_curstat_dsrc_final(cst); + +err: __wt_schema_release_table(session, table); + + __wt_scr_free(&buf); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/schema/schema_truncate.c b/src/third_party/wiredtiger/src/schema/schema_truncate.c new file mode 100644 index 00000000000..1da3b103f10 --- /dev/null +++ b/src/third_party/wiredtiger/src/schema/schema_truncate.c @@ -0,0 +1,183 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __truncate_file -- + * WT_SESSION::truncate for a file. + */ +static int +__truncate_file(WT_SESSION_IMPL *session, const char *name) +{ + const char *filename; + uint32_t allocsize; + + filename = name; + if (!WT_PREFIX_SKIP(filename, "file:")) + return (EINVAL); + + /* Open and lock the file. */ + WT_RET(__wt_session_get_btree( + session, name, NULL, NULL, WT_DHANDLE_EXCLUSIVE)); + + /* Get the allocation size. */ + allocsize = S2BT(session)->allocsize; + + WT_RET(__wt_session_release_btree(session)); + + /* Close any btree handles in the file. */ + WT_RET(__wt_conn_dhandle_close_all(session, name, 0)); + + /* Delete the root address and truncate the file. */ + WT_RET(__wt_meta_checkpoint_clear(session, name)); + WT_RET(__wt_block_manager_truncate(session, filename, allocsize)); + + return (0); +} + +/* + * __truncate_table -- + * WT_SESSION::truncate for a table. + */ +static int +__truncate_table(WT_SESSION_IMPL *session, const char *name, const char *cfg[]) +{ + WT_DECL_RET; + WT_TABLE *table; + u_int i; + + WT_RET(__wt_schema_get_table(session, name, strlen(name), 0, &table)); + + /* Truncate the column groups. */ + for (i = 0; i < WT_COLGROUPS(table); i++) + WT_ERR(__wt_schema_truncate( + session, table->cgroups[i]->source, cfg)); + + /* Truncate the indices. */ + WT_ERR(__wt_schema_open_indices(session, table)); + for (i = 0; i < table->nindices; i++) + WT_ERR(__wt_schema_truncate( + session, table->indices[i]->source, cfg)); + +err: __wt_schema_release_table(session, table); + return (ret); +} + +/* + * __truncate_dsrc -- + * WT_SESSION::truncate for a data-source without a truncate operation. + */ +static int +__truncate_dsrc(WT_SESSION_IMPL *session, const char *uri) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + const char *cfg[2]; + + /* Open a cursor and traverse the object, removing every entry. */ + cfg[0] = WT_CONFIG_BASE(session, session_open_cursor); + cfg[1] = NULL; + WT_RET(__wt_open_cursor(session, uri, NULL, cfg, &cursor)); + while ((ret = cursor->next(cursor)) == 0) + WT_ERR(cursor->remove(cursor)); + WT_ERR_NOTFOUND_OK(ret); + +err: WT_TRET(cursor->close(cursor)); + return (ret); +} + +/* + * __wt_schema_truncate -- + * WT_SESSION::truncate without a range. + */ +int +__wt_schema_truncate( + WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) +{ + WT_DATA_SOURCE *dsrc; + WT_DECL_RET; + const char *tablename; + + tablename = uri; + + if (WT_PREFIX_MATCH(uri, "file:")) { + ret = __truncate_file(session, uri); + } else if (WT_PREFIX_MATCH(uri, "lsm:")) + ret = __wt_lsm_tree_truncate(session, uri, cfg); + else if (WT_PREFIX_SKIP(tablename, "table:")) + ret = __truncate_table(session, tablename, cfg); + else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) + ret = dsrc->truncate == NULL ? + __truncate_dsrc(session, uri) : + dsrc->truncate( + dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg); + else + ret = __wt_bad_object_type(session, uri); + + /* If we didn't find a metadata entry, map that error to ENOENT. */ + return (ret == WT_NOTFOUND ? ENOENT : ret); +} + +/* + * __wt_range_truncate -- + * Truncate of a cursor range, default implementation. + */ +int +__wt_range_truncate(WT_CURSOR *start, WT_CURSOR *stop) +{ + WT_DECL_RET; + int cmp; + + if (start == NULL) { + do { + WT_RET(stop->remove(stop)); + } while ((ret = stop->prev(stop)) == 0); + WT_RET_NOTFOUND_OK(ret); + } else { + cmp = -1; + do { + if (stop != NULL) + WT_RET(start->compare(start, stop, &cmp)); + WT_RET(start->remove(start)); + } while (cmp < 0 && (ret = start->next(start)) == 0); + WT_RET_NOTFOUND_OK(ret); + } + return (0); +} + +/* + * __wt_schema_range_truncate -- + * WT_SESSION::truncate with a range. + */ +int +__wt_schema_range_truncate( + WT_SESSION_IMPL *session, WT_CURSOR *start, WT_CURSOR *stop) +{ + WT_CURSOR *cursor; + WT_DATA_SOURCE *dsrc; + WT_DECL_RET; + const char *uri; + + cursor = (start != NULL) ? start : stop; + uri = cursor->internal_uri; + + if (WT_PREFIX_MATCH(uri, "file:")) + WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)cursor)->btree, + ret = __wt_btcur_range_truncate( + (WT_CURSOR_BTREE *)start, (WT_CURSOR_BTREE *)stop)); + else if (WT_PREFIX_MATCH(uri, "table:")) + ret = __wt_table_range_truncate( + (WT_CURSOR_TABLE *)start, (WT_CURSOR_TABLE *)stop); + else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL && + dsrc->range_truncate != NULL) + ret = dsrc->range_truncate(dsrc, &session->iface, start, stop); + else + ret = __wt_range_truncate(start, stop); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/schema/schema_util.c b/src/third_party/wiredtiger/src/schema/schema_util.c new file mode 100644 index 00000000000..263f56f1c41 --- /dev/null +++ b/src/third_party/wiredtiger/src/schema/schema_util.c @@ -0,0 +1,84 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_schema_get_source -- + * Find a matching data source or report an error. + */ +WT_DATA_SOURCE * +__wt_schema_get_source(WT_SESSION_IMPL *session, const char *name) +{ + WT_NAMED_DATA_SOURCE *ndsrc; + + TAILQ_FOREACH(ndsrc, &S2C(session)->dsrcqh, q) + if (WT_PREFIX_MATCH(name, ndsrc->prefix)) + return (ndsrc->dsrc); + return (NULL); +} + +/* + * __wt_str_name_check -- + * Disallow any use of the WiredTiger name space. + */ +int +__wt_str_name_check(WT_SESSION_IMPL *session, const char *str) +{ + const char *name, *sep; + int skipped; + + /* + * Check if name is somewhere in the WiredTiger name space: it would be + * "bad" if the application truncated the metadata file. Skip any + * leading URI prefix, check and then skip over a table name. + */ + name = str; + for (skipped = 0; skipped < 2; skipped++) { + if ((sep = strchr(name, ':')) == NULL) + break; + + name = sep + 1; + if (WT_PREFIX_MATCH(name, "WiredTiger")) + WT_RET_MSG(session, EINVAL, + "%s: the \"WiredTiger\" name space may not be " + "used by applications", name); + } + + /* + * Disallow JSON quoting characters -- the config string parsing code + * supports quoted strings, but there's no good reason to use them in + * names and we're not going to do the testing. + */ + if (strpbrk(name, "{},:[]\\\"'") != NULL) + WT_RET_MSG(session, EINVAL, + "%s: WiredTiger objects should not include grouping " + "characters in their names", + name); + + return (0); +} + +/* + * __wt_name_check -- + * Disallow any use of the WiredTiger name space. + */ +int +__wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len) +{ + WT_DECL_RET; + WT_DECL_ITEM(tmp); + + WT_RET(__wt_scr_alloc(session, len, &tmp)); + + WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)len, str)); + + ret = __wt_str_name_check(session, tmp->data); + +err: __wt_scr_free(&tmp); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/schema/schema_worker.c b/src/third_party/wiredtiger/src/schema/schema_worker.c new file mode 100644 index 00000000000..8e7ed3925f6 --- /dev/null +++ b/src/third_party/wiredtiger/src/schema/schema_worker.c @@ -0,0 +1,134 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_schema_worker -- + * Get Btree handles for the object and cycle through calls to an + * underlying worker function with each handle. + */ +int +__wt_schema_worker(WT_SESSION_IMPL *session, + const char *uri, + int (*file_func)(WT_SESSION_IMPL *, const char *[]), + int (*name_func)(WT_SESSION_IMPL *, const char *, int *), + const char *cfg[], uint32_t open_flags) +{ + WT_COLGROUP *colgroup; + WT_DATA_SOURCE *dsrc; + WT_DECL_RET; + WT_INDEX *idx; + WT_SESSION *wt_session; + WT_TABLE *table; + const char *tablename; + u_int i; + int skip; + + table = NULL; + tablename = uri; + + skip = 0; + if (name_func != NULL) + WT_ERR(name_func(session, uri, &skip)); + + /* If the callback said to skip this object, we're done. */ + if (skip) + return (0); + + /* Get the btree handle(s) and call the underlying function. */ + if (WT_PREFIX_MATCH(uri, "file:")) { + if (file_func != NULL) { + /* + * If the operation requires exclusive access, close + * any open file handles, including checkpoints. + */ + if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE)) + WT_ERR(__wt_conn_dhandle_close_all( + session, uri, 0)); + + WT_ERR(__wt_session_get_btree_ckpt( + session, uri, cfg, open_flags)); + ret = file_func(session, cfg); + WT_TRET(__wt_session_release_btree(session)); + } + } else if (WT_PREFIX_MATCH(uri, "colgroup:")) { + WT_ERR(__wt_schema_get_colgroup(session, uri, NULL, &colgroup)); + WT_ERR(__wt_schema_worker(session, colgroup->source, + file_func, name_func, cfg, open_flags)); + } else if (WT_PREFIX_SKIP(tablename, "index:")) { + idx = NULL; + WT_ERR(__wt_schema_get_index(session, uri, NULL, &idx)); + WT_ERR(__wt_schema_worker(session, idx->source, + file_func, name_func, cfg, open_flags)); + } else if (WT_PREFIX_MATCH(uri, "lsm:")) { + /* + * LSM compaction is handled elsewhere, but if we get here + * trying to compact files, don't descend into an LSM tree. + */ + if (file_func != __wt_compact) + WT_ERR(__wt_lsm_tree_worker(session, + uri, file_func, name_func, cfg, open_flags)); + } else if (WT_PREFIX_SKIP(tablename, "table:")) { + WT_ERR(__wt_schema_get_table(session, + tablename, strlen(tablename), 0, &table)); + WT_ASSERT(session, session->dhandle == NULL); + + /* + * We could make a recursive call for each colgroup or index + * URI, but since we have already opened the table, we can take + * a short cut and skip straight to the sources. If we have a + * name function, it needs to know about the intermediate URIs. + */ + for (i = 0; i < WT_COLGROUPS(table); i++) { + colgroup = table->cgroups[i]; + skip = 0; + if (name_func != NULL) + WT_ERR(name_func( + session, colgroup->name, &skip)); + if (!skip) + WT_ERR(__wt_schema_worker( + session, colgroup->source, + file_func, name_func, cfg, open_flags)); + } + + WT_ERR(__wt_schema_open_indices(session, table)); + for (i = 0; i < table->nindices; i++) { + idx = table->indices[i]; + skip = 0; + if (name_func != NULL) + WT_ERR(name_func(session, idx->name, &skip)); + if (!skip) + WT_ERR(__wt_schema_worker(session, idx->source, + file_func, name_func, cfg, open_flags)); + } + } else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) { + wt_session = (WT_SESSION *)session; + if (file_func == __wt_compact && dsrc->compact != NULL) + WT_ERR(dsrc->compact( + dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); + else if (file_func == __wt_salvage && dsrc->salvage != NULL) + WT_ERR(dsrc->salvage( + dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); + else if (file_func == __wt_verify && dsrc->verify != NULL) + WT_ERR(dsrc->verify( + dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); + else if (file_func == __wt_checkpoint) + ; + else if (file_func == __wt_checkpoint_list) + ; + else if (file_func == __wt_checkpoint_sync) + ; + else + WT_ERR(__wt_object_unsupported(session, uri)); + } else + WT_ERR(__wt_bad_object_type(session, uri)); + +err: if (table != NULL) + __wt_schema_release_table(session, table); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c new file mode 100644 index 00000000000..39b9dd0de61 --- /dev/null +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -0,0 +1,1054 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __session_checkpoint(WT_SESSION *, const char *); +static int __session_rollback_transaction(WT_SESSION *, const char *); + +/* + * __wt_session_reset_cursors -- + * Reset all open cursors. + */ +int +__wt_session_reset_cursors(WT_SESSION_IMPL *session) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + + TAILQ_FOREACH(cursor, &session->cursors, q) { + /* Stop when there are no positioned cursors. */ + if (session->ncursors == 0) + break; + WT_TRET(cursor->reset(cursor)); + } + return (ret); +} + +/* + * __wt_session_copy_values -- + * Copy values into all positioned cursors, so that they don't keep + * transaction IDs pinned. + */ +int +__wt_session_copy_values(WT_SESSION_IMPL *session) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + + TAILQ_FOREACH(cursor, &session->cursors, q) + if (F_ISSET(cursor, WT_CURSTD_VALUE_INT)) { + F_CLR(cursor, WT_CURSTD_VALUE_INT); + WT_RET(__wt_buf_set(session, &cursor->value, + cursor->value.data, cursor->value.size)); + F_SET(cursor, WT_CURSTD_VALUE_EXT); + } + + return (ret); +} + +/* + * __session_clear -- + * Clear a session structure. + */ +static void +__session_clear(WT_SESSION_IMPL *session) +{ + /* + * There's no serialization support around the review of the hazard + * array, which means threads checking for hazard pointers first check + * the active field (which may be 0) and then use the hazard pointer + * (which cannot be NULL). + * + * Additionally, the session structure can include information that + * persists past the session's end-of-life, stored as part of page + * splits. + * + * For these reasons, be careful when clearing the session structure. + */ + memset(session, 0, WT_SESSION_CLEAR_SIZE(session)); + session->hazard_size = 0; + session->nhazard = 0; +} + +/* + * __session_close -- + * WT_SESSION->close method. + */ +static int +__session_close(WT_SESSION *wt_session, const char *config) +{ + WT_CONNECTION_IMPL *conn; + WT_CURSOR *cursor; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + conn = (WT_CONNECTION_IMPL *)wt_session->connection; + session = (WT_SESSION_IMPL *)wt_session; + + SESSION_API_CALL(session, close, config, cfg); + WT_UNUSED(cfg); + + /* Rollback any active transaction. */ + if (F_ISSET(&session->txn, TXN_RUNNING)) + WT_TRET(__session_rollback_transaction(wt_session, NULL)); + + /* + * Also release any pinned transaction ID from a non-transactional + * operation. + */ + if (conn->txn_global.states != NULL) + __wt_txn_release_snapshot(session); + + /* Close all open cursors. */ + while ((cursor = TAILQ_FIRST(&session->cursors)) != NULL) { + /* + * Notify the user that we are closing the cursor handle + * via the registered close callback. + */ + if (session->event_handler->handle_close != NULL) + WT_TRET(session->event_handler->handle_close( + session->event_handler, wt_session, cursor)); + WT_TRET(cursor->close(cursor)); + } + + WT_ASSERT(session, session->ncursors == 0); + + /* Discard cached handles. */ + __wt_session_close_cache(session); + + /* Close all tables. */ + __wt_schema_close_tables(session); + + /* Discard metadata tracking. */ + __wt_meta_track_discard(session); + + /* Discard scratch buffers. */ + __wt_scr_discard(session); + + /* Free transaction information. */ + __wt_txn_destroy(session); + + /* Confirm we're not holding any hazard pointers. */ + __wt_hazard_close(session); + + /* Cleanup */ + if (session->block_manager_cleanup != NULL) + WT_TRET(session->block_manager_cleanup(session)); + if (session->reconcile_cleanup != NULL) + WT_TRET(session->reconcile_cleanup(session)); + + /* Free the eviction exclusive-lock information. */ + __wt_free(session, session->excl); + + /* Destroy the thread's mutex. */ + WT_TRET(__wt_cond_destroy(session, &session->cond)); + + /* The API lock protects opening and closing of sessions. */ + __wt_spin_lock(session, &conn->api_lock); + + /* Decrement the count of open sessions. */ + WT_STAT_FAST_CONN_DECR(session, session_open); + + /* + * Sessions are re-used, clear the structure: the clear sets the active + * field to 0, which will exclude the hazard array from review by the + * eviction thread. Because some session fields are accessed by other + * threads, the structure must be cleared carefully. + * + * We don't need to publish here, because regardless of the active field + * being non-zero, the hazard pointer is always valid. + */ + __session_clear(session); + session = conn->default_session; + + /* + * Decrement the count of active sessions if that's possible: a session + * being closed may or may not be at the end of the array, step toward + * the beginning of the array until we reach an active session. + */ + while (conn->sessions[conn->session_cnt - 1].active == 0) + if (--conn->session_cnt == 0) + break; + + __wt_spin_unlock(session, &conn->api_lock); + +err: API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __session_reconfigure -- + * WT_SESSION->reconfigure method. + */ +static int +__session_reconfigure(WT_SESSION *wt_session, const char *config) +{ + WT_CONFIG_ITEM cval; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + SESSION_API_CALL(session, reconfigure, config, cfg); + + if (F_ISSET(&session->txn, TXN_RUNNING)) + WT_ERR_MSG(session, EINVAL, "transaction in progress"); + + WT_TRET(__wt_session_reset_cursors(session)); + + WT_ERR(__wt_config_gets_def(session, cfg, "isolation", 0, &cval)); + if (cval.len != 0) + session->isolation = session->txn.isolation = + WT_STRING_MATCH("snapshot", cval.str, cval.len) ? + TXN_ISO_SNAPSHOT : + WT_STRING_MATCH("read-uncommitted", cval.str, cval.len) ? + TXN_ISO_READ_UNCOMMITTED : TXN_ISO_READ_COMMITTED; + +err: API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __wt_open_cursor -- + * Internal version of WT_SESSION::open_cursor. + */ +int +__wt_open_cursor(WT_SESSION_IMPL *session, + const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) +{ + WT_COLGROUP *colgroup; + WT_DATA_SOURCE *dsrc; + WT_DECL_RET; + + *cursorp = NULL; + + /* + * Open specific cursor types we know about, or call the generic data + * source open function. + * + * Unwind a set of string comparisons into a switch statement hoping + * the compiler can make it fast, but list the common choices first + * instead of sorting so if/else patterns are still fast. + */ + switch (uri[0]) { + /* + * Common cursor types. + */ + case 't': + if (WT_PREFIX_MATCH(uri, "table:")) + WT_RET(__wt_curtable_open(session, uri, cfg, cursorp)); + break; + case 'c': + if (WT_PREFIX_MATCH(uri, "colgroup:")) { + /* + * Column groups are a special case: open a cursor on + * the underlying data source. + */ + WT_RET(__wt_schema_get_colgroup( + session, uri, NULL, &colgroup)); + WT_RET(__wt_open_cursor( + session, colgroup->source, owner, cfg, cursorp)); + } else if (WT_PREFIX_MATCH(uri, "config:")) + WT_RET(__wt_curconfig_open( + session, uri, cfg, cursorp)); + break; + case 'i': + if (WT_PREFIX_MATCH(uri, "index:")) + WT_RET(__wt_curindex_open( + session, uri, owner, cfg, cursorp)); + break; + case 'l': + if (WT_PREFIX_MATCH(uri, "lsm:")) + WT_RET(__wt_clsm_open( + session, uri, owner, cfg, cursorp)); + else if (WT_PREFIX_MATCH(uri, "log:")) + WT_RET(__wt_curlog_open(session, uri, cfg, cursorp)); + break; + + /* + * Less common cursor types. + */ + case 'f': + if (WT_PREFIX_MATCH(uri, "file:")) + WT_RET(__wt_curfile_open( + session, uri, owner, cfg, cursorp)); + break; + case 'm': + if (WT_PREFIX_MATCH(uri, WT_METADATA_URI)) + WT_RET(__wt_curmetadata_open( + session, uri, owner, cfg, cursorp)); + break; + case 'b': + if (WT_PREFIX_MATCH(uri, "backup:")) + WT_RET(__wt_curbackup_open( + session, uri, cfg, cursorp)); + break; + case 's': + if (WT_PREFIX_MATCH(uri, "statistics:")) + WT_RET(__wt_curstat_open(session, uri, cfg, cursorp)); + break; + default: + break; + } + + if (*cursorp == NULL && + (dsrc = __wt_schema_get_source(session, uri)) != NULL) + WT_RET(dsrc->open_cursor == NULL ? + __wt_object_unsupported(session, uri) : + __wt_curds_open(session, uri, owner, cfg, dsrc, cursorp)); + + if (*cursorp == NULL) + return (__wt_bad_object_type(session, uri)); + + /* + * When opening simple tables, the table code calls this function on the + * underlying data source, in which case the application's URI has been + * copied. + */ + if ((*cursorp)->uri == NULL && + (ret = __wt_strdup(session, uri, &(*cursorp)->uri)) != 0) + WT_TRET((*cursorp)->close(*cursorp)); + + return (ret); +} + +/* + * __session_open_cursor -- + * WT_SESSION->open_cursor method. + */ +static int +__session_open_cursor(WT_SESSION *wt_session, + const char *uri, WT_CURSOR *to_dup, const char *config, WT_CURSOR **cursorp) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cursor = *cursorp = NULL; + + session = (WT_SESSION_IMPL *)wt_session; + SESSION_API_CALL(session, open_cursor, config, cfg); + + if ((to_dup == NULL && uri == NULL) || (to_dup != NULL && uri != NULL)) + WT_ERR_MSG(session, EINVAL, + "should be passed either a URI or a cursor to duplicate, " + "but not both"); + + if (to_dup != NULL) { + uri = to_dup->uri; + if (!WT_PREFIX_MATCH(uri, "colgroup:") && + !WT_PREFIX_MATCH(uri, "index:") && + !WT_PREFIX_MATCH(uri, "file:") && + !WT_PREFIX_MATCH(uri, "lsm:") && + !WT_PREFIX_MATCH(uri, WT_METADATA_URI) && + !WT_PREFIX_MATCH(uri, "table:") && + __wt_schema_get_source(session, uri) == NULL) + WT_ERR(__wt_bad_object_type(session, uri)); + } + + WT_ERR(__wt_open_cursor(session, uri, NULL, cfg, &cursor)); + if (to_dup != NULL) + WT_ERR(__wt_cursor_dup_position(to_dup, cursor)); + + *cursorp = cursor; + + if (0) { +err: if (cursor != NULL) + WT_TRET(cursor->close(cursor)); + } + + /* + * Opening a cursor on a non-existent data source will set ret to + * either of ENOENT or WT_NOTFOUND at this point. However, + * applications may reasonably do this inside a transaction to check + * for the existence of a table or index. + * + * Prefer WT_NOTFOUND here: that does not force running transactions to + * roll back. It will be mapped back to ENOENT. + */ + if (ret == ENOENT) + ret = WT_NOTFOUND; + + API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __wt_session_create_strip -- + * Discard any configuration information from a schema entry that is not + * applicable to an session.create call, here for the wt dump command utility, + * which only wants to dump the schema information needed for load. + */ +int +__wt_session_create_strip(WT_SESSION *wt_session, + const char *v1, const char *v2, const char **value_ret) +{ + WT_SESSION_IMPL *session = (WT_SESSION_IMPL *)wt_session; + const char *cfg[] = + { WT_CONFIG_BASE(session, session_create), v1, v2, NULL }; + + return (__wt_config_collapse(session, cfg, value_ret)); +} + +/* + * __session_create -- + * WT_SESSION->create method. + */ +static int +__session_create(WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_CONFIG_ITEM cval; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + SESSION_API_CALL(session, create, config, cfg); + WT_UNUSED(cfg); + + /* Disallow objects in the WiredTiger name space. */ + WT_ERR(__wt_str_name_check(session, uri)); + + /* + * Type configuration only applies to tables, column groups and indexes. + * We don't want applications to attempt to layer LSM on top of their + * extended data-sources, and the fact we allow LSM as a valid URI is an + * invitation to that mistake: nip it in the bud. + */ + if (!WT_PREFIX_MATCH(uri, "colgroup:") && + !WT_PREFIX_MATCH(uri, "index:") && + !WT_PREFIX_MATCH(uri, "table:")) { + /* + * We can't disallow type entirely, a configuration string might + * innocently include it, for example, a dump/load pair. If the + * URI type prefix and the type are the same, let it go. + */ + if ((ret = + __wt_config_getones(session, config, "type", &cval)) == 0 && + (strncmp(uri, cval.str, cval.len) != 0 || + uri[cval.len] != ':')) + WT_ERR_MSG(session, EINVAL, + "%s: unsupported type configuration", uri); + WT_ERR_NOTFOUND_OK(ret); + } + + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_schema_create(session, uri, config)); + +err: API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __session_log_printf -- + * WT_SESSION->log_printf method. + */ +static int +__session_log_printf(WT_SESSION *wt_session, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3))) +{ + WT_SESSION_IMPL *session; + WT_DECL_RET; + va_list ap; + + session = (WT_SESSION_IMPL *)wt_session; + SESSION_API_CALL_NOCONF(session, log_printf); + + va_start(ap, fmt); + ret = __wt_log_vprintf(session, fmt, ap); + va_end(ap); + +err: API_END_RET(session, ret); +} + +/* + * __session_rename -- + * WT_SESSION->rename method. + */ +static int +__session_rename(WT_SESSION *wt_session, + const char *uri, const char *newuri, const char *config) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + SESSION_API_CALL(session, rename, config, cfg); + + /* Disallow objects in the WiredTiger name space. */ + WT_ERR(__wt_str_name_check(session, uri)); + WT_ERR(__wt_str_name_check(session, newuri)); + + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_schema_rename(session, uri, newuri, cfg)); + +err: API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __session_compact -- + * WT_SESSION->compact method. + */ +static int +__session_compact(WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + + /* Disallow objects in the WiredTiger name space. */ + WT_RET(__wt_str_name_check(session, uri)); + + if (!WT_PREFIX_MATCH(uri, "colgroup:") && + !WT_PREFIX_MATCH(uri, "file:") && + !WT_PREFIX_MATCH(uri, "index:") && + !WT_PREFIX_MATCH(uri, "lsm:") && + !WT_PREFIX_MATCH(uri, "table:")) + return (__wt_bad_object_type(session, uri)); + + return (__wt_session_compact(wt_session, uri, config)); +} + +/* + * __session_drop -- + * WT_SESSION->drop method. + */ +static int +__session_drop(WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + SESSION_API_CALL(session, drop, config, cfg); + + /* Disallow objects in the WiredTiger name space. */ + WT_ERR(__wt_str_name_check(session, uri)); + + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_schema_drop(session, uri, cfg)); + +err: /* Note: drop operations cannot be unrolled (yet?). */ + API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __session_salvage -- + * WT_SESSION->salvage method. + */ +static int +__session_salvage(WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + + SESSION_API_CALL(session, salvage, config, cfg); + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_schema_worker(session, uri, __wt_salvage, + NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_SALVAGE)); + +err: API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __session_truncate -- + * WT_SESSION->truncate method. + */ +static int +__session_truncate(WT_SESSION *wt_session, + const char *uri, WT_CURSOR *start, WT_CURSOR *stop, const char *config) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + WT_CURSOR *cursor; + int cmp; + + session = (WT_SESSION_IMPL *)wt_session; + SESSION_TXN_API_CALL(session, truncate, config, cfg); + + /* + * If the URI is specified, we don't need a start/stop, if start/stop + * is specified, we don't need a URI. + * + * If no URI is specified, and both cursors are specified, start/stop + * must reference the same object. + * + * Any specified cursor must have been initialized. + */ + if ((uri == NULL && start == NULL && stop == NULL) || + (uri != NULL && (start != NULL || stop != NULL))) + WT_ERR_MSG(session, EINVAL, + "the truncate method should be passed either a URI or " + "start/stop cursors, but not both"); + + if (uri != NULL) { + /* Disallow objects in the WiredTiger name space. */ + WT_ERR(__wt_str_name_check(session, uri)); + + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_schema_truncate(session, uri, cfg)); + goto done; + } + + /* + * Cursor truncate is only supported for some objects, check for the + * supporting methods we need, range_truncate and compare. + */ + cursor = start == NULL ? stop : start; + if (cursor->compare == NULL) + WT_ERR(__wt_bad_object_type(session, cursor->uri)); + + /* + * If both cursors set, check they're correctly ordered with respect to + * each other. We have to test this before any search, the search can + * change the initial cursor position. + * + * Rather happily, the compare routine will also confirm the cursors + * reference the same object and the keys are set. + */ + if (start != NULL && stop != NULL) { + WT_ERR(start->compare(start, stop, &cmp)); + if (cmp > 0) + WT_ERR_MSG(session, EINVAL, + "the start cursor position is after the stop " + "cursor position"); + } + + /* + * Truncate does not require keys actually exist so that applications + * can discard parts of the object's name space without knowing exactly + * what records currently appear in the object. For this reason, do a + * search-near, rather than a search. Additionally, we have to correct + * after calling search-near, to position the start/stop cursors on the + * next record greater than/less than the original key. If the cursors + * hit the beginning/end of the object, or the start/stop keys cross, + * we're done, the range must be empty. + */ + if (start != NULL) { + WT_ERR(start->search_near(start, &cmp)); + if (cmp < 0 && (ret = start->next(start)) != 0) { + WT_ERR_NOTFOUND_OK(ret); + goto done; + } + } + if (stop != NULL) { + WT_ERR(stop->search_near(stop, &cmp)); + if (cmp > 0 && (ret = stop->prev(stop)) != 0) { + WT_ERR_NOTFOUND_OK(ret); + goto done; + } + + if (start != NULL) { + WT_ERR(start->compare(start, stop, &cmp)); + if (cmp > 0) + goto done; + } + } + + WT_ERR(__wt_schema_range_truncate(session, start, stop)); + +done: +err: TXN_API_END_RETRY(session, ret, 0); + return ((ret) == WT_NOTFOUND ? ENOENT : (ret)); +} + +/* + * __session_upgrade -- + * WT_SESSION->upgrade method. + */ +static int +__session_upgrade(WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + + SESSION_API_CALL(session, upgrade, config, cfg); + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_schema_worker(session, uri, __wt_upgrade, + NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_UPGRADE)); + +err: API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __session_verify -- + * WT_SESSION->verify method. + */ +static int +__session_verify(WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + + SESSION_API_CALL(session, verify, config, cfg); + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_schema_worker(session, uri, __wt_verify, + NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_VERIFY)); + +err: API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __session_begin_transaction -- + * WT_SESSION->begin_transaction method. + */ +static int +__session_begin_transaction(WT_SESSION *wt_session, const char *config) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + SESSION_API_CALL(session, begin_transaction, config, cfg); + WT_STAT_FAST_CONN_INCR(session, txn_begin); + + if (F_ISSET(&session->txn, TXN_RUNNING)) + WT_ERR_MSG(session, EINVAL, "Transaction already running"); + + /* + * There is no transaction active in this thread; check if the cache is + * full, if we have to block for eviction, this is the best time to do + * it. + */ + WT_ERR(__wt_cache_full_check(session)); + + ret = __wt_txn_begin(session, cfg); + +err: API_END_RET(session, ret); +} + +/* + * __session_commit_transaction -- + * WT_SESSION->commit_transaction method. + */ +static int +__session_commit_transaction(WT_SESSION *wt_session, const char *config) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + WT_TXN *txn; + + session = (WT_SESSION_IMPL *)wt_session; + SESSION_API_CALL(session, commit_transaction, config, cfg); + WT_STAT_FAST_CONN_INCR(session, txn_commit); + + txn = &session->txn; + if (F_ISSET(txn, TXN_ERROR)) { + __wt_errx(session, "failed transaction requires rollback"); + ret = EINVAL; + } + + if (ret == 0) + ret = __wt_txn_commit(session, cfg); + else { + WT_TRET(__wt_session_reset_cursors(session)); + WT_TRET(__wt_txn_rollback(session, cfg)); + } + +err: API_END_RET(session, ret); +} + +/* + * __session_rollback_transaction -- + * WT_SESSION->rollback_transaction method. + */ +static int +__session_rollback_transaction(WT_SESSION *wt_session, const char *config) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + SESSION_API_CALL(session, rollback_transaction, config, cfg); + WT_STAT_FAST_CONN_INCR(session, txn_rollback); + + WT_TRET(__wt_session_reset_cursors(session)); + + WT_TRET(__wt_txn_rollback(session, cfg)); + +err: API_END_RET(session, ret); +} + +/* + * __session_transaction_pinned_range -- + * WT_SESSION->transaction_pinned_range method. + */ +static int +__session_transaction_pinned_range(WT_SESSION *wt_session, uint64_t *prange) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + WT_TXN_STATE *txn_state; + uint64_t pinned; + + session = (WT_SESSION_IMPL *)wt_session; + SESSION_API_CALL_NOCONF(session, pinned_range); + + txn_state = WT_SESSION_TXN_STATE(session); + + /* Assign pinned to the lesser of id or snap_min */ + if (txn_state->id != WT_TXN_NONE && + TXNID_LT(txn_state->id, txn_state->snap_min)) + pinned = txn_state->id; + else + pinned = txn_state->snap_min; + + if (pinned == WT_TXN_NONE) + *prange = 0; + else + *prange = S2C(session)->txn_global.current - pinned; + +err: API_END_RET(session, ret); +} + +/* + * __session_checkpoint -- + * WT_SESSION->checkpoint method. + */ +static int +__session_checkpoint(WT_SESSION *wt_session, const char *config) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + WT_TXN *txn; + + session = (WT_SESSION_IMPL *)wt_session; + + txn = &session->txn; + + WT_STAT_FAST_CONN_INCR(session, txn_checkpoint); + SESSION_API_CALL(session, checkpoint, config, cfg); + + /* + * Checkpoints require a snapshot to write a transactionally consistent + * snapshot of the data. + * + * We can't use an application's transaction: if it has uncommitted + * changes, they will be written in the checkpoint and may appear after + * a crash. + * + * Use a real snapshot transaction: we don't want any chance of the + * snapshot being updated during the checkpoint. Eviction is prevented + * from evicting anything newer than this because we track the oldest + * transaction ID in the system that is not visible to all readers. + */ + if (F_ISSET(txn, TXN_RUNNING)) + WT_ERR_MSG(session, EINVAL, + "Checkpoint not permitted in a transaction"); + + /* + * Reset open cursors. Do this explicitly, even though it will happen + * implicitly in the call to begin_transaction for the checkpoint, the + * checkpoint code will acquire the schema lock before we do that, and + * some implementation of WT_CURSOR::reset might need the schema lock. + */ + WT_ERR(__wt_session_reset_cursors(session)); + + /* + * Don't highjack the session checkpoint thread for eviction. + * + * Application threads are not generally available for potentially slow + * operations, but checkpoint does enough I/O it may be called upon to + * perform slow operations for the block manager. + */ + F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK); + + /* + * Only one checkpoint can be active at a time, and checkpoints must run + * in the same order as they update the metadata. It's probably a bad + * idea to run checkpoints out of multiple threads, but serialize them + * here to ensure we don't get into trouble. + */ + WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 1); + __wt_spin_lock(session, &S2C(session)->checkpoint_lock); + + ret = __wt_txn_checkpoint(session, cfg); + + WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0); + __wt_spin_unlock(session, &S2C(session)->checkpoint_lock); + +err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK); + + API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* + * __wt_open_internal_session -- + * Allocate a session for WiredTiger's use. + */ +int +__wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, + int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp) +{ + WT_SESSION_IMPL *session; + + *sessionp = NULL; + + WT_RET(__wt_open_session(conn, NULL, NULL, &session)); + session->name = name; + + /* + * Public sessions are automatically closed during WT_CONNECTION->close. + * If the session handles for internal threads were to go on the public + * list, there would be complex ordering issues during close. Set a + * flag to avoid this: internal sessions are not closed automatically. + */ + F_SET(session, WT_SESSION_INTERNAL); + + /* + * Some internal threads must keep running after we close all data + * handles. Make sure these threads don't open their own handles. + */ + if (!uses_dhandles) + F_SET(session, WT_SESSION_NO_DATA_HANDLES); + + /* + * Acquiring the metadata handle requires the schema lock; we've seen + * problems in the past where a worker thread has acquired the schema + * lock unexpectedly, relatively late in the run, and deadlocked. Be + * defensive, get it now. The metadata file may not exist when the + * connection first creates its default session or the shared cache + * pool creates its sessions, let our caller decline this work. + */ + if (open_metadata) { + WT_ASSERT(session, !F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); + WT_RET(__wt_metadata_open(session)); + } + + *sessionp = session; + return (0); +} + +/* + * __wt_open_session -- + * Allocate a session handle. The internal parameter is used for sessions + * opened by WiredTiger for its own use. + */ +int +__wt_open_session(WT_CONNECTION_IMPL *conn, + WT_EVENT_HANDLER *event_handler, const char *config, + WT_SESSION_IMPL **sessionp) +{ + static const WT_SESSION stds = { + NULL, + __session_close, + __session_reconfigure, + __session_open_cursor, + __session_create, + __session_compact, + __session_drop, + __session_log_printf, + __session_rename, + __session_salvage, + __session_truncate, + __session_upgrade, + __session_verify, + __session_begin_transaction, + __session_commit_transaction, + __session_rollback_transaction, + __session_checkpoint, + __session_transaction_pinned_range + }; + WT_DECL_RET; + WT_SESSION_IMPL *session, *session_ret; + uint32_t i; + + *sessionp = NULL; + + session = conn->default_session; + session_ret = NULL; + + __wt_spin_lock(session, &conn->api_lock); + + /* + * Make sure we don't try to open a new session after the application + * closes the connection. This is particularly intended to catch + * cases where server threads open sessions. + */ + WT_ASSERT(session, F_ISSET(conn, WT_CONN_SERVER_RUN)); + + /* Find the first inactive session slot. */ + for (session_ret = conn->sessions, + i = 0; i < conn->session_size; ++session_ret, ++i) + if (!session_ret->active) + break; + if (i == conn->session_size) + WT_ERR_MSG(session, ENOMEM, + "only configured to support %" PRIu32 " sessions" + " (including %" PRIu32 " internal)", + conn->session_size, WT_NUM_INTERNAL_SESSIONS); + + /* + * If the active session count is increasing, update it. We don't worry + * about correcting the session count on error, as long as we don't mark + * this session as active, we'll clean it up on close. + */ + if (i >= conn->session_cnt) /* Defend against off-by-one errors. */ + conn->session_cnt = i + 1; + + session_ret->id = i; + session_ret->iface = stds; + session_ret->iface.connection = &conn->iface; + + WT_ERR(__wt_cond_alloc(session, "session", 0, &session_ret->cond)); + + __wt_random_init(session_ret->rnd); + + __wt_event_handler_set(session_ret, + event_handler == NULL ? session->event_handler : event_handler); + + TAILQ_INIT(&session_ret->cursors); + SLIST_INIT(&session_ret->dhandles); + + /* Initialize transaction support: default to read-committed. */ + session_ret->isolation = TXN_ISO_READ_COMMITTED; + WT_ERR(__wt_txn_init(session_ret)); + + /* + * The session's hazard pointer memory isn't discarded during normal + * session close because access to it isn't serialized. Allocate the + * first time we open this session. + */ + if (session_ret->hazard == NULL) + WT_ERR(__wt_calloc_def( + session, conn->hazard_max, &session_ret->hazard)); + + /* + * Set an initial size for the hazard array. It will be grown as + * required up to hazard_max. The hazard_size is reset on close, since + * __wt_hazard_close ensures the array is cleared - so it is safe to + * reset the starting size on each open. + */ + session_ret->hazard_size = WT_HAZARD_INCR; + + /* + * Configuration: currently, the configuration for open_session is the + * same as session.reconfigure, so use that function. + */ + if (config != NULL) + WT_ERR( + __session_reconfigure((WT_SESSION *)session_ret, config)); + + session_ret->name = NULL; + + /* + * Publish: make the entry visible to server threads. There must be a + * barrier for two reasons, to ensure structure fields are set before + * any other thread will consider the session, and to push the session + * count to ensure the eviction thread can't review too few slots. + */ + WT_PUBLISH(session_ret->active, 1); + + WT_STATIC_ASSERT(offsetof(WT_SESSION_IMPL, iface) == 0); + *sessionp = session_ret; + + WT_STAT_FAST_CONN_INCR(session, session_open); + +err: __wt_spin_unlock(session, &conn->api_lock); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/session/session_compact.c b/src/third_party/wiredtiger/src/session/session_compact.c new file mode 100644 index 00000000000..6eca8a58d13 --- /dev/null +++ b/src/third_party/wiredtiger/src/session/session_compact.c @@ -0,0 +1,236 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * Compaction is the place where the underlying block manager becomes visible + * in the higher engine btree and API layers. As there is currently only one + * block manager, this code is written with it in mind: other block managers + * may need changes to support compaction, and a smart block manager might need + * far less support from the engine. + * + * First, the default block manager cannot entirely own compaction because it + * has no way to find a block after it moves other than a request from the + * btree layer with the new address. In other words, if internal page X points + * to leaf page Y, and page Y moves, the address of page Y has to be updated in + * page X. Generally, this is solved by building a translation layer in the + * block manager so internal pages don't require updates to relocate blocks: + * however, the translation table must be durable, has its own garbage + * collection issues and might be slower, all of which have their own problems. + * + * Second, the btree layer cannot entirely own compaction because page + * addresses are opaque, it cannot know where a page is in the file from the + * address cookie. + * + * For these reasons, compaction is a cooperative process between the btree + * layer and the block manager. The btree layer walks files, and asks the + * block manager if rewriting a particular block would reduce the file + * footprint: if writing the page will help, the page is marked dirty so it + * will eventually be written. As pages are written, the original page + * potentially becomes available for reuse and if enough pages at the end of + * the file are available for reuse, the file can be truncated, and compaction + * succeeds. + * + * However, writing a page is not by itself sufficient to make a page available + * for reuse. The original version of the page is still referenced by at least + * the most recent checkpoint in the file. To make a page available for reuse, + * we have to checkpoint the file so we can discard the checkpoint referencing + * the original version of the block; once no checkpoint references a block, it + * becomes available for reuse. + * + * Compaction is not necessarily possible in WiredTiger, even in a file with + * lots of available space. If a block at the end of the file is referenced by + * a named checkpoint, there is nothing we can do to compact the file, no + * matter how many times we rewrite the block, the named checkpoint can't be + * discarded and so the reference count on the original block will never go to + * zero. What's worse, because the block manager doesn't reference count + * blocks, it can't easily know this is the case, and so we'll waste a lot of + * effort trying to compact files that can't be compacted. + * + * Now, to the actual process. First, we checkpoint the high-level object + * (which is potentially composed of multiple files): there are potentially + * many dirty blocks in the cache, and we want to write them out and then + * discard previous checkpoints so we have as many blocks as possible on the + * file's "available for reuse" list when we start compaction. + * + * Then, we compact the high-level object. + * + * Compacting the object is done 10% at a time, that is, we try and move blocks + * from the last 10% of the file into the beginning of the file (the 10% is + * hard coded in the block manager). The reason for this is because we are + * walking the file in logical order, not block offset order, and we can fail + * to compact a file if we write the wrong blocks first. + * + * For example, imagine a file with 10 blocks in the first 10% of a file, 1,000 + * blocks in the 3rd quartile of the file, and 10 blocks in the last 10% of the + * file. If we were to rewrite blocks from more than the last 10% of the file, + * and found the 1,000 blocks in the 3rd quartile of the file first, we'd copy + * 10 of them without ever rewriting the blocks from the end of the file which + * would allow us to compact the file. So, we compact the last 10% of the + * file, and if that works, we compact the last 10% of the file again, and so + * on. Note the block manager uses a first-fit block selection algorithm + * during compaction to maximize block movement. + * + * After each 10% compaction, we checkpoint two more times (seriously, twice). + * The second and third checkpoints are because the block manager checkpoints + * in two steps: blocks made available for reuse during a checkpoint are put on + * a special checkpoint-available list and only moved to the real available + * list after the metadata has been updated with the new checkpoint's + * information. (Otherwise it is possible to allocate a rewritten block, crash + * before the metadata is updated, and see corruption.) For this reason, + * blocks allocated to write the checkpoint itself cannot be taken from the + * blocks made available by the checkpoint. + * + * To say it another way, the second checkpoint puts the blocks from the end of + * the file that were made available by compaction onto the checkpoint-available + * list, but then potentially writes the checkpoint itself at the end of the + * file, which would prevent any file truncation. When the metadata is updated + * for the second checkpoint, the blocks freed by compaction become available + * for the third checkpoint, so the third checkpoint's blocks are written + * towards the beginning of the file, and then the file can be truncated. + */ + +/* + * __wt_compact_uri_analyze -- + * Extract information relevant to deciding what work compact needs to + * do from a URI that is part of a table schema. + * Called via the schema_worker function. + */ +int +__wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip) +{ + /* + * Add references to schema URI objects to the list of objects to be + * compacted. Skip over LSM trees or we will get false positives on + * the "file:" URIs for the chunks. + */ + if (WT_PREFIX_MATCH(uri, "lsm:")) { + session->compact->lsm_count++; + *skip = 1; + } else if (WT_PREFIX_MATCH(uri, "file:")) + session->compact->file_count++; + + return (0); +} + +/* + * __session_compact_check_timeout -- + * Check if the timeout has been exceeded. + */ +static int +__session_compact_check_timeout( + WT_SESSION_IMPL *session, struct timespec begin) +{ + struct timespec end; + + if (session->compact->max_time == 0) + return (0); + + WT_RET(__wt_epoch(session, &end)); + if (session->compact->max_time < + WT_TIMEDIFF(end, begin) / WT_BILLION) + WT_RET(ETIMEDOUT); + return (0); +} + +/* + * __compact_file -- + * Function to alternate between checkpoints and compaction calls. + */ +static int +__compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) +{ + WT_DECL_RET; + WT_DECL_ITEM(t); + WT_SESSION *wt_session; + WT_TXN *txn; + int i; + struct timespec start_time; + + txn = &session->txn; + wt_session = &session->iface; + + /* + * File compaction requires checkpoints, which will fail in a + * transactional context. Check now so the error message isn't + * confusing. + */ + if (session->compact->file_count != 0 && F_ISSET(txn, TXN_RUNNING)) + WT_ERR_MSG(session, EINVAL, + " File compaction not permitted in a transaction"); + + /* + * Force the checkpoint: we don't want to skip it because the work we + * need to have done is done in the underlying block manager. + */ + WT_ERR(__wt_scr_alloc(session, 128, &t)); + WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\"),force=1", uri)); + + WT_ERR(__wt_epoch(session, &start_time)); + + /* + * We compact 10% of the file on each pass, try 10 times (which is + * probably overkill), and quit if we make no progress. Check for a + * timeout each time through the loop. + */ + for (i = 0; i < 10; ++i) { + WT_ERR(wt_session->checkpoint(wt_session, t->data)); + + session->compaction = 0; + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_schema_worker( + session, uri, __wt_compact, NULL, cfg, 0)); + WT_ERR(ret); + if (!session->compaction) + break; + + WT_ERR(wt_session->checkpoint(wt_session, t->data)); + WT_ERR(wt_session->checkpoint(wt_session, t->data)); + WT_ERR(__session_compact_check_timeout(session, start_time)); + } + +err: __wt_scr_free(&t); + return (ret); +} + +/* + * __wt_session_compact -- + */ +int +__wt_session_compact( + WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_COMPACT compact; + WT_CONFIG_ITEM cval; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + SESSION_API_CALL(session, compact, config, cfg); + + /* Setup the structure in the session handle */ + memset(&compact, 0, sizeof(WT_COMPACT)); + session->compact = &compact; + + WT_ERR(__wt_config_gets(session, cfg, "timeout", &cval)); + session->compact->max_time = (uint64_t)cval.val; + + /* Find the types of data sources are being compacted. */ + WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker( + session, uri, NULL, __wt_compact_uri_analyze, cfg, 0)); + WT_ERR(ret); + + if (session->compact->lsm_count != 0) + WT_ERR(__wt_schema_worker( + session, uri, NULL, __wt_lsm_compact, cfg, 0)); + if (session->compact->file_count != 0) + WT_ERR(__compact_file(session, uri, cfg)); + +err: session->compact = NULL; + API_END_RET_NOTFOUND_MAP(session, ret); +} diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c new file mode 100644 index 00000000000..0c07e5fa259 --- /dev/null +++ b/src/third_party/wiredtiger/src/session/session_dhandle.c @@ -0,0 +1,478 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_session_dhandle_incr_use -- + * Increment the session data source's in-use counter. + */ +void +__wt_session_dhandle_incr_use(WT_SESSION_IMPL *session) +{ + WT_DATA_HANDLE *dhandle; + + dhandle = session->dhandle; + + (void)WT_ATOMIC_ADD4(dhandle->session_inuse, 1); +} + +/* + * __session_dhandle_decr_use -- + * Decrement the session data source's in-use counter. + */ +static int +__session_dhandle_decr_use(WT_SESSION_IMPL *session) +{ + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + + dhandle = session->dhandle; + + /* + * Decrement the in-use count on the underlying data-source -- if we're + * the last reference, set the time-of-death timestamp. + */ + WT_ASSERT(session, dhandle->session_inuse > 0); + if (WT_ATOMIC_SUB4(dhandle->session_inuse, 1) == 0) + WT_TRET(__wt_seconds(session, &dhandle->timeofdeath)); + return (0); +} + +/* + * __session_add_btree -- + * Add a handle to the session's cache. + */ +static int +__session_add_btree( + WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE **dhandle_cachep) +{ + WT_DATA_HANDLE_CACHE *dhandle_cache; + + WT_RET(__wt_calloc_def(session, 1, &dhandle_cache)); + dhandle_cache->dhandle = session->dhandle; + + SLIST_INSERT_HEAD(&session->dhandles, dhandle_cache, l); + + if (dhandle_cachep != NULL) + *dhandle_cachep = dhandle_cache; + + return (0); +} + +/* + * __wt_session_lock_btree -- + * Lock a btree handle. + */ +int +__wt_session_lock_btree(WT_SESSION_IMPL *session, uint32_t flags) +{ + enum { NOLOCK, READLOCK, WRITELOCK } locked; + WT_BTREE *btree; + WT_DATA_HANDLE *dhandle; + uint32_t special_flags; + + btree = S2BT(session); + dhandle = session->dhandle; + locked = NOLOCK; + + /* + * Special operation flags will cause the handle to be reopened. + * For example, a handle opened with WT_BTREE_BULK cannot use the same + * internal data structures as a handle opened for ordinary access. + */ + special_flags = LF_ISSET(WT_BTREE_SPECIAL_FLAGS); + WT_ASSERT(session, + special_flags == 0 || LF_ISSET(WT_DHANDLE_EXCLUSIVE)); + + if (LF_ISSET(WT_DHANDLE_EXCLUSIVE)) { + /* + * Try to get an exclusive handle lock and fail immediately if + * it's unavailable. We don't expect exclusive operations on + * trees to be mixed with ordinary cursor access, but if there + * is a use case in the future, we could make blocking here + * configurable. + * + * Special flags will cause the handle to be reopened, which + * will get the necessary lock, so don't bother here. + */ + if (LF_ISSET(WT_DHANDLE_LOCK_ONLY) || special_flags == 0) { + WT_RET(__wt_try_writelock(session, dhandle->rwlock)); + F_SET(dhandle, WT_DHANDLE_EXCLUSIVE); + locked = WRITELOCK; + } + } else if (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) + return (EBUSY); + else { + WT_RET(__wt_readlock(session, dhandle->rwlock)); + locked = READLOCK; + } + + /* + * At this point, we have the requested lock -- if that is all that was + * required, we're done. Otherwise, check that the handle is open and + * that no special flags are required. + */ + if (LF_ISSET(WT_DHANDLE_LOCK_ONLY) || + (F_ISSET(dhandle, WT_DHANDLE_OPEN) && special_flags == 0)) + return (0); + + /* + * The handle needs to be opened. If we locked the handle above, + * unlock it before returning. + */ + switch (locked) { + case NOLOCK: + break; + case READLOCK: + WT_RET(__wt_readunlock(session, dhandle->rwlock)); + break; + case WRITELOCK: + F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); + WT_RET(__wt_writeunlock(session, dhandle->rwlock)); + break; + } + + /* Treat an unopened handle just like a non-existent handle. */ + return (WT_NOTFOUND); +} + +/* + * __wt_session_release_btree -- + * Unlock a btree handle. + */ +int +__wt_session_release_btree(WT_SESSION_IMPL *session) +{ + enum { NOLOCK, READLOCK, WRITELOCK } locked; + WT_BTREE *btree; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + + btree = S2BT(session); + dhandle = session->dhandle; + + /* + * Decrement the data-source's in-use counter. We ignore errors because + * they're insignificant and handling them complicates error handling in + * this function more than I'm willing to live with. + */ + (void)__session_dhandle_decr_use(session); + + locked = F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) ? WRITELOCK : READLOCK; + if (F_ISSET(dhandle, WT_DHANDLE_DISCARD_CLOSE)) { + /* + * If configured to discard on last close, trade any read lock + * for an exclusive lock. If the exchange succeeds, setup for + * discard. It is expected acquiring an exclusive lock will fail + * sometimes since the handle may still be in use: in that case + * we're done. + */ + if (locked == READLOCK) { + locked = NOLOCK; + WT_ERR(__wt_readunlock(session, dhandle->rwlock)); + ret = __wt_try_writelock(session, dhandle->rwlock); + if (ret != 0) { + if (ret == EBUSY) + ret = 0; + goto err; + } + locked = WRITELOCK; + F_CLR(dhandle, WT_DHANDLE_DISCARD_CLOSE); + F_SET(dhandle, + WT_DHANDLE_DISCARD | WT_DHANDLE_EXCLUSIVE); + } + } + + /* + * If we had special flags set, close the handle so that future access + * can get a handle without special flags. + */ + if (F_ISSET(dhandle, WT_DHANDLE_DISCARD) || + F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) { + WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE)); + F_CLR(dhandle, WT_DHANDLE_DISCARD); + + WT_TRET(__wt_conn_btree_sync_and_close(session, 0)); + } + + if (F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE)) + F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); + +err: switch (locked) { + case NOLOCK: + break; + case READLOCK: + WT_TRET(__wt_readunlock(session, dhandle->rwlock)); + break; + case WRITELOCK: + WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); + break; + } + + session->dhandle = NULL; + return (ret); +} + +/* + * __wt_session_get_btree_ckpt -- + * Check the configuration strings for a checkpoint name, get a btree + * handle for the given name, set session->dhandle. + */ +int +__wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, + const char *uri, const char *cfg[], uint32_t flags) +{ + WT_CONFIG_ITEM cval; + WT_DECL_RET; + int last_ckpt; + const char *checkpoint; + + last_ckpt = 0; + checkpoint = NULL; + + /* + * This function exists to handle checkpoint configuration. Callers + * that never open a checkpoint call the underlying function directly. + */ + WT_RET_NOTFOUND_OK( + __wt_config_gets_def(session, cfg, "checkpoint", 0, &cval)); + if (cval.len != 0) { + /* + * The internal checkpoint name is special, find the last + * unnamed checkpoint of the object. + */ + if (WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) { + last_ckpt = 1; +retry: WT_RET(__wt_meta_checkpoint_last_name( + session, uri, &checkpoint)); + } else + WT_RET(__wt_strndup( + session, cval.str, cval.len, &checkpoint)); + } + + ret = __wt_session_get_btree(session, uri, checkpoint, cfg, flags); + + __wt_free(session, checkpoint); + + /* + * There's a potential race: we get the name of the most recent unnamed + * checkpoint, but if it's discarded (or locked so it can be discarded) + * by the time we try to open it, we'll fail the open. Retry in those + * cases, a new "last" checkpoint should surface, and we can't return an + * error, the application will be justifiably upset if we can't open the + * last checkpoint instance of an object. + * + * The check against WT_NOTFOUND is correct: if there was no checkpoint + * for the object (that is, the object has never been in a checkpoint), + * we returned immediately after the call to search for that name. + */ + if (last_ckpt && (ret == WT_NOTFOUND || ret == EBUSY)) + goto retry; + return (ret); +} + +/* + * __session_discard_btree -- + * Discard our reference to the btree. + */ +static void +__session_discard_btree( + WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE *dhandle_cache) +{ + WT_DATA_HANDLE *saved_dhandle; + + SLIST_REMOVE( + &session->dhandles, dhandle_cache, __wt_data_handle_cache, l); + + saved_dhandle = session->dhandle; + session->dhandle = dhandle_cache->dhandle; + + __wt_overwrite_and_free(session, dhandle_cache); + __wt_conn_btree_close(session); + + /* Restore the original handle in the session. */ + session->dhandle = saved_dhandle; +} + +/* + * __wt_session_close_cache -- + * Close any cached handles in a session. + */ +void +__wt_session_close_cache(WT_SESSION_IMPL *session) +{ + WT_DATA_HANDLE_CACHE *dhandle_cache; + + while ((dhandle_cache = SLIST_FIRST(&session->dhandles)) != NULL) + __session_discard_btree(session, dhandle_cache); +} + +/* + * __session_dhandle_sweep -- + * Discard any session dhandles that are not open. + */ +static int +__session_dhandle_sweep(WT_SESSION_IMPL *session, uint32_t flags) +{ + WT_DATA_HANDLE *dhandle; + WT_DATA_HANDLE_CACHE *dhandle_cache, *dhandle_cache_next; + time_t now; + + /* + * Check the local flag WT_DHANDLE_LOCK_ONLY; a common caller with that + * flag is in the path to discard the handle, don't sweep in that case. + */ + if (LF_ISSET(WT_DHANDLE_LOCK_ONLY)) + return (0); + + /* + * Periodically sweep for dead handles; if we've swept recently, don't + * do it again. + */ + WT_RET(__wt_seconds(session, &now)); + if (now - session->last_sweep < WT_DHANDLE_SWEEP_PERIOD) + return (0); + session->last_sweep = now; + + WT_STAT_FAST_CONN_INCR(session, dh_session_sweeps); + + dhandle_cache = SLIST_FIRST(&session->dhandles); + while (dhandle_cache != NULL) { + dhandle_cache_next = SLIST_NEXT(dhandle_cache, l); + dhandle = dhandle_cache->dhandle; + if (dhandle != session->dhandle && + dhandle->session_inuse == 0 && + now - dhandle->timeofdeath > WT_DHANDLE_SWEEP_WAIT) { + WT_STAT_FAST_CONN_INCR(session, dh_session_handles); + __session_discard_btree(session, dhandle_cache); + } + dhandle_cache = dhandle_cache_next; + } + return (0); +} + +/* + * __wt_session_get_btree -- + * Get a btree handle for the given name, set session->dhandle. + */ +int +__wt_session_get_btree(WT_SESSION_IMPL *session, + const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags) +{ + WT_DATA_HANDLE *dhandle; + WT_DATA_HANDLE_CACHE *dhandle_cache; + WT_DECL_RET; + uint64_t hash; + int candidate; + + WT_ASSERT(session, !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES)); + + dhandle = NULL; + candidate = 0; + + hash = __wt_hash_city64(uri, strlen(uri)); + SLIST_FOREACH(dhandle_cache, &session->dhandles, l) { + dhandle = dhandle_cache->dhandle; + if (hash != dhandle->name_hash || + strcmp(uri, dhandle->name) != 0) + continue; + if (checkpoint == NULL && dhandle->checkpoint == NULL) + break; + if (checkpoint != NULL && dhandle->checkpoint != NULL && + strcmp(checkpoint, dhandle->checkpoint) == 0) + break; + } + + if (dhandle_cache != NULL) { + candidate = 1; + /* We found the data handle, don't try to get it again. */ + LF_SET(WT_DHANDLE_HAVE_REF); + session->dhandle = dhandle; + + /* + * Try to lock the file; if we succeed, our "exclusive" state + * must match. + */ + ret = __wt_session_lock_btree(session, flags); + if (ret == WT_NOTFOUND) + dhandle_cache = NULL; + else + WT_RET(ret); + } + + if (dhandle_cache == NULL) { + /* Sweep the handle list to remove any dead handles. */ + WT_RET(__session_dhandle_sweep(session, flags)); + + /* + * Acquire the schema lock if we don't already hold it, find + * and/or open the handle. + */ + WT_WITH_SCHEMA_LOCK(session, ret = + __wt_conn_btree_get(session, uri, checkpoint, cfg, flags)); + WT_RET(ret); + + if (!candidate) + WT_RET(__session_add_btree(session, NULL)); + WT_ASSERT(session, LF_ISSET(WT_DHANDLE_LOCK_ONLY) || + F_ISSET(session->dhandle, WT_DHANDLE_OPEN)); + } + + /* Increment the data-source's in-use counter. */ + __wt_session_dhandle_incr_use(session); + + WT_ASSERT(session, LF_ISSET(WT_DHANDLE_EXCLUSIVE) == + F_ISSET(session->dhandle, WT_DHANDLE_EXCLUSIVE)); + F_SET(session->dhandle, LF_ISSET(WT_DHANDLE_DISCARD_CLOSE)); + + return (0); +} + +/* + * __wt_session_lock_checkpoint -- + * Lock the btree handle for the given checkpoint name. + */ +int +__wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) +{ + WT_DATA_HANDLE *dhandle, *saved_dhandle; + WT_DECL_RET; + + saved_dhandle = session->dhandle; + + /* + * Get the checkpoint handle exclusive, so no one else can access it + * while we are creating the new checkpoint. + */ + WT_ERR(__wt_session_get_btree(session, saved_dhandle->name, + checkpoint, NULL, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY)); + + /* + * Flush any pages in this checkpoint from the cache (we are about to + * re-write the checkpoint which will mean cached pages no longer have + * valid contents). This is especially noticeable with memory mapped + * files, since changes to the underlying file are visible to the in + * memory pages. + */ + WT_ERR(__wt_cache_op(session, NULL, WT_SYNC_DISCARD)); + + /* + * We lock checkpoint handles that we are overwriting, so the handle + * must be closed when we release it. + */ + dhandle = session->dhandle; + F_SET(dhandle, WT_DHANDLE_DISCARD); + + WT_ASSERT(session, WT_META_TRACKING(session)); + WT_ERR(__wt_meta_track_handle_lock(session, 0)); + + /* Restore the original btree in the session. */ +err: session->dhandle = saved_dhandle; + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/session/session_salvage.c b/src/third_party/wiredtiger/src/session/session_salvage.c new file mode 100644 index 00000000000..1512c6515ec --- /dev/null +++ b/src/third_party/wiredtiger/src/session/session_salvage.c @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_salvage -- + * Salvage a single file. + */ +int +__wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CKPT *ckptbase; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + + dhandle = session->dhandle; + + /* + * XXX + * The salvage process reads and discards previous checkpoints, so the + * underlying block manager has to ignore any previous checkpoint + * entries when creating a new checkpoint, in other words, we can't use + * the metadata checkpoint list, it has all of those checkpoint listed + * and we don't care about them. Build a clean checkpoint list and use + * it instead. + * + * Don't first clear the metadata checkpoint list and call the function + * to get a list of checkpoints: a crash between clearing the metadata + * checkpoint list and creating a new checkpoint list would look like a + * create or open of a file without a checkpoint to roll-forward from, + * and the contents of the file would be discarded. + */ + WT_RET(__wt_calloc_def(session, 2, &ckptbase)); + WT_ERR(__wt_strdup(session, WT_CHECKPOINT, &ckptbase[0].name)); + F_SET(&ckptbase[0], WT_CKPT_ADD); + + WT_ERR(__wt_bt_salvage(session, ckptbase, cfg)); + + /* + * If no checkpoint was created, well, it's probably bad news, but there + * is nothing to do but clear any recorded checkpoints for the file. If + * a checkpoint was created, life is good, replace any existing list of + * checkpoints with the single new one. + */ + if (ckptbase[0].raw.data == NULL) + WT_ERR(__wt_meta_checkpoint_clear(session, dhandle->name)); + else + WT_ERR(__wt_meta_ckptlist_set( + session, dhandle->name, ckptbase, NULL)); + +err: __wt_meta_ckptlist_free(session, ckptbase); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/support/cksum.c b/src/third_party/wiredtiger/src/support/cksum.c new file mode 100644 index 00000000000..1eaa345d1fe --- /dev/null +++ b/src/third_party/wiredtiger/src/support/cksum.c @@ -0,0 +1,1306 @@ +/*- + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "wt_internal.h" + +/* + * This file contains two implementations for computing CRC: one that uses + * hardware CRC instructions, available on newer x86_64/amd64, and one that uses + * a fast software algorithm. __wt_cksum() provides a common entry point that + * indirects to one of these two methods. + */ +static uint32_t (*__wt_cksum_func)(const void *chunk, size_t len); + +/* + * The CRC slicing tables are used by __wt_cksum_sw. + */ +static const uint32_t g_crc_slicing[8][256] = { +#ifdef WORDS_BIGENDIAN + /* + * Big endian tables have entries that are byte reversed from little + * endian tables. + */ + { + 0x00000000, 0x03836bf2, 0xf7703be1, 0xf4f35013, + 0x1f979ac7, 0x1c14f135, 0xe8e7a126, 0xeb64cad4, + 0xcf58d98a, 0xccdbb278, 0x3828e26b, 0x3bab8999, + 0xd0cf434d, 0xd34c28bf, 0x27bf78ac, 0x243c135e, + 0x6fc75e10, 0x6c4435e2, 0x98b765f1, 0x9b340e03, + 0x7050c4d7, 0x73d3af25, 0x8720ff36, 0x84a394c4, + 0xa09f879a, 0xa31cec68, 0x57efbc7b, 0x546cd789, + 0xbf081d5d, 0xbc8b76af, 0x487826bc, 0x4bfb4d4e, + 0xde8ebd20, 0xdd0dd6d2, 0x29fe86c1, 0x2a7ded33, + 0xc11927e7, 0xc29a4c15, 0x36691c06, 0x35ea77f4, + 0x11d664aa, 0x12550f58, 0xe6a65f4b, 0xe52534b9, + 0x0e41fe6d, 0x0dc2959f, 0xf931c58c, 0xfab2ae7e, + 0xb149e330, 0xb2ca88c2, 0x4639d8d1, 0x45bab323, + 0xaede79f7, 0xad5d1205, 0x59ae4216, 0x5a2d29e4, + 0x7e113aba, 0x7d925148, 0x8961015b, 0x8ae26aa9, + 0x6186a07d, 0x6205cb8f, 0x96f69b9c, 0x9575f06e, + 0xbc1d7b41, 0xbf9e10b3, 0x4b6d40a0, 0x48ee2b52, + 0xa38ae186, 0xa0098a74, 0x54fada67, 0x5779b195, + 0x7345a2cb, 0x70c6c939, 0x8435992a, 0x87b6f2d8, + 0x6cd2380c, 0x6f5153fe, 0x9ba203ed, 0x9821681f, + 0xd3da2551, 0xd0594ea3, 0x24aa1eb0, 0x27297542, + 0xcc4dbf96, 0xcfced464, 0x3b3d8477, 0x38beef85, + 0x1c82fcdb, 0x1f019729, 0xebf2c73a, 0xe871acc8, + 0x0315661c, 0x00960dee, 0xf4655dfd, 0xf7e6360f, + 0x6293c661, 0x6110ad93, 0x95e3fd80, 0x96609672, + 0x7d045ca6, 0x7e873754, 0x8a746747, 0x89f70cb5, + 0xadcb1feb, 0xae487419, 0x5abb240a, 0x59384ff8, + 0xb25c852c, 0xb1dfeede, 0x452cbecd, 0x46afd53f, + 0x0d549871, 0x0ed7f383, 0xfa24a390, 0xf9a7c862, + 0x12c302b6, 0x11406944, 0xe5b33957, 0xe63052a5, + 0xc20c41fb, 0xc18f2a09, 0x357c7a1a, 0x36ff11e8, + 0xdd9bdb3c, 0xde18b0ce, 0x2aebe0dd, 0x29688b2f, + 0x783bf682, 0x7bb89d70, 0x8f4bcd63, 0x8cc8a691, + 0x67ac6c45, 0x642f07b7, 0x90dc57a4, 0x935f3c56, + 0xb7632f08, 0xb4e044fa, 0x401314e9, 0x43907f1b, + 0xa8f4b5cf, 0xab77de3d, 0x5f848e2e, 0x5c07e5dc, + 0x17fca892, 0x147fc360, 0xe08c9373, 0xe30ff881, + 0x086b3255, 0x0be859a7, 0xff1b09b4, 0xfc986246, + 0xd8a47118, 0xdb271aea, 0x2fd44af9, 0x2c57210b, + 0xc733ebdf, 0xc4b0802d, 0x3043d03e, 0x33c0bbcc, + 0xa6b54ba2, 0xa5362050, 0x51c57043, 0x52461bb1, + 0xb922d165, 0xbaa1ba97, 0x4e52ea84, 0x4dd18176, + 0x69ed9228, 0x6a6ef9da, 0x9e9da9c9, 0x9d1ec23b, + 0x767a08ef, 0x75f9631d, 0x810a330e, 0x828958fc, + 0xc97215b2, 0xcaf17e40, 0x3e022e53, 0x3d8145a1, + 0xd6e58f75, 0xd566e487, 0x2195b494, 0x2216df66, + 0x062acc38, 0x05a9a7ca, 0xf15af7d9, 0xf2d99c2b, + 0x19bd56ff, 0x1a3e3d0d, 0xeecd6d1e, 0xed4e06ec, + 0xc4268dc3, 0xc7a5e631, 0x3356b622, 0x30d5ddd0, + 0xdbb11704, 0xd8327cf6, 0x2cc12ce5, 0x2f424717, + 0x0b7e5449, 0x08fd3fbb, 0xfc0e6fa8, 0xff8d045a, + 0x14e9ce8e, 0x176aa57c, 0xe399f56f, 0xe01a9e9d, + 0xabe1d3d3, 0xa862b821, 0x5c91e832, 0x5f1283c0, + 0xb4764914, 0xb7f522e6, 0x430672f5, 0x40851907, + 0x64b90a59, 0x673a61ab, 0x93c931b8, 0x904a5a4a, + 0x7b2e909e, 0x78adfb6c, 0x8c5eab7f, 0x8fddc08d, + 0x1aa830e3, 0x192b5b11, 0xedd80b02, 0xee5b60f0, + 0x053faa24, 0x06bcc1d6, 0xf24f91c5, 0xf1ccfa37, + 0xd5f0e969, 0xd673829b, 0x2280d288, 0x2103b97a, + 0xca6773ae, 0xc9e4185c, 0x3d17484f, 0x3e9423bd, + 0x756f6ef3, 0x76ec0501, 0x821f5512, 0x819c3ee0, + 0x6af8f434, 0x697b9fc6, 0x9d88cfd5, 0x9e0ba427, + 0xba37b779, 0xb9b4dc8b, 0x4d478c98, 0x4ec4e76a, + 0xa5a02dbe, 0xa623464c, 0x52d0165f, 0x51537dad + },{ + 0x00000000, 0x7798a213, 0xee304527, 0x99a8e734, + 0xdc618a4e, 0xabf9285d, 0x3251cf69, 0x45c96d7a, + 0xb8c3149d, 0xcf5bb68e, 0x56f351ba, 0x216bf3a9, + 0x64a29ed3, 0x133a3cc0, 0x8a92dbf4, 0xfd0a79e7, + 0x81f1c53f, 0xf669672c, 0x6fc18018, 0x1859220b, + 0x5d904f71, 0x2a08ed62, 0xb3a00a56, 0xc438a845, + 0x3932d1a2, 0x4eaa73b1, 0xd7029485, 0xa09a3696, + 0xe5535bec, 0x92cbf9ff, 0x0b631ecb, 0x7cfbbcd8, + 0x02e38b7f, 0x757b296c, 0xecd3ce58, 0x9b4b6c4b, + 0xde820131, 0xa91aa322, 0x30b24416, 0x472ae605, + 0xba209fe2, 0xcdb83df1, 0x5410dac5, 0x238878d6, + 0x664115ac, 0x11d9b7bf, 0x8871508b, 0xffe9f298, + 0x83124e40, 0xf48aec53, 0x6d220b67, 0x1abaa974, + 0x5f73c40e, 0x28eb661d, 0xb1438129, 0xc6db233a, + 0x3bd15add, 0x4c49f8ce, 0xd5e11ffa, 0xa279bde9, + 0xe7b0d093, 0x90287280, 0x098095b4, 0x7e1837a7, + 0x04c617ff, 0x735eb5ec, 0xeaf652d8, 0x9d6ef0cb, + 0xd8a79db1, 0xaf3f3fa2, 0x3697d896, 0x410f7a85, + 0xbc050362, 0xcb9da171, 0x52354645, 0x25ade456, + 0x6064892c, 0x17fc2b3f, 0x8e54cc0b, 0xf9cc6e18, + 0x8537d2c0, 0xf2af70d3, 0x6b0797e7, 0x1c9f35f4, + 0x5956588e, 0x2ecefa9d, 0xb7661da9, 0xc0febfba, + 0x3df4c65d, 0x4a6c644e, 0xd3c4837a, 0xa45c2169, + 0xe1954c13, 0x960dee00, 0x0fa50934, 0x783dab27, + 0x06259c80, 0x71bd3e93, 0xe815d9a7, 0x9f8d7bb4, + 0xda4416ce, 0xaddcb4dd, 0x347453e9, 0x43ecf1fa, + 0xbee6881d, 0xc97e2a0e, 0x50d6cd3a, 0x274e6f29, + 0x62870253, 0x151fa040, 0x8cb74774, 0xfb2fe567, + 0x87d459bf, 0xf04cfbac, 0x69e41c98, 0x1e7cbe8b, + 0x5bb5d3f1, 0x2c2d71e2, 0xb58596d6, 0xc21d34c5, + 0x3f174d22, 0x488fef31, 0xd1270805, 0xa6bfaa16, + 0xe376c76c, 0x94ee657f, 0x0d46824b, 0x7ade2058, + 0xf9fac3fb, 0x8e6261e8, 0x17ca86dc, 0x605224cf, + 0x259b49b5, 0x5203eba6, 0xcbab0c92, 0xbc33ae81, + 0x4139d766, 0x36a17575, 0xaf099241, 0xd8913052, + 0x9d585d28, 0xeac0ff3b, 0x7368180f, 0x04f0ba1c, + 0x780b06c4, 0x0f93a4d7, 0x963b43e3, 0xe1a3e1f0, + 0xa46a8c8a, 0xd3f22e99, 0x4a5ac9ad, 0x3dc26bbe, + 0xc0c81259, 0xb750b04a, 0x2ef8577e, 0x5960f56d, + 0x1ca99817, 0x6b313a04, 0xf299dd30, 0x85017f23, + 0xfb194884, 0x8c81ea97, 0x15290da3, 0x62b1afb0, + 0x2778c2ca, 0x50e060d9, 0xc94887ed, 0xbed025fe, + 0x43da5c19, 0x3442fe0a, 0xadea193e, 0xda72bb2d, + 0x9fbbd657, 0xe8237444, 0x718b9370, 0x06133163, + 0x7ae88dbb, 0x0d702fa8, 0x94d8c89c, 0xe3406a8f, + 0xa68907f5, 0xd111a5e6, 0x48b942d2, 0x3f21e0c1, + 0xc22b9926, 0xb5b33b35, 0x2c1bdc01, 0x5b837e12, + 0x1e4a1368, 0x69d2b17b, 0xf07a564f, 0x87e2f45c, + 0xfd3cd404, 0x8aa47617, 0x130c9123, 0x64943330, + 0x215d5e4a, 0x56c5fc59, 0xcf6d1b6d, 0xb8f5b97e, + 0x45ffc099, 0x3267628a, 0xabcf85be, 0xdc5727ad, + 0x999e4ad7, 0xee06e8c4, 0x77ae0ff0, 0x0036ade3, + 0x7ccd113b, 0x0b55b328, 0x92fd541c, 0xe565f60f, + 0xa0ac9b75, 0xd7343966, 0x4e9cde52, 0x39047c41, + 0xc40e05a6, 0xb396a7b5, 0x2a3e4081, 0x5da6e292, + 0x186f8fe8, 0x6ff72dfb, 0xf65fcacf, 0x81c768dc, + 0xffdf5f7b, 0x8847fd68, 0x11ef1a5c, 0x6677b84f, + 0x23bed535, 0x54267726, 0xcd8e9012, 0xba163201, + 0x471c4be6, 0x3084e9f5, 0xa92c0ec1, 0xdeb4acd2, + 0x9b7dc1a8, 0xece563bb, 0x754d848f, 0x02d5269c, + 0x7e2e9a44, 0x09b63857, 0x901edf63, 0xe7867d70, + 0xa24f100a, 0xd5d7b219, 0x4c7f552d, 0x3be7f73e, + 0xc6ed8ed9, 0xb1752cca, 0x28ddcbfe, 0x5f4569ed, + 0x1a8c0497, 0x6d14a684, 0xf4bc41b0, 0x8324e3a3 + },{ + 0x00000000, 0x7e9241a5, 0x0d526f4f, 0x73c02eea, + 0x1aa4de9e, 0x64369f3b, 0x17f6b1d1, 0x6964f074, + 0xc53e5138, 0xbbac109d, 0xc86c3e77, 0xb6fe7fd2, + 0xdf9a8fa6, 0xa108ce03, 0xd2c8e0e9, 0xac5aa14c, + 0x8a7da270, 0xf4efe3d5, 0x872fcd3f, 0xf9bd8c9a, + 0x90d97cee, 0xee4b3d4b, 0x9d8b13a1, 0xe3195204, + 0x4f43f348, 0x31d1b2ed, 0x42119c07, 0x3c83dda2, + 0x55e72dd6, 0x2b756c73, 0x58b54299, 0x2627033c, + 0x14fb44e1, 0x6a690544, 0x19a92bae, 0x673b6a0b, + 0x0e5f9a7f, 0x70cddbda, 0x030df530, 0x7d9fb495, + 0xd1c515d9, 0xaf57547c, 0xdc977a96, 0xa2053b33, + 0xcb61cb47, 0xb5f38ae2, 0xc633a408, 0xb8a1e5ad, + 0x9e86e691, 0xe014a734, 0x93d489de, 0xed46c87b, + 0x8422380f, 0xfab079aa, 0x89705740, 0xf7e216e5, + 0x5bb8b7a9, 0x252af60c, 0x56ead8e6, 0x28789943, + 0x411c6937, 0x3f8e2892, 0x4c4e0678, 0x32dc47dd, + 0xd98065c7, 0xa7122462, 0xd4d20a88, 0xaa404b2d, + 0xc324bb59, 0xbdb6fafc, 0xce76d416, 0xb0e495b3, + 0x1cbe34ff, 0x622c755a, 0x11ec5bb0, 0x6f7e1a15, + 0x061aea61, 0x7888abc4, 0x0b48852e, 0x75dac48b, + 0x53fdc7b7, 0x2d6f8612, 0x5eafa8f8, 0x203de95d, + 0x49591929, 0x37cb588c, 0x440b7666, 0x3a9937c3, + 0x96c3968f, 0xe851d72a, 0x9b91f9c0, 0xe503b865, + 0x8c674811, 0xf2f509b4, 0x8135275e, 0xffa766fb, + 0xcd7b2126, 0xb3e96083, 0xc0294e69, 0xbebb0fcc, + 0xd7dfffb8, 0xa94dbe1d, 0xda8d90f7, 0xa41fd152, + 0x0845701e, 0x76d731bb, 0x05171f51, 0x7b855ef4, + 0x12e1ae80, 0x6c73ef25, 0x1fb3c1cf, 0x6121806a, + 0x47068356, 0x3994c2f3, 0x4a54ec19, 0x34c6adbc, + 0x5da25dc8, 0x23301c6d, 0x50f03287, 0x2e627322, + 0x8238d26e, 0xfcaa93cb, 0x8f6abd21, 0xf1f8fc84, + 0x989c0cf0, 0xe60e4d55, 0x95ce63bf, 0xeb5c221a, + 0x4377278b, 0x3de5662e, 0x4e2548c4, 0x30b70961, + 0x59d3f915, 0x2741b8b0, 0x5481965a, 0x2a13d7ff, + 0x864976b3, 0xf8db3716, 0x8b1b19fc, 0xf5895859, + 0x9ceda82d, 0xe27fe988, 0x91bfc762, 0xef2d86c7, + 0xc90a85fb, 0xb798c45e, 0xc458eab4, 0xbacaab11, + 0xd3ae5b65, 0xad3c1ac0, 0xdefc342a, 0xa06e758f, + 0x0c34d4c3, 0x72a69566, 0x0166bb8c, 0x7ff4fa29, + 0x16900a5d, 0x68024bf8, 0x1bc26512, 0x655024b7, + 0x578c636a, 0x291e22cf, 0x5ade0c25, 0x244c4d80, + 0x4d28bdf4, 0x33bafc51, 0x407ad2bb, 0x3ee8931e, + 0x92b23252, 0xec2073f7, 0x9fe05d1d, 0xe1721cb8, + 0x8816eccc, 0xf684ad69, 0x85448383, 0xfbd6c226, + 0xddf1c11a, 0xa36380bf, 0xd0a3ae55, 0xae31eff0, + 0xc7551f84, 0xb9c75e21, 0xca0770cb, 0xb495316e, + 0x18cf9022, 0x665dd187, 0x159dff6d, 0x6b0fbec8, + 0x026b4ebc, 0x7cf90f19, 0x0f3921f3, 0x71ab6056, + 0x9af7424c, 0xe46503e9, 0x97a52d03, 0xe9376ca6, + 0x80539cd2, 0xfec1dd77, 0x8d01f39d, 0xf393b238, + 0x5fc91374, 0x215b52d1, 0x529b7c3b, 0x2c093d9e, + 0x456dcdea, 0x3bff8c4f, 0x483fa2a5, 0x36ade300, + 0x108ae03c, 0x6e18a199, 0x1dd88f73, 0x634aced6, + 0x0a2e3ea2, 0x74bc7f07, 0x077c51ed, 0x79ee1048, + 0xd5b4b104, 0xab26f0a1, 0xd8e6de4b, 0xa6749fee, + 0xcf106f9a, 0xb1822e3f, 0xc24200d5, 0xbcd04170, + 0x8e0c06ad, 0xf09e4708, 0x835e69e2, 0xfdcc2847, + 0x94a8d833, 0xea3a9996, 0x99fab77c, 0xe768f6d9, + 0x4b325795, 0x35a01630, 0x466038da, 0x38f2797f, + 0x5196890b, 0x2f04c8ae, 0x5cc4e644, 0x2256a7e1, + 0x0471a4dd, 0x7ae3e578, 0x0923cb92, 0x77b18a37, + 0x1ed57a43, 0x60473be6, 0x1387150c, 0x6d1554a9, + 0xc14ff5e5, 0xbfddb440, 0xcc1d9aaa, 0xb28fdb0f, + 0xdbeb2b7b, 0xa5796ade, 0xd6b94434, 0xa82b0591 + },{ + 0x00000000, 0xb8aa45dd, 0x812367bf, 0x39892262, + 0xf331227b, 0x4b9b67a6, 0x721245c4, 0xcab80019, + 0xe66344f6, 0x5ec9012b, 0x67402349, 0xdfea6694, + 0x1552668d, 0xadf82350, 0x94710132, 0x2cdb44ef, + 0x3db164e9, 0x851b2134, 0xbc920356, 0x0438468b, + 0xce804692, 0x762a034f, 0x4fa3212d, 0xf70964f0, + 0xdbd2201f, 0x637865c2, 0x5af147a0, 0xe25b027d, + 0x28e30264, 0x904947b9, 0xa9c065db, 0x116a2006, + 0x8b1425d7, 0x33be600a, 0x0a374268, 0xb29d07b5, + 0x782507ac, 0xc08f4271, 0xf9066013, 0x41ac25ce, + 0x6d776121, 0xd5dd24fc, 0xec54069e, 0x54fe4343, + 0x9e46435a, 0x26ec0687, 0x1f6524e5, 0xa7cf6138, + 0xb6a5413e, 0x0e0f04e3, 0x37862681, 0x8f2c635c, + 0x45946345, 0xfd3e2698, 0xc4b704fa, 0x7c1d4127, + 0x50c605c8, 0xe86c4015, 0xd1e56277, 0x694f27aa, + 0xa3f727b3, 0x1b5d626e, 0x22d4400c, 0x9a7e05d1, + 0xe75fa6ab, 0x5ff5e376, 0x667cc114, 0xded684c9, + 0x146e84d0, 0xacc4c10d, 0x954de36f, 0x2de7a6b2, + 0x013ce25d, 0xb996a780, 0x801f85e2, 0x38b5c03f, + 0xf20dc026, 0x4aa785fb, 0x732ea799, 0xcb84e244, + 0xdaeec242, 0x6244879f, 0x5bcda5fd, 0xe367e020, + 0x29dfe039, 0x9175a5e4, 0xa8fc8786, 0x1056c25b, + 0x3c8d86b4, 0x8427c369, 0xbdaee10b, 0x0504a4d6, + 0xcfbca4cf, 0x7716e112, 0x4e9fc370, 0xf63586ad, + 0x6c4b837c, 0xd4e1c6a1, 0xed68e4c3, 0x55c2a11e, + 0x9f7aa107, 0x27d0e4da, 0x1e59c6b8, 0xa6f38365, + 0x8a28c78a, 0x32828257, 0x0b0ba035, 0xb3a1e5e8, + 0x7919e5f1, 0xc1b3a02c, 0xf83a824e, 0x4090c793, + 0x51fae795, 0xe950a248, 0xd0d9802a, 0x6873c5f7, + 0xa2cbc5ee, 0x1a618033, 0x23e8a251, 0x9b42e78c, + 0xb799a363, 0x0f33e6be, 0x36bac4dc, 0x8e108101, + 0x44a88118, 0xfc02c4c5, 0xc58be6a7, 0x7d21a37a, + 0x3fc9a052, 0x8763e58f, 0xbeeac7ed, 0x06408230, + 0xccf88229, 0x7452c7f4, 0x4ddbe596, 0xf571a04b, + 0xd9aae4a4, 0x6100a179, 0x5889831b, 0xe023c6c6, + 0x2a9bc6df, 0x92318302, 0xabb8a160, 0x1312e4bd, + 0x0278c4bb, 0xbad28166, 0x835ba304, 0x3bf1e6d9, + 0xf149e6c0, 0x49e3a31d, 0x706a817f, 0xc8c0c4a2, + 0xe41b804d, 0x5cb1c590, 0x6538e7f2, 0xdd92a22f, + 0x172aa236, 0xaf80e7eb, 0x9609c589, 0x2ea38054, + 0xb4dd8585, 0x0c77c058, 0x35fee23a, 0x8d54a7e7, + 0x47eca7fe, 0xff46e223, 0xc6cfc041, 0x7e65859c, + 0x52bec173, 0xea1484ae, 0xd39da6cc, 0x6b37e311, + 0xa18fe308, 0x1925a6d5, 0x20ac84b7, 0x9806c16a, + 0x896ce16c, 0x31c6a4b1, 0x084f86d3, 0xb0e5c30e, + 0x7a5dc317, 0xc2f786ca, 0xfb7ea4a8, 0x43d4e175, + 0x6f0fa59a, 0xd7a5e047, 0xee2cc225, 0x568687f8, + 0x9c3e87e1, 0x2494c23c, 0x1d1de05e, 0xa5b7a583, + 0xd89606f9, 0x603c4324, 0x59b56146, 0xe11f249b, + 0x2ba72482, 0x930d615f, 0xaa84433d, 0x122e06e0, + 0x3ef5420f, 0x865f07d2, 0xbfd625b0, 0x077c606d, + 0xcdc46074, 0x756e25a9, 0x4ce707cb, 0xf44d4216, + 0xe5276210, 0x5d8d27cd, 0x640405af, 0xdcae4072, + 0x1616406b, 0xaebc05b6, 0x973527d4, 0x2f9f6209, + 0x034426e6, 0xbbee633b, 0x82674159, 0x3acd0484, + 0xf075049d, 0x48df4140, 0x71566322, 0xc9fc26ff, + 0x5382232e, 0xeb2866f3, 0xd2a14491, 0x6a0b014c, + 0xa0b30155, 0x18194488, 0x219066ea, 0x993a2337, + 0xb5e167d8, 0x0d4b2205, 0x34c20067, 0x8c6845ba, + 0x46d045a3, 0xfe7a007e, 0xc7f3221c, 0x7f5967c1, + 0x6e3347c7, 0xd699021a, 0xef102078, 0x57ba65a5, + 0x9d0265bc, 0x25a82061, 0x1c210203, 0xa48b47de, + 0x88500331, 0x30fa46ec, 0x0973648e, 0xb1d92153, + 0x7b61214a, 0xc3cb6497, 0xfa4246f5, 0x42e80328 + },{ + 0x00000000, 0xac6f1138, 0x58df2270, 0xf4b03348, + 0xb0be45e0, 0x1cd154d8, 0xe8616790, 0x440e76a8, + 0x910b67c5, 0x3d6476fd, 0xc9d445b5, 0x65bb548d, + 0x21b52225, 0x8dda331d, 0x796a0055, 0xd505116d, + 0xd361228f, 0x7f0e33b7, 0x8bbe00ff, 0x27d111c7, + 0x63df676f, 0xcfb07657, 0x3b00451f, 0x976f5427, + 0x426a454a, 0xee055472, 0x1ab5673a, 0xb6da7602, + 0xf2d400aa, 0x5ebb1192, 0xaa0b22da, 0x066433e2, + 0x57b5a81b, 0xfbdab923, 0x0f6a8a6b, 0xa3059b53, + 0xe70bedfb, 0x4b64fcc3, 0xbfd4cf8b, 0x13bbdeb3, + 0xc6becfde, 0x6ad1dee6, 0x9e61edae, 0x320efc96, + 0x76008a3e, 0xda6f9b06, 0x2edfa84e, 0x82b0b976, + 0x84d48a94, 0x28bb9bac, 0xdc0ba8e4, 0x7064b9dc, + 0x346acf74, 0x9805de4c, 0x6cb5ed04, 0xc0dafc3c, + 0x15dfed51, 0xb9b0fc69, 0x4d00cf21, 0xe16fde19, + 0xa561a8b1, 0x090eb989, 0xfdbe8ac1, 0x51d19bf9, + 0xae6a5137, 0x0205400f, 0xf6b57347, 0x5ada627f, + 0x1ed414d7, 0xb2bb05ef, 0x460b36a7, 0xea64279f, + 0x3f6136f2, 0x930e27ca, 0x67be1482, 0xcbd105ba, + 0x8fdf7312, 0x23b0622a, 0xd7005162, 0x7b6f405a, + 0x7d0b73b8, 0xd1646280, 0x25d451c8, 0x89bb40f0, + 0xcdb53658, 0x61da2760, 0x956a1428, 0x39050510, + 0xec00147d, 0x406f0545, 0xb4df360d, 0x18b02735, + 0x5cbe519d, 0xf0d140a5, 0x046173ed, 0xa80e62d5, + 0xf9dff92c, 0x55b0e814, 0xa100db5c, 0x0d6fca64, + 0x4961bccc, 0xe50eadf4, 0x11be9ebc, 0xbdd18f84, + 0x68d49ee9, 0xc4bb8fd1, 0x300bbc99, 0x9c64ada1, + 0xd86adb09, 0x7405ca31, 0x80b5f979, 0x2cdae841, + 0x2abedba3, 0x86d1ca9b, 0x7261f9d3, 0xde0ee8eb, + 0x9a009e43, 0x366f8f7b, 0xc2dfbc33, 0x6eb0ad0b, + 0xbbb5bc66, 0x17daad5e, 0xe36a9e16, 0x4f058f2e, + 0x0b0bf986, 0xa764e8be, 0x53d4dbf6, 0xffbbcace, + 0x5cd5a26e, 0xf0bab356, 0x040a801e, 0xa8659126, + 0xec6be78e, 0x4004f6b6, 0xb4b4c5fe, 0x18dbd4c6, + 0xcddec5ab, 0x61b1d493, 0x9501e7db, 0x396ef6e3, + 0x7d60804b, 0xd10f9173, 0x25bfa23b, 0x89d0b303, + 0x8fb480e1, 0x23db91d9, 0xd76ba291, 0x7b04b3a9, + 0x3f0ac501, 0x9365d439, 0x67d5e771, 0xcbbaf649, + 0x1ebfe724, 0xb2d0f61c, 0x4660c554, 0xea0fd46c, + 0xae01a2c4, 0x026eb3fc, 0xf6de80b4, 0x5ab1918c, + 0x0b600a75, 0xa70f1b4d, 0x53bf2805, 0xffd0393d, + 0xbbde4f95, 0x17b15ead, 0xe3016de5, 0x4f6e7cdd, + 0x9a6b6db0, 0x36047c88, 0xc2b44fc0, 0x6edb5ef8, + 0x2ad52850, 0x86ba3968, 0x720a0a20, 0xde651b18, + 0xd80128fa, 0x746e39c2, 0x80de0a8a, 0x2cb11bb2, + 0x68bf6d1a, 0xc4d07c22, 0x30604f6a, 0x9c0f5e52, + 0x490a4f3f, 0xe5655e07, 0x11d56d4f, 0xbdba7c77, + 0xf9b40adf, 0x55db1be7, 0xa16b28af, 0x0d043997, + 0xf2bff359, 0x5ed0e261, 0xaa60d129, 0x060fc011, + 0x4201b6b9, 0xee6ea781, 0x1ade94c9, 0xb6b185f1, + 0x63b4949c, 0xcfdb85a4, 0x3b6bb6ec, 0x9704a7d4, + 0xd30ad17c, 0x7f65c044, 0x8bd5f30c, 0x27bae234, + 0x21ded1d6, 0x8db1c0ee, 0x7901f3a6, 0xd56ee29e, + 0x91609436, 0x3d0f850e, 0xc9bfb646, 0x65d0a77e, + 0xb0d5b613, 0x1cbaa72b, 0xe80a9463, 0x4465855b, + 0x006bf3f3, 0xac04e2cb, 0x58b4d183, 0xf4dbc0bb, + 0xa50a5b42, 0x09654a7a, 0xfdd57932, 0x51ba680a, + 0x15b41ea2, 0xb9db0f9a, 0x4d6b3cd2, 0xe1042dea, + 0x34013c87, 0x986e2dbf, 0x6cde1ef7, 0xc0b10fcf, + 0x84bf7967, 0x28d0685f, 0xdc605b17, 0x700f4a2f, + 0x766b79cd, 0xda0468f5, 0x2eb45bbd, 0x82db4a85, + 0xc6d53c2d, 0x6aba2d15, 0x9e0a1e5d, 0x32650f65, + 0xe7601e08, 0x4b0f0f30, 0xbfbf3c78, 0x13d02d40, + 0x57de5be8, 0xfbb14ad0, 0x0f017998, 0xa36e68a0 + },{ + 0x00000000, 0x196b30ef, 0xc3a08cdb, 0xdacbbc34, + 0x7737f5b2, 0x6e5cc55d, 0xb4977969, 0xadfc4986, + 0x1f180660, 0x0673368f, 0xdcb88abb, 0xc5d3ba54, + 0x682ff3d2, 0x7144c33d, 0xab8f7f09, 0xb2e44fe6, + 0x3e300cc0, 0x275b3c2f, 0xfd90801b, 0xe4fbb0f4, + 0x4907f972, 0x506cc99d, 0x8aa775a9, 0x93cc4546, + 0x21280aa0, 0x38433a4f, 0xe288867b, 0xfbe3b694, + 0x561fff12, 0x4f74cffd, 0x95bf73c9, 0x8cd44326, + 0x8d16f485, 0x947dc46a, 0x4eb6785e, 0x57dd48b1, + 0xfa210137, 0xe34a31d8, 0x39818dec, 0x20eabd03, + 0x920ef2e5, 0x8b65c20a, 0x51ae7e3e, 0x48c54ed1, + 0xe5390757, 0xfc5237b8, 0x26998b8c, 0x3ff2bb63, + 0xb326f845, 0xaa4dc8aa, 0x7086749e, 0x69ed4471, + 0xc4110df7, 0xdd7a3d18, 0x07b1812c, 0x1edab1c3, + 0xac3efe25, 0xb555ceca, 0x6f9e72fe, 0x76f54211, + 0xdb090b97, 0xc2623b78, 0x18a9874c, 0x01c2b7a3, + 0xeb5b040e, 0xf23034e1, 0x28fb88d5, 0x3190b83a, + 0x9c6cf1bc, 0x8507c153, 0x5fcc7d67, 0x46a74d88, + 0xf443026e, 0xed283281, 0x37e38eb5, 0x2e88be5a, + 0x8374f7dc, 0x9a1fc733, 0x40d47b07, 0x59bf4be8, + 0xd56b08ce, 0xcc003821, 0x16cb8415, 0x0fa0b4fa, + 0xa25cfd7c, 0xbb37cd93, 0x61fc71a7, 0x78974148, + 0xca730eae, 0xd3183e41, 0x09d38275, 0x10b8b29a, + 0xbd44fb1c, 0xa42fcbf3, 0x7ee477c7, 0x678f4728, + 0x664df08b, 0x7f26c064, 0xa5ed7c50, 0xbc864cbf, + 0x117a0539, 0x081135d6, 0xd2da89e2, 0xcbb1b90d, + 0x7955f6eb, 0x603ec604, 0xbaf57a30, 0xa39e4adf, + 0x0e620359, 0x170933b6, 0xcdc28f82, 0xd4a9bf6d, + 0x587dfc4b, 0x4116cca4, 0x9bdd7090, 0x82b6407f, + 0x2f4a09f9, 0x36213916, 0xecea8522, 0xf581b5cd, + 0x4765fa2b, 0x5e0ecac4, 0x84c576f0, 0x9dae461f, + 0x30520f99, 0x29393f76, 0xf3f28342, 0xea99b3ad, + 0xd6b7081c, 0xcfdc38f3, 0x151784c7, 0x0c7cb428, + 0xa180fdae, 0xb8ebcd41, 0x62207175, 0x7b4b419a, + 0xc9af0e7c, 0xd0c43e93, 0x0a0f82a7, 0x1364b248, + 0xbe98fbce, 0xa7f3cb21, 0x7d387715, 0x645347fa, + 0xe88704dc, 0xf1ec3433, 0x2b278807, 0x324cb8e8, + 0x9fb0f16e, 0x86dbc181, 0x5c107db5, 0x457b4d5a, + 0xf79f02bc, 0xeef43253, 0x343f8e67, 0x2d54be88, + 0x80a8f70e, 0x99c3c7e1, 0x43087bd5, 0x5a634b3a, + 0x5ba1fc99, 0x42cacc76, 0x98017042, 0x816a40ad, + 0x2c96092b, 0x35fd39c4, 0xef3685f0, 0xf65db51f, + 0x44b9faf9, 0x5dd2ca16, 0x87197622, 0x9e7246cd, + 0x338e0f4b, 0x2ae53fa4, 0xf02e8390, 0xe945b37f, + 0x6591f059, 0x7cfac0b6, 0xa6317c82, 0xbf5a4c6d, + 0x12a605eb, 0x0bcd3504, 0xd1068930, 0xc86db9df, + 0x7a89f639, 0x63e2c6d6, 0xb9297ae2, 0xa0424a0d, + 0x0dbe038b, 0x14d53364, 0xce1e8f50, 0xd775bfbf, + 0x3dec0c12, 0x24873cfd, 0xfe4c80c9, 0xe727b026, + 0x4adbf9a0, 0x53b0c94f, 0x897b757b, 0x90104594, + 0x22f40a72, 0x3b9f3a9d, 0xe15486a9, 0xf83fb646, + 0x55c3ffc0, 0x4ca8cf2f, 0x9663731b, 0x8f0843f4, + 0x03dc00d2, 0x1ab7303d, 0xc07c8c09, 0xd917bce6, + 0x74ebf560, 0x6d80c58f, 0xb74b79bb, 0xae204954, + 0x1cc406b2, 0x05af365d, 0xdf648a69, 0xc60fba86, + 0x6bf3f300, 0x7298c3ef, 0xa8537fdb, 0xb1384f34, + 0xb0faf897, 0xa991c878, 0x735a744c, 0x6a3144a3, + 0xc7cd0d25, 0xdea63dca, 0x046d81fe, 0x1d06b111, + 0xafe2fef7, 0xb689ce18, 0x6c42722c, 0x752942c3, + 0xd8d50b45, 0xc1be3baa, 0x1b75879e, 0x021eb771, + 0x8ecaf457, 0x97a1c4b8, 0x4d6a788c, 0x54014863, + 0xf9fd01e5, 0xe096310a, 0x3a5d8d3e, 0x2336bdd1, + 0x91d2f237, 0x88b9c2d8, 0x52727eec, 0x4b194e03, + 0xe6e50785, 0xff8e376a, 0x25458b5e, 0x3c2ebbb1 + },{ + 0x00000000, 0xc82c0368, 0x905906d0, 0x587505b8, + 0xd1c5e0a5, 0x19e9e3cd, 0x419ce675, 0x89b0e51d, + 0x53fd2d4e, 0x9bd12e26, 0xc3a42b9e, 0x0b8828f6, + 0x8238cdeb, 0x4a14ce83, 0x1261cb3b, 0xda4dc853, + 0xa6fa5b9c, 0x6ed658f4, 0x36a35d4c, 0xfe8f5e24, + 0x773fbb39, 0xbf13b851, 0xe766bde9, 0x2f4abe81, + 0xf50776d2, 0x3d2b75ba, 0x655e7002, 0xad72736a, + 0x24c29677, 0xecee951f, 0xb49b90a7, 0x7cb793cf, + 0xbd835b3d, 0x75af5855, 0x2dda5ded, 0xe5f65e85, + 0x6c46bb98, 0xa46ab8f0, 0xfc1fbd48, 0x3433be20, + 0xee7e7673, 0x2652751b, 0x7e2770a3, 0xb60b73cb, + 0x3fbb96d6, 0xf79795be, 0xafe29006, 0x67ce936e, + 0x1b7900a1, 0xd35503c9, 0x8b200671, 0x430c0519, + 0xcabce004, 0x0290e36c, 0x5ae5e6d4, 0x92c9e5bc, + 0x48842def, 0x80a82e87, 0xd8dd2b3f, 0x10f12857, + 0x9941cd4a, 0x516dce22, 0x0918cb9a, 0xc134c8f2, + 0x7a07b77a, 0xb22bb412, 0xea5eb1aa, 0x2272b2c2, + 0xabc257df, 0x63ee54b7, 0x3b9b510f, 0xf3b75267, + 0x29fa9a34, 0xe1d6995c, 0xb9a39ce4, 0x718f9f8c, + 0xf83f7a91, 0x301379f9, 0x68667c41, 0xa04a7f29, + 0xdcfdece6, 0x14d1ef8e, 0x4ca4ea36, 0x8488e95e, + 0x0d380c43, 0xc5140f2b, 0x9d610a93, 0x554d09fb, + 0x8f00c1a8, 0x472cc2c0, 0x1f59c778, 0xd775c410, + 0x5ec5210d, 0x96e92265, 0xce9c27dd, 0x06b024b5, + 0xc784ec47, 0x0fa8ef2f, 0x57ddea97, 0x9ff1e9ff, + 0x16410ce2, 0xde6d0f8a, 0x86180a32, 0x4e34095a, + 0x9479c109, 0x5c55c261, 0x0420c7d9, 0xcc0cc4b1, + 0x45bc21ac, 0x8d9022c4, 0xd5e5277c, 0x1dc92414, + 0x617eb7db, 0xa952b4b3, 0xf127b10b, 0x390bb263, + 0xb0bb577e, 0x78975416, 0x20e251ae, 0xe8ce52c6, + 0x32839a95, 0xfaaf99fd, 0xa2da9c45, 0x6af69f2d, + 0xe3467a30, 0x2b6a7958, 0x731f7ce0, 0xbb337f88, + 0xf40e6ef5, 0x3c226d9d, 0x64576825, 0xac7b6b4d, + 0x25cb8e50, 0xede78d38, 0xb5928880, 0x7dbe8be8, + 0xa7f343bb, 0x6fdf40d3, 0x37aa456b, 0xff864603, + 0x7636a31e, 0xbe1aa076, 0xe66fa5ce, 0x2e43a6a6, + 0x52f43569, 0x9ad83601, 0xc2ad33b9, 0x0a8130d1, + 0x8331d5cc, 0x4b1dd6a4, 0x1368d31c, 0xdb44d074, + 0x01091827, 0xc9251b4f, 0x91501ef7, 0x597c1d9f, + 0xd0ccf882, 0x18e0fbea, 0x4095fe52, 0x88b9fd3a, + 0x498d35c8, 0x81a136a0, 0xd9d43318, 0x11f83070, + 0x9848d56d, 0x5064d605, 0x0811d3bd, 0xc03dd0d5, + 0x1a701886, 0xd25c1bee, 0x8a291e56, 0x42051d3e, + 0xcbb5f823, 0x0399fb4b, 0x5becfef3, 0x93c0fd9b, + 0xef776e54, 0x275b6d3c, 0x7f2e6884, 0xb7026bec, + 0x3eb28ef1, 0xf69e8d99, 0xaeeb8821, 0x66c78b49, + 0xbc8a431a, 0x74a64072, 0x2cd345ca, 0xe4ff46a2, + 0x6d4fa3bf, 0xa563a0d7, 0xfd16a56f, 0x353aa607, + 0x8e09d98f, 0x4625dae7, 0x1e50df5f, 0xd67cdc37, + 0x5fcc392a, 0x97e03a42, 0xcf953ffa, 0x07b93c92, + 0xddf4f4c1, 0x15d8f7a9, 0x4dadf211, 0x8581f179, + 0x0c311464, 0xc41d170c, 0x9c6812b4, 0x544411dc, + 0x28f38213, 0xe0df817b, 0xb8aa84c3, 0x708687ab, + 0xf93662b6, 0x311a61de, 0x696f6466, 0xa143670e, + 0x7b0eaf5d, 0xb322ac35, 0xeb57a98d, 0x237baae5, + 0xaacb4ff8, 0x62e74c90, 0x3a924928, 0xf2be4a40, + 0x338a82b2, 0xfba681da, 0xa3d38462, 0x6bff870a, + 0xe24f6217, 0x2a63617f, 0x721664c7, 0xba3a67af, + 0x6077affc, 0xa85bac94, 0xf02ea92c, 0x3802aa44, + 0xb1b24f59, 0x799e4c31, 0x21eb4989, 0xe9c74ae1, + 0x9570d92e, 0x5d5cda46, 0x0529dffe, 0xcd05dc96, + 0x44b5398b, 0x8c993ae3, 0xd4ec3f5b, 0x1cc03c33, + 0xc68df460, 0x0ea1f708, 0x56d4f2b0, 0x9ef8f1d8, + 0x174814c5, 0xdf6417ad, 0x87111215, 0x4f3d117d + },{ + 0x00000000, 0x277d3c49, 0x4efa7892, 0x698744db, + 0x6d821d21, 0x4aff2168, 0x237865b3, 0x040559fa, + 0xda043b42, 0xfd79070b, 0x94fe43d0, 0xb3837f99, + 0xb7862663, 0x90fb1a2a, 0xf97c5ef1, 0xde0162b8, + 0xb4097684, 0x93744acd, 0xfaf30e16, 0xdd8e325f, + 0xd98b6ba5, 0xfef657ec, 0x97711337, 0xb00c2f7e, + 0x6e0d4dc6, 0x4970718f, 0x20f73554, 0x078a091d, + 0x038f50e7, 0x24f26cae, 0x4d752875, 0x6a08143c, + 0x9965000d, 0xbe183c44, 0xd79f789f, 0xf0e244d6, + 0xf4e71d2c, 0xd39a2165, 0xba1d65be, 0x9d6059f7, + 0x43613b4f, 0x641c0706, 0x0d9b43dd, 0x2ae67f94, + 0x2ee3266e, 0x099e1a27, 0x60195efc, 0x476462b5, + 0x2d6c7689, 0x0a114ac0, 0x63960e1b, 0x44eb3252, + 0x40ee6ba8, 0x679357e1, 0x0e14133a, 0x29692f73, + 0xf7684dcb, 0xd0157182, 0xb9923559, 0x9eef0910, + 0x9aea50ea, 0xbd976ca3, 0xd4102878, 0xf36d1431, + 0x32cb001a, 0x15b63c53, 0x7c317888, 0x5b4c44c1, + 0x5f491d3b, 0x78342172, 0x11b365a9, 0x36ce59e0, + 0xe8cf3b58, 0xcfb20711, 0xa63543ca, 0x81487f83, + 0x854d2679, 0xa2301a30, 0xcbb75eeb, 0xecca62a2, + 0x86c2769e, 0xa1bf4ad7, 0xc8380e0c, 0xef453245, + 0xeb406bbf, 0xcc3d57f6, 0xa5ba132d, 0x82c72f64, + 0x5cc64ddc, 0x7bbb7195, 0x123c354e, 0x35410907, + 0x314450fd, 0x16396cb4, 0x7fbe286f, 0x58c31426, + 0xabae0017, 0x8cd33c5e, 0xe5547885, 0xc22944cc, + 0xc62c1d36, 0xe151217f, 0x88d665a4, 0xafab59ed, + 0x71aa3b55, 0x56d7071c, 0x3f5043c7, 0x182d7f8e, + 0x1c282674, 0x3b551a3d, 0x52d25ee6, 0x75af62af, + 0x1fa77693, 0x38da4ada, 0x515d0e01, 0x76203248, + 0x72256bb2, 0x555857fb, 0x3cdf1320, 0x1ba22f69, + 0xc5a34dd1, 0xe2de7198, 0x8b593543, 0xac24090a, + 0xa82150f0, 0x8f5c6cb9, 0xe6db2862, 0xc1a6142b, + 0x64960134, 0x43eb3d7d, 0x2a6c79a6, 0x0d1145ef, + 0x09141c15, 0x2e69205c, 0x47ee6487, 0x609358ce, + 0xbe923a76, 0x99ef063f, 0xf06842e4, 0xd7157ead, + 0xd3102757, 0xf46d1b1e, 0x9dea5fc5, 0xba97638c, + 0xd09f77b0, 0xf7e24bf9, 0x9e650f22, 0xb918336b, + 0xbd1d6a91, 0x9a6056d8, 0xf3e71203, 0xd49a2e4a, + 0x0a9b4cf2, 0x2de670bb, 0x44613460, 0x631c0829, + 0x671951d3, 0x40646d9a, 0x29e32941, 0x0e9e1508, + 0xfdf30139, 0xda8e3d70, 0xb30979ab, 0x947445e2, + 0x90711c18, 0xb70c2051, 0xde8b648a, 0xf9f658c3, + 0x27f73a7b, 0x008a0632, 0x690d42e9, 0x4e707ea0, + 0x4a75275a, 0x6d081b13, 0x048f5fc8, 0x23f26381, + 0x49fa77bd, 0x6e874bf4, 0x07000f2f, 0x207d3366, + 0x24786a9c, 0x030556d5, 0x6a82120e, 0x4dff2e47, + 0x93fe4cff, 0xb48370b6, 0xdd04346d, 0xfa790824, + 0xfe7c51de, 0xd9016d97, 0xb086294c, 0x97fb1505, + 0x565d012e, 0x71203d67, 0x18a779bc, 0x3fda45f5, + 0x3bdf1c0f, 0x1ca22046, 0x7525649d, 0x525858d4, + 0x8c593a6c, 0xab240625, 0xc2a342fe, 0xe5de7eb7, + 0xe1db274d, 0xc6a61b04, 0xaf215fdf, 0x885c6396, + 0xe25477aa, 0xc5294be3, 0xacae0f38, 0x8bd33371, + 0x8fd66a8b, 0xa8ab56c2, 0xc12c1219, 0xe6512e50, + 0x38504ce8, 0x1f2d70a1, 0x76aa347a, 0x51d70833, + 0x55d251c9, 0x72af6d80, 0x1b28295b, 0x3c551512, + 0xcf380123, 0xe8453d6a, 0x81c279b1, 0xa6bf45f8, + 0xa2ba1c02, 0x85c7204b, 0xec406490, 0xcb3d58d9, + 0x153c3a61, 0x32410628, 0x5bc642f3, 0x7cbb7eba, + 0x78be2740, 0x5fc31b09, 0x36445fd2, 0x1139639b, + 0x7b3177a7, 0x5c4c4bee, 0x35cb0f35, 0x12b6337c, + 0x16b36a86, 0x31ce56cf, 0x58491214, 0x7f342e5d, + 0xa1354ce5, 0x864870ac, 0xefcf3477, 0xc8b2083e, + 0xccb751c4, 0xebca6d8d, 0x824d2956, 0xa530151f + } +#else + { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, + 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, + 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, + 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, + 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, + 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, + 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, + 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, + 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, + 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, + 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, + 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, + 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, + 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, + 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, + 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, + 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, + 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, + 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, + 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, + 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, + 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, + 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, + 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, + 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, + 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, + 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, + 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, + 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, + 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, + 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, + 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, + 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, + 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, + 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, + 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, + 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, + 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, + 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, + 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, + 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, + 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, + 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 + },{ + 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, + 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, + 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, + 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, + 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, + 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, + 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, + 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, + 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, + 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, + 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, + 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, + 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, + 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, + 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, + 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, + 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, + 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, + 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, + 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, + 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, + 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, + 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, + 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, + 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, + 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, + 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, + 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, + 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, + 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, + 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, + 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, + 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, + 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, + 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, + 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, + 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, + 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, + 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, + 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, + 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, + 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, + 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, + 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, + 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, + 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, + 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, + 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, + 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, + 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, + 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, + 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, + 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, + 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, + 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, + 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, + 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, + 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, + 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, + 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, + 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, + 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, + 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, + 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 + },{ + 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, + 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, + 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, + 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, + 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, + 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, + 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, + 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, + 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, + 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, + 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, + 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, + 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, + 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, + 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, + 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, + 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, + 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, + 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, + 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, + 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, + 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, + 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, + 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, + 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, + 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, + 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, + 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, + 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, + 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, + 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, + 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, + 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, + 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, + 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, + 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, + 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, + 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, + 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, + 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, + 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, + 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, + 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, + 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, + 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, + 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, + 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, + 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, + 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, + 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, + 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, + 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, + 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, + 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, + 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, + 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, + 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, + 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, + 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, + 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, + 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, + 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, + 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, + 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 + },{ + 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, + 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, + 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, + 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, + 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, + 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, + 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, + 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, + 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, + 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, + 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, + 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, + 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, + 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, + 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, + 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, + 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, + 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, + 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, + 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, + 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, + 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, + 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, + 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, + 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, + 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, + 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, + 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, + 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, + 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, + 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, + 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, + 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, + 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, + 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, + 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, + 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, + 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, + 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, + 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, + 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, + 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, + 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, + 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, + 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, + 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, + 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, + 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, + 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, + 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, + 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, + 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, + 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, + 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, + 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, + 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, + 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, + 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, + 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, + 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, + 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, + 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, + 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, + 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 + },{ + 0x00000000, 0x38116fac, 0x7022df58, 0x4833b0f4, + 0xe045beb0, 0xd854d11c, 0x906761e8, 0xa8760e44, + 0xc5670b91, 0xfd76643d, 0xb545d4c9, 0x8d54bb65, + 0x2522b521, 0x1d33da8d, 0x55006a79, 0x6d1105d5, + 0x8f2261d3, 0xb7330e7f, 0xff00be8b, 0xc711d127, + 0x6f67df63, 0x5776b0cf, 0x1f45003b, 0x27546f97, + 0x4a456a42, 0x725405ee, 0x3a67b51a, 0x0276dab6, + 0xaa00d4f2, 0x9211bb5e, 0xda220baa, 0xe2336406, + 0x1ba8b557, 0x23b9dafb, 0x6b8a6a0f, 0x539b05a3, + 0xfbed0be7, 0xc3fc644b, 0x8bcfd4bf, 0xb3debb13, + 0xdecfbec6, 0xe6ded16a, 0xaeed619e, 0x96fc0e32, + 0x3e8a0076, 0x069b6fda, 0x4ea8df2e, 0x76b9b082, + 0x948ad484, 0xac9bbb28, 0xe4a80bdc, 0xdcb96470, + 0x74cf6a34, 0x4cde0598, 0x04edb56c, 0x3cfcdac0, + 0x51eddf15, 0x69fcb0b9, 0x21cf004d, 0x19de6fe1, + 0xb1a861a5, 0x89b90e09, 0xc18abefd, 0xf99bd151, + 0x37516aae, 0x0f400502, 0x4773b5f6, 0x7f62da5a, + 0xd714d41e, 0xef05bbb2, 0xa7360b46, 0x9f2764ea, + 0xf236613f, 0xca270e93, 0x8214be67, 0xba05d1cb, + 0x1273df8f, 0x2a62b023, 0x625100d7, 0x5a406f7b, + 0xb8730b7d, 0x806264d1, 0xc851d425, 0xf040bb89, + 0x5836b5cd, 0x6027da61, 0x28146a95, 0x10050539, + 0x7d1400ec, 0x45056f40, 0x0d36dfb4, 0x3527b018, + 0x9d51be5c, 0xa540d1f0, 0xed736104, 0xd5620ea8, + 0x2cf9dff9, 0x14e8b055, 0x5cdb00a1, 0x64ca6f0d, + 0xccbc6149, 0xf4ad0ee5, 0xbc9ebe11, 0x848fd1bd, + 0xe99ed468, 0xd18fbbc4, 0x99bc0b30, 0xa1ad649c, + 0x09db6ad8, 0x31ca0574, 0x79f9b580, 0x41e8da2c, + 0xa3dbbe2a, 0x9bcad186, 0xd3f96172, 0xebe80ede, + 0x439e009a, 0x7b8f6f36, 0x33bcdfc2, 0x0badb06e, + 0x66bcb5bb, 0x5eadda17, 0x169e6ae3, 0x2e8f054f, + 0x86f90b0b, 0xbee864a7, 0xf6dbd453, 0xcecabbff, + 0x6ea2d55c, 0x56b3baf0, 0x1e800a04, 0x269165a8, + 0x8ee76bec, 0xb6f60440, 0xfec5b4b4, 0xc6d4db18, + 0xabc5decd, 0x93d4b161, 0xdbe70195, 0xe3f66e39, + 0x4b80607d, 0x73910fd1, 0x3ba2bf25, 0x03b3d089, + 0xe180b48f, 0xd991db23, 0x91a26bd7, 0xa9b3047b, + 0x01c50a3f, 0x39d46593, 0x71e7d567, 0x49f6bacb, + 0x24e7bf1e, 0x1cf6d0b2, 0x54c56046, 0x6cd40fea, + 0xc4a201ae, 0xfcb36e02, 0xb480def6, 0x8c91b15a, + 0x750a600b, 0x4d1b0fa7, 0x0528bf53, 0x3d39d0ff, + 0x954fdebb, 0xad5eb117, 0xe56d01e3, 0xdd7c6e4f, + 0xb06d6b9a, 0x887c0436, 0xc04fb4c2, 0xf85edb6e, + 0x5028d52a, 0x6839ba86, 0x200a0a72, 0x181b65de, + 0xfa2801d8, 0xc2396e74, 0x8a0ade80, 0xb21bb12c, + 0x1a6dbf68, 0x227cd0c4, 0x6a4f6030, 0x525e0f9c, + 0x3f4f0a49, 0x075e65e5, 0x4f6dd511, 0x777cbabd, + 0xdf0ab4f9, 0xe71bdb55, 0xaf286ba1, 0x9739040d, + 0x59f3bff2, 0x61e2d05e, 0x29d160aa, 0x11c00f06, + 0xb9b60142, 0x81a76eee, 0xc994de1a, 0xf185b1b6, + 0x9c94b463, 0xa485dbcf, 0xecb66b3b, 0xd4a70497, + 0x7cd10ad3, 0x44c0657f, 0x0cf3d58b, 0x34e2ba27, + 0xd6d1de21, 0xeec0b18d, 0xa6f30179, 0x9ee26ed5, + 0x36946091, 0x0e850f3d, 0x46b6bfc9, 0x7ea7d065, + 0x13b6d5b0, 0x2ba7ba1c, 0x63940ae8, 0x5b856544, + 0xf3f36b00, 0xcbe204ac, 0x83d1b458, 0xbbc0dbf4, + 0x425b0aa5, 0x7a4a6509, 0x3279d5fd, 0x0a68ba51, + 0xa21eb415, 0x9a0fdbb9, 0xd23c6b4d, 0xea2d04e1, + 0x873c0134, 0xbf2d6e98, 0xf71ede6c, 0xcf0fb1c0, + 0x6779bf84, 0x5f68d028, 0x175b60dc, 0x2f4a0f70, + 0xcd796b76, 0xf56804da, 0xbd5bb42e, 0x854adb82, + 0x2d3cd5c6, 0x152dba6a, 0x5d1e0a9e, 0x650f6532, + 0x081e60e7, 0x300f0f4b, 0x783cbfbf, 0x402dd013, + 0xe85bde57, 0xd04ab1fb, 0x9879010f, 0xa0686ea3 + },{ + 0x00000000, 0xef306b19, 0xdb8ca0c3, 0x34bccbda, + 0xb2f53777, 0x5dc55c6e, 0x697997b4, 0x8649fcad, + 0x6006181f, 0x8f367306, 0xbb8ab8dc, 0x54bad3c5, + 0xd2f32f68, 0x3dc34471, 0x097f8fab, 0xe64fe4b2, + 0xc00c303e, 0x2f3c5b27, 0x1b8090fd, 0xf4b0fbe4, + 0x72f90749, 0x9dc96c50, 0xa975a78a, 0x4645cc93, + 0xa00a2821, 0x4f3a4338, 0x7b8688e2, 0x94b6e3fb, + 0x12ff1f56, 0xfdcf744f, 0xc973bf95, 0x2643d48c, + 0x85f4168d, 0x6ac47d94, 0x5e78b64e, 0xb148dd57, + 0x370121fa, 0xd8314ae3, 0xec8d8139, 0x03bdea20, + 0xe5f20e92, 0x0ac2658b, 0x3e7eae51, 0xd14ec548, + 0x570739e5, 0xb83752fc, 0x8c8b9926, 0x63bbf23f, + 0x45f826b3, 0xaac84daa, 0x9e748670, 0x7144ed69, + 0xf70d11c4, 0x183d7add, 0x2c81b107, 0xc3b1da1e, + 0x25fe3eac, 0xcace55b5, 0xfe729e6f, 0x1142f576, + 0x970b09db, 0x783b62c2, 0x4c87a918, 0xa3b7c201, + 0x0e045beb, 0xe13430f2, 0xd588fb28, 0x3ab89031, + 0xbcf16c9c, 0x53c10785, 0x677dcc5f, 0x884da746, + 0x6e0243f4, 0x813228ed, 0xb58ee337, 0x5abe882e, + 0xdcf77483, 0x33c71f9a, 0x077bd440, 0xe84bbf59, + 0xce086bd5, 0x213800cc, 0x1584cb16, 0xfab4a00f, + 0x7cfd5ca2, 0x93cd37bb, 0xa771fc61, 0x48419778, + 0xae0e73ca, 0x413e18d3, 0x7582d309, 0x9ab2b810, + 0x1cfb44bd, 0xf3cb2fa4, 0xc777e47e, 0x28478f67, + 0x8bf04d66, 0x64c0267f, 0x507ceda5, 0xbf4c86bc, + 0x39057a11, 0xd6351108, 0xe289dad2, 0x0db9b1cb, + 0xebf65579, 0x04c63e60, 0x307af5ba, 0xdf4a9ea3, + 0x5903620e, 0xb6330917, 0x828fc2cd, 0x6dbfa9d4, + 0x4bfc7d58, 0xa4cc1641, 0x9070dd9b, 0x7f40b682, + 0xf9094a2f, 0x16392136, 0x2285eaec, 0xcdb581f5, + 0x2bfa6547, 0xc4ca0e5e, 0xf076c584, 0x1f46ae9d, + 0x990f5230, 0x763f3929, 0x4283f2f3, 0xadb399ea, + 0x1c08b7d6, 0xf338dccf, 0xc7841715, 0x28b47c0c, + 0xaefd80a1, 0x41cdebb8, 0x75712062, 0x9a414b7b, + 0x7c0eafc9, 0x933ec4d0, 0xa7820f0a, 0x48b26413, + 0xcefb98be, 0x21cbf3a7, 0x1577387d, 0xfa475364, + 0xdc0487e8, 0x3334ecf1, 0x0788272b, 0xe8b84c32, + 0x6ef1b09f, 0x81c1db86, 0xb57d105c, 0x5a4d7b45, + 0xbc029ff7, 0x5332f4ee, 0x678e3f34, 0x88be542d, + 0x0ef7a880, 0xe1c7c399, 0xd57b0843, 0x3a4b635a, + 0x99fca15b, 0x76ccca42, 0x42700198, 0xad406a81, + 0x2b09962c, 0xc439fd35, 0xf08536ef, 0x1fb55df6, + 0xf9fab944, 0x16cad25d, 0x22761987, 0xcd46729e, + 0x4b0f8e33, 0xa43fe52a, 0x90832ef0, 0x7fb345e9, + 0x59f09165, 0xb6c0fa7c, 0x827c31a6, 0x6d4c5abf, + 0xeb05a612, 0x0435cd0b, 0x308906d1, 0xdfb96dc8, + 0x39f6897a, 0xd6c6e263, 0xe27a29b9, 0x0d4a42a0, + 0x8b03be0d, 0x6433d514, 0x508f1ece, 0xbfbf75d7, + 0x120cec3d, 0xfd3c8724, 0xc9804cfe, 0x26b027e7, + 0xa0f9db4a, 0x4fc9b053, 0x7b757b89, 0x94451090, + 0x720af422, 0x9d3a9f3b, 0xa98654e1, 0x46b63ff8, + 0xc0ffc355, 0x2fcfa84c, 0x1b736396, 0xf443088f, + 0xd200dc03, 0x3d30b71a, 0x098c7cc0, 0xe6bc17d9, + 0x60f5eb74, 0x8fc5806d, 0xbb794bb7, 0x544920ae, + 0xb206c41c, 0x5d36af05, 0x698a64df, 0x86ba0fc6, + 0x00f3f36b, 0xefc39872, 0xdb7f53a8, 0x344f38b1, + 0x97f8fab0, 0x78c891a9, 0x4c745a73, 0xa344316a, + 0x250dcdc7, 0xca3da6de, 0xfe816d04, 0x11b1061d, + 0xf7fee2af, 0x18ce89b6, 0x2c72426c, 0xc3422975, + 0x450bd5d8, 0xaa3bbec1, 0x9e87751b, 0x71b71e02, + 0x57f4ca8e, 0xb8c4a197, 0x8c786a4d, 0x63480154, + 0xe501fdf9, 0x0a3196e0, 0x3e8d5d3a, 0xd1bd3623, + 0x37f2d291, 0xd8c2b988, 0xec7e7252, 0x034e194b, + 0x8507e5e6, 0x6a378eff, 0x5e8b4525, 0xb1bb2e3c + },{ + 0x00000000, 0x68032cc8, 0xd0065990, 0xb8057558, + 0xa5e0c5d1, 0xcde3e919, 0x75e69c41, 0x1de5b089, + 0x4e2dfd53, 0x262ed19b, 0x9e2ba4c3, 0xf628880b, + 0xebcd3882, 0x83ce144a, 0x3bcb6112, 0x53c84dda, + 0x9c5bfaa6, 0xf458d66e, 0x4c5da336, 0x245e8ffe, + 0x39bb3f77, 0x51b813bf, 0xe9bd66e7, 0x81be4a2f, + 0xd27607f5, 0xba752b3d, 0x02705e65, 0x6a7372ad, + 0x7796c224, 0x1f95eeec, 0xa7909bb4, 0xcf93b77c, + 0x3d5b83bd, 0x5558af75, 0xed5dda2d, 0x855ef6e5, + 0x98bb466c, 0xf0b86aa4, 0x48bd1ffc, 0x20be3334, + 0x73767eee, 0x1b755226, 0xa370277e, 0xcb730bb6, + 0xd696bb3f, 0xbe9597f7, 0x0690e2af, 0x6e93ce67, + 0xa100791b, 0xc90355d3, 0x7106208b, 0x19050c43, + 0x04e0bcca, 0x6ce39002, 0xd4e6e55a, 0xbce5c992, + 0xef2d8448, 0x872ea880, 0x3f2bddd8, 0x5728f110, + 0x4acd4199, 0x22ce6d51, 0x9acb1809, 0xf2c834c1, + 0x7ab7077a, 0x12b42bb2, 0xaab15eea, 0xc2b27222, + 0xdf57c2ab, 0xb754ee63, 0x0f519b3b, 0x6752b7f3, + 0x349afa29, 0x5c99d6e1, 0xe49ca3b9, 0x8c9f8f71, + 0x917a3ff8, 0xf9791330, 0x417c6668, 0x297f4aa0, + 0xe6ecfddc, 0x8eefd114, 0x36eaa44c, 0x5ee98884, + 0x430c380d, 0x2b0f14c5, 0x930a619d, 0xfb094d55, + 0xa8c1008f, 0xc0c22c47, 0x78c7591f, 0x10c475d7, + 0x0d21c55e, 0x6522e996, 0xdd279cce, 0xb524b006, + 0x47ec84c7, 0x2fefa80f, 0x97eadd57, 0xffe9f19f, + 0xe20c4116, 0x8a0f6dde, 0x320a1886, 0x5a09344e, + 0x09c17994, 0x61c2555c, 0xd9c72004, 0xb1c40ccc, + 0xac21bc45, 0xc422908d, 0x7c27e5d5, 0x1424c91d, + 0xdbb77e61, 0xb3b452a9, 0x0bb127f1, 0x63b20b39, + 0x7e57bbb0, 0x16549778, 0xae51e220, 0xc652cee8, + 0x959a8332, 0xfd99affa, 0x459cdaa2, 0x2d9ff66a, + 0x307a46e3, 0x58796a2b, 0xe07c1f73, 0x887f33bb, + 0xf56e0ef4, 0x9d6d223c, 0x25685764, 0x4d6b7bac, + 0x508ecb25, 0x388de7ed, 0x808892b5, 0xe88bbe7d, + 0xbb43f3a7, 0xd340df6f, 0x6b45aa37, 0x034686ff, + 0x1ea33676, 0x76a01abe, 0xcea56fe6, 0xa6a6432e, + 0x6935f452, 0x0136d89a, 0xb933adc2, 0xd130810a, + 0xccd53183, 0xa4d61d4b, 0x1cd36813, 0x74d044db, + 0x27180901, 0x4f1b25c9, 0xf71e5091, 0x9f1d7c59, + 0x82f8ccd0, 0xeafbe018, 0x52fe9540, 0x3afdb988, + 0xc8358d49, 0xa036a181, 0x1833d4d9, 0x7030f811, + 0x6dd54898, 0x05d66450, 0xbdd31108, 0xd5d03dc0, + 0x8618701a, 0xee1b5cd2, 0x561e298a, 0x3e1d0542, + 0x23f8b5cb, 0x4bfb9903, 0xf3feec5b, 0x9bfdc093, + 0x546e77ef, 0x3c6d5b27, 0x84682e7f, 0xec6b02b7, + 0xf18eb23e, 0x998d9ef6, 0x2188ebae, 0x498bc766, + 0x1a438abc, 0x7240a674, 0xca45d32c, 0xa246ffe4, + 0xbfa34f6d, 0xd7a063a5, 0x6fa516fd, 0x07a63a35, + 0x8fd9098e, 0xe7da2546, 0x5fdf501e, 0x37dc7cd6, + 0x2a39cc5f, 0x423ae097, 0xfa3f95cf, 0x923cb907, + 0xc1f4f4dd, 0xa9f7d815, 0x11f2ad4d, 0x79f18185, + 0x6414310c, 0x0c171dc4, 0xb412689c, 0xdc114454, + 0x1382f328, 0x7b81dfe0, 0xc384aab8, 0xab878670, + 0xb66236f9, 0xde611a31, 0x66646f69, 0x0e6743a1, + 0x5daf0e7b, 0x35ac22b3, 0x8da957eb, 0xe5aa7b23, + 0xf84fcbaa, 0x904ce762, 0x2849923a, 0x404abef2, + 0xb2828a33, 0xda81a6fb, 0x6284d3a3, 0x0a87ff6b, + 0x17624fe2, 0x7f61632a, 0xc7641672, 0xaf673aba, + 0xfcaf7760, 0x94ac5ba8, 0x2ca92ef0, 0x44aa0238, + 0x594fb2b1, 0x314c9e79, 0x8949eb21, 0xe14ac7e9, + 0x2ed97095, 0x46da5c5d, 0xfedf2905, 0x96dc05cd, + 0x8b39b544, 0xe33a998c, 0x5b3fecd4, 0x333cc01c, + 0x60f48dc6, 0x08f7a10e, 0xb0f2d456, 0xd8f1f89e, + 0xc5144817, 0xad1764df, 0x15121187, 0x7d113d4f + },{ + 0x00000000, 0x493c7d27, 0x9278fa4e, 0xdb448769, + 0x211d826d, 0x6821ff4a, 0xb3657823, 0xfa590504, + 0x423b04da, 0x0b0779fd, 0xd043fe94, 0x997f83b3, + 0x632686b7, 0x2a1afb90, 0xf15e7cf9, 0xb86201de, + 0x847609b4, 0xcd4a7493, 0x160ef3fa, 0x5f328edd, + 0xa56b8bd9, 0xec57f6fe, 0x37137197, 0x7e2f0cb0, + 0xc64d0d6e, 0x8f717049, 0x5435f720, 0x1d098a07, + 0xe7508f03, 0xae6cf224, 0x7528754d, 0x3c14086a, + 0x0d006599, 0x443c18be, 0x9f789fd7, 0xd644e2f0, + 0x2c1de7f4, 0x65219ad3, 0xbe651dba, 0xf759609d, + 0x4f3b6143, 0x06071c64, 0xdd439b0d, 0x947fe62a, + 0x6e26e32e, 0x271a9e09, 0xfc5e1960, 0xb5626447, + 0x89766c2d, 0xc04a110a, 0x1b0e9663, 0x5232eb44, + 0xa86bee40, 0xe1579367, 0x3a13140e, 0x732f6929, + 0xcb4d68f7, 0x827115d0, 0x593592b9, 0x1009ef9e, + 0xea50ea9a, 0xa36c97bd, 0x782810d4, 0x31146df3, + 0x1a00cb32, 0x533cb615, 0x8878317c, 0xc1444c5b, + 0x3b1d495f, 0x72213478, 0xa965b311, 0xe059ce36, + 0x583bcfe8, 0x1107b2cf, 0xca4335a6, 0x837f4881, + 0x79264d85, 0x301a30a2, 0xeb5eb7cb, 0xa262caec, + 0x9e76c286, 0xd74abfa1, 0x0c0e38c8, 0x453245ef, + 0xbf6b40eb, 0xf6573dcc, 0x2d13baa5, 0x642fc782, + 0xdc4dc65c, 0x9571bb7b, 0x4e353c12, 0x07094135, + 0xfd504431, 0xb46c3916, 0x6f28be7f, 0x2614c358, + 0x1700aeab, 0x5e3cd38c, 0x857854e5, 0xcc4429c2, + 0x361d2cc6, 0x7f2151e1, 0xa465d688, 0xed59abaf, + 0x553baa71, 0x1c07d756, 0xc743503f, 0x8e7f2d18, + 0x7426281c, 0x3d1a553b, 0xe65ed252, 0xaf62af75, + 0x9376a71f, 0xda4ada38, 0x010e5d51, 0x48322076, + 0xb26b2572, 0xfb575855, 0x2013df3c, 0x692fa21b, + 0xd14da3c5, 0x9871dee2, 0x4335598b, 0x0a0924ac, + 0xf05021a8, 0xb96c5c8f, 0x6228dbe6, 0x2b14a6c1, + 0x34019664, 0x7d3deb43, 0xa6796c2a, 0xef45110d, + 0x151c1409, 0x5c20692e, 0x8764ee47, 0xce589360, + 0x763a92be, 0x3f06ef99, 0xe44268f0, 0xad7e15d7, + 0x572710d3, 0x1e1b6df4, 0xc55fea9d, 0x8c6397ba, + 0xb0779fd0, 0xf94be2f7, 0x220f659e, 0x6b3318b9, + 0x916a1dbd, 0xd856609a, 0x0312e7f3, 0x4a2e9ad4, + 0xf24c9b0a, 0xbb70e62d, 0x60346144, 0x29081c63, + 0xd3511967, 0x9a6d6440, 0x4129e329, 0x08159e0e, + 0x3901f3fd, 0x703d8eda, 0xab7909b3, 0xe2457494, + 0x181c7190, 0x51200cb7, 0x8a648bde, 0xc358f6f9, + 0x7b3af727, 0x32068a00, 0xe9420d69, 0xa07e704e, + 0x5a27754a, 0x131b086d, 0xc85f8f04, 0x8163f223, + 0xbd77fa49, 0xf44b876e, 0x2f0f0007, 0x66337d20, + 0x9c6a7824, 0xd5560503, 0x0e12826a, 0x472eff4d, + 0xff4cfe93, 0xb67083b4, 0x6d3404dd, 0x240879fa, + 0xde517cfe, 0x976d01d9, 0x4c2986b0, 0x0515fb97, + 0x2e015d56, 0x673d2071, 0xbc79a718, 0xf545da3f, + 0x0f1cdf3b, 0x4620a21c, 0x9d642575, 0xd4585852, + 0x6c3a598c, 0x250624ab, 0xfe42a3c2, 0xb77edee5, + 0x4d27dbe1, 0x041ba6c6, 0xdf5f21af, 0x96635c88, + 0xaa7754e2, 0xe34b29c5, 0x380faeac, 0x7133d38b, + 0x8b6ad68f, 0xc256aba8, 0x19122cc1, 0x502e51e6, + 0xe84c5038, 0xa1702d1f, 0x7a34aa76, 0x3308d751, + 0xc951d255, 0x806daf72, 0x5b29281b, 0x1215553c, + 0x230138cf, 0x6a3d45e8, 0xb179c281, 0xf845bfa6, + 0x021cbaa2, 0x4b20c785, 0x906440ec, 0xd9583dcb, + 0x613a3c15, 0x28064132, 0xf342c65b, 0xba7ebb7c, + 0x4027be78, 0x091bc35f, 0xd25f4436, 0x9b633911, + 0xa777317b, 0xee4b4c5c, 0x350fcb35, 0x7c33b612, + 0x866ab316, 0xcf56ce31, 0x14124958, 0x5d2e347f, + 0xe54c35a1, 0xac704886, 0x7734cfef, 0x3e08b2c8, + 0xc451b7cc, 0x8d6dcaeb, 0x56294d82, 0x1f1530a5 + } +#endif +}; + +/* + * __wt_cksum_sw -- + * Return a checksum for a chunk of memory, computed in software. + * + * Slicing-by-8 algorithm by Michael E. Kounavis and Frank L. Berry from + * Intel Corp.: + * http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf + * + * Based on Peter Kanowski's posting: + * http://www.strchr.com/crc32_popcnt + * + * The big endian version calculates the same result at each step, except the + * value of the crc is byte reversed from what it would be at that step for + * little endian. + */ +static uint32_t +__wt_cksum_sw(const void *chunk, size_t len) +{ + uint32_t crc, next; + size_t nqwords; + const uint8_t *p; + + crc = 0xffffffff; + + /* Checksum one byte at a time to the first 4B boundary. */ + for (p = chunk; + ((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0 && + len > 0; ++p, --len) +#ifdef WORDS_BIGENDIAN + crc = g_crc_slicing[0][((crc >> 24) ^ *p) & 0xFF] ^ (crc << 8); +#else + crc = g_crc_slicing[0][(crc ^ *p) & 0xFF] ^ (crc >> 8); +#endif + + /* Checksum in 8B chunks. */ + for (nqwords = len / sizeof(uint64_t); nqwords; nqwords--) { + crc ^= *(uint32_t *)p; + p += sizeof(uint32_t); + next = *(uint32_t *)p; + p += sizeof(uint32_t); + crc = +#ifdef WORDS_BIGENDIAN + g_crc_slicing[4][(crc ) & 0xFF] ^ + g_crc_slicing[5][(crc >> 8) & 0xFF] ^ + g_crc_slicing[6][(crc >> 16) & 0xFF] ^ + g_crc_slicing[7][(crc >> 24)] ^ + g_crc_slicing[0][(next ) & 0xFF] ^ + g_crc_slicing[1][(next >> 8) & 0xFF] ^ + g_crc_slicing[2][(next >> 16) & 0xFF] ^ + g_crc_slicing[3][(next >> 24)]; +#else + g_crc_slicing[7][(crc ) & 0xFF] ^ + g_crc_slicing[6][(crc >> 8) & 0xFF] ^ + g_crc_slicing[5][(crc >> 16) & 0xFF] ^ + g_crc_slicing[4][(crc >> 24)] ^ + g_crc_slicing[3][(next ) & 0xFF] ^ + g_crc_slicing[2][(next >> 8) & 0xFF] ^ + g_crc_slicing[1][(next >> 16) & 0xFF] ^ + g_crc_slicing[0][(next >> 24)]; +#endif + } + + /* Checksum trailing bytes one byte at a time. */ +#ifdef WORDS_BIGENDIAN + for (len &= 0x7; len > 0; ++p, len--) + crc = g_crc_slicing[0][((crc >> 24) ^ *p) & 0xFF] ^ (crc << 8); + + /* Do final byte swap to produce a result identical to little endian */ + crc = + ((crc << 24) & 0xFF000000) | + ((crc << 8) & 0x00FF0000) | + ((crc >> 8) & 0x0000FF00) | + ((crc >> 24) & 0x000000FF); +#else + for (len &= 0x7; len > 0; ++p, len--) + crc = g_crc_slicing[0][(crc ^ *p) & 0xFF] ^ (crc >> 8); +#endif + return (~crc); +} + +#if (defined(__amd64) || defined(__x86_64)) +/* + * __wt_cksum_hw -- + * Return a checksum for a chunk of memory, computed in hardware + * using 8 byte steps. + */ +static uint32_t +__wt_cksum_hw(const void *chunk, size_t len) +{ + uint32_t crc; + size_t nqwords; + const uint8_t *p; + const uint64_t *p64; + + crc = 0xffffffff; + + /* Checksum one byte at a time to the first 4B boundary. */ + for (p = chunk; + ((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0 && + len > 0; ++p, --len) { + __asm__ __volatile__( + ".byte 0xF2, 0x0F, 0x38, 0xF0, 0xF1" + : "=S" (crc) + : "0" (crc), "c" (*p)); + } + + p64 = (const uint64_t *)p; + /* Checksum in 8B chunks. */ + for (nqwords = len / sizeof(uint64_t); nqwords; nqwords--) { + __asm__ __volatile__ ( + ".byte 0xF2, 0x48, 0x0F, 0x38, 0xF1, 0xF1" + : "=S"(crc) + : "0"(crc), "c" (*p64)); + p64++; + } + + /* Checksum trailing bytes one byte at a time. */ + p = (const uint8_t *)p64; + for (len &= 0x7; len > 0; ++p, len--) { + __asm__ __volatile__( + ".byte 0xF2, 0x0F, 0x38, 0xF0, 0xF1" + : "=S" (crc) + : "0" (crc), "c" (*p)); + } + return (~crc); +} +#endif + +#if defined(_M_AMD64) +/* + * __wt_cksum_hw -- + * Return a checksum for a chunk of memory, computed in hardware + * using 8 byte steps. + */ +static uint32_t +__wt_cksum_hw(const void *chunk, size_t len) +{ + uint32_t crc; + size_t nqwords; + const uint8_t *p; + const uint64_t *p64; + + crc = 0xffffffff; + + /* Checksum one byte at a time to the first 4B boundary. */ + for (p = chunk; + ((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0 && + len > 0; ++p, --len) { + crc = _mm_crc32_u8(crc, *p); + } + + p64 = (const uint64_t *)p; + /* Checksum in 8B chunks. */ + for (nqwords = len / sizeof(uint64_t); nqwords; nqwords--) { + crc = (uint32_t)_mm_crc32_u64(crc, *p64); + p64++; + } + + /* Checksum trailing bytes one byte at a time. */ + p = (const uint8_t *)p64; + for (len &= 0x7; len > 0; ++p, len--) { + crc = _mm_crc32_u8(crc, *p); + } + + return (~crc); +} +#endif + +/* + * __wt_cksum -- + * Return a checksum for a chunk of memory using the fastest method + * available. + */ +uint32_t +__wt_cksum(const void *chunk, size_t len) +{ + return (*__wt_cksum_func)(chunk, len); +} + +/* + * __wt_cksum_init -- + * Detect CRC hardware and set the checksum function. + */ +void +__wt_cksum_init(void) +{ +#define CPUID_ECX_HAS_SSE42 (1 << 20) + +#if (defined(__amd64) || defined(__x86_64)) + unsigned int eax, ebx, ecx, edx; + + __asm__ __volatile__ ( + "cpuid" + : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) + : "a" (1)); + + if (ecx & CPUID_ECX_HAS_SSE42) + __wt_cksum_func = __wt_cksum_hw; + else + __wt_cksum_func = __wt_cksum_sw; + +#elif defined(_M_AMD64) + int cpuInfo[4]; + + __cpuid(cpuInfo, 1); + + if (cpuInfo[2] & CPUID_ECX_HAS_SSE42) + __wt_cksum_func = __wt_cksum_hw; + else + __wt_cksum_func = __wt_cksum_sw; +#else + __wt_cksum_func = __wt_cksum_sw; +#endif +} diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c new file mode 100644 index 00000000000..3e874078fbf --- /dev/null +++ b/src/third_party/wiredtiger/src/support/err.c @@ -0,0 +1,527 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __handle_error_default -- + * Default WT_EVENT_HANDLER->handle_error implementation: send to stderr. + */ +static int +__handle_error_default(WT_EVENT_HANDLER *handler, + WT_SESSION *session, int error, const char *errmsg) +{ + WT_UNUSED(handler); + WT_UNUSED(session); + WT_UNUSED(error); + + return (fprintf(stderr, "%s\n", errmsg) >= 0 && + fflush(stderr) == 0 ? 0 : __wt_errno()); +} + +/* + * __handle_message_default -- + * Default WT_EVENT_HANDLER->handle_message implementation: send to stdout. + */ +static int +__handle_message_default(WT_EVENT_HANDLER *handler, + WT_SESSION *session, const char *message) +{ + WT_UNUSED(handler); + WT_UNUSED(session); + + return (printf("%s\n", message) >= 0 && + fflush(stdout) == 0 ? 0 : __wt_errno()); +} + +/* + * __handle_progress_default -- + * Default WT_EVENT_HANDLER->handle_progress implementation: ignore. + */ +static int +__handle_progress_default(WT_EVENT_HANDLER *handler, + WT_SESSION *session, const char *operation, uint64_t progress) +{ + WT_UNUSED(handler); + WT_UNUSED(session); + WT_UNUSED(operation); + WT_UNUSED(progress); + + return (0); +} + +/* + * __handle_close_default -- + * Default WT_EVENT_HANDLER->handle_close implementation: ignore. + */ +static int +__handle_close_default(WT_EVENT_HANDLER *handler, + WT_SESSION *session, WT_CURSOR *cursor) +{ + WT_UNUSED(handler); + WT_UNUSED(session); + WT_UNUSED(cursor); + + return (0); +} + +static WT_EVENT_HANDLER __event_handler_default = { + __handle_error_default, + __handle_message_default, + __handle_progress_default, + __handle_close_default +}; + +/* + * __handler_failure -- + * Report the failure of an application-configured event handler. + */ +static void +__handler_failure(WT_SESSION_IMPL *session, + int error, const char *which, int error_handler_failed) +{ + WT_EVENT_HANDLER *handler; + WT_SESSION *wt_session; + + /* + * !!! + * SECURITY: + * Buffer placed at the end of the stack in case snprintf overflows. + */ + char s[256]; + + (void)snprintf(s, sizeof(s), + "application %s event handler failed: %s", + which, wiredtiger_strerror(error)); + + /* + * Use the error handler to report the failure, unless it was the error + * handler that failed. If it was the error handler that failed, or a + * call to the error handler fails, use the default error handler. + */ + wt_session = (WT_SESSION *)session; + handler = session->event_handler; + if (!error_handler_failed && + handler->handle_error != __handle_error_default && + handler->handle_error(handler, wt_session, error, s) == 0) + return; + + (void)__handle_error_default(NULL, wt_session, error, s); +} + +/* + * __wt_event_handler_set -- + * Set an event handler, fill in any NULL methods with the defaults. + */ +void +__wt_event_handler_set(WT_SESSION_IMPL *session, WT_EVENT_HANDLER *handler) +{ + if (handler == NULL) + handler = &__event_handler_default; + else { + if (handler->handle_error == NULL) + handler->handle_error = __handle_error_default; + if (handler->handle_message == NULL) + handler->handle_message = __handle_message_default; + if (handler->handle_progress == NULL) + handler->handle_progress = __handle_progress_default; + } + + session->event_handler = handler; +} + +/* + * __wt_eventv -- + * Report a message to an event handler. + */ +int +__wt_eventv(WT_SESSION_IMPL *session, int msg_event, int error, + const char *file_name, int line_number, const char *fmt, va_list ap) +{ + WT_EVENT_HANDLER *handler; + WT_DECL_RET; + WT_SESSION *wt_session; + struct timespec ts; + size_t len, remain, wlen; + int prefix_cnt; + const char *err, *prefix; + char *end, *p, tid[128]; + + /* + * We're using a stack buffer because we want error messages no matter + * what, and allocating a WT_ITEM, or the memory it needs, might fail. + * + * !!! + * SECURITY: + * Buffer placed at the end of the stack in case snprintf overflows. + */ + char s[2048]; + + /* + * !!! + * This function MUST handle a NULL WT_SESSION_IMPL handle. + * + * Without a session, we don't have event handlers or prefixes for the + * error message. Write the error to stderr and call it a day. (It's + * almost impossible for that to happen given how early we allocate the + * first session, but if the allocation of the first session fails, for + * example, we can end up here without a session.) + */ + if (session == NULL) + return (fprintf(stderr, "WiredTiger Error%s%s\n", + error == 0 ? "" : ": ", + error == 0 ? "" : wiredtiger_strerror(error)) >= 0 && + fflush(stderr) == 0 ? 0 : __wt_errno()); + + p = s; + end = s + sizeof(s); + + /* + * We have several prefixes for the error message: + * a timestamp and the process and thread ids, the database error + * prefix, the data-source's name, and the session's name. Write them + * as a comma-separate list, followed by a colon. + */ + prefix_cnt = 0; + if (__wt_epoch(session, &ts) == 0) { + __wt_thread_id(tid, sizeof(tid)); + remain = WT_PTRDIFF(end, p); + wlen = (size_t)snprintf(p, remain, + "[%" PRIuMAX ":%" PRIuMAX "][%s]", + (uintmax_t)ts.tv_sec, (uintmax_t)ts.tv_nsec / 1000, tid); + p = wlen >= remain ? end : p + wlen; + prefix_cnt = 1; + } + if ((prefix = S2C(session)->error_prefix) != NULL) { + remain = WT_PTRDIFF(end, p); + wlen = (size_t)snprintf(p, remain, + "%s%s", prefix_cnt == 0 ? "" : ", ", prefix); + p = wlen >= remain ? end : p + wlen; + prefix_cnt = 1; + } + prefix = session->dhandle == NULL ? NULL : session->dhandle->name; + if (prefix != NULL) { + remain = WT_PTRDIFF(end, p); + wlen = (size_t)snprintf(p, remain, + "%s%s", prefix_cnt == 0 ? "" : ", ", prefix); + p = wlen >= remain ? end : p + wlen; + prefix_cnt = 1; + } + if ((prefix = session->name) != NULL) { + remain = WT_PTRDIFF(end, p); + wlen = (size_t)snprintf(p, remain, + "%s%s", prefix_cnt == 0 ? "" : ", ", prefix); + p = wlen >= remain ? end : p + wlen; + prefix_cnt = 1; + } + if (prefix_cnt != 0) { + remain = WT_PTRDIFF(end, p); + wlen = (size_t)snprintf(p, remain, ": "); + p = wlen >= remain ? end : p + wlen; + } + + if (file_name != NULL) { + remain = WT_PTRDIFF(end, p); + wlen = (size_t) + snprintf(p, remain, "%s, %d: ", file_name, line_number); + p = wlen >= remain ? end : p + wlen; + } + + remain = WT_PTRDIFF(end, p); + wlen = (size_t)vsnprintf(p, remain, fmt, ap); + p = wlen >= remain ? end : p + wlen; + + if (error != 0) { + /* + * When the engine calls __wt_err on error, it often outputs an + * error message including the string associated with the error + * it's returning. We could change the calls to call __wt_errx, + * but it's simpler to not append an error string if all we are + * doing is duplicating an existing error string. + * + * Use strcmp to compare: both strings are nul-terminated, and + * we don't want to run past the end of the buffer. + */ + err = wiredtiger_strerror(error); + len = strlen(err); + if (WT_PTRDIFF(p, s) < len || strcmp(p - len, err) != 0) { + remain = WT_PTRDIFF(end, p); + (void)snprintf(p, remain, ": %s", err); + } + } + + /* + * If a handler fails, return the error status: if we're in the process + * of handling an error, any return value we provide will be ignored by + * our caller, our caller presumably already has an error value it will + * be returning. + * + * If an application-specified or default informational message handler + * fails, complain using the application-specified or default error + * handler. + * + * If an application-specified error message handler fails, complain + * using the default error handler. If the default error handler fails, + * there's nothing to do. + */ + wt_session = (WT_SESSION *)session; + handler = session->event_handler; + if (msg_event) { + ret = handler->handle_message(handler, wt_session, s); + if (ret != 0) + __handler_failure(session, ret, "message", 0); + } else { + ret = handler->handle_error(handler, wt_session, error, s); + if (ret != 0 && handler->handle_error != __handle_error_default) + __handler_failure(session, ret, "error", 1); + } + + return (ret); +} + +/* + * __wt_err -- + * Report an error. + */ +void +__wt_err(WT_SESSION_IMPL *session, int error, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4))) +{ + va_list ap; + + /* + * Ignore error returns from underlying event handlers, we already have + * an error value to return. + */ + va_start(ap, fmt); + (void)__wt_eventv(session, 0, error, NULL, 0, fmt, ap); + va_end(ap); +} + +/* + * __wt_errx -- + * Report an error with no error code. + */ +void +__wt_errx(WT_SESSION_IMPL *session, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3))) +{ + va_list ap; + + /* + * Ignore error returns from underlying event handlers, we already have + * an error value to return. + */ + va_start(ap, fmt); + (void)__wt_eventv(session, 0, 0, NULL, 0, fmt, ap); + va_end(ap); +} + +/* + * __wt_ext_err_printf -- + * Extension API call to print to the error stream. + */ +int +__wt_ext_err_printf( + WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4))) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + va_list ap; + + if ((session = (WT_SESSION_IMPL *)wt_session) == NULL) + session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session; + + va_start(ap, fmt); + ret = __wt_eventv(session, 0, 0, NULL, 0, fmt, ap); + va_end(ap); + return (ret); +} + +/* + * info_msg -- + * Informational message. + */ +static int +info_msg(WT_SESSION_IMPL *session, const char *fmt, va_list ap) +{ + WT_EVENT_HANDLER *handler; + WT_SESSION *wt_session; + + /* + * !!! + * SECURITY: + * Buffer placed at the end of the stack in case snprintf overflows. + */ + char s[2048]; + + (void)vsnprintf(s, sizeof(s), fmt, ap); + + wt_session = (WT_SESSION *)session; + handler = session->event_handler; + return (handler->handle_message(handler, wt_session, s)); +} + +/* + * __wt_msg -- + * Informational message. + */ +int +__wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3))) +{ + WT_DECL_RET; + va_list ap; + + va_start(ap, fmt); + ret = info_msg(session, fmt, ap); + va_end(ap); + + return (ret); +} + +/* + * __wt_ext_msg_printf -- + * Extension API call to print to the message stream. + */ +int +__wt_ext_msg_printf( + WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4))) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + va_list ap; + + if ((session = (WT_SESSION_IMPL *)wt_session) == NULL) + session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session; + + va_start(ap, fmt); + ret = info_msg(session, fmt, ap); + va_end(ap); + return (ret); +} + +/* + * __wt_progress -- + * Progress message. + */ +int +__wt_progress(WT_SESSION_IMPL *session, const char *s, uint64_t v) +{ + WT_DECL_RET; + WT_EVENT_HANDLER *handler; + WT_SESSION *wt_session; + + wt_session = (WT_SESSION *)session; + handler = session->event_handler; + if (handler != NULL && handler->handle_progress != NULL) + if ((ret = handler->handle_progress(handler, + wt_session, s == NULL ? session->name : s, v)) != 0) + __handler_failure(session, ret, "progress", 0); + return (0); +} + +/* + * __wt_assert -- + * Assert and other unexpected failures, includes file/line information + * for debugging. + */ +void +__wt_assert(WT_SESSION_IMPL *session, + int error, const char *file_name, int line_number, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 5, 6))) +{ + va_list ap; + + va_start(ap, fmt); + (void)__wt_eventv(session, 0, error, file_name, line_number, fmt, ap); + va_end(ap); + +#ifdef HAVE_DIAGNOSTIC + __wt_abort(session); /* Drop core if testing. */ + /* NOTREACHED */ +#endif +} + +/* + * __wt_panic -- + * A standard error message when we panic. + */ +int +__wt_panic(WT_SESSION_IMPL *session) +{ + F_SET(S2C(session), WT_CONN_PANIC); + __wt_errx(session, "%s", + "the WiredTiger library cannot continue; the process must exit " + "and restart"); + +#if !defined(HAVE_DIAGNOSTIC) + /* + * Chaos reigns within. + * Reflect, repent, and reboot. + * Order shall return. + */ + return (WT_PANIC); +#endif + + __wt_abort(session); /* Drop core if testing. */ + /* NOTREACHED */ +} + +/* + * __wt_illegal_value -- + * A standard error message when we detect an illegal value. + */ +int +__wt_illegal_value(WT_SESSION_IMPL *session, const char *name) +{ + __wt_errx(session, "%s%s%s", + name == NULL ? "" : name, name == NULL ? "" : ": ", + "encountered an illegal file format or internal value"); + +#if !defined(HAVE_DIAGNOSTIC) + return (__wt_panic(session)); +#endif + + __wt_abort(session); /* Drop core if testing. */ + /* NOTREACHED */ +} + +/* + * __wt_object_unsupported -- + * Print a standard error message for an object that doesn't support a + * particular operation. + */ +int +__wt_object_unsupported(WT_SESSION_IMPL *session, const char *uri) +{ + WT_RET_MSG(session, ENOTSUP, "unsupported object operation: %s", uri); +} + +/* + * __wt_bad_object_type -- + * Print a standard error message when given an unknown or unsupported + * object type. + */ +int +__wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri) +{ + if (WT_PREFIX_MATCH(uri, "backup:") || + WT_PREFIX_MATCH(uri, "colgroup:") || + WT_PREFIX_MATCH(uri, "config:") || + WT_PREFIX_MATCH(uri, "file:") || + WT_PREFIX_MATCH(uri, "index:") || + WT_PREFIX_MATCH(uri, "log:") || + WT_PREFIX_MATCH(uri, "lsm:") || + WT_PREFIX_MATCH(uri, "statistics:") || + WT_PREFIX_MATCH(uri, "table:")) + return (__wt_object_unsupported(session, uri)); + + WT_RET_MSG(session, ENOTSUP, "unknown object type: %s", uri); +} diff --git a/src/third_party/wiredtiger/src/support/filename.c b/src/third_party/wiredtiger/src/support/filename.c new file mode 100644 index 00000000000..bd5d03fa633 --- /dev/null +++ b/src/third_party/wiredtiger/src/support/filename.c @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_filename -- + * Build a file name in a scratch buffer, automatically calculate the + * length of the file name. + */ +int +__wt_filename(WT_SESSION_IMPL *session, const char *name, char **path) +{ + return (__wt_nfilename(session, name, strlen(name), path)); +} + +/* + * __wt_nfilename -- + * Build a file name in a scratch buffer. If the name is already an + * absolute path duplicate it, otherwise generate a path relative to the + * connection home directory. + */ +int +__wt_nfilename( + WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path) +{ + WT_CONNECTION_IMPL *conn; + size_t len; + char *buf; + + conn = S2C(session); + *path = NULL; + + if (__wt_absolute_path(name)) + WT_RET(__wt_strndup(session, name, namelen, path)); + else { + len = strlen(conn->home) + 1 + namelen + 1; + WT_RET(__wt_calloc(session, 1, len, &buf)); + snprintf(buf, len, "%s%s%.*s", + conn->home, __wt_path_separator(), (int)namelen, name); + *path = buf; + } + + return (0); +} diff --git a/src/third_party/wiredtiger/src/support/global.c b/src/third_party/wiredtiger/src/support/global.c new file mode 100644 index 00000000000..10f718d57f7 --- /dev/null +++ b/src/third_party/wiredtiger/src/support/global.c @@ -0,0 +1,118 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +WT_PROCESS __wt_process; /* Per-process structure */ +static int __wt_pthread_once_failed; /* If initialization failed */ + +/* + * __system_is_little_endian -- + * Check if the system is little endian. + */ +static int +__system_is_little_endian(void) +{ + uint64_t v; + int little; + + v = 1; + little = *((uint8_t *)&v) == 0 ? 0 : 1; + + if (little) + return (0); + + fprintf(stderr, + "This release of the WiredTiger data engine does not support " + "big-endian systems; contact WiredTiger for more information.\n"); + return (EINVAL); +} + +/* + * __wt_global_once -- + * Global initialization, run once. + */ +static void +__wt_global_once(void) +{ + WT_DECL_RET; + + if ((ret = __system_is_little_endian()) != 0) { + __wt_pthread_once_failed = ret; + return; + } + + if ((ret = + __wt_spin_init(NULL, &__wt_process.spinlock, "global")) != 0) { + __wt_pthread_once_failed = ret; + return; + } + + __wt_cksum_init(); + + TAILQ_INIT(&__wt_process.connqh); + +#ifdef HAVE_DIAGNOSTIC + /* Load debugging code the compiler might optimize out. */ + (void)__wt_breakpoint(); +#endif +} + +/* + * __wt_library_init -- + * Some things to do, before we do anything else. + */ +int +__wt_library_init(void) +{ + static int first = 1; + WT_DECL_RET; + + /* + * Do per-process initialization once, before anything else, but only + * once. I don't know how heavy-weight the function (pthread_once, in + * the POSIX world), might be, so I'm front-ending it with a local + * static and only using that function to avoid a race. + */ + if (first) { + if ((ret = __wt_once(__wt_global_once)) != 0) + __wt_pthread_once_failed = ret; + first = 0; + } + return (__wt_pthread_once_failed); +} + +#ifdef HAVE_DIAGNOSTIC +/* + * __wt_breakpoint -- + * A simple place to put a breakpoint, if you need one. + */ +int +__wt_breakpoint(void) +{ + return (0); +} + +/* + * __wt_attach -- + * A routine to wait for the debugging to attach. + */ +void +__wt_attach(WT_SESSION_IMPL *session) +{ +#ifdef HAVE_ATTACH + __wt_errx(session, "process ID %" PRIdMAX + ": waiting for debugger...", (intmax_t)getpid()); + + /* Sleep forever, the debugger will interrupt us when it attaches. */ + for (;;) + __wt_sleep(100, 0); +#else + WT_UNUSED(session); +#endif +} +#endif diff --git a/src/third_party/wiredtiger/src/support/hash_city.c b/src/third_party/wiredtiger/src/support/hash_city.c new file mode 100644 index 00000000000..c6978f6bfe6 --- /dev/null +++ b/src/third_party/wiredtiger/src/support/hash_city.c @@ -0,0 +1,323 @@ +/*- + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * Copyright (c) 2011 Google, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * CityHash, by Geoff Pike and Jyrki Alakuijala + * + * This file provides CityHash64() and related functions. + * + * It's probably possible to create even faster hash functions by + * writing a program that systematically explores some of the space of + * possible hash functions, by using SIMD instructions, or by + * compromising on hash quality. + */ + +#include <string.h> +#include "wt_internal.h" + +/* + * Google City Hash implementation. Based on source code from: + * http://code.google.com/p/cityhash/ + */ + +typedef struct _uint128 uint128; +struct _uint128 { + uint64_t first; + uint64_t second; +}; + +#define Uint128Low64(x) (x).first +#define Uint128High64(x) (x).second + +static uint64_t UNALIGNED_LOAD64(const char *p) { + uint64_t result; + memcpy(&result, p, sizeof(result)); + return (result); +} + +static uint32_t UNALIGNED_LOAD32(const char *p) { + uint32_t result; + memcpy(&result, p, sizeof(result)); + return (result); +} + +#if !defined(WORDS_BIGENDIAN) + +#define uint32_in_expected_order(x) (x) +#define uint64_in_expected_order(x) (x) + +#else + +#ifdef __APPLE__ +/* Mac OS X / Darwin features */ +#include <libkern/OSByteOrder.h> +#define bswap_32(x) OSSwapInt32(x) +#define bswap_64(x) OSSwapInt64(x) + +#else +#include <byteswap.h> +#endif + +#define uint32_in_expected_order(x) (bswap_32(x)) +#define uint64_in_expected_order(x) (bswap_64(x)) + +#endif /* WORDS_BIGENDIAN */ + +static uint64_t Fetch64(const char *p) { + return uint64_in_expected_order(UNALIGNED_LOAD64(p)); +} + +static uint32_t Fetch32(const char *p) { + return uint32_in_expected_order(UNALIGNED_LOAD32(p)); +} + +/* Some primes between 2^63 and 2^64 for various uses. */ +static const uint64_t k0 = 0xc3a5c85c97cb3127ULL; +static const uint64_t k1 = 0xb492b66fbe98f273ULL; +static const uint64_t k2 = 0x9ae16a3b2f90404fULL; +static const uint64_t k3 = 0xc949d7c7509e6557ULL; + +/* + * Hash 128 input bits down to 64 bits of output. + * This is intended to be a reasonably good hash function. + */ +static inline uint64_t Hash128to64(const uint128 x) { + /* Murmur-inspired hashing. */ + const uint64_t kMul = 0x9ddfea08eb382d69ULL; + uint64_t a, b; + + a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; + a ^= (a >> 47); + b = (Uint128High64(x) ^ a) * kMul; + b ^= (b >> 47); + b *= kMul; + return (b); +} + +/* + * Bitwise right rotate. Normally this will compile to a single + * instruction, especially if the shift is a manifest constant. + */ +static uint64_t Rotate(uint64_t val, int shift) { + /* Avoid shifting by 64: doing so yields an undefined result. */ + return shift == 0 ? val : ((val >> shift) | (val << (64 - shift))); +} + +/* + * Equivalent to Rotate(), but requires the second arg to be non-zero. + * On x86-64, and probably others, it's possible for this to compile + * to a single instruction if both args are already in registers. + */ +static uint64_t RotateByAtLeast1(uint64_t val, int shift) { + return (val >> shift) | (val << (64 - shift)); +} + +static uint64_t ShiftMix(uint64_t val) { + return val ^ (val >> 47); +} + +static uint64_t HashLen16(uint64_t u, uint64_t v) { + uint128 result; + + result.first = u; + result.second = v; + return Hash128to64(result); +} + +static uint64_t HashLen0to16(const char *s, size_t len) { + uint64_t a64, b64; + uint32_t y, z; + uint8_t a8, b8, c8; + if (len > 8) { + a64 = Fetch64(s); + b64 = Fetch64(s + len - 8); + return HashLen16( + a64, RotateByAtLeast1(b64 + len, (int)len)) ^ b64; + } + if (len >= 4) { + a64 = Fetch32(s); + return HashLen16(len + (a64 << 3), Fetch32(s + len - 4)); + } + if (len > 0) { + a8 = (uint8_t)s[0]; + b8 = (uint8_t)s[len >> 1]; + c8 = (uint8_t)s[len - 1]; + y = (uint32_t)(a8) + ((uint32_t)(b8) << 8); + z = (uint32_t)len + ((uint32_t)(c8) << 2); + return ShiftMix(y * k2 ^ z * k3) * k2; + } + return (k2); +} + +/* + * This probably works well for 16-byte strings as well, but it may be overkill + * in that case. + */ +static uint64_t HashLen17to32(const char *s, size_t len) { + uint64_t a = Fetch64(s) * k1; + uint64_t b = Fetch64(s + 8); + uint64_t c = Fetch64(s + len - 8) * k2; + uint64_t d = Fetch64(s + len - 16) * k0; + return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d, + a + Rotate(b ^ k3, 20) + len - c); +} + +/* + * Return a 16-byte hash for 48 bytes. Quick and dirty. + * Callers do best to use "random-looking" values for a and b. + * static pair<uint64, uint64> WeakHashLen32WithSeeds( + */ +static void WeakHashLen32WithSeeds6(uint64_t w, uint64_t x, + uint64_t y, uint64_t z, uint64_t a, uint64_t b, uint128 *ret) { + uint64_t c; + + a += w; + b = Rotate(b + a + z, 21); + c = a; + a += x; + a += y; + b += Rotate(a, 44); + + ret->first = (uint64_t) (a + z); + ret->second = (uint64_t) (b + c); +} + +/* + * Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty. + * static pair<uint64, uint64> WeakHashLen32WithSeeds( + */ +static void WeakHashLen32WithSeeds( + const char* s, uint64_t a, uint64_t b, uint128 *ret) { + WeakHashLen32WithSeeds6(Fetch64(s), + Fetch64(s + 8), + Fetch64(s + 16), + Fetch64(s + 24), + a, + b, + ret); +} + +/* Return an 8-byte hash for 33 to 64 bytes. */ +static uint64_t HashLen33to64(const char *s, size_t len) { + uint64_t a, b, c, r, vf, vs, wf, ws, z; + z = Fetch64(s + 24); + a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0; + b = Rotate(a + z, 52); + c = Rotate(a, 37); + a += Fetch64(s + 8); + c += Rotate(a, 7); + a += Fetch64(s + 16); + vf = a + z; + vs = b + Rotate(a, 31) + c; + a = Fetch64(s + 16) + Fetch64(s + len - 32); + z = Fetch64(s + len - 8); + b = Rotate(a + z, 52); + c = Rotate(a, 37); + a += Fetch64(s + len - 24); + c += Rotate(a, 7); + a += Fetch64(s + len - 16); + wf = a + z; + ws = b + Rotate(a, 31) + c; + r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0); + return ShiftMix(r * k0 + vs) * k2; +} + +static inline uint64_t CityHash64(const char *s, size_t len) { + uint64_t temp, x, y, z; + uint128 v, w; + + if (len <= 32) { + if (len <= 16) { + return HashLen0to16(s, len); + } else { + return HashLen17to32(s, len); + } + } else if (len <= 64) { + return HashLen33to64(s, len); + } + + /* + * For strings over 64 bytes we hash the end first, and then as we + * loop we keep 56 bytes of state: v, w, x, y, and z. + */ + x = Fetch64(s + len - 40); + y = Fetch64(s + len - 16) + Fetch64(s + len - 56); + z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24)); + WeakHashLen32WithSeeds(s + len - 64, len, z, &v); + WeakHashLen32WithSeeds(s + len - 32, y + k1, x, &w); + x = x * k1 + Fetch64(s); + + /* + * Use len to count multiples of 64, and operate on 64-byte chunks. + */ + for (len = (len - 1) >> 6; len != 0; len--) { + x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; + y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; + x ^= w.second; + y += v.first + Fetch64(s + 40); + z = Rotate(z + w.first, 33) * k1; + WeakHashLen32WithSeeds(s, v.second * k1, x + w.first, &v); + WeakHashLen32WithSeeds( + s + 32, z + w.second, y + Fetch64(s + 16), &w); + temp = z; + z = x; + x = temp; + s += 64; + } + return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z, + HashLen16(v.second, w.second) + x); +} + +/* + * __wt_hash_city64 -- + * WiredTiger wrapper around third party hash implementation. + */ +uint64_t +__wt_hash_city64(const void *s, size_t len) +{ + return (CityHash64(s, len)); +} diff --git a/src/third_party/wiredtiger/src/support/hash_fnv.c b/src/third_party/wiredtiger/src/support/hash_fnv.c new file mode 100644 index 00000000000..68f8537a4a0 --- /dev/null +++ b/src/third_party/wiredtiger/src/support/hash_fnv.c @@ -0,0 +1,161 @@ +/*- + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code + * + * @(#) $Revision: 5.1 $ + * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $ + * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $ + * + *** + * + * Fowler/Noll/Vo hash + * + * The basis of this hash algorithm was taken from an idea sent + * as reviewer comments to the IEEE POSIX P1003.2 committee by: + * + * Phong Vo (http://www.research.att.com/info/kpv/) + * Glenn Fowler (http://www.research.att.com/~gsf/) + * + * In a subsequent ballot round: + * + * Landon Curt Noll (http://www.isthe.com/chongo/) + * + * improved on their algorithm. Some people tried this hash + * and found that it worked rather well. In an EMail message + * to Landon, they named it the ``Fowler/Noll/Vo'' or FNV hash. + * + * FNV hashes are designed to be fast while maintaining a low + * collision rate. The FNV speed allows one to quickly hash lots + * of data while maintaining a reasonable collision rate. See: + * + * http://www.isthe.com/chongo/tech/comp/fnv/index.html + * + * for more details as well as other forms of the FNV hash. + * + *** + * + * To use the recommended 64 bit FNV-1a hash, pass FNV1A_64_INIT as the + * uint64_t hashval argument to fnv_64a_buf() or fnv_64a_str(). + * + *** + * + * Please do not copyright this code. This code is in the public domain. + * + * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO + * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + * + * By: + * chongo <Landon Curt Noll> /\oo/\ + * http://www.isthe.com/chongo/ + * + * Share and Enjoy! :-) + */ + +#include <stdlib.h> +#include "wt_internal.h" + +/* + * This file contains a 64 bit hash implementation of the FNV 1a 64 bit hash + * function. The implementation is from a third party. + * + * The code has been updated to remove unnecessary content and better comply + * with WiredTiger coding standards. The original source code can be found at: + * FNV 1a 64 bit: http://www.isthe.com/chongo/src/fnv/hash_64a.c + */ + +/* + * 64 bit FNV-1 non-zero initial basis + * + * The FNV-1 initial basis is the FNV-0 hash of the following 32 octets: + * + * chongo <Landon Curt Noll> /\../\ + * + * NOTE: The \'s above are not back-slashing escape characters. + * They are literal ASCII backslash 0x5c characters. + * + * NOTE: The FNV-1a initial basis is the same value as FNV-1 by definition. + */ +#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL) + +/* + * fnv_64a_buf -- + * Perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer + * + * input: + * buf - start of buffer to hash + * len - length of buffer in octets + * hval - previous hash value or 0 if first call + * + * returns: + * 64 bit hash as a static hash type + * + * NOTE: To use the recommended 64 bit FNV-1a hash, use FNV1A_64_INIT as the + * hval arg on the first call to either fnv_64a_buf() or fnv_64a_str(). + */ +static inline uint64_t +fnv_64a_buf(const void *buf, size_t len, uint64_t hval) +{ + const unsigned char *bp = buf; /* start of buffer */ + const unsigned char *be = bp + len; /* beyond end of buffer */ + + /* + * FNV-1a hash each octet of the buffer + */ + while (bp < be) { + + /* xor the bottom with the current octet */ + hval ^= (uint64_t)*bp++; + + /* + * Multiply by the 64 bit FNV magic prime mod 2^64. The + * following shift operation is generally faster than + * a multiply operation. + */ + hval += (hval << 1) + (hval << 4) + (hval << 5) + + (hval << 7) + (hval << 8) + (hval << 40); + } + + /* return our new hash value */ + return (hval); +} + +/* + * __wt_hash_fnv64 -- + * WiredTiger wrapper around third party hash implementation. + */ +uint64_t +__wt_hash_fnv64(const void *string, size_t len) +{ + return (fnv_64a_buf(string, len, FNV1A_64_INIT)); +} diff --git a/src/third_party/wiredtiger/src/support/hazard.c b/src/third_party/wiredtiger/src/support/hazard.c new file mode 100644 index 00000000000..12350ab52f4 --- /dev/null +++ b/src/third_party/wiredtiger/src/support/hazard.c @@ -0,0 +1,244 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +#ifdef HAVE_DIAGNOSTIC +static void __hazard_dump(WT_SESSION_IMPL *); +#endif + +/* + * __wt_hazard_set -- + * Set a hazard pointer. + */ +int +__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp +#ifdef HAVE_DIAGNOSTIC + , const char *file, int line +#endif + ) +{ + WT_BTREE *btree; + WT_HAZARD *hp; + int restarts = 0; + + btree = S2BT(session); + *busyp = 0; + + /* If a file can never be evicted, hazard pointers aren't required. */ + if (F_ISSET(btree, WT_BTREE_NO_HAZARD)) + return (0); + + /* + * Do the dance: + * + * The memory location which makes a page "real" is the WT_REF's state + * of WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the + * page eviction server. + * + * Add the WT_REF reference to the session's hazard list and flush the + * write, then see if the page's state is still valid. If so, we can + * use the page because the page eviction server will see our hazard + * pointer before it discards the page (the eviction server sets the + * state to WT_REF_LOCKED, then flushes memory and checks the hazard + * pointers). + * + * For sessions with many active hazard pointers, skip most of the + * active slots: there may be a free slot in there, but checking is + * expensive. Most hazard pointers are released quickly: optimize + * for that case. + */ + for (hp = session->hazard + session->nhazard;; ++hp) { + /* Expand the number of hazard pointers if available.*/ + if (hp >= session->hazard + session->hazard_size) { + if (session->hazard_size >= S2C(session)->hazard_max) + break; + /* Restart the search. */ + if (session->nhazard < session->hazard_size && + restarts++ == 0) { + hp = session->hazard; + continue; + } + WT_PUBLISH(session->hazard_size, + WT_MIN(session->hazard_size + WT_HAZARD_INCR, + S2C(session)->hazard_max)); + } + + if (hp->page != NULL) + continue; + + hp->page = ref->page; +#ifdef HAVE_DIAGNOSTIC + hp->file = file; + hp->line = line; +#endif + /* Publish the hazard pointer before reading page's state. */ + WT_FULL_BARRIER(); + + /* + * Check if the page state is still valid, where valid means a + * state of WT_REF_MEM and the pointer is unchanged. (The + * pointer can change, it means the page was evicted between + * the time we set our hazard pointer and the publication. It + * would theoretically be possible for the page to be evicted + * and a different page read into the same memory, so the + * pointer hasn't changed but the contents have. That's OK, we + * found this page using the tree's key space, whatever page we + * find here is the page for us to use.) + */ + if (ref->page == hp->page && ref->state == WT_REF_MEM) { + ++session->nhazard; + return (0); + } + + /* + * The page isn't available, it's being considered for eviction + * (or being evicted, for all we know). If the eviction server + * sees our hazard pointer before evicting the page, it will + * return the page to use, no harm done, if it doesn't, it will + * go ahead and complete the eviction. + * + * We don't bother publishing this update: the worst case is we + * prevent some random page from being evicted. + */ + hp->page = NULL; + *busyp = 1; + return (0); + } + + __wt_errx(session, "session %p: hazard pointer table full", session); +#ifdef HAVE_DIAGNOSTIC + __hazard_dump(session); +#endif + + return (ENOMEM); +} + +/* + * __wt_hazard_clear -- + * Clear a hazard pointer. + */ +int +__wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BTREE *btree; + WT_HAZARD *hp; + + btree = S2BT(session); + + /* If a file can never be evicted, hazard pointers aren't required. */ + if (F_ISSET(btree, WT_BTREE_NO_HAZARD)) + return (0); + + /* + * Clear the caller's hazard pointer. + * The common pattern is LIFO, so do a reverse search. + */ + for (hp = session->hazard + session->hazard_size - 1; + hp >= session->hazard; + --hp) + if (hp->page == page) { + /* + * We don't publish the hazard pointer clear in the + * general case. It's not required for correctness; + * it gives an eviction thread faster access to the + * page were the page selected for eviction, but the + * generation number was just set, it's unlikely the + * page will be selected for eviction. + */ + hp->page = NULL; + + /* + * If this was the last hazard pointer in the session, + * we may need to update our transactional context. + */ + --session->nhazard; + return (0); + } + + /* + * A serious error, we should always find the hazard pointer. Panic, + * because using a page we didn't have pinned down implies corruption. + */ + WT_PANIC_RET(session, EINVAL, + "session %p: clear hazard pointer: %p: not found", session, page); +} + +/* + * __wt_hazard_close -- + * Verify that no hazard pointers are set. + */ +void +__wt_hazard_close(WT_SESSION_IMPL *session) +{ + WT_HAZARD *hp; + int found; + + /* + * Check for a set hazard pointer and complain if we find one. We could + * just check the session's hazard pointer count, but this is a useful + * diagnostic. + */ + for (found = 0, hp = session->hazard; + hp < session->hazard + session->hazard_size; ++hp) + if (hp->page != NULL) { + found = 1; + break; + } + if (session->nhazard == 0 && !found) + return; + + __wt_errx(session, + "session %p: close hazard pointer table: table not empty", session); + +#ifdef HAVE_DIAGNOSTIC + __hazard_dump(session); +#endif + + /* + * Clear any hazard pointers because it's not a correctness problem + * (any hazard pointer we find can't be real because the session is + * being closed when we're called). We do this work because session + * close isn't that common that it's an expensive check, and we don't + * want to let a hazard pointer lie around, keeping a page from being + * evicted. + * + * We don't panic: this shouldn't be a correctness issue (at least, I + * can't think of a reason it would be). + */ + for (hp = session->hazard; + hp < session->hazard + session->hazard_size; ++hp) + if (hp->page != NULL) { + hp->page = NULL; + --session->nhazard; + } + + if (session->nhazard != 0) + __wt_errx(session, + "session %p: close hazard pointer table: count didn't " + "match entries", + session); +} + +#ifdef HAVE_DIAGNOSTIC +/* + * __hazard_dump -- + * Display the list of hazard pointers. + */ +static void +__hazard_dump(WT_SESSION_IMPL *session) +{ + WT_HAZARD *hp; + + for (hp = session->hazard; + hp < session->hazard + session->hazard_size; ++hp) + if (hp->page != NULL) + __wt_errx(session, + "session %p: hazard pointer %p: %s, line %d", + session, hp->page, hp->file, hp->line); +} +#endif diff --git a/src/third_party/wiredtiger/src/support/hex.c b/src/third_party/wiredtiger/src/support/hex.c new file mode 100644 index 00000000000..9ee3e723fa2 --- /dev/null +++ b/src/third_party/wiredtiger/src/support/hex.c @@ -0,0 +1,215 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static const u_char hex[] = "0123456789abcdef"; + +/* + * __fill_hex -- + * In-memory conversion of raw bytes to a hexadecimal representation. + */ +static inline void +__fill_hex(const uint8_t *src, size_t src_max, + uint8_t *dest, size_t dest_max, size_t *lenp) +{ + uint8_t *dest_orig; + + dest_orig = dest; + if (dest_max > 0) /* save a byte for nul-termination */ + --dest_max; + for (; src_max > 0 && dest_max > 1; + src_max -= 1, dest_max -= 2, ++src) { + *dest++ = hex[(*src & 0xf0) >> 4]; + *dest++ = hex[*src & 0x0f]; + } + *dest++ = '\0'; + if (lenp != NULL) + *lenp = WT_PTRDIFF(dest, dest_orig); +} + +/* + * __wt_raw_to_hex -- + * Convert a chunk of data to a nul-terminated printable hex string. + */ +int +__wt_raw_to_hex( + WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to) +{ + size_t len; + + /* + * Every byte takes up 2 spaces, plus a trailing nul byte. + */ + len = size * 2 + 1; + WT_RET(__wt_buf_init(session, to, len)); + + __fill_hex(from, size, to->mem, len, &to->size); + return (0); +} + +/* + * __wt_raw_to_esc_hex -- + * Convert a chunk of data to a nul-terminated printable string using + * escaped hex, as necessary. + */ +int +__wt_raw_to_esc_hex( + WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to) +{ + size_t i; + const uint8_t *p; + u_char *t; + + /* + * In the worst case, every character takes up 3 spaces, plus a + * trailing nul byte. + */ + WT_RET(__wt_buf_init(session, to, size * 3 + 1)); + + /* + * In the worst case, every character takes up 3 spaces, plus a + * trailing nul byte. + */ + for (p = from, t = to->mem, i = size; i > 0; --i, ++p) + if (isprint((int)*p)) { + if (*p == '\\') + *t++ = '\\'; + *t++ = *p; + } else { + *t++ = '\\'; + *t++ = hex[(*p & 0xf0) >> 4]; + *t++ = hex[*p & 0x0f]; + } + *t++ = '\0'; + to->size = WT_PTRDIFF(t, to->mem); + return (0); +} + +/* + * __wt_hex2byte -- + * Convert a pair of hex characters into a byte. + */ +int +__wt_hex2byte(const u_char *from, u_char *to) +{ + uint8_t byte; + + switch (from[0]) { + case '0': byte = 0; break; + case '1': byte = 1 << 4; break; + case '2': byte = 2 << 4; break; + case '3': byte = 3 << 4; break; + case '4': byte = 4 << 4; break; + case '5': byte = 5 << 4; break; + case '6': byte = 6 << 4; break; + case '7': byte = 7 << 4; break; + case '8': byte = 8 << 4; break; + case '9': byte = 9 << 4; break; + case 'a': byte = 10 << 4; break; + case 'b': byte = 11 << 4; break; + case 'c': byte = 12 << 4; break; + case 'd': byte = 13 << 4; break; + case 'e': byte = 14 << 4; break; + case 'f': byte = 15 << 4; break; + default: + return (1); + } + + switch (from[1]) { + case '0': break; + case '1': byte |= 1; break; + case '2': byte |= 2; break; + case '3': byte |= 3; break; + case '4': byte |= 4; break; + case '5': byte |= 5; break; + case '6': byte |= 6; break; + case '7': byte |= 7; break; + case '8': byte |= 8; break; + case '9': byte |= 9; break; + case 'a': byte |= 10; break; + case 'b': byte |= 11; break; + case 'c': byte |= 12; break; + case 'd': byte |= 13; break; + case 'e': byte |= 14; break; + case 'f': byte |= 15; break; + default: + return (1); + } + *to = byte; + return (0); +} + +/* + * __hex_fmterr -- + * Hex format error message. + */ +static int +__hex_fmterr(WT_SESSION_IMPL *session) +{ + WT_RET_MSG(session, EINVAL, "Invalid format in hexadecimal string"); +} + +/* + * __wt_hex_to_raw -- + * Convert a nul-terminated printable hex string to a chunk of data. + */ +int +__wt_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to) +{ + return (__wt_nhex_to_raw(session, from, strlen(from), to)); +} + +/* + * __wt_nhex_to_raw -- + * Convert a printable hex string to a chunk of data. + */ +int +__wt_nhex_to_raw( + WT_SESSION_IMPL *session, const char *from, size_t size, WT_ITEM *to) +{ + const u_char *p; + u_char *t; + + if (size % 2 != 0) + return (__hex_fmterr(session)); + + WT_RET(__wt_buf_init(session, to, size / 2)); + + for (p = (u_char *)from, t = to->mem; size > 0; p += 2, size -= 2, ++t) + if (__wt_hex2byte(p, t)) + return (__hex_fmterr(session)); + + to->size = WT_PTRDIFF(t, to->mem); + return (0); +} + +/* + * __wt_esc_hex_to_raw -- + * Convert a printable string, encoded in escaped hex, to a chunk of data. + */ +int +__wt_esc_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to) +{ + const u_char *p; + u_char *t; + + WT_RET(__wt_buf_init(session, to, strlen(from))); + + for (p = (u_char *)from, t = to->mem; *p != '\0'; ++p, ++t) { + if ((*t = *p) != '\\') + continue; + ++p; + if (p[0] != '\\') { + if (p[0] == '\0' || p[1] == '\0' || __wt_hex2byte(p, t)) + return (__hex_fmterr(session)); + ++p; + } + } + to->size = WT_PTRDIFF(t, to->mem); + return (0); +} diff --git a/src/third_party/wiredtiger/src/support/huffman.c b/src/third_party/wiredtiger/src/support/huffman.c new file mode 100644 index 00000000000..5a06b72d33e --- /dev/null +++ b/src/third_party/wiredtiger/src/support/huffman.c @@ -0,0 +1,899 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +#define __HUFFMAN_DETAIL 0 /* Set to 1 for debugging output. */ + +/* Length of header in compressed message, in bits. */ +#define WT_HUFFMAN_HEADER 3 + +/* + * Maximum allowed length of Huffman code words, which otherwise can range up + * to (#symbols - 1) bits long. Lower value to use less memory for tables, + * higher value for better compression. Max value = 16 (or 32-7=25 or 64-7=57 + * if adjust data types). FYI, JPEG uses 16. A side effect of limiting max + * code length is that the worst case compression (a message of the least + * frequent symbols) is shorter. + */ +#define MAX_CODE_LENGTH 16 + +typedef struct __wt_freqtree_node { + /* + * Data structure representing a node of the huffman tree. It holds a + * 64-bit weight and pointers to the left and right child nodes. The + * node either has two child nodes or none. + */ + uint8_t symbol; /* only used in leaf nodes */ + uint64_t weight; + struct __wt_freqtree_node *left; /* bit 0 */ + struct __wt_freqtree_node *right; /* bit 1 */ +} WT_FREQTREE_NODE; + +typedef struct __wt_huffman_code { + uint16_t pattern; /* requirement: length of field's type + * in bits >= MAX_CODE_LENGTH. + */ + uint8_t length; +} WT_HUFFMAN_CODE; + +typedef struct __wt_huffman_obj { + /* + * Data structure here defines specific instance of the encoder/decoder. + */ + u_int numSymbols; /* Symbols: UINT16_MAX or UINT8_MAX */ + + uint16_t max_depth, min_depth; /* Tree max/min depths */ + + /* + * use: codes[symbol] = struct with pattern and length. + * Used in encoding and decoding. + * memory: codes[0-to-(number of symbols - 1)] + */ + WT_HUFFMAN_CODE *codes; + + /* + * use: code2symbol[Huffman_code] = symbol. + * Used in decoding. + * memory: code2symbol[1 << max_code_length] + */ + uint8_t *code2symbol; +} WT_HUFFMAN_OBJ; + +/* + * Queue element data structure. + * + * Consists of a pointer to a huffman tree node, and a pointer to the next + * element in the queue. + */ +typedef struct node_queue_elem { + WT_FREQTREE_NODE *node; + struct node_queue_elem *next; +} NODE_QUEUE_ELEM; + +/* + * Queue of huffman tree nodes. + * + * Contains a pointer to the beginning and the end of the queue, which is + * implemented as a linked list. + */ +typedef struct node_queue { + NODE_QUEUE_ELEM *first; + NODE_QUEUE_ELEM *last; +} NODE_QUEUE; + +/* + * Internal data structure used to preserve the symbol when rearranging the + * frequency array. + */ +typedef struct __indexed_byte { + uint32_t symbol; /* not uint8_t: match external data structure */ + uint32_t frequency; +} INDEXED_SYMBOL; + +static int indexed_freq_compare(const void *, const void *); +static int indexed_symbol_compare(const void *, const void *); +static void make_table( + WT_SESSION_IMPL *, uint8_t *, uint16_t, WT_HUFFMAN_CODE *, u_int); +static void node_queue_close(WT_SESSION_IMPL *, NODE_QUEUE *); +static void node_queue_dequeue( + WT_SESSION_IMPL *, NODE_QUEUE *, WT_FREQTREE_NODE **); +static int node_queue_enqueue( + WT_SESSION_IMPL *, NODE_QUEUE *, WT_FREQTREE_NODE *); +static uint32_t profile_tree( + WT_FREQTREE_NODE *, uint16_t, uint16_t *, uint16_t *); +static void recursive_free_node(WT_SESSION_IMPL *, WT_FREQTREE_NODE *); +static void set_codes(WT_FREQTREE_NODE *, WT_HUFFMAN_CODE *, uint16_t, uint8_t); + +#define node_queue_is_empty(queue) \ + ((queue) == NULL || (queue)->first == NULL) + +/* + * indexed_symbol_compare -- + * Qsort comparator to order the table by symbol, lowest to highest. + */ +static int +indexed_symbol_compare(const void *a, const void *b) +{ + return (((INDEXED_SYMBOL *)a)->symbol > + ((INDEXED_SYMBOL *)b)->symbol ? 1 : + (((INDEXED_SYMBOL *)a)->symbol < + ((INDEXED_SYMBOL *)b)->symbol ? -1 : 0)); +} + +/* + * indexed_freq_compare -- + * Qsort comparator to order the table by frequency (the most frequent + * symbols will be at the end of the array). + */ +static int +indexed_freq_compare(const void *a, const void *b) +{ + return (((INDEXED_SYMBOL *)a)->frequency > + ((INDEXED_SYMBOL *)b)->frequency ? 1 : + (((INDEXED_SYMBOL *)a)->frequency < + ((INDEXED_SYMBOL *)b)->frequency ? -1 : 0)); +} + +/* + * profile_tree -- + * Traverses tree to determine #leaves under each node, max depth, min + * depth of leaf. + */ +static uint32_t +profile_tree(WT_FREQTREE_NODE *node, + uint16_t len, uint16_t *max_depth, uint16_t *min_depth) +{ + uint32_t leaf_cnt; + + if (node->left == NULL && node->right == NULL) { /* leaf */ + leaf_cnt = 1; + if (*max_depth < len) + *max_depth = len; + if (*min_depth > len) + *min_depth = len; + } else { + /* + * internal node -- way tree constructed internal always has + * left and right children + */ + leaf_cnt = + profile_tree(node->left, len + 1, max_depth, min_depth) + + profile_tree(node->right, len + 1, max_depth, min_depth); + } + node->weight = leaf_cnt; /* abuse weight field */ + return (leaf_cnt); +} + +/* + * set_codes -- + * Computes Huffman code for each symbol in tree. + * + * Method is standard way in the literature, except that limits maximum code + * length. A known max code length is important for limiting memory use by + * the tables and for knowing how large data types need to be such as the field + * that holds the code pattern. + */ +static void +set_codes(WT_FREQTREE_NODE *node, + WT_HUFFMAN_CODE *codes, uint16_t pattern, uint8_t len) +{ + WT_HUFFMAN_CODE *code; + uint16_t patternleft, patternright, half; + uint8_t remaining; + + if (node->left == NULL && node->right == NULL) { + code = &codes[node->symbol]; + code->pattern = pattern; + code->length = len; +#if __HUFFMAN_DETAIL + printf("%" PRIx16 ": code %" PRIx16 ", len %" PRIu8 "\n", + node->symbol, pattern, len); +#endif + } else { + /* + * Check each subtree individually to see if can afford to split + * up bits into possibly shorter codes, or if need to employ all + * remaining bits up to MAX_CODE_LENGTH to consecutively number + * leaves. + */ + remaining = MAX_CODE_LENGTH - len; + /* + * If not already in "low-bit mode", but need to be, open up + * lower-order bits for consecutive numbering. + */ + if (len < MAX_CODE_LENGTH && + ((half = 1 << (remaining - 1)) < node->left->weight || + half < node->right->weight)) { + pattern = pattern << remaining; + len = MAX_CODE_LENGTH; + } + + if (len < MAX_CODE_LENGTH) { + patternleft = (pattern << 1) | 0; + patternright = (pattern << 1) | 1; + len++; + } else { /* "low bit mode" */ + patternleft = pattern; + patternright = pattern + node->left->weight; + /* len unchanged */ + } + + set_codes(node->left, codes, patternleft, len); + set_codes(node->right, codes, patternright, len); + } +} + +/* + * make_table -- + * Computes Huffman table used for subsequent lookups in encoding and + * decoding. With the table, encoding from a symbol to Huffman code and + * decoding from a code to a symbol are simple array lookups. + */ +static void +make_table(WT_SESSION_IMPL *session, uint8_t *code2symbol, + uint16_t max_depth, WT_HUFFMAN_CODE *codes, u_int symcnt) +{ + uint32_t j, c1, c2; /* Exceeds uint16_t bounds at loop boundary. */ + uint16_t c, i; + uint8_t len, shift; + + /* Zero out, for assertion below. */ + for (j = 0, c2 = (1U << max_depth); j < c2; j++) + code2symbol[j] = 0; + + /* + * Here's the magic: flood all bit patterns for lower-order bits to + * point to same symbol. + */ + for (i = 0; i < symcnt; i++) { + if ((len = codes[i].length) == 0) + continue; + + /* + * The size of the array index should be enough to hold largest + * index into symbol table. Pre-existing symbols were packed + * 0-255, so 8 bits is enough. Don't want to make it larger + * than necessary, we allocate (2 ^ max-code-length) of them. + */ + c = codes[i].pattern; + shift = max_depth - len; + c1 = (uint32_t)c << shift; + c2 = (uint32_t)(c + 1) << shift; + for (j = c1; j < c2; j++) { + WT_ASSERT(session, code2symbol[j] == 0); + code2symbol[j] = i; + } + } +} + +/* + * recursive_free_node -- + * Recursively free the huffman frequency tree's nodes. + */ +static void +recursive_free_node(WT_SESSION_IMPL *session, WT_FREQTREE_NODE *node) +{ + if (node != NULL) { + recursive_free_node(session, node->left); + recursive_free_node(session, node->right); + __wt_free(session, node); + } +} + +/* + * __wt_huffman_open -- + * Take a frequency table and return a pointer to a descriptor object. + */ +int +__wt_huffman_open(WT_SESSION_IMPL *session, + void *symbol_frequency_array, u_int symcnt, u_int numbytes, void *retp) +{ + INDEXED_SYMBOL *indexed_freqs, *sym; + NODE_QUEUE *combined_nodes, *leaves; + WT_DECL_RET; + WT_FREQTREE_NODE *node, *node2, **refnode, *tempnode; + WT_HUFFMAN_OBJ *huffman; + uint64_t w1, w2; + uint16_t i; + + indexed_freqs = symbol_frequency_array; + + combined_nodes = leaves = NULL; + node = node2 = tempnode = NULL; + + WT_RET(__wt_calloc_def(session, 1, &huffman)); + + /* + * The frequency table is 4B pairs of symbol and frequency. The symbol + * is either 1 or 2 bytes and the frequency ranges from 1 to UINT32_MAX + * (a frequency of 0 means the value is never expected to appear in the + * input). Validate the symbols are within range. + */ + if (numbytes != 1 && numbytes != 2) + WT_ERR_MSG(session, EINVAL, + "illegal number of symbol bytes specified for a huffman " + "table"); + + if (symcnt == 0) + WT_ERR_MSG(session, EINVAL, + "illegal number of symbols specified for a huffman table"); + + huffman->numSymbols = numbytes == 2 ? UINT16_MAX : UINT8_MAX; + + /* + * Order the array by symbol and check for invalid symbols and + * duplicates. + */ + qsort((void *)indexed_freqs, + symcnt, sizeof(INDEXED_SYMBOL), indexed_symbol_compare); + for (i = 0; i < symcnt; ++i) { + if (i > 0 && + indexed_freqs[i].symbol == indexed_freqs[i - 1].symbol) + WT_ERR_MSG(session, EINVAL, + "duplicate symbol %" PRIx32 + " specified in a huffman table", + indexed_freqs[i].symbol); + if (indexed_freqs[i].symbol > huffman->numSymbols) + WT_ERR_MSG(session, EINVAL, + "illegal symbol %" PRIx32 + " specified in a huffman table", + indexed_freqs[i].symbol); + } + + /* + * Massage frequencies. + */ + indexed_freqs = NULL; + WT_ERR(__wt_calloc_def(session, 256, &indexed_freqs)); + + /* + * Minimum of frequency==1 so everybody gets a Huffman code, in case + * data evolves and we need to represent this value. + */ + for (i = 0; i < 256; i++) { + sym = &indexed_freqs[i]; + sym->symbol = i; + sym->frequency = 1; + } + /* + * Avoid large tables by splitting UTF-16 frequencies into high byte + * and low byte. + */ + for (i = 0; i < symcnt; i++) { + sym = &((INDEXED_SYMBOL *)symbol_frequency_array)[i]; + indexed_freqs[sym->symbol & 0xff].frequency += sym->frequency; + if (numbytes == 2) + indexed_freqs[(sym->symbol >> 8) & 0xff].frequency += + sym->frequency; + } + huffman->numSymbols = symcnt = 256; + + /* + * The array must be sorted by frequency to be able to use a linear time + * construction algorithm. + */ + qsort((void *)indexed_freqs, + symcnt, sizeof(INDEXED_SYMBOL), indexed_freq_compare); + + /* We need two node queues to build the tree. */ + WT_ERR(__wt_calloc_def(session, 1, &leaves)); + WT_ERR(__wt_calloc_def(session, 1, &combined_nodes)); + + /* + * Adding the leaves to the queue. + * + * Discard symbols with a frequency of 0; this assumes these symbols + * never occur in the source stream, and the purpose is to reduce the + * huffman tree's size. + */ + for (i = 0; i < symcnt; ++i) + if (indexed_freqs[i].frequency > 0) { + WT_ERR(__wt_calloc_def(session, 1, &tempnode)); + tempnode->symbol = (uint8_t)indexed_freqs[i].symbol; + tempnode->weight = indexed_freqs[i].frequency; + WT_ERR(node_queue_enqueue(session, leaves, tempnode)); + tempnode = NULL; + } + + while (!node_queue_is_empty(leaves) || + !node_queue_is_empty(combined_nodes)) { + /* + * We have to get the node with the smaller weight, examining + * both queues' first element. We are collecting pairs of these + * items, by alternating between node and node2: + */ + refnode = !node ? &node : &node2; + + /* + * To decide which queue must be used, we get the weights of + * the first items from both: + */ + w1 = node_queue_is_empty(leaves) ? + UINT64_MAX : leaves->first->node->weight; + w2 = node_queue_is_empty(combined_nodes) ? + UINT64_MAX : combined_nodes->first->node->weight; + + /* + * Based on the two weights we finally can dequeue the smaller + * element and place it to the alternating target node pointer: + */ + if (w1 < w2) + node_queue_dequeue(session, leaves, refnode); + else + node_queue_dequeue(session, combined_nodes, refnode); + + /* + * In every second run, we have both node and node2 initialized. + */ + if (node != NULL && node2 != NULL) { + WT_ERR(__wt_calloc_def(session, 1, &tempnode)); + + /* The new weight is the sum of the two weights. */ + tempnode->weight = node->weight + node2->weight; + tempnode->left = node; + tempnode->right = node2; + + /* Enqueue it to the combined nodes queue */ + WT_ERR(node_queue_enqueue( + session, combined_nodes, tempnode)); + tempnode = NULL; + + /* Reset the state pointers */ + node = node2 = NULL; + } + } + + /* + * The remaining node is in the node variable, this is the root of the + * tree. Calculate how many bytes it takes to hold numSymbols bytes + * bits. + */ + huffman->max_depth = 0; + huffman->min_depth = MAX_CODE_LENGTH; + (void)profile_tree(node, 0, &huffman->max_depth, &huffman->min_depth); + if (huffman->max_depth > MAX_CODE_LENGTH) + huffman->max_depth = MAX_CODE_LENGTH; + + WT_ERR(__wt_calloc_def(session, huffman->numSymbols, &huffman->codes)); + set_codes(node, huffman->codes, 0, 0); + + WT_ERR(__wt_calloc_def( + session, 1U << huffman->max_depth, &huffman->code2symbol)); + make_table(session, huffman->code2symbol, + huffman->max_depth, huffman->codes, huffman->numSymbols); + +#if __HUFFMAN_DETAIL + { + uint8_t symbol; + uint32_t weighted_length; + + printf("leaf depth %" PRIu16 "..%" PRIu16 ", memory use: " + "codes %u# * %uB + code2symbol %u# * %uB\n", + huffman->min_depth, huffman->max_depth, + huffman->numSymbols, (u_int)sizeof(WT_HUFFMAN_CODE), + 1U << huffman->max_depth, (u_int)sizeof(uint16_t)); + + /* + * measure quality of computed Huffman codes, for different max bit + * lengths (say, 16 vs 24 vs 32) + */ + weighted_length = 0; + for (i = 0; i < symcnt; i++) { + symbol = indexed_freqs[i].symbol; + weighted_length += + indexed_freqs[i].frequency * huffman->codes[symbol].length; + printf( + "\t%" PRIu16 "->%" PRIu16 ". %" PRIu32 " * %" PRIu8 "\n", + i, symbol, + indexed_freqs[i].frequency, huffman->codes[symbol].length); + } + printf("weighted length of all codes (the smaller the better): " + "%" PRIu32 "\n", weighted_length); + } +#endif + + *(void **)retp = huffman; + + if (0) { +err: if (ret == 0) + ret = WT_ERROR; + } + __wt_free(session, indexed_freqs); + if (leaves != NULL) + node_queue_close(session, leaves); + if (combined_nodes != NULL) + node_queue_close(session, combined_nodes); + if (node != NULL) + recursive_free_node(session, node); + if (node2 != NULL) + recursive_free_node(session, node2); + __wt_free(session, tempnode); + if (ret != 0) + __wt_huffman_close(session, huffman); + return (ret); +} + +/* + * __wt_huffman_close -- + * Discard a Huffman descriptor object. + */ +void +__wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg) +{ + WT_HUFFMAN_OBJ *huffman; + + huffman = huffman_arg; + + __wt_free(session, huffman->code2symbol); + __wt_free(session, huffman->codes); + __wt_free(session, huffman); +} + +#if __HUFFMAN_DETAIL +/* + * __wt_print_huffman_code -- + * Prints a symbol's Huffman code. + */ +int +__wt_print_huffman_code(void *huffman_arg, uint16_t symbol) +{ + WT_HUFFMAN_CODE code; + WT_HUFFMAN_OBJ *huffman; + + huffman = huffman_arg; + + if (symbol >= huffman->numSymbols) + printf("symbol %" PRIu16 " out of range\n", symbol); + else { + code = huffman->codes[symbol]; + if (code.length == 0) + printf( + "symbol %" PRIu16 " not defined -- 0 frequency\n", + symbol); + else + /* should print code as binary */ + printf( + "%" PRIu16 " -> code pattern " + "%" PRIx16 ", length %" PRIu8 "\n", + symbol, code.pattern, code.length); + } + + return (0); +} +#endif + +/* + * __wt_huffman_encode -- + * Take a byte string, encode it into the target. + * + * Translation from symbol to Huffman code is a simple array lookup. + * + * WT_HUFFMAN_OBJ contains an array called 'codes' with one WT_HUFFMAN_CODE per + * symbol. Then, given a symbol: + * pattern = codes[symbol].pattern; + * length = codes[symbol].length; + * + * To encode byte-string, we iterate over the input symbols. For each symbol, + * look it up via table, shift bits onto a shift register (an int long enough + * to hold the longest code word + up to 7 bits remaining from the previous), + * then drain out full bytes. Finally, at the end flush remaining bits + * and write header bits. + */ +int +__wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg, + const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) +{ + WT_DECL_RET; + WT_HUFFMAN_CODE code; + WT_HUFFMAN_OBJ *huffman; + WT_ITEM *tmp; + size_t max_len, outlen, bytes; + uint64_t bitpos; + const uint8_t *from; + uint8_t len, *out, padding_info, symbol; + + /* + * Shift register to accumulate bits from input. + * Should be >= (MAX_CODE_LENGTH + 7), but also efficient to shift bits + * and preferably in a machine register. + */ + uint32_t bits; + + /* Count of bits in shift register ('bits' above). */ + uint8_t valid; + + huffman = huffman_arg; + from = from_arg; + tmp = NULL; + + /* + * We don't want to find all of our callers and ensure they don't pass + * 0-length byte strings, but there's no reason to do any work. + */ + if (from_len == 0) { + to_buf->size = 0; + return (0); + } + + /* + * Compute the largest compressed output size, which is if all symbols + * are least frequent and so have largest Huffman codes, and compressed + * output may be larger than the input size. This way we don't have to + * worry about resizing the buffer during compression. Use the shared + * system buffer while compressing, then allocate a new buffer of the + * right size and copy the result into it. + */ + max_len = (WT_HUFFMAN_HEADER + + from_len * huffman->max_depth + 7 /* round up to full byte */) / 8; + WT_ERR(__wt_scr_alloc(session, max_len, &tmp)); + + /* + * Leave the first 3 bits of the encoded value empty, it holds the + * number of bits actually used in the last byte of the encoded value. + */ + bits = 0; + bitpos = WT_HUFFMAN_HEADER; + valid = WT_HUFFMAN_HEADER; + out = tmp->mem; + for (bytes = 0; bytes < from_len; bytes++) { + WT_ASSERT(session, WT_PTR_IN_RANGE(from, from_arg, from_len)); + + symbol = *from++; + + /* Translate symbol into Huffman code and stuff into buffer. */ + code = huffman->codes[symbol]; + len = code.length; + bits = (bits << len) | code.pattern; + valid += len; + bitpos += len; + while (valid >= 8) { + WT_ASSERT(session, + WT_PTR_IN_RANGE(out, tmp->mem, tmp->memsize)); + *out++ = (uint8_t)(bits >> (valid - 8)); + valid -= 8; + } + } + if (valid > 0) { /* Flush shift register. */ + WT_ASSERT(session, + WT_PTR_IN_RANGE(out, tmp->mem, tmp->memsize)); + *out = (uint8_t)(bits << (8 - valid)); + } + + /* + * At this point, bitpos is the total number of used bits (including + * the 3 bits at the beginning of the buffer, which we'll set now to + * the number of bits used in the last byte). Note if the number of + * bits used in the last byte is 8, we set the 3 bits to 0, in other + * words, the first 3 bits of the encoded value are the number of bits + * used in the last byte, unless they're 0, in which case there are 8 + * bits used in the last byte. + */ + padding_info = (bitpos % 8) << (8 - WT_HUFFMAN_HEADER); + ((uint8_t *)tmp->mem)[0] |= padding_info; + + /* Copy result of exact known size into caller's buffer. */ + outlen = (uint32_t)((bitpos + 7) / 8); + WT_ERR(__wt_buf_initsize(session, to_buf, outlen)); + memcpy(to_buf->mem, tmp->mem, outlen); + +#if __HUFFMAN_DETAIL + printf("encode: worst case %" PRIu32 " bytes -> actual %" PRIu32 "\n", + max_len, outlen); +#endif + +err: __wt_scr_free(&tmp); + return (ret); + +} + +/* + * __wt_huffman_decode -- + * Take a byte string, decode it into the target. + * + * Translation from Huffman code to symbol is a simple array lookup. + * + * WT_HUFFMAN_OBJ contains an array called 'code2symbol' indexed by code word + * and whose value is the corresponding symbol. + * From the symbol, we index into the 'codes' array to get the code length. + * + * When decoding a message, we don't know where the boundaries are between + * codes. The trick is that we collect enough bits for the longest code word, + * and construct the table such that for codes with fewer bits we flood the + * table with all of the bit patterns in the lower order bits. This works + * because the Huffman code is a unique prefix, and by the flooding we are + * treating bits beyond the unique prefix as don't care bits. + * + * For example, we have table of length 2^max_code_length (1<<max_code_length). + * For a code of length, max_code_length, the position code2symbol[code] = + * symbol. + * For a code word of (max_length - 1), we fill code2symbol[code << 1] = symbol, + * as well as code2symbol[(code << 1) | 1] = symbol. + * And so on, so in general we fill: + * code2symbol[(code) << shift inclusive .. (code+1) << shift exclusive]. + * + * To decode a message, we read in enough bits from input to fill the shift + * register with at least MAX_CODE_LENGTH bits. + * We look up in the table code2symbol to obtain the symbol. + * We look up the symbol in 'codes' to obtain the code length + * Finally, subtract off these bits from the shift register. + */ +int +__wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, + const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) +{ + WT_DECL_RET; + WT_ITEM *tmp; + WT_HUFFMAN_OBJ *huffman; + size_t from_bytes, len, max_len, outlen; + uint64_t from_len_bits; + uint32_t bits, mask, max; + uint16_t pattern; + const uint8_t *from; + uint8_t padding_info, symbol, *to, valid; + + huffman = huffman_arg; + from = from_arg; + tmp = NULL; + + /* + * We don't want to find all of our callers and ensure they don't pass + * 0-length byte strings, but there's no reason to do any work. + */ + if (from_len == 0) { + to_buf->size = 0; + return (0); + } + + /* + * The first 3 bits are the number of used bits in the last byte, unless + * they're 0, in which case there are 8 bits used in the last byte. + */ + padding_info = (*from & 0xE0) >> (8 - WT_HUFFMAN_HEADER); + from_len_bits = from_len * 8; + if (padding_info != 0) + from_len_bits -= 8U - padding_info; + + /* Number of bits that have codes. */ + from_len_bits -= WT_HUFFMAN_HEADER; + + /* + * Compute largest uncompressed output size, which is if all symbols are + * most frequent and so have smallest Huffman codes and therefore + * largest expansion. Use the shared system buffer while uncompressing, + * then allocate a new buffer of exactly the right size and copy the + * result into it. + */ + max_len = (uint32_t)(from_len_bits / huffman->min_depth); + WT_ERR(__wt_scr_alloc(session, max_len, &tmp)); + to = tmp->mem; + + /* The first byte of input is a special case because of header bits. */ + bits = *from++; + valid = 8 - WT_HUFFMAN_HEADER; + from_bytes = from_len - 1; + + max = huffman->max_depth; + mask = (1U << max) - 1; + for (outlen = 0; from_len_bits > 0; outlen++) { + while (valid < max && from_bytes > 0) { + WT_ASSERT(session, + WT_PTR_IN_RANGE(from, from_arg, from_len)); + bits = (bits << 8) | *from++; + valid += 8; + from_bytes--; + } + pattern = valid >= max ? /* short patterns near end */ + (bits >> (valid - max)) : (bits << (max - valid)); + symbol = huffman->code2symbol[pattern & mask]; + len = huffman->codes[symbol].length; + valid -= len; + WT_ASSERT(session, from_len_bits >= len); + from_len_bits -= len; + + WT_ASSERT(session, + WT_PTR_IN_RANGE(to, tmp->mem, tmp->memsize)); + *to++ = symbol; + } + + /* Return the number of bytes used. */ + WT_ERR(__wt_buf_initsize(session, to_buf, outlen)); + memcpy(to_buf->mem, tmp->mem, outlen); + +#if __HUFFMAN_DETAIL + printf("decode: worst case %" PRIu32 " bytes -> actual %" PRIu32 "\n", + max_len, outlen); +#endif + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * node_queue_close -- + * Delete a queue from memory. + * + * It does not delete the pointed huffman tree nodes! + */ +static void +node_queue_close(WT_SESSION_IMPL *session, NODE_QUEUE *queue) +{ + NODE_QUEUE_ELEM *elem, *next_elem; + + /* Freeing each element of the queue's linked list. */ + for (elem = queue->first; elem != NULL; elem = next_elem) { + next_elem = elem->next; + __wt_free(session, elem); + } + + /* Freeing the queue record itself. */ + __wt_free(session, queue); +} + +/* + * node_queue_enqueue -- + * Push a tree node to the end of the queue. + */ +static int +node_queue_enqueue( + WT_SESSION_IMPL *session, NODE_QUEUE *queue, WT_FREQTREE_NODE *node) +{ + NODE_QUEUE_ELEM *elem; + + /* Allocating a new linked list element */ + WT_RET(__wt_calloc_def(session, 1, &elem)); + + /* It holds the tree node, and has no next element yet */ + elem->node = node; + elem->next = NULL; + + /* If the queue is empty, the first element will be the new one. */ + if (queue->first == NULL) + queue->first = elem; + + /* + * If the queue is not empty, the last element's next pointer must be + * updated. + */ + if (queue->last != NULL) + queue->last->next = elem; + + /* The last element is the new one */ + queue->last = elem; + + return (0); +} + +/* + * node_queue_dequeue -- + * Removes a node from the beginning of the queue and copies the node's + * pointer to the location referred by the retp parameter. + */ +static void +node_queue_dequeue( + WT_SESSION_IMPL *session, NODE_QUEUE *queue, WT_FREQTREE_NODE **retp) +{ + NODE_QUEUE_ELEM *first_elem; + + /* + * Getting the first element of the queue and updating it to point to + * the next element as first. + */ + first_elem = queue->first; + *retp = first_elem->node; + queue->first = first_elem->next; + + /* + * If the last element was the dequeued element, we have to update it + * to NULL. + */ + if (queue->last == first_elem) + queue->last = NULL; + + /* Freeing the linked list element that has been dequeued */ + __wt_free(session, first_elem); +} diff --git a/src/third_party/wiredtiger/src/support/mutex.c b/src/third_party/wiredtiger/src/support/mutex.c new file mode 100644 index 00000000000..ffe52cf28fd --- /dev/null +++ b/src/third_party/wiredtiger/src/support/mutex.c @@ -0,0 +1,257 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING + +/* + * __wt_spin_lock_register_lock -- + * Add a lock to the connection's list. + */ +int +__wt_spin_lock_register_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + WT_CONNECTION_IMPL *conn; + u_int i; + + /* + * There is a spinlock we initialize before we have a connection, the + * global library lock. In that case, the session will be NULL and + * we can't track the lock. + */ + if (session == NULL) + return (0); + + conn = S2C(session); + + for (i = 0; i < WT_SPINLOCK_MAX; i++) + if (conn->spinlock_list[i] == NULL && + WT_ATOMIC_CAS(conn->spinlock_list[i], NULL, t)) + return (0); + + WT_RET_MSG(session, ENOMEM, + "spinlock connection registry failed, increase the connection's " + "spinlock list size"); +} + +/* + * __wt_spin_lock_unregister_lock -- + * Remove a lock from the connection's list. + */ +void +__wt_spin_lock_unregister_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + WT_CONNECTION_IMPL *conn; + u_int i; + + conn = S2C(session); + + for (i = 0; i < WT_SPINLOCK_MAX; i++) + if (conn->spinlock_list[i] == t) + conn->spinlock_list[i] = NULL; + + /* + * XXX + * The statistics thread reads through this array, there's a possible + * race: if that thread reads the pointer then goes to sleep, then we + * free the spinlock, then the statistics thread wakes up, it can read + * free'd memory. + * + * This is performance debugging code, so we're not fixing the race for + * now, minimize the window. + */ + WT_FULL_BARRIER(); +} + +/* + * __spin_lock_next_id -- + * Return the next spinlock caller ID. + */ +static int +__spin_lock_next_id(WT_SESSION_IMPL *session, int *idp) +{ + static int lock_id = 0, next_id = 0; + WT_DECL_RET; + + /* If we've ever registered this location, we already have an ID. */ + if (*idp != WT_SPINLOCK_REGISTER) + return (0); + + /* + * We can't use the global spinlock to lock the ID allocation (duh!), + * use a CAS instruction to serialize access to a local variable. + * This work only gets done once per library instantiation, there + * isn't a performance concern. + */ + while (!WT_ATOMIC_CAS(lock_id, 0, 1)) + __wt_yield(); + + /* Allocate a blocking ID for this location. */ + if (*idp == WT_SPINLOCK_REGISTER) { + if (next_id < WT_SPINLOCK_MAX_LOCATION_ID) + *idp = next_id++; + else + WT_ERR_MSG(session, ENOMEM, + "spinlock caller location registry failed, " + "increase the connection's blocking matrix size"); + } + +err: WT_PUBLISH(lock_id, 0); + return (ret); +} + +/* + * __wt_spin_lock_register_caller -- + * Register a spin-lock caller's location information in the blocking + * matrix. + */ +int +__wt_spin_lock_register_caller(WT_SESSION_IMPL *session, + const char *name, const char *file, int line, int *idp) +{ + WT_CONNECTION_IMPL *conn; + WT_CONNECTION_STATS_SPINLOCK *p; + + conn = S2C(session); + + /* + * The caller's location ID is a static offset into a per-connection + * structure, and that has problems: first, if there are multiple + * connections, we'll need to hold some kind of lock to avoid racing + * when setting that value, and second, if/when there are multiple + * connections and/or a single connection is closed and re-opened, the + * variable may be initialized and underlying connection information + * may not. + * + * First, allocate a location ID if needed. + */ + WT_RET(__spin_lock_next_id(session, idp)); + + /* + * Add the caller's information to the blocking matrix. We could race + * here (if two threads of control register the same lock at the same + * time), but we don't care as both threads are setting the identical + * information. + */ + p = &conn->spinlock_block[*idp]; + p->name = name; + if ((p->file = strrchr(file, '/')) == NULL) + p->file = file; + else + ++p->file; + p->line = line; + return (0); +} + +/* + * __wt_statlog_dump_spinlock -- + * Log the spin-lock statistics. + */ +int +__wt_statlog_dump_spinlock(WT_CONNECTION_IMPL *conn, const char *tag) +{ + WT_SPINLOCK *spin; + WT_CONNECTION_STATS_SPINLOCK *p, *t; + uint64_t block_manager, btree_page, ignore; + u_int i, j; + + /* + * Ignore rare acquisition of a spinlock using a base value of 10 per + * second so we don't create graphs we don't care about. + */ + ignore = (uint64_t)(conn->stat_usecs / 1000000) * 10; + + /* Output the number of times each spinlock was acquired. */ + block_manager = btree_page = 0; + for (i = 0; i < WT_ELEMENTS(conn->spinlock_list); ++i) { + if ((spin = conn->spinlock_list[i]) == NULL) + continue; + + /* + * There are two sets of spinlocks we aggregate, the btree page + * locks and the block manager per-file locks. The reason is + * the block manager locks grow with the number of files open + * (and LSM and bloom filters can open a lot of files), and + * there are 16 btree page locks and splitting them out has not + * historically been that informative. + */ + if (strcmp(spin->name, "block manager") == 0) { + block_manager += spin->counter; + if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR)) + spin->counter = 0; + continue; + } + if (strcmp(spin->name, "btree page") == 0) { + btree_page += spin->counter; + if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR)) + spin->counter = 0; + continue; + } + + WT_RET_TEST((fprintf(conn->stat_fp, + "%s %" PRIu64 " %s spinlock %s: acquisitions\n", + conn->stat_stamp, + spin->counter <= ignore ? 0 : spin->counter, + tag, spin->name) < 0), + __wt_errno()); + if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR)) + spin->counter = 0; + } + WT_RET_TEST((fprintf(conn->stat_fp, + "%s %" PRIu64 " %s spinlock %s: acquisitions\n", + conn->stat_stamp, + block_manager <= ignore ? 0 : block_manager, + tag, "block manager") < 0), + __wt_errno()); + WT_RET_TEST((fprintf(conn->stat_fp, + "%s %" PRIu64 " %s spinlock %s: acquisitions\n", + conn->stat_stamp, + btree_page <= ignore ? 0 : btree_page, + tag, "btree page") < 0), + __wt_errno()); + + /* + * Output the number of times each location acquires its spinlock and + * the blocking matrix. + */ + for (i = 0; i < WT_ELEMENTS(conn->spinlock_block); ++i) { + p = &conn->spinlock_block[i]; + if (p->name == NULL) + continue; + + WT_RET_TEST((fprintf(conn->stat_fp, + "%s %d %s spinlock %s acquired by %s(%d)\n", + conn->stat_stamp, + p->total <= ignore ? 0 : p->total, + tag, + p->name, p->file, p->line) < 0), __wt_errno()); + if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR)) + p->total = 0; + + for (j = 0; j < WT_ELEMENTS(conn->spinlock_block); ++j) { + t = &conn->spinlock_block[j]; + if (t->name == NULL) + continue; + + WT_RET_TEST((fprintf(conn->stat_fp, + "%s %d %s spinlock %s: %s(%d) blocked by %s(%d)\n", + conn->stat_stamp, + p->blocked[j] <= ignore ? 0 : p->blocked[j], + tag, + p->name, p->file, p->line, + t->file, t->line) < 0), __wt_errno()); + if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR)) + p->blocked[j] = 0; + } + } + + WT_FULL_BARRIER(); /* Minimize the window. */ + return (0); +} + +#endif /* SPINLOCK_PTHREAD_MUTEX_LOGGING */ diff --git a/src/third_party/wiredtiger/src/support/pow.c b/src/third_party/wiredtiger/src/support/pow.c new file mode 100644 index 00000000000..a6bf6c7227f --- /dev/null +++ b/src/third_party/wiredtiger/src/support/pow.c @@ -0,0 +1,130 @@ +/*- + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "wt_internal.h" + +#ifdef __WIREDTIGER_UNUSED__ + +/* + * __wt_nlpo2_round -- + * Round up to the next-largest power-of-two for a 32-bit unsigned value. + * + * In 12 operations, this code computes the next highest power of 2 for a 32-bit + * integer. The result may be expressed by the formula 1U << (lg(v - 1) + 1). + * Note that in the edge case where v is 0, it returns 0, which isn't a power of + * 2; you might append the expression v += (v == 0) to remedy this if it + * matters. It would be faster by 2 operations to use the formula and the + * log base 2 method that uses a lookup table, but in some situations, lookup + * tables are not suitable, so the above code may be best. (On a Athlon XP 2100+ + * I've found the above shift-left and then OR code is as fast as using a single + * BSR assembly language instruction, which scans in reverse to find the highest + * set bit.) It works by copying the highest set bit to all of the lower bits, + * and then adding one, which results in carries that set all of the lower bits + * to 0 and one bit beyond the highest set bit to 1. If the original number was + * a power of 2, then the decrement will reduce it to one less, so that we round + * up to the same original value. Devised by Sean Anderson, September 14, 2001. + * Pete Hart pointed me to a couple newsgroup posts by him and William Lewis in + * February of 1997, where they arrive at the same algorithm. + * http://graphics.stanford.edu/~seander/bithacks.html + * Sean Eron Anderson, seander@cs.stanford.edu + */ +uint32_t +__wt_nlpo2_round(uint32_t v) +{ + v--; /* If v is a power-of-two, return it. */ + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return (v + 1); +} + +/* + * __wt_nlpo2 -- + * Return the next largest power-of-two. + */ +uint32_t +__wt_nlpo2(uint32_t v) +{ + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return (v + 1); +} +#endif /* __WIREDTIGER_UNUSED__ */ + +/* + * __wt_log2_int -- + * Find the log base 2 of an integer in O(N) operations; + * http://graphics.stanford.edu/~seander/bithacks.html + */ +uint32_t +__wt_log2_int(uint32_t n) +{ + uint32_t l = 0; + + while (n >>= 1) + l++; + return (l); +} + +/* + * __wt_ispo2 -- + * Return if a number is a power-of-two. + */ +int +__wt_ispo2(uint32_t v) +{ + /* + * Only numbers that are powers of two will satisfy the relationship + * (v & (v - 1) == 0). + * + * However n must be positive, this returns 0 as a power of 2; to fix + * that, use: (! (v & (v - 1)) && v) + */ + return ((v & (v - 1)) == 0); +} + +/* + * __wt_rduppo2 -- + * Round the given int up to the next multiple of N, where N is power of 2. + */ +uint32_t +__wt_rduppo2(uint32_t n, uint32_t po2) +{ + uint32_t bits, res; + + if (__wt_ispo2(po2)) { + bits = __wt_log2_int(po2); + res = (((n - 1) >> bits) + 1) << bits; + } else + res = 0; + return (res); +} diff --git a/src/third_party/wiredtiger/src/support/rand.c b/src/third_party/wiredtiger/src/support/rand.c new file mode 100644 index 00000000000..b716eb8c58b --- /dev/null +++ b/src/third_party/wiredtiger/src/support/rand.c @@ -0,0 +1,69 @@ +/*- + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "wt_internal.h" + +#undef M_W +#define M_W (rnd)[0] +#undef M_Z +#define M_Z (rnd)[1] + +/* + * __wt_random_init -- + * Initialize return of a 32-bit pseudo-random number. + */ +void +__wt_random_init(uint32_t *rnd) +{ + M_W = 521288629; + M_Z = 362436069; +} + +/* + * __wt_random -- + * Return a 32-bit pseudo-random number. + * + * This is an implementation of George Marsaglia's multiply-with-carry pseudo- + * random number generator. Computationally fast, with reasonable randomness + * properties. + * + * We have to be very careful about races here. Multiple threads can call + * __wt_random concurrently, and it is okay if those concurrent calls get the + * same return value. What is *not* okay is if reading the shared state races + * with an update and uses two different values for m_w or m_z. That could + * result in a value of zero, in which case they would be stuck on zero + * forever. Take local copies of the shared values to avoid this. + */ +uint32_t +__wt_random(uint32_t *rnd) +{ + uint32_t w = M_W, z = M_Z; + + M_Z = z = 36969 * (z & 65535) + (z >> 16); + M_W = w = 18000 * (w & 65535) + (w >> 16); + return (z << 16) + (w & 65535); +} diff --git a/src/third_party/wiredtiger/src/support/scratch.c b/src/third_party/wiredtiger/src/support/scratch.c new file mode 100644 index 00000000000..ca2cdac8377 --- /dev/null +++ b/src/third_party/wiredtiger/src/support/scratch.c @@ -0,0 +1,319 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_buf_grow_worker -- + * Grow a buffer that may be in-use, and ensure that all data is local to + * the buffer. + */ +int +__wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) +{ + size_t offset; + int copy_data; + + /* + * Maintain the existing data: there are 3 cases: + * No existing data: allocate the required memory, and initialize + * the data to reference it. + * Existing data local to the buffer: set the data to the same + * offset in the re-allocated memory. + * Existing data not-local to the buffer: copy the data into the + * buffer and set the data to reference it. + */ + if (WT_DATA_IN_ITEM(buf)) { + offset = WT_PTRDIFF(buf->data, buf->mem); + copy_data = 0; + } else { + offset = 0; + copy_data = buf->size ? 1 : 0; + } + + /* + * This function is also used to ensure data is local to the buffer, + * check to see if we actually need to grow anything. + */ + if (size > buf->memsize) { + if (F_ISSET(buf, WT_ITEM_ALIGNED)) + WT_RET(__wt_realloc_aligned( + session, &buf->memsize, size, &buf->mem)); + else + WT_RET(__wt_realloc( + session, &buf->memsize, size, &buf->mem)); + } + + if (buf->data == NULL) { + buf->data = buf->mem; + buf->size = 0; + } else { + if (copy_data) + memcpy(buf->mem, buf->data, buf->size); + buf->data = (uint8_t *)buf->mem + offset; + } + + return (0); +} + +/* + * __wt_buf_fmt -- + * Grow a buffer to accommodate a formatted string. + */ +int +__wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4))) +{ + va_list ap; + size_t len; + + for (;;) { + va_start(ap, fmt); + len = (size_t)vsnprintf(buf->mem, buf->memsize, fmt, ap); + va_end(ap); + + /* Check if there was enough space. */ + if (len < buf->memsize) { + buf->data = buf->mem; + buf->size = len; + return (0); + } + + /* + * If not, double the size of the buffer: we're dealing with + * strings, and we don't expect these numbers to get huge. + */ + WT_RET(__wt_buf_extend(session, buf, len + 1)); + } +} + +/* + * __wt_buf_catfmt -- + * Grow a buffer to append a formatted string. + */ +int +__wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4))) +{ + va_list ap; + size_t len, space; + char *p; + + /* + * If we're appending data to an existing buffer, any data field should + * point into the allocated memory. (It wouldn't be insane to copy any + * previously existing data at this point, if data wasn't in the local + * buffer, but we don't and it would be bad if we didn't notice it.) + */ + WT_ASSERT(session, buf->data == NULL || WT_DATA_IN_ITEM(buf)); + + for (;;) { + va_start(ap, fmt); + p = (char *)((uint8_t *)buf->mem + buf->size); + WT_ASSERT(session, buf->memsize >= buf->size); + space = buf->memsize - buf->size; + len = (size_t)vsnprintf(p, (size_t)space, fmt, ap); + va_end(ap); + + /* Check if there was enough space. */ + if (len < space) { + buf->size += len; + return (0); + } + + /* + * If not, double the size of the buffer: we're dealing with + * strings, and we don't expect these numbers to get huge. + */ + WT_RET(__wt_buf_extend(session, buf, buf->size + len + 1)); + } +} + +/* + * __wt_scr_alloc_func -- + * Scratch buffer allocation function. + */ +int +__wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp +#ifdef HAVE_DIAGNOSTIC + , const char *file, int line +#endif + ) +{ + WT_DECL_RET; + WT_ITEM *buf, **p, **best, **slot; + size_t allocated; + u_int i; + + /* Don't risk the caller not catching the error. */ + *scratchp = NULL; + + /* + * Each WT_SESSION_IMPL has an array of scratch buffers available for + * use by any function. We use WT_ITEM structures for scratch memory + * because we already have functions that do variable-length allocation + * on a WT_ITEM. Scratch buffers are allocated only by a single thread + * of control, so no locking is necessary. + * + * Walk the array, looking for a buffer we can use. + */ + for (i = 0, best = slot = NULL, + p = session->scratch; i < session->scratch_alloc; ++i, ++p) { + /* If we find an empty slot, remember it. */ + if ((buf = *p) == NULL) { + if (slot == NULL) + slot = p; + continue; + } + + if (F_ISSET(buf, WT_ITEM_INUSE)) + continue; + + /* + * If we find a buffer that's not in-use, check its size: we + * want the smallest buffer larger than the requested size, + * or the largest buffer if none are large enough. + */ + if (best == NULL || + ((*best)->memsize < size && + buf->memsize > (*best)->memsize) || + (buf->memsize >= size && buf->memsize < (*best)->memsize)) + best = p; + + /* If we find a perfect match, use it. */ + if ((*best)->memsize == size) + break; + } + + /* + * If we didn't find a free buffer, extend the array and use the first + * slot we allocated. + */ + if (best == NULL && slot == NULL) { + allocated = session->scratch_alloc * sizeof(WT_ITEM *); + WT_ERR(__wt_realloc(session, &allocated, + (session->scratch_alloc + 10) * sizeof(WT_ITEM *), + &session->scratch)); +#ifdef HAVE_DIAGNOSTIC + allocated = session->scratch_alloc * sizeof(WT_SCRATCH_TRACK); + WT_ERR(__wt_realloc(session, &allocated, + (session->scratch_alloc + 10) * sizeof(WT_SCRATCH_TRACK), + &session->scratch_track)); +#endif + slot = session->scratch + session->scratch_alloc; + session->scratch_alloc += 10; + } + + /* + * If slot is non-NULL, we found an empty slot, try and allocate a + * buffer. + */ + if (best == NULL) { + WT_ASSERT(session, slot != NULL); + best = slot; + + WT_ERR(__wt_calloc_def(session, 1, best)); + + /* Scratch buffers must be aligned. */ + F_SET(*best, WT_ITEM_ALIGNED); + } + + /* Grow the buffer as necessary and return. */ + WT_ERR(__wt_buf_init(session, *best, size)); + F_SET(*best, WT_ITEM_INUSE); + +#ifdef HAVE_DIAGNOSTIC + session->scratch_track[best - session->scratch].file = file; + session->scratch_track[best - session->scratch].line = line; +#endif + + *scratchp = *best; + return (0); + +err: WT_RET_MSG(session, ret, + "session unable to allocate a scratch buffer"); +} + +/* + * __wt_scr_discard -- + * Free all memory associated with the scratch buffers. + */ +void +__wt_scr_discard(WT_SESSION_IMPL *session) +{ + WT_ITEM **bufp; + u_int i; + + for (i = 0, + bufp = session->scratch; i < session->scratch_alloc; ++i, ++bufp) { + if (*bufp == NULL) + continue; + if (F_ISSET(*bufp, WT_ITEM_INUSE)) + __wt_errx(session, + "scratch buffer allocated and never discarded" +#ifdef HAVE_DIAGNOSTIC + ": %s: %d", + session-> + scratch_track[bufp - session->scratch].file, + session-> + scratch_track[bufp - session->scratch].line +#endif + ); + + __wt_buf_free(session, *bufp); + __wt_free(session, *bufp); + } + + __wt_free(session, session->scratch); +#ifdef HAVE_DIAGNOSTIC + __wt_free(session, session->scratch_track); +#endif +} + +/* + * __wt_ext_scr_alloc -- + * Allocate a scratch buffer, and return the memory reference. + */ +void * +__wt_ext_scr_alloc( + WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t size) +{ + WT_ITEM *buf; + WT_SESSION_IMPL *session; + + if ((session = (WT_SESSION_IMPL *)wt_session) == NULL) + session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session; + + return (__wt_scr_alloc(session, size, &buf) == 0 ? buf->mem : NULL); +} + +/* + * __wt_ext_scr_free -- + * Free a scratch buffer based on the memory reference. + */ +void +__wt_ext_scr_free(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *p) +{ + WT_ITEM **bufp; + WT_SESSION_IMPL *session; + u_int i; + + if ((session = (WT_SESSION_IMPL *)wt_session) == NULL) + session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session; + + for (i = 0, + bufp = session->scratch; i < session->scratch_alloc; ++i, ++bufp) + if (*bufp != NULL && (*bufp)->mem == p) { + /* + * Do NOT call __wt_scr_free() here, it clears the + * caller's pointer, which would truncate the list. + */ + F_CLR(*bufp, WT_ITEM_INUSE); + return; + } + __wt_errx(session, "extension free'd non-existent scratch buffer"); +} diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c new file mode 100644 index 00000000000..bc468fbe938 --- /dev/null +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -0,0 +1,567 @@ +/* DO NOT EDIT: automatically built by dist/stat.py. */ + +#include "wt_internal.h" + +void +__wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats) +{ + /* Clear, so can also be called for reinitialization. */ + memset(stats, 0, sizeof(*stats)); + + stats->allocation_size.desc = + "block manager: file allocation unit size"; + stats->block_alloc.desc = "block manager: blocks allocated"; + stats->block_checkpoint_size.desc = "block manager: checkpoint size"; + stats->block_extension.desc = + "block manager: allocations requiring file extension"; + stats->block_free.desc = "block manager: blocks freed"; + stats->block_magic.desc = "block manager: file magic number"; + stats->block_major.desc = "block manager: file major version number"; + stats->block_minor.desc = "block manager: minor version number"; + stats->block_reuse_bytes.desc = + "block manager: file bytes available for reuse"; + stats->block_size.desc = "block manager: file size in bytes"; + stats->bloom_count.desc = "LSM: bloom filters in the LSM tree"; + stats->bloom_false_positive.desc = "LSM: bloom filter false positives"; + stats->bloom_hit.desc = "LSM: bloom filter hits"; + stats->bloom_miss.desc = "LSM: bloom filter misses"; + stats->bloom_page_evict.desc = + "LSM: bloom filter pages evicted from cache"; + stats->bloom_page_read.desc = + "LSM: bloom filter pages read into cache"; + stats->bloom_size.desc = "LSM: total size of bloom filters"; + stats->btree_column_deleted.desc = + "btree: column-store variable-size deleted values"; + stats->btree_column_fix.desc = + "btree: column-store fixed-size leaf pages"; + stats->btree_column_internal.desc = + "btree: column-store internal pages"; + stats->btree_column_variable.desc = + "btree: column-store variable-size leaf pages"; + stats->btree_compact_rewrite.desc = + "btree: pages rewritten by compaction"; + stats->btree_entries.desc = "btree: number of key/value pairs"; + stats->btree_fixed_len.desc = "btree: fixed-record size"; + stats->btree_maximum_depth.desc = "btree: maximum tree depth"; + stats->btree_maxintlitem.desc = + "btree: maximum internal page item size"; + stats->btree_maxintlpage.desc = "btree: maximum internal page size"; + stats->btree_maxleafitem.desc = "btree: maximum leaf page item size"; + stats->btree_maxleafpage.desc = "btree: maximum leaf page size"; + stats->btree_overflow.desc = "btree: overflow pages"; + stats->btree_row_internal.desc = "btree: row-store internal pages"; + stats->btree_row_leaf.desc = "btree: row-store leaf pages"; + stats->cache_bytes_read.desc = "cache: bytes read into cache"; + stats->cache_bytes_write.desc = "cache: bytes written from cache"; + stats->cache_eviction_checkpoint.desc = + "cache: checkpoint blocked page eviction"; + stats->cache_eviction_clean.desc = "cache: unmodified pages evicted"; + stats->cache_eviction_dirty.desc = "cache: modified pages evicted"; + stats->cache_eviction_fail.desc = + "cache: data source pages selected for eviction unable to be evicted"; + stats->cache_eviction_hazard.desc = + "cache: hazard pointer blocked page eviction"; + stats->cache_eviction_internal.desc = "cache: internal pages evicted"; + stats->cache_overflow_value.desc = + "cache: overflow values cached in memory"; + stats->cache_read.desc = "cache: pages read into cache"; + stats->cache_read_overflow.desc = + "cache: overflow pages read into cache"; + stats->cache_write.desc = "cache: pages written from cache"; + stats->compress_raw_fail.desc = + "compression: raw compression call failed, no additional data available"; + stats->compress_raw_fail_temporary.desc = + "compression: raw compression call failed, additional data available"; + stats->compress_raw_ok.desc = + "compression: raw compression call succeeded"; + stats->compress_read.desc = "compression: compressed pages read"; + stats->compress_write.desc = "compression: compressed pages written"; + stats->compress_write_fail.desc = + "compression: page written failed to compress"; + stats->compress_write_too_small.desc = + "compression: page written was too small to compress"; + stats->cursor_create.desc = "cursor: create calls"; + stats->cursor_insert.desc = "cursor: insert calls"; + stats->cursor_insert_bulk.desc = + "cursor: bulk-loaded cursor-insert calls"; + stats->cursor_insert_bytes.desc = + "cursor: cursor-insert key and value bytes inserted"; + stats->cursor_next.desc = "cursor: next calls"; + stats->cursor_prev.desc = "cursor: prev calls"; + stats->cursor_remove.desc = "cursor: remove calls"; + stats->cursor_remove_bytes.desc = + "cursor: cursor-remove key bytes removed"; + stats->cursor_reset.desc = "cursor: reset calls"; + stats->cursor_search.desc = "cursor: search calls"; + stats->cursor_search_near.desc = "cursor: search near calls"; + stats->cursor_update.desc = "cursor: update calls"; + stats->cursor_update_bytes.desc = + "cursor: cursor-update value bytes updated"; + stats->lsm_checkpoint_throttle.desc = + "LSM: sleep for LSM checkpoint throttle"; + stats->lsm_chunk_count.desc = "LSM: chunks in the LSM tree"; + stats->lsm_generation_max.desc = + "LSM: highest merge generation in the LSM tree"; + stats->lsm_lookup_no_bloom.desc = + "LSM: queries that could have benefited from a Bloom filter that did not exist"; + stats->lsm_merge_throttle.desc = "LSM: sleep for LSM merge throttle"; + stats->rec_dictionary.desc = "reconciliation: dictionary matches"; + stats->rec_multiblock_internal.desc = + "reconciliation: internal page multi-block writes"; + stats->rec_multiblock_leaf.desc = + "reconciliation: leaf page multi-block writes"; + stats->rec_multiblock_max.desc = + "reconciliation: maximum blocks required for a page"; + stats->rec_overflow_key_internal.desc = + "reconciliation: internal-page overflow keys"; + stats->rec_overflow_key_leaf.desc = + "reconciliation: leaf-page overflow keys"; + stats->rec_overflow_value.desc = + "reconciliation: overflow values written"; + stats->rec_page_delete.desc = "reconciliation: pages deleted"; + stats->rec_page_match.desc = "reconciliation: page checksum matches"; + stats->rec_pages.desc = "reconciliation: page reconciliation calls"; + stats->rec_pages_eviction.desc = + "reconciliation: page reconciliation calls for eviction"; + stats->rec_prefix_compression.desc = + "reconciliation: leaf page key bytes discarded using prefix compression"; + stats->rec_suffix_compression.desc = + "reconciliation: internal page key bytes discarded using suffix compression"; + stats->session_compact.desc = "session: object compaction"; + stats->session_cursor_open.desc = "session: open cursor count"; + stats->txn_update_conflict.desc = "txn: update conflicts"; +} + +void +__wt_stat_refresh_dsrc_stats(void *stats_arg) +{ + WT_DSRC_STATS *stats; + + stats = (WT_DSRC_STATS *)stats_arg; + stats->allocation_size.v = 0; + stats->block_alloc.v = 0; + stats->block_checkpoint_size.v = 0; + stats->block_extension.v = 0; + stats->block_free.v = 0; + stats->block_magic.v = 0; + stats->block_major.v = 0; + stats->block_minor.v = 0; + stats->block_reuse_bytes.v = 0; + stats->block_size.v = 0; + stats->bloom_count.v = 0; + stats->bloom_false_positive.v = 0; + stats->bloom_hit.v = 0; + stats->bloom_miss.v = 0; + stats->bloom_page_evict.v = 0; + stats->bloom_page_read.v = 0; + stats->bloom_size.v = 0; + stats->btree_column_deleted.v = 0; + stats->btree_column_fix.v = 0; + stats->btree_column_internal.v = 0; + stats->btree_column_variable.v = 0; + stats->btree_compact_rewrite.v = 0; + stats->btree_entries.v = 0; + stats->btree_fixed_len.v = 0; + stats->btree_maximum_depth.v = 0; + stats->btree_maxintlitem.v = 0; + stats->btree_maxintlpage.v = 0; + stats->btree_maxleafitem.v = 0; + stats->btree_maxleafpage.v = 0; + stats->btree_overflow.v = 0; + stats->btree_row_internal.v = 0; + stats->btree_row_leaf.v = 0; + stats->cache_bytes_read.v = 0; + stats->cache_bytes_write.v = 0; + stats->cache_eviction_checkpoint.v = 0; + stats->cache_eviction_clean.v = 0; + stats->cache_eviction_dirty.v = 0; + stats->cache_eviction_fail.v = 0; + stats->cache_eviction_hazard.v = 0; + stats->cache_eviction_internal.v = 0; + stats->cache_overflow_value.v = 0; + stats->cache_read.v = 0; + stats->cache_read_overflow.v = 0; + stats->cache_write.v = 0; + stats->compress_raw_fail.v = 0; + stats->compress_raw_fail_temporary.v = 0; + stats->compress_raw_ok.v = 0; + stats->compress_read.v = 0; + stats->compress_write.v = 0; + stats->compress_write_fail.v = 0; + stats->compress_write_too_small.v = 0; + stats->cursor_create.v = 0; + stats->cursor_insert.v = 0; + stats->cursor_insert_bulk.v = 0; + stats->cursor_insert_bytes.v = 0; + stats->cursor_next.v = 0; + stats->cursor_prev.v = 0; + stats->cursor_remove.v = 0; + stats->cursor_remove_bytes.v = 0; + stats->cursor_reset.v = 0; + stats->cursor_search.v = 0; + stats->cursor_search_near.v = 0; + stats->cursor_update.v = 0; + stats->cursor_update_bytes.v = 0; + stats->lsm_checkpoint_throttle.v = 0; + stats->lsm_chunk_count.v = 0; + stats->lsm_generation_max.v = 0; + stats->lsm_lookup_no_bloom.v = 0; + stats->lsm_merge_throttle.v = 0; + stats->rec_dictionary.v = 0; + stats->rec_multiblock_internal.v = 0; + stats->rec_multiblock_leaf.v = 0; + stats->rec_multiblock_max.v = 0; + stats->rec_overflow_key_internal.v = 0; + stats->rec_overflow_key_leaf.v = 0; + stats->rec_overflow_value.v = 0; + stats->rec_page_delete.v = 0; + stats->rec_page_match.v = 0; + stats->rec_pages.v = 0; + stats->rec_pages_eviction.v = 0; + stats->rec_prefix_compression.v = 0; + stats->rec_suffix_compression.v = 0; + stats->session_compact.v = 0; + stats->txn_update_conflict.v = 0; +} + +void +__wt_stat_aggregate_dsrc_stats(const void *child, const void *parent) +{ + WT_DSRC_STATS *c, *p; + + c = (WT_DSRC_STATS *)child; + p = (WT_DSRC_STATS *)parent; + p->block_alloc.v += c->block_alloc.v; + p->block_checkpoint_size.v += c->block_checkpoint_size.v; + p->block_extension.v += c->block_extension.v; + p->block_free.v += c->block_free.v; + p->block_reuse_bytes.v += c->block_reuse_bytes.v; + p->block_size.v += c->block_size.v; + p->bloom_count.v += c->bloom_count.v; + p->bloom_false_positive.v += c->bloom_false_positive.v; + p->bloom_hit.v += c->bloom_hit.v; + p->bloom_miss.v += c->bloom_miss.v; + p->bloom_page_evict.v += c->bloom_page_evict.v; + p->bloom_page_read.v += c->bloom_page_read.v; + p->bloom_size.v += c->bloom_size.v; + p->btree_column_deleted.v += c->btree_column_deleted.v; + p->btree_column_fix.v += c->btree_column_fix.v; + p->btree_column_internal.v += c->btree_column_internal.v; + p->btree_column_variable.v += c->btree_column_variable.v; + p->btree_compact_rewrite.v += c->btree_compact_rewrite.v; + p->btree_entries.v += c->btree_entries.v; + if (c->btree_maximum_depth.v > p->btree_maximum_depth.v) + p->btree_maximum_depth.v = c->btree_maximum_depth.v; + p->btree_overflow.v += c->btree_overflow.v; + p->btree_row_internal.v += c->btree_row_internal.v; + p->btree_row_leaf.v += c->btree_row_leaf.v; + p->cache_bytes_read.v += c->cache_bytes_read.v; + p->cache_bytes_write.v += c->cache_bytes_write.v; + p->cache_eviction_checkpoint.v += c->cache_eviction_checkpoint.v; + p->cache_eviction_clean.v += c->cache_eviction_clean.v; + p->cache_eviction_dirty.v += c->cache_eviction_dirty.v; + p->cache_eviction_fail.v += c->cache_eviction_fail.v; + p->cache_eviction_hazard.v += c->cache_eviction_hazard.v; + p->cache_eviction_internal.v += c->cache_eviction_internal.v; + p->cache_overflow_value.v += c->cache_overflow_value.v; + p->cache_read.v += c->cache_read.v; + p->cache_read_overflow.v += c->cache_read_overflow.v; + p->cache_write.v += c->cache_write.v; + p->compress_raw_fail.v += c->compress_raw_fail.v; + p->compress_raw_fail_temporary.v += c->compress_raw_fail_temporary.v; + p->compress_raw_ok.v += c->compress_raw_ok.v; + p->compress_read.v += c->compress_read.v; + p->compress_write.v += c->compress_write.v; + p->compress_write_fail.v += c->compress_write_fail.v; + p->compress_write_too_small.v += c->compress_write_too_small.v; + p->cursor_create.v += c->cursor_create.v; + p->cursor_insert.v += c->cursor_insert.v; + p->cursor_insert_bulk.v += c->cursor_insert_bulk.v; + p->cursor_insert_bytes.v += c->cursor_insert_bytes.v; + p->cursor_next.v += c->cursor_next.v; + p->cursor_prev.v += c->cursor_prev.v; + p->cursor_remove.v += c->cursor_remove.v; + p->cursor_remove_bytes.v += c->cursor_remove_bytes.v; + p->cursor_reset.v += c->cursor_reset.v; + p->cursor_search.v += c->cursor_search.v; + p->cursor_search_near.v += c->cursor_search_near.v; + p->cursor_update.v += c->cursor_update.v; + p->cursor_update_bytes.v += c->cursor_update_bytes.v; + p->lsm_checkpoint_throttle.v += c->lsm_checkpoint_throttle.v; + if (c->lsm_generation_max.v > p->lsm_generation_max.v) + p->lsm_generation_max.v = c->lsm_generation_max.v; + p->lsm_lookup_no_bloom.v += c->lsm_lookup_no_bloom.v; + p->lsm_merge_throttle.v += c->lsm_merge_throttle.v; + p->rec_dictionary.v += c->rec_dictionary.v; + p->rec_multiblock_internal.v += c->rec_multiblock_internal.v; + p->rec_multiblock_leaf.v += c->rec_multiblock_leaf.v; + if (c->rec_multiblock_max.v > p->rec_multiblock_max.v) + p->rec_multiblock_max.v = c->rec_multiblock_max.v; + p->rec_overflow_key_internal.v += c->rec_overflow_key_internal.v; + p->rec_overflow_key_leaf.v += c->rec_overflow_key_leaf.v; + p->rec_overflow_value.v += c->rec_overflow_value.v; + p->rec_page_delete.v += c->rec_page_delete.v; + p->rec_page_match.v += c->rec_page_match.v; + p->rec_pages.v += c->rec_pages.v; + p->rec_pages_eviction.v += c->rec_pages_eviction.v; + p->rec_prefix_compression.v += c->rec_prefix_compression.v; + p->rec_suffix_compression.v += c->rec_suffix_compression.v; + p->session_compact.v += c->session_compact.v; + p->session_cursor_open.v += c->session_cursor_open.v; + p->txn_update_conflict.v += c->txn_update_conflict.v; +} + +void +__wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) +{ + /* Clear, so can also be called for reinitialization. */ + memset(stats, 0, sizeof(*stats)); + + stats->async_alloc_race.desc = + "async: number of allocation state races"; + stats->async_alloc_view.desc = + "async: number of op slots viewed for alloc"; + stats->async_cur_queue.desc = "async: current work queue length"; + stats->async_flush.desc = "async: number of async flush calls"; + stats->async_full.desc = "async: number of times op allocation failed"; + stats->async_max_queue.desc = "async: maximum work queue length"; + stats->async_nowork.desc = + "async: number of times worker found no work"; + stats->async_op_alloc.desc = "async: op allocations"; + stats->async_op_compact.desc = "async: op compact calls"; + stats->async_op_insert.desc = "async: op insert calls"; + stats->async_op_remove.desc = "async: op remove calls"; + stats->async_op_search.desc = "async: op search calls"; + stats->async_op_update.desc = "async: op update calls"; + stats->block_byte_map_read.desc = "block manager: mapped bytes read"; + stats->block_byte_read.desc = "block manager: bytes read"; + stats->block_byte_write.desc = "block manager: bytes written"; + stats->block_map_read.desc = "block manager: mapped blocks read"; + stats->block_preload.desc = "block manager: blocks pre-loaded"; + stats->block_read.desc = "block manager: blocks read"; + stats->block_write.desc = "block manager: blocks written"; + stats->cache_bytes_dirty.desc = + "cache: tracked dirty bytes in the cache"; + stats->cache_bytes_inuse.desc = "cache: bytes currently in the cache"; + stats->cache_bytes_max.desc = "cache: maximum bytes configured"; + stats->cache_bytes_read.desc = "cache: bytes read into cache"; + stats->cache_bytes_write.desc = "cache: bytes written from cache"; + stats->cache_eviction_checkpoint.desc = + "cache: checkpoint blocked page eviction"; + stats->cache_eviction_clean.desc = "cache: unmodified pages evicted"; + stats->cache_eviction_deepen.desc = + "cache: page split during eviction deepened the tree"; + stats->cache_eviction_dirty.desc = "cache: modified pages evicted"; + stats->cache_eviction_fail.desc = + "cache: pages selected for eviction unable to be evicted"; + stats->cache_eviction_force.desc = + "cache: pages evicted because they exceeded the in-memory maximum"; + stats->cache_eviction_force_fail.desc = + "cache: failed eviction of pages that exceeded the in-memory maximum"; + stats->cache_eviction_hazard.desc = + "cache: hazard pointer blocked page eviction"; + stats->cache_eviction_internal.desc = "cache: internal pages evicted"; + stats->cache_eviction_queue_empty.desc = + "cache: eviction server candidate queue empty when topping up"; + stats->cache_eviction_queue_not_empty.desc = + "cache: eviction server candidate queue not empty when topping up"; + stats->cache_eviction_server_evicting.desc = + "cache: eviction server evicting pages"; + stats->cache_eviction_server_not_evicting.desc = + "cache: eviction server populating queue, but not evicting pages"; + stats->cache_eviction_slow.desc = + "cache: eviction server unable to reach eviction goal"; + stats->cache_eviction_split.desc = + "cache: pages split during eviction"; + stats->cache_eviction_walk.desc = "cache: pages walked for eviction"; + stats->cache_pages_dirty.desc = + "cache: tracked dirty pages in the cache"; + stats->cache_pages_inuse.desc = + "cache: pages currently held in the cache"; + stats->cache_read.desc = "cache: pages read into cache"; + stats->cache_write.desc = "cache: pages written from cache"; + stats->cond_wait.desc = "conn: pthread mutex condition wait calls"; + stats->cursor_create.desc = "Btree: cursor create calls"; + stats->cursor_insert.desc = "Btree: cursor insert calls"; + stats->cursor_next.desc = "Btree: cursor next calls"; + stats->cursor_prev.desc = "Btree: cursor prev calls"; + stats->cursor_remove.desc = "Btree: cursor remove calls"; + stats->cursor_reset.desc = "Btree: cursor reset calls"; + stats->cursor_search.desc = "Btree: cursor search calls"; + stats->cursor_search_near.desc = "Btree: cursor search near calls"; + stats->cursor_update.desc = "Btree: cursor update calls"; + stats->dh_session_handles.desc = "dhandle: session dhandles swept"; + stats->dh_session_sweeps.desc = "dhandle: session sweep attempts"; + stats->file_open.desc = "conn: files currently open"; + stats->log_buffer_grow.desc = "log: log buffer size increases"; + stats->log_buffer_size.desc = "log: total log buffer size"; + stats->log_bytes_user.desc = "log: user provided log bytes written"; + stats->log_bytes_written.desc = "log: log bytes written"; + stats->log_close_yields.desc = + "log: yields waiting for previous log file close"; + stats->log_max_filesize.desc = "log: maximum log file size"; + stats->log_reads.desc = "log: log read operations"; + stats->log_scan_records.desc = "log: records processed by log scan"; + stats->log_scan_rereads.desc = + "log: log scan records requiring two reads"; + stats->log_scans.desc = "log: log scan operations"; + stats->log_slot_closes.desc = "log: consolidated slot closures"; + stats->log_slot_consolidated.desc = "log: logging bytes consolidated"; + stats->log_slot_joins.desc = "log: consolidated slot joins"; + stats->log_slot_races.desc = "log: consolidated slot join races"; + stats->log_slot_switch_fails.desc = + "log: slots selected for switching that were unavailable"; + stats->log_slot_toobig.desc = "log: record size exceeded maximum"; + stats->log_slot_toosmall.desc = + "log: failed to find a slot large enough for record"; + stats->log_slot_transitions.desc = + "log: consolidated slot join transitions"; + stats->log_sync.desc = "log: log sync operations"; + stats->log_writes.desc = "log: log write operations"; + stats->lsm_checkpoint_throttle.desc = + "LSM: sleep for LSM checkpoint throttle"; + stats->lsm_merge_throttle.desc = "LSM: sleep for LSM merge throttle"; + stats->lsm_rows_merged.desc = "LSM: rows merged in an LSM tree"; + stats->lsm_work_queue_app.desc = + "LSM: App work units currently queued"; + stats->lsm_work_queue_manager.desc = + "LSM: Merge work units currently queued"; + stats->lsm_work_queue_max.desc = "LSM: tree queue hit maximum"; + stats->lsm_work_queue_switch.desc = + "LSM: Switch work units currently queued"; + stats->lsm_work_units_created.desc = + "LSM: tree maintenance operations scheduled"; + stats->lsm_work_units_discarded.desc = + "LSM: tree maintenance operations discarded"; + stats->lsm_work_units_done.desc = + "LSM: tree maintenance operations executed"; + stats->memory_allocation.desc = "conn: memory allocations"; + stats->memory_free.desc = "conn: memory frees"; + stats->memory_grow.desc = "conn: memory re-allocations"; + stats->read_io.desc = "conn: total read I/Os"; + stats->rec_pages.desc = "reconciliation: page reconciliation calls"; + stats->rec_pages_eviction.desc = + "reconciliation: page reconciliation calls for eviction"; + stats->rec_split_stashed_bytes.desc = + "reconciliation: split bytes currently awaiting free"; + stats->rec_split_stashed_objects.desc = + "reconciliation: split objects currently awaiting free"; + stats->rwlock_read.desc = + "conn: pthread mutex shared lock read-lock calls"; + stats->rwlock_write.desc = + "conn: pthread mutex shared lock write-lock calls"; + stats->session_cursor_open.desc = "session: open cursor count"; + stats->session_open.desc = "session: open session count"; + stats->txn_begin.desc = "txn: transaction begins"; + stats->txn_checkpoint.desc = "txn: transaction checkpoints"; + stats->txn_checkpoint_running.desc = + "txn: transaction checkpoint currently running"; + stats->txn_commit.desc = "txn: transactions committed"; + stats->txn_fail_cache.desc = + "txn: transaction failures due to cache overflow"; + stats->txn_pinned_range.desc = + "txn: transaction range of IDs currently pinned"; + stats->txn_rollback.desc = "txn: transactions rolled back"; + stats->write_io.desc = "conn: total write I/Os"; +} + +void +__wt_stat_refresh_connection_stats(void *stats_arg) +{ + WT_CONNECTION_STATS *stats; + + stats = (WT_CONNECTION_STATS *)stats_arg; + stats->async_alloc_race.v = 0; + stats->async_alloc_view.v = 0; + stats->async_cur_queue.v = 0; + stats->async_flush.v = 0; + stats->async_full.v = 0; + stats->async_max_queue.v = 0; + stats->async_nowork.v = 0; + stats->async_op_alloc.v = 0; + stats->async_op_compact.v = 0; + stats->async_op_insert.v = 0; + stats->async_op_remove.v = 0; + stats->async_op_search.v = 0; + stats->async_op_update.v = 0; + stats->block_byte_map_read.v = 0; + stats->block_byte_read.v = 0; + stats->block_byte_write.v = 0; + stats->block_map_read.v = 0; + stats->block_preload.v = 0; + stats->block_read.v = 0; + stats->block_write.v = 0; + stats->cache_bytes_dirty.v = 0; + stats->cache_bytes_read.v = 0; + stats->cache_bytes_write.v = 0; + stats->cache_eviction_checkpoint.v = 0; + stats->cache_eviction_clean.v = 0; + stats->cache_eviction_deepen.v = 0; + stats->cache_eviction_dirty.v = 0; + stats->cache_eviction_fail.v = 0; + stats->cache_eviction_force.v = 0; + stats->cache_eviction_force_fail.v = 0; + stats->cache_eviction_hazard.v = 0; + stats->cache_eviction_internal.v = 0; + stats->cache_eviction_queue_empty.v = 0; + stats->cache_eviction_queue_not_empty.v = 0; + stats->cache_eviction_server_evicting.v = 0; + stats->cache_eviction_server_not_evicting.v = 0; + stats->cache_eviction_slow.v = 0; + stats->cache_eviction_split.v = 0; + stats->cache_eviction_walk.v = 0; + stats->cache_pages_dirty.v = 0; + stats->cache_read.v = 0; + stats->cache_write.v = 0; + stats->cond_wait.v = 0; + stats->cursor_create.v = 0; + stats->cursor_insert.v = 0; + stats->cursor_next.v = 0; + stats->cursor_prev.v = 0; + stats->cursor_remove.v = 0; + stats->cursor_reset.v = 0; + stats->cursor_search.v = 0; + stats->cursor_search_near.v = 0; + stats->cursor_update.v = 0; + stats->dh_session_handles.v = 0; + stats->dh_session_sweeps.v = 0; + stats->log_buffer_grow.v = 0; + stats->log_bytes_user.v = 0; + stats->log_bytes_written.v = 0; + stats->log_close_yields.v = 0; + stats->log_reads.v = 0; + stats->log_scan_records.v = 0; + stats->log_scan_rereads.v = 0; + stats->log_scans.v = 0; + stats->log_slot_closes.v = 0; + stats->log_slot_consolidated.v = 0; + stats->log_slot_joins.v = 0; + stats->log_slot_races.v = 0; + stats->log_slot_switch_fails.v = 0; + stats->log_slot_toobig.v = 0; + stats->log_slot_toosmall.v = 0; + stats->log_slot_transitions.v = 0; + stats->log_sync.v = 0; + stats->log_writes.v = 0; + stats->lsm_checkpoint_throttle.v = 0; + stats->lsm_merge_throttle.v = 0; + stats->lsm_rows_merged.v = 0; + stats->lsm_work_queue_max.v = 0; + stats->lsm_work_units_created.v = 0; + stats->lsm_work_units_discarded.v = 0; + stats->lsm_work_units_done.v = 0; + stats->memory_allocation.v = 0; + stats->memory_free.v = 0; + stats->memory_grow.v = 0; + stats->read_io.v = 0; + stats->rec_pages.v = 0; + stats->rec_pages_eviction.v = 0; + stats->rwlock_read.v = 0; + stats->rwlock_write.v = 0; + stats->txn_begin.v = 0; + stats->txn_checkpoint.v = 0; + stats->txn_commit.v = 0; + stats->txn_fail_cache.v = 0; + stats->txn_rollback.v = 0; + stats->write_io.v = 0; +} diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c new file mode 100644 index 00000000000..292d1a37ceb --- /dev/null +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -0,0 +1,554 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_txnid_cmp -- + * Compare transaction IDs for sorting / searching. + */ +int +__wt_txnid_cmp(const void *v1, const void *v2) +{ + uint64_t id1, id2; + + id1 = *(uint64_t *)v1; + id2 = *(uint64_t *)v2; + + return ((id1 == id2) ? 0 : TXNID_LT(id1, id2) ? -1 : 1); +} + +/* + * __txn_sort_snapshot -- + * Sort a snapshot for faster searching and set the min/max bounds. + */ +static void +__txn_sort_snapshot(WT_SESSION_IMPL *session, uint32_t n, uint64_t snap_max) +{ + WT_TXN *txn; + + txn = &session->txn; + + if (n > 1) + qsort(txn->snapshot, n, sizeof(uint64_t), __wt_txnid_cmp); + txn->snapshot_count = n; + txn->snap_max = snap_max; + txn->snap_min = (n > 0 && TXNID_LE(txn->snapshot[0], snap_max)) ? + txn->snapshot[0] : snap_max; + F_SET(txn, TXN_HAS_SNAPSHOT); + WT_ASSERT(session, n == 0 || txn->snap_min != WT_TXN_NONE); +} + +/* + * __wt_txn_release_snapshot -- + * Release the snapshot in the current transaction. + */ +void +__wt_txn_release_snapshot(WT_SESSION_IMPL *session) +{ + WT_TXN *txn; + WT_TXN_STATE *txn_state; + + txn = &session->txn; + txn_state = &S2C(session)->txn_global.states[session->id]; + + if (txn_state->snap_min != WT_TXN_NONE) { + WT_ASSERT(session, + session->txn.isolation == TXN_ISO_READ_UNCOMMITTED || + !__wt_txn_visible_all(session, txn_state->snap_min)); + txn_state->snap_min = WT_TXN_NONE; + } + F_CLR(txn, TXN_HAS_SNAPSHOT); +} + +/* + * __wt_txn_update_oldest -- + * Sweep the running transactions to update the oldest ID required. + */ +void +__wt_txn_update_oldest(WT_SESSION_IMPL *session) +{ + /* + * !!! + * If a data-source is calling the WT_EXTENSION_API.transaction_oldest + * method (for the oldest transaction ID not yet visible to a running + * transaction), and then comparing that oldest ID against committed + * transactions to see if updates for a committed transaction are still + * visible to running transactions, the oldest transaction ID may be + * the same as the last committed transaction ID, if the transaction + * state wasn't refreshed after the last transaction committed. Push + * past the last committed transaction. + */ + __wt_txn_refresh(session, 0); +} + +/* + * __wt_txn_refresh -- + * Allocate a transaction ID and/or a snapshot. + */ +void +__wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot) +{ + WT_CONNECTION_IMPL *conn; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *s, *txn_state; + uint64_t current_id, id, oldest_id; + uint64_t prev_oldest_id, snap_min; + uint32_t i, n, oldest_session, session_cnt; + int32_t count; + + conn = S2C(session); + txn = &session->txn; + txn_global = &conn->txn_global; + txn_state = &txn_global->states[session->id]; + + current_id = snap_min = txn_global->current; + prev_oldest_id = txn_global->oldest_id; + + /* For pure read-only workloads, avoid scanning. */ + if (prev_oldest_id == current_id) { + if (get_snapshot) { + txn_state->snap_min = current_id; + __txn_sort_snapshot(session, 0, current_id); + } + /* Check that the oldest ID has not moved in the meantime. */ + if (prev_oldest_id == txn_global->oldest_id && + txn_global->scan_count == 0) + return; + } + + /* + * We're going to scan. Increment the count of scanners to prevent the + * oldest ID from moving forwards. Spin if the count is negative, + * which indicates that some thread is moving the oldest ID forwards. + */ + do { + if ((count = txn_global->scan_count) < 0) + WT_PAUSE(); + } while (count < 0 || + !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1)); + + /* The oldest ID cannot change until the scan count goes to zero. */ + prev_oldest_id = txn_global->oldest_id; + current_id = oldest_id = snap_min = txn_global->current; + oldest_session = 0; + + /* Walk the array of concurrent transactions. */ + WT_ORDERED_READ(session_cnt, conn->session_cnt); + for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) { + /* + * Build our snapshot of any concurrent transaction IDs. + * + * Ignore our own ID: we always read our own updates. + * + * Also ignore the ID if it is older than the oldest ID we saw. + * This can happen if we race with a thread that is allocating + * an ID -- the ID will not be used because the thread will + * keep spinning until it gets a valid one. + */ + if (s != txn_state && + (id = s->id) != WT_TXN_NONE && + TXNID_LE(prev_oldest_id, id)) { + if (get_snapshot) + txn->snapshot[n++] = id; + if (TXNID_LT(id, snap_min)) + snap_min = id; + } + + /* + * Ignore the session's own snap_min: we are about to update + * it. + */ + if (get_snapshot && s == txn_state) + continue; + + /* + * !!! + * Note: Don't ignore snap_min values older than the previous + * oldest ID. Read-uncommitted operations publish snap_min + * values without incrementing scan_count to protect the global + * table. See the comment in __wt_txn_cursor_op for + * more details. + */ + if ((id = s->snap_min) != WT_TXN_NONE && + TXNID_LT(id, oldest_id)) { + oldest_id = id; + oldest_session = i; + } + } + + if (TXNID_LT(snap_min, oldest_id)) + oldest_id = snap_min; + if (txn->id != WT_TXN_NONE && TXNID_LT(txn->id, oldest_id)) + oldest_id = txn->id; + + /* + * If we got a new snapshot, update the published snap_min for this + * session. + */ + if (get_snapshot) { + WT_ASSERT(session, TXNID_LE(prev_oldest_id, snap_min)); + WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); + txn_state->snap_min = snap_min; + } + + /* + * Update the last running ID if we have a much newer value or we are + * forcing an update. + */ + if (!get_snapshot || snap_min > txn_global->last_running + 100) + txn_global->last_running = snap_min; + + /* + * Update the oldest ID if we have a newer ID and we can get exclusive + * access. During normal snapshot refresh, only do this if we have a + * much newer value. Once we get exclusive access, do another pass to + * make sure nobody else is using an earlier ID. + */ + if (TXNID_LT(prev_oldest_id, oldest_id) && + (!get_snapshot || oldest_id - prev_oldest_id > 100) && + WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) { + WT_ORDERED_READ(session_cnt, conn->session_cnt); + for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { + if ((id = s->id) != WT_TXN_NONE && + TXNID_LT(id, oldest_id)) + oldest_id = id; + if ((id = s->snap_min) != WT_TXN_NONE && + TXNID_LT(id, oldest_id)) + oldest_id = id; + } + if (TXNID_LT(txn_global->oldest_id, oldest_id)) + txn_global->oldest_id = oldest_id; + txn_global->scan_count = 0; + } else { + if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && + current_id - oldest_id > 10000 && + txn_global->oldest_session != oldest_session) { + (void)__wt_verbose(session, WT_VERB_TRANSACTION, + "old snapshot %" PRIu64 + " pinned in session %d [%s]" + " with snap_min %" PRIu64 "\n", + oldest_id, oldest_session, + conn->sessions[oldest_session].lastop, + conn->sessions[oldest_session].txn.snap_min); + txn_global->oldest_session = oldest_session; + } + WT_ASSERT(session, txn_global->scan_count > 0); + (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); + } + + if (get_snapshot) + __txn_sort_snapshot(session, n, current_id); +} + +/* + * __wt_txn_begin -- + * Begin a transaction. + */ +int +__wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONFIG_ITEM cval; + WT_TXN *txn; + + txn = &session->txn; + + WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval)); + if (cval.len == 0) + txn->isolation = session->isolation; + else + txn->isolation = + WT_STRING_MATCH("snapshot", cval.str, cval.len) ? + TXN_ISO_SNAPSHOT : + WT_STRING_MATCH("read-committed", cval.str, cval.len) ? + TXN_ISO_READ_COMMITTED : TXN_ISO_READ_UNCOMMITTED; + + /* + * The default sync setting is inherited from the connection, but can + * be overridden by an explicit "sync" setting for this transaction. + */ + txn->txn_logsync = S2C(session)->txn_logsync; + WT_RET(__wt_config_gets_def(session, cfg, "sync", + FLD_ISSET(txn->txn_logsync, WT_LOG_FLUSH), &cval)); + if (!cval.val) + txn->txn_logsync = 0; + + F_SET(txn, TXN_RUNNING); + if (txn->isolation == TXN_ISO_SNAPSHOT) { + if (session->ncursors > 0) + WT_RET(__wt_session_copy_values(session)); + __wt_txn_refresh(session, 1); + } + return (0); +} + +/* + * __wt_txn_release -- + * Release the resources associated with the current transaction. + */ +void +__wt_txn_release(WT_SESSION_IMPL *session) +{ + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *txn_state; + + txn = &session->txn; + WT_ASSERT(session, txn->mod_count == 0); + txn->notify = NULL; + + txn_global = &S2C(session)->txn_global; + txn_state = &txn_global->states[session->id]; + + /* Clear the transaction's ID from the global table. */ + if (F_ISSET(txn, TXN_HAS_ID)) { + WT_ASSERT(session, txn_state->id != WT_TXN_NONE && + txn->id != WT_TXN_NONE); + WT_PUBLISH(txn_state->id, WT_TXN_NONE); + txn->id = WT_TXN_NONE; + } + + /* Free the scratch buffer allocated for logging. */ + __wt_logrec_free(session, &txn->logrec); + + /* Discard any memory from the session's split stash that we can. */ + if (session->split_stash_cnt > 0) + __wt_split_stash_discard(session); + + /* + * Reset the transaction state to not running and release the snapshot. + */ + __wt_txn_release_snapshot(session); + txn->isolation = session->isolation; + F_CLR(txn, TXN_ERROR | TXN_HAS_ID | TXN_RUNNING); +} + +/* + * __wt_txn_commit -- + * Commit the current transaction. + */ +int +__wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_DECL_RET; + WT_TXN *txn; + WT_TXN_OP *op; + u_int i; + + txn = &session->txn; + WT_ASSERT(session, !F_ISSET(txn, TXN_ERROR)); + + if (!F_ISSET(txn, TXN_RUNNING)) + WT_RET_MSG(session, EINVAL, "No transaction is active"); + + /* Commit notification. */ + if (txn->notify != NULL) + WT_TRET(txn->notify->notify(txn->notify, + (WT_SESSION *)session, txn->id, 1)); + + /* If we are logging, write a commit log record. */ + if (ret == 0 && + txn->mod_count > 0 && S2C(session)->logging && + !F_ISSET(session, WT_SESSION_NO_LOGGING)) + ret = __wt_txn_log_commit(session, cfg); + + /* + * If anything went wrong, roll back. + * + * !!! + * Nothing can fail after this point. + */ + if (ret != 0) { + WT_TRET(__wt_txn_rollback(session, cfg)); + return (ret); + } + + /* Free memory associated with updates. */ + for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) + __wt_txn_op_free(session, op); + txn->mod_count = 0; + + /* + * We are about to release the snapshot: copy values into any + * positioned cursors so they don't point to updates that could be + * freed once we don't have a transaction ID pinned. + */ + if (session->ncursors > 0) + WT_RET(__wt_session_copy_values(session)); + + __wt_txn_release(session); + return (0); +} + +/* + * __wt_txn_rollback -- + * Roll back the current transaction. + */ +int +__wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_DECL_RET; + WT_TXN *txn; + WT_TXN_OP *op; + u_int i; + + WT_UNUSED(cfg); + + txn = &session->txn; + if (!F_ISSET(txn, TXN_RUNNING)) + WT_RET_MSG(session, EINVAL, "No transaction is active"); + + /* Rollback notification. */ + if (txn->notify != NULL) + WT_TRET(txn->notify->notify(txn->notify, (WT_SESSION *)session, + txn->id, 0)); + + /* Rollback updates. */ + for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { + /* Metadata updates are never rolled back. */ + if (op->fileid == WT_METAFILE_ID) + continue; + + switch (op->type) { + case TXN_OP_BASIC: + case TXN_OP_INMEM: + op->u.upd->txnid = WT_TXN_ABORTED; + break; + case TXN_OP_REF: + __wt_delete_page_rollback(session, op->u.ref); + break; + case TXN_OP_TRUNCATE_COL: + case TXN_OP_TRUNCATE_ROW: + /* + * Nothing to do: these operations are only logged for + * recovery. The in-memory changes will be rolled back + * with a combination of TXN_OP_REF and TXN_OP_INMEM + * operations. + */ + break; + } + + /* Free any memory allocated for the operation. */ + __wt_txn_op_free(session, op); + } + txn->mod_count = 0; + + __wt_txn_release(session); + return (ret); +} + +/* + * __wt_txn_init -- + * Initialize a session's transaction data. + */ +int +__wt_txn_init(WT_SESSION_IMPL *session) +{ + WT_TXN *txn; + + txn = &session->txn; + txn->id = WT_TXN_NONE; + + WT_RET(__wt_calloc_def(session, + S2C(session)->session_size, &txn->snapshot)); + +#ifdef HAVE_DIAGNOSTIC + if (S2C(session)->txn_global.states != NULL) { + WT_TXN_STATE *txn_state; + txn_state = &S2C(session)->txn_global.states[session->id]; + WT_ASSERT(session, txn_state->snap_min == WT_TXN_NONE); + } +#endif + + /* + * Take care to clean these out in case we are reusing the transaction + * for eviction. + */ + txn->mod = NULL; + + txn->isolation = session->isolation; + return (0); +} + +/* + * __wt_txn_stats_update -- + * Update the transaction statistics for return to the application. + */ +void +__wt_txn_stats_update(WT_SESSION_IMPL *session) +{ + WT_TXN_GLOBAL *txn_global; + WT_CONNECTION_IMPL *conn; + WT_CONNECTION_STATS *stats; + + conn = S2C(session); + txn_global = &conn->txn_global; + stats = &conn->stats; + + WT_STAT_SET(stats, txn_pinned_range, + txn_global->current - txn_global->oldest_id); +} + +/* + * __wt_txn_destroy -- + * Destroy a session's transaction data. + */ +void +__wt_txn_destroy(WT_SESSION_IMPL *session) +{ + WT_TXN *txn; + + txn = &session->txn; + __wt_free(session, txn->mod); + __wt_free(session, txn->snapshot); +} + +/* + * __wt_txn_global_init -- + * Initialize the global transaction state. + */ +int +__wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONNECTION_IMPL *conn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *s; + u_int i; + + WT_UNUSED(cfg); + conn = S2C(session); + + txn_global = &conn->txn_global; + txn_global->current = 1; + txn_global->oldest_id = 1; + txn_global->last_running = 1; + + WT_RET(__wt_calloc_def( + session, conn->session_size, &txn_global->states)); + for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++) + s->id = s->snap_min = WT_TXN_NONE; + + return (0); +} + +/* + * __wt_txn_global_destroy -- + * Destroy the global transaction state. + */ +void +__wt_txn_global_destroy(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_TXN_GLOBAL *txn_global; + + conn = S2C(session); + txn_global = &conn->txn_global; + + if (txn_global != NULL) + __wt_free(session, txn_global->states); +} diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c new file mode 100644 index 00000000000..555eec649c6 --- /dev/null +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -0,0 +1,944 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_checkpoint_name_ok -- + * Complain if the checkpoint name isn't acceptable. + */ +int +__wt_checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len) +{ + /* Check for characters we don't want to see in a metadata file. */ + WT_RET(__wt_name_check(session, name, len)); + + /* + * The internal checkpoint name is special, applications aren't allowed + * to use it. Be aggressive and disallow any matching prefix, it makes + * things easier when checking in other places. + */ + if (len < strlen(WT_CHECKPOINT)) + return (0); + if (!WT_PREFIX_MATCH(name, WT_CHECKPOINT)) + return (0); + + WT_RET_MSG(session, EINVAL, + "the checkpoint name \"%s\" is reserved", WT_CHECKPOINT); +} + +/* + * __checkpoint_name_check -- + * Check for an attempt to name a checkpoint that includes anything + * other than a file object. + */ +static int +__checkpoint_name_check(WT_SESSION_IMPL *session, const char *uri) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + const char *fail; + + cursor = NULL; + fail = NULL; + + /* + * This function exists as a place for this comment: named checkpoints + * are only supported on file objects, and not on LSM trees or Helium + * devices. If a target list is configured for the checkpoint, this + * function is called with each target list entry; check the entry to + * make sure it's backed by a file. If no target list is configured, + * confirm the metadata file contains no non-file objects. + */ + if (uri == NULL) { + WT_ERR(__wt_metadata_cursor(session, NULL, &cursor)); + while ((ret = cursor->next(cursor)) == 0) { + WT_ERR(cursor->get_key(cursor, &uri)); + if (!WT_PREFIX_MATCH(uri, "colgroup:") && + !WT_PREFIX_MATCH(uri, "file:") && + !WT_PREFIX_MATCH(uri, "index:") && + !WT_PREFIX_MATCH(uri, "table:")) { + fail = uri; + break; + } + } + WT_ERR_NOTFOUND_OK(ret); + } else + if (!WT_PREFIX_MATCH(uri, "colgroup:") && + !WT_PREFIX_MATCH(uri, "file:") && + !WT_PREFIX_MATCH(uri, "index:") && + !WT_PREFIX_MATCH(uri, "table:")) + fail = uri; + + if (fail != NULL) + WT_ERR_MSG(session, EINVAL, + "%s object does not support named checkpoints", fail); + +err: if (cursor != NULL) + WT_TRET(cursor->close(cursor)); + return (ret); +} + +/* + * __checkpoint_apply -- + * Apply an operation to all files involved in a checkpoint. + */ +static int +__checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], + int (*op)(WT_SESSION_IMPL *, const char *[]), int *fullp) +{ + WT_CONFIG targetconf; + WT_CONFIG_ITEM cval, k, v; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + int ckpt_closed, named, target_list; + + target_list = 0; + + /* Flag if this is a named checkpoint, and check if the name is OK. */ + WT_RET(__wt_config_gets(session, cfg, "name", &cval)); + named = cval.len != 0; + if (named) + WT_RET(__wt_checkpoint_name_ok(session, cval.str, cval.len)); + + /* Step through the targets and optionally operate on each one. */ + WT_ERR(__wt_config_gets(session, cfg, "target", &cval)); + WT_ERR(__wt_config_subinit(session, &targetconf, &cval)); + while ((ret = __wt_config_next(&targetconf, &k, &v)) == 0) { + if (!target_list) { + WT_ERR(__wt_scr_alloc(session, 512, &tmp)); + target_list = 1; + } + + if (v.len != 0) + WT_ERR_MSG(session, EINVAL, + "invalid checkpoint target %.*s: URIs may require " + "quoting", + (int)cval.len, (char *)cval.str); + + /* Some objects don't support named checkpoints. */ + if (named) + WT_ERR(__checkpoint_name_check(session, k.str)); + + if (op == NULL) + continue; + WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str)); + if ((ret = __wt_schema_worker( + session, tmp->data, op, NULL, cfg, 0)) != 0) + WT_ERR_MSG(session, ret, "%s", (const char *)tmp->data); + } + WT_ERR_NOTFOUND_OK(ret); + + if (!target_list && named) + /* Some objects don't support named checkpoints. */ + WT_ERR(__checkpoint_name_check(session, NULL)); + + if (!target_list && op != NULL) { + /* + * If the checkpoint is named or we're dropping checkpoints, we + * checkpoint both open and closed files; else, only checkpoint + * open files. + * + * XXX + * We don't optimize unnamed checkpoints of a list of targets, + * we open the targets and checkpoint them even if they are + * quiescent and don't need a checkpoint, believing applications + * unlikely to checkpoint a list of closed targets. + */ + ckpt_closed = named; + if (!ckpt_closed) { + WT_ERR(__wt_config_gets(session, cfg, "drop", &cval)); + ckpt_closed = cval.len != 0; + } + WT_ERR(ckpt_closed ? + __wt_meta_btree_apply(session, op, cfg) : + __wt_conn_btree_apply(session, 0, op, cfg)); + } + + if (fullp != NULL) + *fullp = !target_list; + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __checkpoint_data_source -- + * Checkpoint all data sources. + */ +static int +__checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_NAMED_DATA_SOURCE *ndsrc; + WT_DATA_SOURCE *dsrc; + + /* + * A place-holder, to support Helium devices: we assume calling the + * underlying data-source session checkpoint function is sufficient to + * checkpoint all objects in the data source, open or closed, and we + * don't attempt to optimize the checkpoint of individual targets. + * Those assumptions is correct for the Helium device, but it's not + * necessarily going to be true for other data sources. + * + * It's not difficult to support data-source checkpoints of individual + * targets (__wt_schema_worker is the underlying function that will do + * the work, and it's already written to support data-sources, although + * we'd probably need to pass the URI of the object to the data source + * checkpoint function which we don't currently do). However, doing a + * full data checkpoint is trickier: currently, the connection code is + * written to ignore all objects other than "file:", and that code will + * require significant changes to work with data sources. + */ + TAILQ_FOREACH(ndsrc, &S2C(session)->dsrcqh, q) { + dsrc = ndsrc->dsrc; + if (dsrc->checkpoint != NULL) + WT_RET(dsrc->checkpoint(dsrc, + (WT_SESSION *)session, (WT_CONFIG_ARG *)cfg)); + } + return (0); +} + +/* + * __wt_checkpoint_list -- + * Get a list of handles to flush. + */ +int +__wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_DATA_HANDLE *saved_dhandle; + WT_DECL_RET; + const char *name; + + WT_UNUSED(cfg); + + /* Should not be called with anything other than a file object. */ + WT_ASSERT(session, session->dhandle->checkpoint == NULL); + WT_ASSERT(session, + memcmp(session->dhandle->name, "file:", strlen("file:")) == 0); + + /* Make sure there is space for the next entry. */ + WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated, + session->ckpt_handle_next + 1, &session->ckpt_handle)); + + /* Not strictly necessary, but cleaner to clear the current handle. */ + name = session->dhandle->name; + saved_dhandle = session->dhandle; + session->dhandle = NULL; + + /* Ignore busy files, we'll deal with them in the checkpoint. */ + switch (ret = __wt_session_get_btree(session, name, NULL, NULL, 0)) { + case 0: + session->ckpt_handle[ + session->ckpt_handle_next++] = session->dhandle; + break; + case EBUSY: + ret = 0; + break; + default: + break; + } + + session->dhandle = saved_dhandle; + return (ret); +} + +/* + * __checkpoint_write_leaves -- + * Write any dirty leaf pages for all checkpoint handles. + */ +static int +__checkpoint_write_leaves(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + u_int i; + + i = 0; + + /* Should not be called with any handle reference. */ + WT_ASSERT(session, session->dhandle == NULL); + + /* + * Get a list of handles we want to flush; this may pull closed objects + * into the session cache, but we're going to do that eventually anyway. + */ + WT_WITH_SCHEMA_LOCK(session, + ret = __checkpoint_apply(session, cfg, __wt_checkpoint_list, NULL)); + WT_ERR(ret); + + /* + * Walk the list, flushing the leaf pages from each file, then releasing + * the file. Note that we increment inside the loop to simplify error + * handling. + */ + while (i < session->ckpt_handle_next) { + dhandle = session->ckpt_handle[i++]; + WT_WITH_DHANDLE(session, dhandle, + ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES)); + WT_WITH_DHANDLE(session, dhandle, + WT_TRET(__wt_session_release_btree(session))); + WT_ERR(ret); + } + +err: while (i < session->ckpt_handle_next) { + dhandle = session->ckpt_handle[i++]; + WT_WITH_DHANDLE(session, dhandle, + WT_TRET(__wt_session_release_btree(session))); + } + __wt_free(session, session->ckpt_handle); + session->ckpt_handle_allocated = session->ckpt_handle_next = 0; + return (ret); +} + +/* + * __wt_txn_checkpoint -- + * Checkpoint a database or a list of objects in the database. + */ +int +__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + WT_TXN *txn; + WT_TXN_ISOLATION saved_isolation; + int full, logging, tracking; + const char *txn_cfg[] = + { WT_CONFIG_BASE(session, session_begin_transaction), + "isolation=snapshot", NULL }; + void *saved_meta_next; + + conn = S2C(session); + saved_isolation = session->isolation; + txn = &session->txn; + full = logging = tracking = 0; + + /* + * Do a pass over the configuration arguments and figure out what kind + * kind of checkpoint this is. + */ + WT_RET(__checkpoint_apply(session, cfg, NULL, &full)); + + /* + * Update the global oldest ID so we do all possible cleanup. + * + * This is particularly important for compact, so that all dirty pages + * can be fully written. + */ + __wt_txn_update_oldest(session); + + /* Flush data-sources before we start the checkpoint. */ + WT_ERR(__checkpoint_data_source(session, cfg)); + + /* Flush dirty leaf pages before we start the checkpoint. */ + session->isolation = txn->isolation = TXN_ISO_READ_COMMITTED; + WT_ERR(__checkpoint_write_leaves(session, cfg)); + + /* Acquire the schema lock. */ + F_SET(session, WT_SESSION_SCHEMA_LOCKED); + __wt_spin_lock(session, &conn->schema_lock); + + WT_ERR(__wt_meta_track_on(session)); + tracking = 1; + + /* Tell logging that we are about to start a database checkpoint. */ + if (conn->logging && full) + WT_ERR(__wt_txn_checkpoint_log( + session, full, WT_TXN_LOG_CKPT_PREPARE, NULL)); + + /* + * Start a snapshot transaction for the checkpoint. + * + * Note: we don't go through the public API calls because they have + * side effects on cursors, which applications can hold open across + * calls to checkpoint. + */ + WT_ERR(__wt_txn_begin(session, txn_cfg)); + + /* Tell logging that we have started a database checkpoint. */ + if (conn->logging && full) { + WT_ERR(__wt_txn_checkpoint_log( + session, full, WT_TXN_LOG_CKPT_START, NULL)); + logging = 1; + } + + WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint, NULL)); + + /* Commit the transaction before syncing the file(s). */ + WT_ERR(__wt_txn_commit(session, NULL)); + + /* + * Checkpoints have to hit disk (it would be reasonable to configure for + * lazy checkpoints, but we don't support them yet). + */ + if (F_ISSET(conn, WT_CONN_CKPT_SYNC)) + WT_ERR(__checkpoint_apply( + session, cfg, __wt_checkpoint_sync, NULL)); + + /* Checkpoint the metadata file. */ + SLIST_FOREACH(dhandle, &conn->dhlh, l) { + if (WT_IS_METADATA(dhandle) || + !WT_PREFIX_MATCH(dhandle->name, "file:")) + break; + } + if (dhandle == NULL) + WT_ERR_MSG(session, EINVAL, + "checkpoint unable to find open meta-data handle"); + + /* + * Disable metadata tracking during the metadata checkpoint. + * + * We don't lock old checkpoints in the metadata file: there is no way + * to open one. We are holding other handle locks, it is not safe to + * lock conn->spinlock. + */ + session->isolation = txn->isolation = TXN_ISO_READ_UNCOMMITTED; + saved_meta_next = session->meta_track_next; + session->meta_track_next = NULL; + WT_WITH_DHANDLE(session, dhandle, ret = __wt_checkpoint(session, cfg)); + session->meta_track_next = saved_meta_next; + +err: /* + * XXX + * Rolling back the changes here is problematic. + * + * If we unroll here, we need a way to roll back changes to the avail + * list for each tree that was successfully synced before the error + * occurred. Otherwise, the next time we try this operation, we will + * try to free an old checkpoint again. + * + * OTOH, if we commit the changes after a failure, we have partially + * overwritten the checkpoint, so what ends up on disk is not + * consistent. + */ + session->isolation = txn->isolation = TXN_ISO_READ_UNCOMMITTED; + if (tracking) + WT_TRET(__wt_meta_track_off(session, ret != 0)); + + if (F_ISSET(txn, TXN_RUNNING)) + WT_TRET(__wt_txn_rollback(session, NULL)); + + /* Tell logging that we have finished a database checkpoint. */ + if (logging) + WT_TRET(__wt_txn_checkpoint_log(session, full, + (ret == 0) ? WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_FAIL, + NULL)); + + if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) { + F_CLR(session, WT_SESSION_SCHEMA_LOCKED); + __wt_spin_unlock(session, &conn->schema_lock); + } + + session->isolation = txn->isolation = saved_isolation; + + return (ret); +} + +/* + * __drop -- + * Drop all checkpoints with a specific name. + */ +static void +__drop(WT_CKPT *ckptbase, const char *name, size_t len) +{ + WT_CKPT *ckpt; + + /* + * If we're dropping internal checkpoints, match to the '.' separating + * the checkpoint name from the generational number, and take all that + * we can find. Applications aren't allowed to use any variant of this + * name, so the test is still pretty simple, if the leading bytes match, + * it's one we want to drop. + */ + if (strncmp(WT_CHECKPOINT, name, len) == 0) { + WT_CKPT_FOREACH(ckptbase, ckpt) + if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) + F_SET(ckpt, WT_CKPT_DELETE); + } else + WT_CKPT_FOREACH(ckptbase, ckpt) + if (WT_STRING_MATCH(ckpt->name, name, len)) + F_SET(ckpt, WT_CKPT_DELETE); +} + +/* + * __drop_from -- + * Drop all checkpoints after, and including, the named checkpoint. + */ +static void +__drop_from(WT_CKPT *ckptbase, const char *name, size_t len) +{ + WT_CKPT *ckpt; + int matched; + + /* + * There's a special case -- if the name is "all", then we delete all + * of the checkpoints. + */ + if (WT_STRING_MATCH("all", name, len)) { + WT_CKPT_FOREACH(ckptbase, ckpt) + F_SET(ckpt, WT_CKPT_DELETE); + return; + } + + /* + * We use the first checkpoint we can find, that is, if there are two + * checkpoints with the same name in the list, we'll delete from the + * first match to the end. + */ + matched = 0; + WT_CKPT_FOREACH(ckptbase, ckpt) { + if (!matched && !WT_STRING_MATCH(ckpt->name, name, len)) + continue; + + matched = 1; + F_SET(ckpt, WT_CKPT_DELETE); + } +} + +/* + * __drop_to -- + * Drop all checkpoints before, and including, the named checkpoint. + */ +static void +__drop_to(WT_CKPT *ckptbase, const char *name, size_t len) +{ + WT_CKPT *ckpt, *mark; + + /* + * We use the last checkpoint we can find, that is, if there are two + * checkpoints with the same name in the list, we'll delete from the + * beginning to the second match, not the first. + */ + mark = NULL; + WT_CKPT_FOREACH(ckptbase, ckpt) + if (WT_STRING_MATCH(ckpt->name, name, len)) + mark = ckpt; + + if (mark == NULL) + return; + + WT_CKPT_FOREACH(ckptbase, ckpt) { + F_SET(ckpt, WT_CKPT_DELETE); + + if (ckpt == mark) + break; + } +} + +/* + * __checkpoint_worker -- + * Checkpoint a tree. + */ +static int +__checkpoint_worker( + WT_SESSION_IMPL *session, const char *cfg[], int is_checkpoint) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_CKPT *ckpt, *ckptbase; + WT_CONFIG dropconf; + WT_CONFIG_ITEM cval, k, v; + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + WT_LSN ckptlsn; + const char *name; + int deleted, force, hot_backup_locked, track_ckpt, was_modified; + char *name_alloc; + + btree = S2BT(session); + bm = btree->bm; + conn = S2C(session); + ckpt = ckptbase = NULL; + INIT_LSN(&ckptlsn); + dhandle = session->dhandle; + name_alloc = NULL; + hot_backup_locked = 0; + name_alloc = NULL; + track_ckpt = 1; + was_modified = btree->modified; + + /* Get the list of checkpoints for this file. */ + WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, &ckptbase)); + + /* This may be a named checkpoint, check the configuration. */ + cval.len = 0; + if (cfg != NULL) + WT_ERR(__wt_config_gets(session, cfg, "name", &cval)); + if (cval.len == 0) + name = WT_CHECKPOINT; + else { + WT_ERR(__wt_checkpoint_name_ok(session, cval.str, cval.len)); + WT_ERR(__wt_strndup(session, cval.str, cval.len, &name_alloc)); + name = name_alloc; + } + + /* We may be dropping specific checkpoints, check the configuration. */ + if (cfg != NULL) { + cval.len = 0; + WT_ERR(__wt_config_gets(session, cfg, "drop", &cval)); + if (cval.len != 0) { + WT_ERR(__wt_config_subinit(session, &dropconf, &cval)); + while ((ret = + __wt_config_next(&dropconf, &k, &v)) == 0) { + /* Disallow unsafe checkpoint names. */ + if (v.len == 0) + WT_ERR(__wt_checkpoint_name_ok( + session, k.str, k.len)); + else + WT_ERR(__wt_checkpoint_name_ok( + session, v.str, v.len)); + + if (v.len == 0) + __drop(ckptbase, k.str, k.len); + else if (WT_STRING_MATCH("from", k.str, k.len)) + __drop_from(ckptbase, v.str, v.len); + else if (WT_STRING_MATCH("to", k.str, k.len)) + __drop_to(ckptbase, v.str, v.len); + else + WT_ERR_MSG(session, EINVAL, + "unexpected value for checkpoint " + "key: %.*s", + (int)k.len, k.str); + } + WT_ERR_NOTFOUND_OK(ret); + } + } + + /* Drop checkpoints with the same name as the one we're taking. */ + __drop(ckptbase, name, strlen(name)); + + /* + * Check for clean objects not requiring a checkpoint. + * + * If we're closing a handle, and the object is clean, we can skip the + * checkpoint, whatever checkpoints we have are sufficient. (We might + * not have any checkpoints if the object was never modified, and that's + * OK: the object creation code doesn't mark the tree modified so we can + * skip newly created trees here.) + * + * If the application repeatedly checkpoints an object (imagine hourly + * checkpoints using the same explicit or internal name), there's no + * reason to repeat the checkpoint for clean objects. The test is if + * the only checkpoint we're deleting is the last one in the list and + * it has the same name as the checkpoint we're about to take, skip the + * work. (We can't skip checkpoints that delete more than the last + * checkpoint because deleting those checkpoints might free up space in + * the file.) This means an application toggling between two (or more) + * checkpoint names will repeatedly take empty checkpoints, but that's + * not likely enough to make detection worthwhile. + * + * Checkpoint read-only objects otherwise: the application must be able + * to open the checkpoint in a cursor after taking any checkpoint, which + * means it must exist. + */ + force = 0; + if (!btree->modified && cfg != NULL) { + ret = __wt_config_gets(session, cfg, "force", &cval); + if (ret != 0 && ret != WT_NOTFOUND) + WT_ERR(ret); + if (ret == 0 && cval.val != 0) + force = 1; + } + if (!btree->modified && !force) { + if (!is_checkpoint) + goto done; + + deleted = 0; + WT_CKPT_FOREACH(ckptbase, ckpt) + if (F_ISSET(ckpt, WT_CKPT_DELETE)) + ++deleted; + /* + * Complicated test: if we only deleted a single checkpoint, and + * it was the last checkpoint in the object, and it has the same + * name as the checkpoint we're taking (correcting for internal + * checkpoint names with their generational suffix numbers), we + * can skip the checkpoint, there's nothing to do. + */ + if (deleted == 1 && + F_ISSET(ckpt - 1, WT_CKPT_DELETE) && + (strcmp(name, (ckpt - 1)->name) == 0 || + (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && + WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT)))) + goto done; + } + + /* Add a new checkpoint entry at the end of the list. */ + WT_CKPT_FOREACH(ckptbase, ckpt) + ; + WT_ERR(__wt_strdup(session, name, &ckpt->name)); + F_SET(ckpt, WT_CKPT_ADD); + + /* + * We can't delete checkpoints if a backup cursor is open. WiredTiger + * checkpoints are uniquely named and it's OK to have multiple of them + * in the system: clear the delete flag for them, and otherwise fail. + * Hold the lock until we're done (blocking hot backups from starting), + * we don't want to race with a future hot backup. + */ + __wt_spin_lock(session, &conn->hot_backup_lock); + hot_backup_locked = 1; + if (conn->hot_backup) + WT_CKPT_FOREACH(ckptbase, ckpt) { + if (!F_ISSET(ckpt, WT_CKPT_DELETE)) + continue; + if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { + F_CLR(ckpt, WT_CKPT_DELETE); + continue; + } + WT_ERR_MSG(session, EBUSY, + "checkpoint %s blocked by hot backup: it would " + "delete an existing checkpoint, and checkpoints " + "cannot be deleted during a hot backup", + ckpt->name); + } + + /* + * Lock the checkpoints that will be deleted. + * + * Checkpoints are only locked when tracking is enabled, which covers + * checkpoint and drop operations, but not close. The reasoning is + * there should be no access to a checkpoint during close, because any + * thread accessing a checkpoint will also have the current file handle + * open. + */ + if (WT_META_TRACKING(session)) + WT_CKPT_FOREACH(ckptbase, ckpt) { + if (!F_ISSET(ckpt, WT_CKPT_DELETE)) + continue; + + /* + * We can't delete checkpoints referenced by a cursor. + * WiredTiger checkpoints are uniquely named and it's + * OK to have multiple in the system: clear the delete + * flag for them, and otherwise fail. + */ + ret = __wt_session_lock_checkpoint(session, ckpt->name); + if (ret == 0) + continue; + if (ret == EBUSY && + WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { + F_CLR(ckpt, WT_CKPT_DELETE); + continue; + } + WT_ERR_MSG(session, ret, + "checkpoints cannot be dropped when in-use"); + } + + /* + * There are special files: those being bulk-loaded, salvaged, upgraded + * or verified during the checkpoint. We have to do something for those + * objects because a checkpoint is an external name the application can + * reference and the name must exist no matter what's happening during + * the checkpoint. For bulk-loaded files, we could block until the load + * completes, checkpoint the partial load, or magic up an empty-file + * checkpoint. The first is too slow, the second is insane, so do the + * third. + * Salvage, upgrade and verify don't currently require any work, all + * three hold the schema lock, blocking checkpoints. If we ever want to + * fix that (and I bet we eventually will, at least for verify), we can + * copy the last checkpoint the file has. That works if we guarantee + * salvage, upgrade and verify act on objects with previous checkpoints + * (true if handles are closed/re-opened between object creation and a + * subsequent salvage, upgrade or verify operation). Presumably, + * salvage and upgrade will discard all previous checkpoints when they + * complete, which is fine with us. This change will require reference + * counting checkpoints, and once that's done, we should use checkpoint + * copy instead of forcing checkpoints on clean objects to associate + * names with checkpoints. + */ + if (is_checkpoint) + switch (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) { + case 0: + break; + case WT_BTREE_BULK: + /* + * The only checkpoints a bulk-loaded file should have + * are fake ones we created without the underlying block + * manager. I'm leaving this code here because it's a + * cheap test and a nasty race. + */ + WT_CKPT_FOREACH(ckptbase, ckpt) + if (!F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_FAKE)) + WT_ERR_MSG(session, ret, + "block-manager checkpoint found " + "for a bulk-loaded file"); + track_ckpt = 0; + goto fake; + case WT_BTREE_SALVAGE: + case WT_BTREE_UPGRADE: + case WT_BTREE_VERIFY: + WT_ERR_MSG(session, EINVAL, + "checkpoints are blocked during salvage, upgrade " + "or verify operations"); + } + + /* + * If an object has never been used (in other words, if it could become + * a bulk-loaded file), then we must fake the checkpoint. This is good + * because we don't write physical checkpoint blocks for just-created + * files, but it's not just a good idea. The reason is because deleting + * a physical checkpoint requires writing the file, and fake checkpoints + * can't write the file. If you (1) create a physical checkpoint for an + * empty file which writes blocks, (2) start bulk-loading records into + * the file, (3) during the bulk-load perform another checkpoint with + * the same name; in order to keep from having two checkpoints with the + * same name you would have to use the bulk-load's fake checkpoint to + * delete a physical checkpoint, and that will end in tears. + */ + if (is_checkpoint) + if (btree->bulk_load_ok) { + track_ckpt = 0; + goto fake; + } + + /* + * Mark the root page dirty to ensure something gets written. (If the + * tree is modified, we must write the root page anyway, this doesn't + * add additional writes to the process. If the tree is not modified, + * we have to dirty the root page to ensure something gets written.) + * This is really about paranoia: if the tree modification value gets + * out of sync with the set of dirty pages (modify is set, but there + * are no dirty pages), we perform a checkpoint without any writes, no + * checkpoint is created, and then things get bad. + */ + WT_ERR(__wt_page_modify_init(session, btree->root.page)); + __wt_page_modify_set(session, btree->root.page); + + /* + * Clear the tree's modified flag; any changes before we clear the flag + * are guaranteed to be part of this checkpoint (unless reconciliation + * skips updates for transactional reasons), and changes subsequent to + * the checkpoint start, which might not be included, will re-set the + * modified flag. The "unless reconciliation skips updates" problem is + * handled in the reconciliation code: if reconciliation skips updates, + * it sets the modified flag itself. Use a full barrier so we get the + * store done quickly, this isn't a performance path. + */ + btree->modified = 0; + WT_FULL_BARRIER(); + + /* Tell logging that a file checkpoint is starting. */ + if (conn->logging) + WT_ERR(__wt_txn_checkpoint_log( + session, 0, WT_TXN_LOG_CKPT_START, &ckptlsn)); + + /* Flush the file from the cache, creating the checkpoint. */ + if (is_checkpoint) + WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CHECKPOINT)); + else + WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CLOSE)); + + /* + * All blocks being written have been written; set the object's write + * generation. + */ + WT_CKPT_FOREACH(ckptbase, ckpt) + if (F_ISSET(ckpt, WT_CKPT_ADD)) + ckpt->write_gen = btree->write_gen; + +fake: /* Update the object's metadata. */ + WT_ERR(__wt_meta_ckptlist_set( + session, dhandle->name, ckptbase, &ckptlsn)); + + /* + * If we wrote a checkpoint (rather than faking one), pages may be + * available for re-use. If tracking enabled, defer making pages + * available until transaction end. The exception is if the handle + * is being discarded, in which case the handle will be gone by the + * time we try to apply or unroll the meta tracking event. + */ + if (track_ckpt) { + if (WT_META_TRACKING(session) && is_checkpoint) + WT_ERR(__wt_meta_track_checkpoint(session)); + else + WT_ERR(bm->checkpoint_resolve(bm, session)); + } + + /* Tell logging that the checkpoint is complete. */ + if (conn->logging) + WT_ERR(__wt_txn_checkpoint_log( + session, 0, WT_TXN_LOG_CKPT_STOP, NULL)); + +done: err: + /* + * If the checkpoint didn't complete successfully, make sure the + * tree is marked dirty. + */ + if (ret != 0 && !btree->modified && was_modified) + btree->modified = 1; + + if (hot_backup_locked) + __wt_spin_unlock(session, &conn->hot_backup_lock); + + __wt_meta_ckptlist_free(session, ckptbase); + __wt_free(session, name_alloc); + + return (ret); +} + +/* + * __wt_checkpoint -- + * Checkpoint a file. + */ +int +__wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) +{ + /* Should not be called with a checkpoint handle. */ + WT_ASSERT(session, session->dhandle->checkpoint == NULL); + + /* Should be holding the schema lock. */ + WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); + + return (__checkpoint_worker(session, cfg, 1)); +} + +/* + * __wt_checkpoint_sync -- + * Sync a file that has been checkpointed, and wait for the result. + */ +int +__wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_BM *bm; + + WT_UNUSED(cfg); + + bm = S2BT(session)->bm; + + /* Should not be called with a checkpoint handle. */ + WT_ASSERT(session, session->dhandle->checkpoint == NULL); + + /* Should have an underlying block manager reference. */ + WT_ASSERT(session, bm != NULL); + + return (bm->sync(bm, session, 0)); +} + +/* + * __wt_checkpoint_close -- + * Checkpoint a single file as part of closing the handle. + */ +int +__wt_checkpoint_close(WT_SESSION_IMPL *session, int force) +{ + /* If closing an unmodified file, simply discard its blocks. */ + if (!S2BT(session)->modified || force) + return (__wt_cache_op(session, NULL, + force ? WT_SYNC_DISCARD_FORCE : WT_SYNC_DISCARD)); + + /* + * Else, checkpoint the file and optionally flush the writes (the + * checkpoint call will discard the blocks, there's no additional + * step needed). + */ + WT_RET(__checkpoint_worker(session, NULL, 0)); + if (F_ISSET(S2C(session), WT_CONN_CKPT_SYNC)) + WT_RET(__wt_checkpoint_sync(session, NULL)); + + return (0); +} diff --git a/src/third_party/wiredtiger/src/txn/txn_ext.c b/src/third_party/wiredtiger/src/txn/txn_ext.c new file mode 100644 index 00000000000..31d5506be5b --- /dev/null +++ b/src/third_party/wiredtiger/src/txn/txn_ext.c @@ -0,0 +1,104 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_ext_transaction_id -- + * Return the session's transaction ID. + */ +uint64_t +__wt_ext_transaction_id(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session) +{ + WT_SESSION_IMPL *session; + + (void)wt_api; /* Unused parameters */ + session = (WT_SESSION_IMPL *)wt_session; + /* Ignore failures: the only case is running out of transaction IDs. */ + (void)__wt_txn_id_check(session); + return (session->txn.id); +} + +/* + * __wt_ext_transaction_isolation_level -- + * Return if the current transaction's isolation level. + */ +int +__wt_ext_transaction_isolation_level( + WT_EXTENSION_API *wt_api, WT_SESSION *wt_session) +{ + WT_SESSION_IMPL *session; + WT_TXN *txn; + + (void)wt_api; /* Unused parameters */ + + session = (WT_SESSION_IMPL *)wt_session; + txn = &session->txn; + + if (txn->isolation == TXN_ISO_READ_COMMITTED) + return (WT_TXN_ISO_READ_COMMITTED); + if (txn->isolation == TXN_ISO_READ_UNCOMMITTED) + return (WT_TXN_ISO_READ_UNCOMMITTED); + return (WT_TXN_ISO_SNAPSHOT); +} + +/* + * __wt_ext_transaction_notify -- + * Request notification of transaction resolution. + */ +int +__wt_ext_transaction_notify( + WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_TXN_NOTIFY *notify) +{ + WT_SESSION_IMPL *session; + WT_TXN *txn; + + (void)wt_api; /* Unused parameters */ + + session = (WT_SESSION_IMPL *)wt_session; + txn = &session->txn; + + /* + * XXX + * For now, a single slot for notifications: I'm not bothering with + * more than one because more than one data-source in a transaction + * doesn't work anyway. + */ + if (txn->notify == notify) + return (0); + if (txn->notify != NULL) + return (ENOMEM); + + txn->notify = notify; + + return (0); +} + +/* + * __wt_ext_transaction_oldest -- + * Return the oldest transaction ID not yet visible to a running + * transaction. + */ +uint64_t +__wt_ext_transaction_oldest(WT_EXTENSION_API *wt_api) +{ + return (((WT_CONNECTION_IMPL *)wt_api->conn)->txn_global.oldest_id); +} + +/* + * __wt_ext_transaction_visible -- + * Return if the current transaction can see the given transaction ID. + */ +int +__wt_ext_transaction_visible( + WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, uint64_t transaction_id) +{ + (void)wt_api; /* Unused parameters */ + + return (__wt_txn_visible( + (WT_SESSION_IMPL *)wt_session, transaction_id)); +} diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c new file mode 100644 index 00000000000..03a71056a9a --- /dev/null +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -0,0 +1,500 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __txn_op_log -- + * Log an operation for the current transaction. + */ +static int +__txn_op_log(WT_SESSION_IMPL *session, + WT_ITEM *logrec, WT_TXN_OP *op, WT_CURSOR_BTREE *cbt) +{ + WT_DECL_RET; + WT_ITEM key, value; + WT_UPDATE *upd; + uint64_t recno; + + WT_CLEAR(key); + upd = op->u.upd; + value.data = WT_UPDATE_DATA(upd); + value.size = upd->size; + + /* + * Log the operation. It must be one of the following: + * 1) column store remove; + * 2) column store insert/update; + * 3) row store remove; or + * 4) row store insert/update. + */ + if (cbt->btree->type != BTREE_ROW) { + WT_ASSERT(session, cbt->ins != NULL); + recno = WT_INSERT_RECNO(cbt->ins); + WT_ASSERT(session, recno != 0); + + if (WT_UPDATE_DELETED_ISSET(upd)) + WT_ERR(__wt_logop_col_remove_pack(session, logrec, + op->fileid, recno)); + else + WT_ERR(__wt_logop_col_put_pack(session, logrec, + op->fileid, recno, &value)); + } else { + WT_ERR(__wt_cursor_row_leaf_key(cbt, &key)); + + if (WT_UPDATE_DELETED_ISSET(upd)) + WT_ERR(__wt_logop_row_remove_pack(session, logrec, + op->fileid, &key)); + else + WT_ERR(__wt_logop_row_put_pack(session, logrec, + op->fileid, &key, &value)); + } + +err: __wt_buf_free(session, &key); + return (ret); +} + +/* + * __txn_commit_printlog -- + * Print a commit log record. + */ +static int +__txn_commit_printlog( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + /* The logging subsystem zero-pads records. */ + while (*pp < end && **pp) + WT_RET(__wt_txn_op_printlog(session, pp, end, out)); + return (0); +} + +/* + * __wt_txn_op_free -- + * Free memory associated with a transactional operation. + */ +void +__wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op) +{ + switch (op->type) { + case TXN_OP_BASIC: + case TXN_OP_INMEM: + case TXN_OP_REF: + case TXN_OP_TRUNCATE_COL: + break; + + case TXN_OP_TRUNCATE_ROW: + __wt_buf_free(session, &op->u.truncate_row.start); + __wt_buf_free(session, &op->u.truncate_row.stop); + break; + } +} + +/* + * __txn_logrec_init -- + * Allocate and initialize a buffer for a transaction's log records. + */ +static int +__txn_logrec_init(WT_SESSION_IMPL *session) +{ + WT_DECL_ITEM(logrec); + WT_DECL_RET; + WT_TXN *txn; + const char *fmt = WT_UNCHECKED_STRING(Iq); + uint32_t rectype = WT_LOGREC_COMMIT; + size_t header_size; + + txn = &session->txn; + if (txn->logrec != NULL) + return (0); + + WT_ASSERT(session, txn->id != WT_TXN_NONE); + WT_RET(__wt_struct_size(session, &header_size, fmt, rectype, txn->id)); + WT_RET(__wt_logrec_alloc(session, header_size, &logrec)); + + WT_ERR(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, header_size, + fmt, rectype, txn->id)); + logrec->size += (uint32_t)header_size; + txn->logrec = logrec; + + if (0) { +err: __wt_logrec_free(session, &logrec); + } + return (ret); +} + +/* + * __wt_txn_log_op -- + * Write the last logged operation into the in-memory buffer. + */ +int +__wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +{ + WT_ITEM *logrec; + WT_TXN *txn; + WT_TXN_OP *op; + + if (!S2C(session)->logging || F_ISSET(session, WT_SESSION_NO_LOGGING)) + return (0); + + txn = &session->txn; + + /* We'd better have a transaction. */ + WT_ASSERT(session, + F_ISSET(txn, TXN_RUNNING) && F_ISSET(txn, TXN_HAS_ID)); + + WT_ASSERT(session, txn->mod_count > 0); + op = txn->mod + txn->mod_count - 1; + + WT_RET(__txn_logrec_init(session)); + logrec = txn->logrec; + + switch (op->type) { + case TXN_OP_BASIC: + return (__txn_op_log(session, logrec, op, cbt)); + case TXN_OP_INMEM: + case TXN_OP_REF: + /* Nothing to log, we're done. */ + return (0); + case TXN_OP_TRUNCATE_COL: + return (__wt_logop_col_truncate_pack(session, logrec, + op->fileid, + op->u.truncate_col.start, op->u.truncate_col.stop)); + case TXN_OP_TRUNCATE_ROW: + return (__wt_logop_row_truncate_pack(session, txn->logrec, + op->fileid, + &op->u.truncate_row.start, &op->u.truncate_row.stop, + (uint32_t)op->u.truncate_row.mode)); + WT_ILLEGAL_VALUE(session); + } + + /* NOTREACHED */ +} + +/* + * __wt_txn_log_commit -- + * Write the operations of a transaction to the log at commit time. + */ +int +__wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_TXN *txn; + + WT_UNUSED(cfg); + txn = &session->txn; + + /* Write updates to the log. */ + return (__wt_log_write(session, txn->logrec, NULL, txn->txn_logsync)); +} + +/* + * __txn_log_file_sync -- + * Write a log record for a file sync. + */ +static int +__txn_log_file_sync(WT_SESSION_IMPL *session, uint32_t flags, WT_LSN *lsnp) +{ + WT_BTREE *btree; + WT_DECL_ITEM(logrec); + WT_DECL_RET; + size_t header_size; + uint32_t rectype = WT_LOGREC_FILE_SYNC; + int start; + const char *fmt = WT_UNCHECKED_STRING(III); + + btree = S2BT(session); + start = LF_ISSET(WT_TXN_LOG_CKPT_START); + + WT_RET(__wt_struct_size( + session, &header_size, fmt, rectype, btree->id, start)); + WT_RET(__wt_logrec_alloc(session, header_size, &logrec)); + + WT_ERR(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, header_size, + fmt, rectype, btree->id, start)); + logrec->size += (uint32_t)header_size; + + WT_ERR(__wt_log_write(session, logrec, lsnp, 0)); +err: __wt_logrec_free(session, &logrec); + return (ret); +} + +/* + * __wt_txn_checkpoint_logread -- + * Read a log record for a checkpoint operation. + */ +int +__wt_txn_checkpoint_logread( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + WT_LSN *ckpt_lsn) +{ + WT_ITEM ckpt_snapshot; + u_int ckpt_nsnapshot; + const char *fmt = WT_UNCHECKED_STRING(IQIU); + + WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, + &ckpt_lsn->file, &ckpt_lsn->offset, + &ckpt_nsnapshot, &ckpt_snapshot)); + WT_UNUSED(ckpt_nsnapshot); + WT_UNUSED(ckpt_snapshot); + *pp = end; + return (0); +} + +/* + * __wt_txn_checkpoint_log -- + * Write a log record for a checkpoint operation. + */ +int +__wt_txn_checkpoint_log( + WT_SESSION_IMPL *session, int full, uint32_t flags, WT_LSN *lsnp) +{ + WT_DECL_ITEM(logrec); + WT_DECL_RET; + WT_LSN *ckpt_lsn; + WT_TXN *txn; + uint8_t *end, *p; + size_t recsize; + uint32_t i, rectype = WT_LOGREC_CHECKPOINT; + const char *fmt = WT_UNCHECKED_STRING(IIQIU); + + txn = &session->txn; + ckpt_lsn = &txn->ckpt_lsn; + + /* + * If this is a file sync, log it unless there is a full checkpoint in + * progress. + */ + if (!full) { + if (txn->full_ckpt) { + if (lsnp != NULL) + *lsnp = *ckpt_lsn; + return (0); + } else + return (__txn_log_file_sync(session, flags, lsnp)); + } + + switch (flags) { + case WT_TXN_LOG_CKPT_PREPARE: + txn->full_ckpt = 1; + *ckpt_lsn = S2C(session)->log->alloc_lsn; + break; + + case WT_TXN_LOG_CKPT_START: + /* Take a copy of the transaction snapshot. */ + txn->ckpt_nsnapshot = txn->snapshot_count; + recsize = txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE; + WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot)); + p = txn->ckpt_snapshot->mem; + end = p + recsize; + for (i = 0; i < txn->snapshot_count; i++) + WT_ERR(__wt_vpack_uint( + &p, WT_PTRDIFF(end, p), txn->snapshot[i])); + break; + + case WT_TXN_LOG_CKPT_STOP: + /* + * During a clean connection close, we get here without the + * prepare or start steps. In that case, log the current LSN + * as the checkpoint LSN. + */ + if (!txn->full_ckpt) { + txn->ckpt_nsnapshot = 0; + *ckpt_lsn = S2C(session)->log->alloc_lsn; + } + + /* Write the checkpoint log record. */ + WT_ERR(__wt_struct_size(session, &recsize, fmt, + rectype, ckpt_lsn->file, ckpt_lsn->offset, + txn->ckpt_nsnapshot, &txn->ckpt_snapshot)); + WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); + + WT_ERR(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, recsize, fmt, + rectype, ckpt_lsn->file, ckpt_lsn->offset, + txn->ckpt_nsnapshot, &txn->ckpt_snapshot)); + logrec->size += (uint32_t)recsize; + WT_ERR(__wt_log_write(session, logrec, lsnp, 0)); + + /* + * If this full checkpoint completed successfully and there is + * no hot backup in progress, tell the logging subsystem the + * checkpoint LSN so that it can archive. + */ + if (!S2C(session)->hot_backup) + WT_ERR(__wt_log_ckpt(session, ckpt_lsn)); + + /* FALLTHROUGH */ + case WT_TXN_LOG_CKPT_FAIL: + /* Cleanup any allocated resources */ + INIT_LSN(ckpt_lsn); + txn->ckpt_nsnapshot = 0; + __wt_scr_free(&txn->ckpt_snapshot); + txn->full_ckpt = 0; + break; + } + +err: __wt_logrec_free(session, &logrec); + return (ret); +} + +/* + * __wt_txn_truncate_log -- + * Begin truncating a range of a file. + */ +int +__wt_txn_truncate_log( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) +{ + WT_BTREE *btree; + WT_ITEM *item; + WT_TXN_OP *op; + + btree = S2BT(session); + + WT_RET(__txn_next_op(session, &op)); + + if (btree->type == BTREE_ROW) { + op->type = TXN_OP_TRUNCATE_ROW; + op->u.truncate_row.mode = TXN_TRUNC_ALL; + WT_CLEAR(op->u.truncate_row.start); + WT_CLEAR(op->u.truncate_row.stop); + if (start != NULL) { + op->u.truncate_row.mode = TXN_TRUNC_START; + item = &op->u.truncate_row.start; + WT_RET(__wt_cursor_get_raw_key(&start->iface, item)); + WT_RET(__wt_buf_set( + session, item, item->data, item->size)); + } + if (stop != NULL) { + op->u.truncate_row.mode = + (op->u.truncate_row.mode == TXN_TRUNC_ALL) ? + TXN_TRUNC_STOP : TXN_TRUNC_BOTH; + item = &op->u.truncate_row.stop; + WT_RET(__wt_cursor_get_raw_key(&stop->iface, item)); + WT_RET(__wt_buf_set( + session, item, item->data, item->size)); + } + } else { + op->type = TXN_OP_TRUNCATE_COL; + op->u.truncate_col.start = + (start == NULL) ? 0 : start->recno; + op->u.truncate_col.stop = + (stop == NULL) ? 0 : stop->recno; + } + + /* Write that operation into the in-memory log. */ + WT_RET(__wt_txn_log_op(session, NULL)); + + WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOGGING_INMEM)); + F_SET(session, WT_SESSION_LOGGING_INMEM); + return (0); +} + +/* + * __wt_txn_truncate_end -- + * Finish truncating a range of a file. + */ +int +__wt_txn_truncate_end(WT_SESSION_IMPL *session) +{ + F_CLR(session, WT_SESSION_LOGGING_INMEM); + return (0); +} + +/* + * __txn_printlog -- + * Print a log record in a human-readable format. + */ +static int +__txn_printlog( + WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, void *cookie) +{ + FILE *out; + WT_LSN ckpt_lsn; + uint64_t txnid; + uint32_t fileid, rectype; + int32_t start; + const uint8_t *end, *p; + const char *msg; + + out = cookie; + + p = LOG_SKIP_HEADER(logrec->data); + end = (const uint8_t *)logrec->data + logrec->size; + + /* First, peek at the log record type. */ + WT_RET(__wt_logrec_read(session, &p, end, &rectype)); + + if (fprintf(out, " { \"lsn\" : [%" PRIu32 ",%" PRId64 "],\n", + lsnp->file, lsnp->offset) < 0) + return (errno); + + switch (rectype) { + case WT_LOGREC_CHECKPOINT: + WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p), + WT_UNCHECKED_STRING(IQ), &ckpt_lsn.file, &ckpt_lsn.offset)); + if (fprintf(out, " \"type\" : \"checkpoint\"\n") < 0 || + fprintf( + out, " \"ckpt_lsn\" : [%" PRIu32 ",%" PRId64 "],\n", + ckpt_lsn.file, ckpt_lsn.offset) < 0) + return (errno); + break; + + case WT_LOGREC_COMMIT: + WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid)); + if (fprintf(out, " \"type\" : \"commit\"\n") < 0 || + fprintf(out, " \"txnid\" : %" PRIu64 ",\n", txnid) < 0) + return (errno); + WT_RET(__txn_commit_printlog(session, &p, end, out)); + break; + + case WT_LOGREC_FILE_SYNC: + WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p), + WT_UNCHECKED_STRING(Ii), &fileid, &start)); + if (fprintf(out, " \"type\" : \"file_sync\"\n") < 0 || + fprintf(out, " \"fileid\" : %" PRIu32 "\n", + fileid) < 0 || + fprintf(out, " \"start\" : %" PRId32 "\n", start) < 0) + return (errno); + break; + + case WT_LOGREC_MESSAGE: + WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p), + WT_UNCHECKED_STRING(S), &msg)); + if (fprintf(out, " \"type\" : \"message\"\n") < 0 || + fprintf(out, " \"message\" : \"%s\"\n", msg) < 0) + return (errno); + break; + } + + if (fprintf(out, " },\n") < 0) + return (errno); + + return (0); +} + +/* + * __wt_txn_printlog -- + * Print the log in a human-readable format. + */ +int +__wt_txn_printlog(WT_SESSION *wt_session, FILE *out) +{ + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + + if (fprintf(out, "[\n") < 0) + return (errno); + WT_RET(__wt_log_scan( + session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, out)); + if (fprintf(out, "]\n") < 0) + return (errno); + + return (0); +} diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c new file mode 100644 index 00000000000..38c606320ef --- /dev/null +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -0,0 +1,491 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* State maintained during recovery. */ +typedef struct { + WT_SESSION_IMPL *session; + + /* Files from the metadata, indexed by file ID. */ + struct WT_RECOVERY_FILE { + const char *uri; /* File URI. */ + WT_CURSOR *c; /* Cursor used for recovery. */ + WT_LSN ckpt_lsn; /* File's checkpoint LSN. */ + } *files; + size_t file_alloc; /* Allocated size of files array. */ + u_int max_fileid; /* Maximum file ID seen. */ + u_int nfiles; /* Number of files in the metadata. */ + + WT_LSN ckpt_lsn; /* Start LSN for main recovery loop. */ + + int missing; /* Were there missing files? */ + int modified; /* Did recovery make any changes? */ + int metadata_only; /* + * Set during the first recovery pass, + * when only the metadata is recovered. + */ +} WT_RECOVERY; + +/* + * __recovery_cursor -- + * Get a cursor for a recovery operation. + */ +static int +__recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r, + WT_LSN *lsnp, u_int id, int duplicate, WT_CURSOR **cp) +{ + WT_CURSOR *c; + const char *cfg[] = { WT_CONFIG_BASE(session, session_open_cursor), + "overwrite", NULL }; + int metadata_op; + + c = NULL; + + /* Track the largest file ID we have seen. */ + if (id > r->max_fileid) + r->max_fileid = id; + + /* + * Metadata operations have an id of 0. Match operations based + * on the id and the current pass of recovery for metadata. + * + * Only apply operations in the correct metadata phase, and if the LSN + * is more recent than the last checkpoint. If there is no entry for a + * file, assume it was dropped or missing after a hot backup. + */ + metadata_op = (id == WT_METAFILE_ID); + if (r->metadata_only != metadata_op) + ; + else if (id >= r->nfiles || r->files[id].uri == NULL) { + /* If a file is missing, output a verbose message once. */ + if (!r->missing) + WT_RET(__wt_verbose(session, WT_VERB_RECOVERY, + "No file found with ID %u (max %u)", + id, r->nfiles)); + r->missing = 1; + } else if (LOG_CMP(lsnp, &r->files[id].ckpt_lsn) >= 0) { + /* + * We're going to apply the operation. Get the cursor, opening + * one if none is cached. + */ + if ((c = r->files[id].c) == NULL) { + WT_RET(__wt_open_cursor( + session, r->files[id].uri, NULL, cfg, &c)); + r->files[id].c = c; + } + } + + if (duplicate && c != NULL) + WT_RET(__wt_open_cursor( + session, r->files[id].uri, NULL, cfg, &c)); + + *cp = c; + return (0); +} + +/* + * Helper to a cursor if this operation is to be applied during recovery. + */ +#define GET_RECOVERY_CURSOR(session, r, lsnp, fileid, cp) \ + WT_ERR(__recovery_cursor( \ + (session), (r), (lsnp), (fileid), 0, (cp))); \ + WT_ERR(__wt_verbose((session), WT_VERB_RECOVERY, \ + "%s op %d to file %d at LSN %u/%" PRIuMAX, \ + (cursor == NULL) ? "Skipping" : "Applying", \ + optype, fileid, lsnp->file, (uintmax_t)lsnp->offset)); \ + if (cursor == NULL) \ + break + +/* + * __txn_op_apply -- + * Apply a transactional operation during recovery. + */ +static int +__txn_op_apply( + WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end) +{ + WT_CURSOR *cursor, *start, *stop; + WT_DECL_RET; + WT_ITEM key, start_key, stop_key, value; + WT_SESSION_IMPL *session; + uint64_t recno, start_recno, stop_recno; + uint32_t fileid, mode, optype, opsize; + + session = r->session; + cursor = NULL; + + /* Peek at the size and the type. */ + WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize)); + end = *pp + opsize; + + switch (optype) { + case WT_LOGOP_COL_PUT: + WT_ERR(__wt_logop_col_put_unpack(session, pp, end, + &fileid, &recno, &value)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + cursor->set_key(cursor, recno); + __wt_cursor_set_raw_value(cursor, &value); + WT_ERR(cursor->insert(cursor)); + break; + + case WT_LOGOP_COL_REMOVE: + WT_ERR(__wt_logop_col_remove_unpack(session, pp, end, + &fileid, &recno)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + cursor->set_key(cursor, recno); + WT_ERR(cursor->remove(cursor)); + break; + + case WT_LOGOP_COL_TRUNCATE: + WT_ERR(__wt_logop_col_truncate_unpack(session, pp, end, + &fileid, &start_recno, &stop_recno)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + + /* Set up the cursors. */ + if (start_recno == 0) { + start = NULL; + stop = cursor; + } else if (stop_recno == 0) { + start = cursor; + stop = NULL; + } else { + start = cursor; + WT_ERR(__recovery_cursor( + session, r, lsnp, fileid, 1, &stop)); + } + + /* Set the keys. */ + if (start != NULL) + start->set_key(start, start_recno); + if (stop != NULL) + stop->set_key(stop, stop_recno); + + WT_TRET(session->iface.truncate(&session->iface, NULL, + start, stop, NULL)); + /* If we opened a duplicate cursor, close it now. */ + if (stop != NULL && stop != cursor) + WT_TRET(stop->close(stop)); + WT_ERR(ret); + break; + + case WT_LOGOP_ROW_PUT: + WT_ERR(__wt_logop_row_put_unpack(session, pp, end, + &fileid, &key, &value)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + __wt_cursor_set_raw_key(cursor, &key); + __wt_cursor_set_raw_value(cursor, &value); + WT_ERR(cursor->insert(cursor)); + break; + + case WT_LOGOP_ROW_REMOVE: + WT_ERR(__wt_logop_row_remove_unpack(session, pp, end, + &fileid, &key)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + __wt_cursor_set_raw_key(cursor, &key); + WT_ERR(cursor->remove(cursor)); + break; + + case WT_LOGOP_ROW_TRUNCATE: + WT_ERR(__wt_logop_row_truncate_unpack(session, pp, end, + &fileid, &start_key, &stop_key, &mode)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + /* Set up the cursors. */ + start = stop = NULL; + switch (mode) { + case TXN_TRUNC_ALL: + /* Both cursors stay NULL. */ + break; + case TXN_TRUNC_BOTH: + start = cursor; + WT_ERR(__recovery_cursor( + session, r, lsnp, fileid, 1, &stop)); + break; + case TXN_TRUNC_START: + start = cursor; + break; + case TXN_TRUNC_STOP: + stop = cursor; + break; + + WT_ILLEGAL_VALUE_ERR(session); + } + + /* Set the keys. */ + if (start != NULL) + __wt_cursor_set_raw_key(start, &start_key); + if (stop != NULL) + __wt_cursor_set_raw_key(stop, &stop_key); + + WT_TRET(session->iface.truncate(&session->iface, NULL, + start, stop, NULL)); + /* If we opened a duplicate cursor, close it now. */ + if (stop != NULL && stop != cursor) + WT_TRET(stop->close(stop)); + WT_ERR(ret); + break; + + WT_ILLEGAL_VALUE_ERR(session); + } + + /* Reset the cursor so it doesn't block eviction. */ + if (cursor != NULL) + WT_ERR(cursor->reset(cursor)); + + r->modified = 1; + +err: if (ret != 0) + __wt_err(session, ret, "Operation failed during recovery"); + return (ret); +} + +/* + * __txn_commit_apply -- + * Apply a commit record during recovery. + */ +static int +__txn_commit_apply( + WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end) +{ + WT_UNUSED(lsnp); + + /* The logging subsystem zero-pads records. */ + while (*pp < end && **pp) + WT_RET(__txn_op_apply(r, lsnp, pp, end)); + + return (0); +} + +/* + * __txn_log_recover -- + * Roll the log forward to recover committed changes. + */ +static int +__txn_log_recover( + WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, void *cookie) +{ + WT_RECOVERY *r; + const uint8_t *end, *p; + uint64_t txnid; + uint32_t rectype; + + r = cookie; + p = LOG_SKIP_HEADER(logrec->data); + end = (const uint8_t *)logrec->data + logrec->size; + + /* First, peek at the log record type. */ + WT_RET(__wt_logrec_read(session, &p, end, &rectype)); + + switch (rectype) { + case WT_LOGREC_CHECKPOINT: + if (r->metadata_only) + WT_RET(__wt_txn_checkpoint_logread( + session, &p, end, &r->ckpt_lsn)); + break; + + case WT_LOGREC_COMMIT: + WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid)); + WT_UNUSED(txnid); + WT_RET(__txn_commit_apply(r, lsnp, &p, end)); + break; + } + + return (0); +} + +/* + * __recovery_setup_file -- + * Set up the recovery slot for a file. + */ +static int +__recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config) +{ + WT_CONFIG_ITEM cval; + WT_LSN lsn; + uint32_t fileid; + + WT_RET(__wt_config_getones(r->session, config, "id", &cval)); + fileid = (uint32_t)cval.val; + + if (r->nfiles <= fileid) { + WT_RET(__wt_realloc_def( + r->session, &r->file_alloc, fileid + 1, &r->files)); + r->nfiles = fileid + 1; + } + + WT_RET(__wt_strdup(r->session, uri, &r->files[fileid].uri)); + WT_RET( + __wt_config_getones(r->session, config, "checkpoint_lsn", &cval)); + /* If there is checkpoint logged for the file, apply everything. */ + if (cval.type != WT_CONFIG_ITEM_STRUCT) + INIT_LSN(&lsn); + else if (sscanf(cval.str, "(%" PRIu32 ",%" PRIdMAX ")", + &lsn.file, (intmax_t*)&lsn.offset) != 2) + WT_RET_MSG(r->session, EINVAL, + "Failed to parse checkpoint LSN '%.*s'", + (int)cval.len, cval.str); + r->files[fileid].ckpt_lsn = lsn; + + WT_RET(__wt_verbose(r->session, WT_VERB_RECOVERY, + "Recovering %s with id %u @ (%" PRIu32 ", %" PRIu64 ")", + uri, fileid, lsn.file, lsn.offset)); + + return (0); + +} + +/* + * __recovery_free -- + * Free the recovery state. + */ +static int +__recovery_free(WT_RECOVERY *r) +{ + WT_CURSOR *c; + WT_DECL_RET; + WT_SESSION_IMPL *session; + u_int i; + + session = r->session; + for (i = 0; i < r->nfiles; i++) { + __wt_free(session, r->files[i].uri); + if ((c = r->files[i].c) != NULL) + WT_TRET(c->close(c)); + } + + __wt_free(session, r->files); + return (ret); +} + +/* + * __recovery_file_scan -- + * Scan the files referenced from the metadata and gather information + * about them for recovery. + */ +static int +__recovery_file_scan(WT_RECOVERY *r) +{ + WT_DECL_RET; + WT_CURSOR *c; + const char *uri, *config; + int cmp; + + /* Scan through all files in the metadata. */ + c = r->files[0].c; + c->set_key(c, "file:"); + if ((ret = c->search_near(c, &cmp)) != 0) { + /* Is the metadata empty? */ + if (ret == WT_NOTFOUND) + ret = 0; + goto err; + } + if (cmp < 0) + WT_ERR_NOTFOUND_OK(c->next(c)); + for (; ret == 0; ret = c->next(c)) { + WT_ERR(c->get_key(c, &uri)); + if (!WT_PREFIX_MATCH(uri, "file:")) + break; + WT_ERR(c->get_value(c, &config)); + WT_ERR(__recovery_setup_file(r, uri, config)); + } + WT_ERR_NOTFOUND_OK(ret); + +err: if (r->nfiles > r->max_fileid) + r->max_fileid = r->nfiles; + return (ret); +} + +/* + * __wt_txn_recover -- + * Run recovery. + */ +int +__wt_txn_recover(WT_CONNECTION_IMPL *conn) +{ + WT_CURSOR *metac; + WT_DECL_RET; + WT_RECOVERY r; + WT_SESSION_IMPL *session; + struct WT_RECOVERY_FILE *metafile; + const char *config; + int was_backup; + + WT_CLEAR(r); + INIT_LSN(&r.ckpt_lsn); + was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0; + + /* We need a real session for recovery. */ + WT_RET(__wt_open_session(conn, NULL, NULL, &session)); + F_SET(session, WT_SESSION_NO_LOGGING); + r.session = session; + + WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); + WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); + WT_ERR(__wt_metadata_cursor(session, NULL, &metac)); + metafile = &r.files[WT_METAFILE_ID]; + metafile->c = metac; + + /* + * First, do a pass through the log to recover the metadata, and + * establish the last checkpoint LSN. Skip this when opening a hot + * backup: we already have the correct metadata in that case. + */ + if (!was_backup) { + r.metadata_only = 1; + if (IS_INIT_LSN(&metafile->ckpt_lsn)) + WT_ERR(__wt_log_scan(session, + NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r)); + else + WT_ERR(__wt_log_scan(session, + &metafile->ckpt_lsn, 0, __txn_log_recover, &r)); + + WT_ASSERT(session, + LOG_CMP(&r.ckpt_lsn, &conn->log->first_lsn) >= 0); + } + + /* Scan the metadata to find the live files and their IDs. */ + WT_ERR(__recovery_file_scan(&r)); + + /* + * We no longer need the metadata cursor: close it to avoid pinning any + * resources that could block eviction during recovery. + */ + r.files[0].c = NULL; + WT_ERR(metac->close(metac)); + + /* + * Now, recover all the files apart from the metadata. + * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. + */ + r.metadata_only = 0; + WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY, + "Main recovery loop: starting at %u/%" PRIuMAX, + r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset)); + if (IS_INIT_LSN(&r.ckpt_lsn)) + WT_ERR(__wt_log_scan(session, NULL, + WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, + __txn_log_recover, &r)); + else + WT_ERR(__wt_log_scan(session, &r.ckpt_lsn, + WT_LOGSCAN_RECOVER, + __txn_log_recover, &r)); + + conn->next_file_id = r.max_fileid; + + /* + * If recovery ran successfully forcibly log a checkpoint so the next + * open is fast and keep the metadata up to date with the checkpoint + * LSN and archiving. + */ + WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); + +err: WT_TRET(__recovery_free(&r)); + __wt_free(session, config); + WT_TRET(session->iface.close(&session->iface, NULL)); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/utilities/util.h b/src/third_party/wiredtiger/src/utilities/util.h new file mode 100644 index 00000000000..1f2f0b7211a --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util.h @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include <wt_internal.h> + +typedef struct { + void *mem; /* Managed memory chunk */ + size_t memsize; /* Managed memory size */ +} ULINE; + +extern const char *home; /* Home directory */ +extern const char *progname; /* Program name */ +extern const char *usage_prefix; /* Global arguments */ +extern int verbose; /* Verbose flag */ + +extern WT_EVENT_HANDLER *verbose_handler; + +extern int __wt_opterr; /* if error message should be printed */ +extern int __wt_optind; /* index into parent argv vector */ +extern int __wt_optopt; /* character checked for validity */ +extern int __wt_optreset; /* reset getopt */ +extern char *__wt_optarg; /* argument associated with option */ + +int util_backup(WT_SESSION *, int, char *[]); +int util_cerr(const char *, const char *, int); +int util_compact(WT_SESSION *, int, char *[]); +void util_copyright(void); +int util_create(WT_SESSION *, int, char *[]); +int util_drop(WT_SESSION *, int, char *[]); +int util_dump(WT_SESSION *, int, char *[]); +int util_err(int, const char *, ...); +int util_flush(WT_SESSION *, const char *); +int util_list(WT_SESSION *, int, char *[]); +int util_load(WT_SESSION *, int, char *[]); +int util_loadtext(WT_SESSION *, int, char *[]); +char *util_name(const char *, const char *); +int util_printlog(WT_SESSION *, int, char *[]); +int util_read(WT_SESSION *, int, char *[]); +int util_read_line(ULINE *, int, int *); +int util_rename(WT_SESSION *, int, char *[]); +int util_salvage(WT_SESSION *, int, char *[]); +int util_stat(WT_SESSION *, int, char *[]); +int util_str2recno(const char *p, uint64_t *recnop); +int util_upgrade(WT_SESSION *, int, char *[]); +int util_verify(WT_SESSION *, int, char *[]); +int util_write(WT_SESSION *, int, char *[]); diff --git a/src/third_party/wiredtiger/src/utilities/util_backup.c b/src/third_party/wiredtiger/src/utilities/util_backup.c new file mode 100644 index 00000000000..aa61cc338f0 --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_backup.c @@ -0,0 +1,205 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int copy(const char *, const char *); +static int usage(void); + +#define CBUF_LEN (128 * 1024) /* Copy buffer and size. */ +static char *cbuf; + +/* + * append_target -- + * Build a list of comma-separated targets. + */ +static int +append_target(const char *target, char **bufp) +{ + static int first = 1; + static size_t len = 0, remain = 0; + static char *buf = NULL; + + /* 20 bytes of slop */ + if (remain < strlen(target) + 20) { + len += strlen(target) + 512; + remain += strlen(target) + 512; + if ((buf = realloc(buf, len)) == NULL) + return (util_err(errno, NULL)); + *bufp = buf; + } + if (first) { + first = 0; + strcpy(buf, "target=("); + } else + buf[strlen(buf) - 1] = ','; /* overwrite previous ")" */ + strcat(buf, "\""); + strcat(buf, target); + strcat(buf, "\")"); + remain -= strlen(target) + 1; + + return (0); +} + +int +util_backup(WT_SESSION *session, int argc, char *argv[]) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + int ch; + char *config; + const char *directory, *name; + + config = NULL; + while ((ch = __wt_getopt(progname, argc, argv, "t:")) != EOF) + switch (ch) { + case 't': + if (append_target(__wt_optarg, &config)) + return (1); + break; + case '?': + default: + return (usage()); + } + argc -= __wt_optind; + argv += __wt_optind; + + if (argc != 1) { + (void)usage(); + goto err; + } + directory = *argv; + + if ((ret = session->open_cursor( + session, "backup:", NULL, config, &cursor)) != 0) { + fprintf(stderr, "%s: cursor open(backup:) failed: %s\n", + progname, wiredtiger_strerror(ret)); + goto err; + } + + /* Copy the files. */ + while ( + (ret = cursor->next(cursor)) == 0 && + (ret = cursor->get_key(cursor, &name)) == 0) + if ((ret = copy(name, directory)) != 0) + goto err; + if (ret == WT_NOTFOUND) + ret = 0; + + if (ret != 0) { + fprintf(stderr, "%s: cursor next(backup:) failed: %s\n", + progname, wiredtiger_strerror(ret)); + goto err; + } + +err: if (config != NULL) + free(config); + if (cbuf != NULL) + free(cbuf); + + return (ret); +} + +static int +copy(const char *name, const char *directory) +{ + WT_DECL_RET; + ssize_t n; + int ifd, ofd; + + ret = 1; + ifd = ofd = -1; + + if (verbose && + printf("Backing up %s/%s to %s\n", home, name, directory) < 0) { + fprintf(stderr, "%s: %s\n", progname, strerror(errno)); + return (1); + } + + /* Allocate a large copy buffer (use it to build pathnames as well. */ + if (cbuf == NULL && (cbuf = malloc(CBUF_LEN)) == NULL) + goto memerr; + + /* Open the read file. */ + if (snprintf(cbuf, CBUF_LEN, "%s/%s", home, name) >= CBUF_LEN) + goto memerr; + if ((ifd = open(cbuf, O_BINARY | O_RDONLY, 0)) < 0) + goto readerr; + + /* Open the write file. */ + if (snprintf(cbuf, CBUF_LEN, "%s/%s", directory, name) >= CBUF_LEN) + goto memerr; + if ((ofd = open( + cbuf, O_BINARY | O_CREAT | O_WRONLY | O_TRUNC, 0666)) < 0) + goto writerr; + + /* Copy the file. */ + while ((n = read(ifd, cbuf, CBUF_LEN)) > 0) + if (write(ofd, cbuf, (size_t)n) != n) + goto writerr; + if (n != 0) + goto readerr; + + /* + * Close file descriptors (forcing a flush on the write side), and + * check for any errors. + */ + ret = close(ifd); + ifd = -1; + if (ret != 0) + goto readerr; + + /* + * We need to know this file was successfully written, it's a backup. + */ +#ifdef _WIN32 + if (FlushFileBuffers((HANDLE)_get_osfhandle(ofd)) == 0) { + DWORD err = GetLastError(); + ret = err; + goto writerr; + } +#else + if (fsync(ofd)) + goto writerr; +#endif + ret = close(ofd); + ofd = -1; + if (ret != 0) + goto writerr; + + /* Success. */ + ret = 0; + + if (0) { +readerr: fprintf(stderr, + "%s: %s/%s: %s\n", progname, home, name, strerror(errno)); + } + if (0) { +writerr: fprintf(stderr, "%s: %s/%s: %s\n", + progname, directory, name, strerror(errno)); + } + if (0) { +memerr: fprintf(stderr, "%s: %s\n", progname, strerror(errno)); + } + + if (ifd >= 0) + (void)close(ifd); + if (ofd >= 0) + (void)close(ofd); + + return (ret); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "backup [-t uri] directory\n", + progname, usage_prefix); + return (1); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_compact.c b/src/third_party/wiredtiger/src/utilities/util_compact.c new file mode 100644 index 00000000000..51d5461e43c --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_compact.c @@ -0,0 +1,59 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int usage(void); + +int +util_compact(WT_SESSION *session, int argc, char *argv[]) +{ + WT_DECL_RET; + int ch; + char *uri; + + uri = NULL; + while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) + switch (ch) { + case '?': + default: + return (usage()); + } + argc -= __wt_optind; + argv += __wt_optind; + + /* The remaining argument is the table name. */ + if (argc != 1) + return (usage()); + if ((uri = util_name(*argv, "table")) == NULL) + return (1); + + if ((ret = session->compact(session, uri, NULL)) != 0) { + fprintf(stderr, "%s: compact(%s): %s\n", + progname, uri, wiredtiger_strerror(ret)); + goto err; + } + + if (0) { +err: ret = 1; + } + + if (uri != NULL) + free(uri); + + return (ret); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "compact uri\n", + progname, usage_prefix); + return (1); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_cpyright.c b/src/third_party/wiredtiger/src/utilities/util_cpyright.c new file mode 100644 index 00000000000..21d82828863 --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_cpyright.c @@ -0,0 +1,35 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +void +util_copyright(void) +{ + printf("%s\n", "Copyright (c) 2008-2014 WiredTiger, Inc."); + printf("%s\n\n", "All rights reserved."); + + printf("%s\n\n", + "This program is free software: you can redistribute it and/or\n" + "modify it under the terms of version 3 of the GNU General\n" + "Public License as published by the Free Software Foundation."); + + printf("%s\n\n", + "This program is distributed in the hope that it will be useful,\n" + "but WITHOUT ANY WARRANTY; without even the implied warranty of\n" + "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n" + "GNU General Public License for more details:"); + + printf("\t%s\n\n", + "http://www.gnu.org/licenses/gpl-3.0-standalone.html"); + + printf("%s\n", + "For a license to use the WiredTiger software under conditions\n" + "other than those described by the GNU General Public License,\n" + "or for technical support for this software, contact WiredTiger,\n" + "Inc. at info@wiredtiger.com."); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_create.c b/src/third_party/wiredtiger/src/utilities/util_create.c new file mode 100644 index 00000000000..ebff3a8ad05 --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_create.c @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int usage(void); + +int +util_create(WT_SESSION *session, int argc, char *argv[]) +{ + WT_DECL_RET; + int ch; + const char *config, *uri; + + config = NULL; + while ((ch = __wt_getopt(progname, argc, argv, "c:")) != EOF) + switch (ch) { + case 'c': /* command-line configuration */ + config = __wt_optarg; + break; + case '?': + default: + return (usage()); + } + + argc -= __wt_optind; + argv += __wt_optind; + + /* The remaining argument is the uri. */ + if (argc != 1) + return (usage()); + + if ((uri = util_name(*argv, "table")) == NULL) + return (1); + + if ((ret = session->create(session, uri, config)) != 0) + return (util_err(ret, "%s: session.create", uri)); + return (0); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "create [-c configuration] uri\n", + progname, usage_prefix); + return (1); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_drop.c b/src/third_party/wiredtiger/src/utilities/util_drop.c new file mode 100644 index 00000000000..6fe416882a3 --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_drop.c @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int usage(void); + +int +util_drop(WT_SESSION *session, int argc, char *argv[]) +{ + WT_DECL_RET; + int ch; + char *name; + + while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) + switch (ch) { + case '?': + default: + return (usage()); + } + + argc -= __wt_optind; + argv += __wt_optind; + + /* The remaining argument is the uri. */ + if (argc != 1) + return (usage()); + if ((name = util_name(*argv, "table")) == NULL) + return (1); + + ret = session->drop(session, name, "force"); + + if (name != NULL) + free(name); + return (ret); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "drop uri\n", + progname, usage_prefix); + return (1); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_dump.c b/src/third_party/wiredtiger/src/utilities/util_dump.c new file mode 100644 index 00000000000..bd0590948b4 --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_dump.c @@ -0,0 +1,701 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int dump_config(WT_SESSION *, const char *, int); +static int dump_json_begin(void); +static int dump_json_end(void); +static int dump_json_separator(void); +static int dump_json_table_begin(WT_CURSOR *, const char *, const char *); +static int dump_json_table_cg(WT_CURSOR *, const char *, const char *, + const char *, const char *); +static int dump_json_table_config(WT_SESSION *, const char *); +static int dump_json_table_end(void); +static int dump_prefix(int); +static int dump_record(WT_CURSOR *, const char *, int, int); +static int dump_suffix(void); +static int dump_table_config(WT_SESSION *, WT_CURSOR *, const char *); +static int dump_table_config_type(WT_SESSION *, + WT_CURSOR *, WT_CURSOR *, const char *, const char *, const char *); +static int dup_json_string(const char *, char **); +static int print_config(WT_SESSION *, const char *, const char *, const char *); +static int usage(void); + +int +util_dump(WT_SESSION *session, int argc, char *argv[]) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + size_t len; + int ch, hex, i, json, reverse; + char *checkpoint, *config, *name; + + hex = json = reverse = 0; + checkpoint = config = name = NULL; + while ((ch = __wt_getopt(progname, argc, argv, "c:f:jrx")) != EOF) + switch (ch) { + case 'c': + checkpoint = __wt_optarg; + break; + case 'f': /* output file */ + if (freopen(__wt_optarg, "w", stdout) == NULL) + return ( + util_err(errno, "%s: reopen", __wt_optarg)); + break; + case 'j': + json = 1; + break; + case 'r': + reverse = 1; + break; + case 'x': + hex = 1; + break; + case '?': + default: + return (usage()); + } + argc -= __wt_optind; + argv += __wt_optind; + + /* -j and -x are incompatible. */ + if (hex && json) { + fprintf(stderr, + "%s: the -j and -x dump options are incompatible\n", + progname); + goto err; + } + + /* The remaining argument is the uri. */ + if (argc < 1 || (argc != 1 && !json)) + return (usage()); + + if (json && (ret = dump_json_begin()) != 0) + goto err; + + for (i = 0; i < argc; i++) { + if (json && i > 0) + if ((ret = dump_json_separator()) != 0) + goto err; + if (name != NULL) { + free(name); + name = NULL; + } + if ((name = util_name(argv[i], "table")) == NULL) + goto err; + + if (json && dump_json_table_config(session, name) != 0) + goto err; + if (!json && dump_config(session, name, hex) != 0) + goto err; + + len = + checkpoint == NULL ? 0 : strlen("checkpoint=") + + strlen(checkpoint) + 1; + len += strlen(json ? "dump=json" : + (hex ? "dump=hex" : "dump=print")); + if ((config = malloc(len + 10)) == NULL) + goto err; + if (checkpoint == NULL) + config[0] = '\0'; + else { + (void)strcpy(config, "checkpoint="); + (void)strcat(config, checkpoint); + (void)strcat(config, ","); + } + (void)strcat(config, json ? "dump=json" : + (hex ? "dump=hex" : "dump=print")); + if ((ret = session->open_cursor( + session, name, NULL, config, &cursor)) != 0) { + fprintf(stderr, "%s: cursor open(%s) failed: %s\n", + progname, name, wiredtiger_strerror(ret)); + goto err; + } + + if ((ret = dump_record(cursor, name, reverse, json)) != 0) + goto err; + if (json && (ret = dump_json_table_end()) != 0) + goto err; + } + if (json && ((ret = dump_json_end()) != 0)) + goto err; + + if (0) { +err: ret = 1; + } + + if (config != NULL) + free(config); + if (name != NULL) + free(name); + + return (ret); +} + +/* + * dump_config -- + * Dump the config for the uri. + */ +static int +dump_config(WT_SESSION *session, const char *uri, int hex) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + int tret; + + /* Open a metadata cursor. */ + if ((ret = session->open_cursor( + session, WT_METADATA_URI, NULL, NULL, &cursor)) != 0) { + fprintf(stderr, "%s: %s: session.open_cursor: %s\n", + progname, WT_METADATA_URI, wiredtiger_strerror(ret)); + return (1); + } + /* + * Search for the object itself, just to make sure it exists, we don't + * want to output a header if the user entered the wrong name. This is + * where we find out a table doesn't exist, use a simple error message. + */ + cursor->set_key(cursor, uri); + if ((ret = cursor->search(cursor)) == 0) { + if (dump_prefix(hex) != 0 || + dump_table_config(session, cursor, uri) != 0 || + dump_suffix() != 0) + ret = 1; + } else if (ret == WT_NOTFOUND) + ret = util_err(0, "%s: No such object exists", uri); + else + ret = util_err(ret, "%s", uri); + + if ((tret = cursor->close(cursor)) != 0) { + tret = util_cerr(uri, "close", tret); + if (ret == 0) + ret = tret; + } + + return (ret); +} + +/* + * dump_json_begin -- + * Output the dump file header prefix. + */ +static int +dump_json_begin(void) +{ + if (printf("{\n") < 0) + return (util_err(EIO, NULL)); + return (0); +} + +/* + * dump_json_end -- + * Output the dump file header suffix. + */ +static int +dump_json_end(void) +{ + if (printf("\n}\n") < 0) + return (util_err(EIO, NULL)); + return (0); +} + +/* + * dump_json_begin -- + * Output the dump file header prefix. + */ +static int +dump_json_separator(void) +{ + if (printf(",\n") < 0) + return (util_err(EIO, NULL)); + return (0); +} + +/* + * dump_json_table_begin -- + * Output the JSON syntax that starts a table, along with its config. + */ +static int +dump_json_table_begin(WT_CURSOR *cursor, const char *uri, const char *config) +{ + WT_DECL_RET; + const char *name; + char *jsonconfig; + + jsonconfig = NULL; + + /* Get the table name. */ + if ((name = strchr(uri, ':')) == NULL) { + fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri); + return (1); + } + ++name; + + if ((ret = dup_json_string(config, &jsonconfig)) != 0) + return (util_cerr(uri, "config dup", ret)); + if (printf(" \"%s\" : [\n {\n", uri) < 0) + goto eio; + if (printf(" \"config\" : \"%s\",\n", jsonconfig) < 0) + goto eio; + + if ((ret = dump_json_table_cg( + cursor, uri, name, "colgroup:", "colgroups")) == 0) { + if (printf(",\n") < 0) + goto eio; + ret = + dump_json_table_cg(cursor, uri, name, "index:", "indices"); + } + + if (printf("\n },\n {\n \"data\" : [") < 0) + goto eio; + + if (0) { +eio: ret = util_err(EIO, NULL); + } + + free(jsonconfig); + return (ret); +} + +/* + * dump_json_table_cg -- + * Dump the column groups or indices for a table. + */ +static int +dump_json_table_cg(WT_CURSOR *cursor, + const char *uri, const char *name, const char *entry, const char *header) +{ + WT_DECL_RET; + const char *key, *skip, *value; + int exact, once; + char *jsonconfig; + static const char * const indent = " "; + + once = 0; + if (printf(" \"%s\" : [", header) < 0) + return (util_err(EIO, NULL)); + + /* + * For table dumps, we're done. + */ + if (cursor == NULL) { + if (printf("]") < 0) + return (util_err(EIO, NULL)); + else + return (0); + } + + /* + * Search the file looking for column group and index key/value pairs: + * for each one, look up the related source information and append it + * to the base record. + */ + cursor->set_key(cursor, entry); + if ((ret = cursor->search_near(cursor, &exact)) != 0) { + if (ret == WT_NOTFOUND) + return (0); + return (util_cerr(uri, "search_near", ret)); + } + if (exact >= 0) + goto match; + while ((ret = cursor->next(cursor)) == 0) { +match: if ((ret = cursor->get_key(cursor, &key)) != 0) + return (util_cerr(uri, "get_key", ret)); + + /* Check if we've finished the list of entries. */ + if (!WT_PREFIX_MATCH(key, entry)) + break; + + /* Check for a table name match. */ + skip = key + strlen(entry); + if (strncmp( + skip, name, strlen(name)) != 0 || skip[strlen(name)] != ':') + continue; + + /* Get the value. */ + if ((ret = cursor->get_value(cursor, &value)) != 0) + return (util_cerr(uri, "get_value", ret)); + + if ((ret = dup_json_string(value, &jsonconfig)) != 0) + return (util_cerr(uri, "config dup", ret)); + ret = printf("%s\n" + "%s{\n" + "%s \"uri\" : \"%s\",\n" + "%s \"config\" : \"%s\"\n" + "%s}", + (once == 0 ? "" : ","), + indent, indent, key, indent, jsonconfig, indent); + free(jsonconfig); + if (ret < 0) + return (util_err(EIO, NULL)); + + once = 1; + } + if (printf("%s]", (once == 0 ? "" : "\n ")) < 0) + return (util_err(EIO, NULL)); + if (ret == 0 || ret == WT_NOTFOUND) + return (0); + return (util_cerr(uri, "next", ret)); +} + +/* + * dump_json_table_config -- + * Dump the config for the uri. + */ +static int +dump_json_table_config(WT_SESSION *session, const char *uri) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + WT_EXTENSION_API *wtext; + int tret; + const char *value; + + /* Dump the config. */ + if (WT_PREFIX_MATCH(uri, "table:")) { + /* Open a metadata cursor. */ + if ((ret = session->open_cursor( + session, WT_METADATA_URI, NULL, NULL, &cursor)) != 0) { + fprintf(stderr, "%s: %s: session.open_cursor: %s\n", + progname, WT_METADATA_URI, + wiredtiger_strerror(ret)); + return (1); + } + + /* + * Search for the object itself, to make sure it + * exists, and get its config string. This where we + * find out a table object doesn't exist, use a simple + * error message. + */ + cursor->set_key(cursor, uri); + if ((ret = cursor->search(cursor)) == 0) { + if ((ret = cursor->get_value(cursor, &value)) != 0) + ret = util_cerr(uri, "get_value", ret); + else if (dump_json_table_begin(cursor, uri, + value) != 0) + ret = 1; + } else if (ret == WT_NOTFOUND) + ret = util_err(0, "%s: No such object exists", uri); + else + ret = util_err(ret, "%s", uri); + + if ((tret = cursor->close(cursor)) != 0) { + tret = util_cerr(uri, "close", tret); + if (ret == 0) + ret = tret; + } + } else { + /* + * We want to be able to dump the metadata file itself, but the + * configuration for that file lives in the turtle file. Reach + * down into the library and ask for the file's configuration, + * that will work in all cases. + * + * This where we find out a file object doesn't exist, use a + * simple error message. + */ + wtext = session-> + connection->get_extension_api(session->connection); + if ((ret = + wtext->metadata_search(wtext, session, uri, &value)) == 0) { + if (dump_json_table_begin(NULL, uri, value) != 0) + ret = 1; + } else if (ret == WT_NOTFOUND) + ret = util_err(0, "%s: No such object exists", uri); + else + ret = util_err(ret, "%s", uri); + } + + return (ret); +} + +/* + * dump_json_table_end -- + * Output the JSON syntax that ends a table. + */ +static int +dump_json_table_end(void) +{ + if (printf(" ]\n }\n ]") < 0) + return (util_err(EIO, NULL)); + return (0); +} + +/* + * dump_table_config -- + * Dump the config for a table. + */ +static int +dump_table_config(WT_SESSION *session, WT_CURSOR *cursor, const char *uri) +{ + WT_CURSOR *srch; + WT_DECL_RET; + int tret; + const char *key, *name, *value; + + /* Get the table name. */ + if ((name = strchr(uri, ':')) == NULL) { + fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri); + return (1); + } + ++name; + + /* + * Dump out the config information: first, dump the uri entry itself + * (requires a lookup). + */ + cursor->set_key(cursor, uri); + if ((ret = cursor->search(cursor)) != 0) + return (util_cerr(uri, "search", ret)); + if ((ret = cursor->get_key(cursor, &key)) != 0) + return (util_cerr(uri, "get_key", ret)); + if ((ret = cursor->get_value(cursor, &value)) != 0) + return (util_cerr(uri, "get_value", ret)); + if (print_config(session, key, value, NULL) != 0) + return (1); + + /* + * The underlying table configuration function needs a second cursor: + * open one before calling it, it makes error handling hugely simpler. + */ + if ((ret = + session->open_cursor(session, NULL, cursor, NULL, &srch)) != 0) + return (util_cerr(uri, "open_cursor", ret)); + + if ((ret = dump_table_config_type( + session, cursor, srch, uri, name, "colgroup:")) == 0) + ret = dump_table_config_type( + session, cursor, srch, uri, name, "index:"); + + if ((tret = srch->close(srch)) != 0) { + tret = util_cerr(uri, "close", tret); + if (ret == 0) + ret = tret; + } + + return (ret); +} + +/* + * dump_table_config_type -- + * Dump the column groups or indices for a table. + */ +static int +dump_table_config_type(WT_SESSION *session, + WT_CURSOR *cursor, WT_CURSOR *srch, + const char *uri, const char *name, const char *entry) +{ + WT_CONFIG_ITEM cval; + WT_DECL_RET; + const char *key, *skip, *value, *value_source; + int exact; + char *p; + + /* + * Search the file looking for column group and index key/value pairs: + * for each one, look up the related source information and append it + * to the base record. + */ + cursor->set_key(cursor, entry); + if ((ret = cursor->search_near(cursor, &exact)) != 0) { + if (ret == WT_NOTFOUND) + return (0); + return (util_cerr(uri, "search_near", ret)); + } + if (exact >= 0) + goto match; + while ((ret = cursor->next(cursor)) == 0) { +match: if ((ret = cursor->get_key(cursor, &key)) != 0) + return (util_cerr(uri, "get_key", ret)); + + /* Check if we've finished the list of entries. */ + if (!WT_PREFIX_MATCH(key, entry)) + return (0); + + /* Check for a table name match. */ + skip = key + strlen(entry); + if (strncmp( + skip, name, strlen(name)) != 0 || skip[strlen(name)] != ':') + continue; + + /* Get the value. */ + if ((ret = cursor->get_value(cursor, &value)) != 0) + return (util_cerr(uri, "get_value", ret)); + + /* Crack it and get the underlying source. */ + if ((ret = __wt_config_getones( + (WT_SESSION_IMPL *)session, value, "source", &cval)) != 0) + return (util_err(ret, "%s: source entry", key)); + + /* Nul-terminate the source entry. */ + if ((p = malloc(cval.len + 10)) == NULL) + return (util_err(errno, NULL)); + (void)strncpy(p, cval.str, cval.len); + p[cval.len] = '\0'; + srch->set_key(srch, p); + if ((ret = srch->search(srch)) != 0) + ret = util_err(ret, "%s: %s", key, p); + free(p); + if (ret != 0) + return (1); + + /* Get the source's value. */ + if ((ret = srch->get_value(srch, &value_source)) != 0) + return (util_cerr(uri, "get_value", ret)); + + /* + * The dumped configuration string is the original key plus the + * source's configuration. + */ + if (print_config(session, key, value, value_source) != 0) + return (util_err(EIO, NULL)); + } + if (ret == 0 || ret == WT_NOTFOUND) + return (0); + return (util_cerr(uri, "next", ret)); +} + +/* + * dump_prefix -- + * Output the dump file header prefix. + */ +static int +dump_prefix(int hex) +{ + int vmajor, vminor, vpatch; + + (void)wiredtiger_version(&vmajor, &vminor, &vpatch); + + if (printf( + "WiredTiger Dump (WiredTiger Version %d.%d.%d)\n", + vmajor, vminor, vpatch) < 0 || + printf("Format=%s\n", hex ? "hex" : "print") < 0 || + printf("Header\n") < 0) + return (util_err(EIO, NULL)); + return (0); +} + +/* + * dump_record -- + * Dump a single record, advance cursor to next/prev, along + * with JSON formatting if needed. + */ +static int +dump_record(WT_CURSOR *cursor, const char *name, int reverse, int json) +{ + WT_DECL_RET; + const char *infix, *key, *prefix, *suffix, *value; + int once; + + once = 0; + if (json) { + prefix = "\n{\n"; + infix = ",\n"; + suffix = "\n}"; + } else { + prefix = ""; + infix = "\n"; + suffix = "\n"; + } + while ((ret = + (reverse ? cursor->prev(cursor) : cursor->next(cursor))) == 0) { + if ((ret = cursor->get_key(cursor, &key)) != 0) + return (util_cerr(name, "get_key", ret)); + if ((ret = cursor->get_value(cursor, &value)) != 0) + return (util_cerr(name, "get_value", ret)); + if (printf("%s%s%s%s%s%s", (json && once) ? "," : "", + prefix, key, infix, value, suffix) < 0) + return (util_err(EIO, NULL)); + once = 1; + } + if (json && once && printf("\n") < 0) + return (util_err(EIO, NULL)); + return (ret == WT_NOTFOUND ? 0 : + util_cerr(name, (reverse ? "prev" : "next"), ret)); +} + +/* + * dump_suffix -- + * Output the dump file header suffix. + */ +static int +dump_suffix(void) +{ + if (printf("Data\n") < 0) + return (util_err(EIO, NULL)); + return (0); +} + +/* + * dup_json_string -- + * Like strdup, but escape any characters that are special for JSON. + * The result will be embedded in a JSON string. + */ +static int +dup_json_string(const char *str, char **result) +{ + size_t left, nchars; + const char *p; + char *q; + + nchars = 0; + for (p = str; *p; p++, nchars++) + nchars += __wt_json_unpack_char(*p, NULL, 0, 0); + q = malloc(nchars + 1); + if (q == NULL) + return (1); + *result = q; + left = nchars; + for (p = str; *p; p++, nchars++) { + nchars = __wt_json_unpack_char(*p, (u_char *)q, left, 0); + left -= nchars; + q += nchars; + } + *q = '\0'; + return (0); +} + +/* + * print_config -- + * Output a key/value URI pair by combining v1 and v2. + */ +static int +print_config(WT_SESSION *session, + const char *key, const char *v1, const char *v2) +{ + WT_DECL_RET; + const char *value_ret; + + /* + * The underlying call will ignore v2 if v1 is NULL -- check here and + * swap in that case. + */ + if (v1 == NULL) { + v1 = v2; + v2 = NULL; + } + + if ((ret = __wt_session_create_strip(session, v1, v2, &value_ret)) != 0) + return (util_err(ret, NULL)); + ret = printf("%s\n%s\n", key, value_ret); + free((char *)value_ret); + if (ret < 0) + return (util_err(EIO, NULL)); + return (0); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "dump [-jrx] [-c checkpoint] [-f output-file] uri\n", + progname, usage_prefix); + return (1); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_list.c b/src/third_party/wiredtiger/src/utilities/util_list.c new file mode 100644 index 00000000000..4a1489628d1 --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_list.c @@ -0,0 +1,193 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int list_print(WT_SESSION *, const char *, int, int); +static int list_print_checkpoint(WT_SESSION *, const char *); +static int usage(void); + +int +util_list(WT_SESSION *session, int argc, char *argv[]) +{ + WT_DECL_RET; + int cflag, ch, vflag; + char *name; + + cflag = vflag = 0; + name = NULL; + while ((ch = __wt_getopt(progname, argc, argv, "cv")) != EOF) + switch (ch) { + case 'c': + cflag = 1; + break; + case 'v': + vflag = 1; + break; + case '?': + default: + return (usage()); + } + argc -= __wt_optind; + argv += __wt_optind; + + switch (argc) { + case 0: + break; + case 1: + if ((name = util_name(*argv, "table")) == NULL) + return (1); + break; + default: + return (usage()); + } + + ret = list_print(session, name, cflag, vflag); + + if (name != NULL) + free(name); + + return (ret); +} + +/* + * list_print -- + * List the high-level objects in the database. + */ +static int +list_print(WT_SESSION *session, const char *name, int cflag, int vflag) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + int found; + const char *key, *value; + + /* Open the metadata file. */ + if ((ret = session->open_cursor( + session, WT_METADATA_URI, NULL, NULL, &cursor)) != 0) { + /* + * If there is no metadata (yet), this will return ENOENT. + * Treat that the same as an empty metadata. + */ + if (ret == ENOENT) + return (0); + + fprintf(stderr, "%s: %s: session.open_cursor: %s\n", + progname, WT_METADATA_URI, wiredtiger_strerror(ret)); + return (1); + } + + found = name == NULL; + while ((ret = cursor->next(cursor)) == 0) { + /* Get the key. */ + if ((ret = cursor->get_key(cursor, &key)) != 0) + return (util_cerr("metadata", "get_key", ret)); + + /* + * If a name is specified, only show objects that match. + */ + if (name != NULL) { + if (!WT_PREFIX_MATCH(key, name)) + continue; + found = 1; + } + + /* + * XXX + * We don't normally say anything about the WiredTiger + * metadata, it's not a normal "object" in the database. I'm + * making an exception for the checkpoint and verbose options. + */ + if (strcmp(key, WT_METADATA_URI) != 0 || cflag || vflag) + printf("%s\n", key); + + if (!cflag && !vflag) + continue; + + if (cflag && (ret = list_print_checkpoint(session, key)) != 0) + return (ret); + if (vflag) { + if ((ret = cursor->get_value(cursor, &value)) != 0) + return ( + util_cerr("metadata", "get_value", ret)); + printf("%s\n", value); + } + } + if (ret != WT_NOTFOUND) + return (util_cerr("metadata", "next", ret)); + if (!found) { + fprintf(stderr, "%s: %s: not found\n", progname, name); + return (1); + } + + return (0); +} + +/* + * list_print_checkpoint -- + * List the checkpoint information. + */ +static int +list_print_checkpoint(WT_SESSION *session, const char *key) +{ + WT_DECL_RET; + WT_CKPT *ckpt, *ckptbase; + size_t len; + time_t t; + uint64_t v; + + /* + * We may not find any checkpoints for this file, in which case we don't + * report an error, and continue our caller's loop. Otherwise, read the + * list of checkpoints and print each checkpoint's name and time. + */ + if ((ret = __wt_metadata_get_ckptlist(session, key, &ckptbase)) != 0) + return (ret == WT_NOTFOUND ? 0 : ret); + + /* Find the longest name, so we can pretty-print. */ + len = 0; + WT_CKPT_FOREACH(ckptbase, ckpt) + if (strlen(ckpt->name) > len) + len = strlen(ckpt->name); + ++len; + + WT_CKPT_FOREACH(ckptbase, ckpt) { + /* + * Call ctime, not ctime_r; ctime_r has portability problems, + * the Solaris version is different from the POSIX standard. + */ + t = (time_t)ckpt->sec; + printf("\t%*s: %.24s", (int)len, ckpt->name, ctime(&t)); + + v = ckpt->ckpt_size; + if (v >= WT_PETABYTE) + printf(" (%" PRIu64 " PB)\n", v / WT_PETABYTE); + else if (v >= WT_TERABYTE) + printf(" (%" PRIu64 " TB)\n", v / WT_TERABYTE); + else if (v >= WT_GIGABYTE) + printf(" (%" PRIu64 " GB)\n", v / WT_GIGABYTE); + else if (v >= WT_MEGABYTE) + printf(" (%" PRIu64 " MB)\n", v / WT_MEGABYTE); + else if (v >= WT_KILOBYTE) + printf(" (%" PRIu64 " KB)\n", v / WT_KILOBYTE); + else + printf(" (%" PRIu64 " B)\n", v); + } + + __wt_metadata_free_ckptlist(session, ckptbase); + return (0); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "list [-cv] [uri]\n", + progname, usage_prefix); + return (1); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_load.c b/src/third_party/wiredtiger/src/utilities/util_load.c new file mode 100644 index 00000000000..7d9dfa445dc --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_load.c @@ -0,0 +1,595 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" +#include "util_load.h" + +static int config_read(char ***, int *); +static int config_rename(char **, const char *); +static void config_remove(char *, const char *); +static int format(void); +static int insert(WT_CURSOR *, const char *); +static int load_dump(WT_SESSION *); +static int usage(void); + +static int append; /* -a append (ignore record number keys) */ +static char *cmdname; /* -r rename */ +static char **cmdconfig; /* configuration pairs */ +static int json; /* -j input is JSON format */ +static int no_overwrite; /* -n don't overwrite existing data */ + +int +util_load(WT_SESSION *session, int argc, char *argv[]) +{ + int ch; + const char *filename; + uint32_t flags; + + flags = 0; + + filename = "<stdin>"; + while ((ch = __wt_getopt(progname, argc, argv, "af:jnr:")) != EOF) + switch (ch) { + case 'a': /* append (ignore record number keys) */ + append = 1; + break; + case 'f': /* input file */ + if (freopen(__wt_optarg, "r", stdin) == NULL) + return ( + util_err(errno, "%s: reopen", __wt_optarg)); + else + filename = __wt_optarg; + break; + case 'j': /* input is JSON */ + json = 1; + break; + case 'n': /* don't overwrite existing data */ + no_overwrite = 1; + break; + case 'r': /* rename */ + cmdname = __wt_optarg; + break; + case '?': + default: + return (usage()); + } + argc -= __wt_optind; + argv += __wt_optind; + + /* -a and -o are mutually exclusive. */ + if (append == 1 && no_overwrite == 1) + return (util_err(EINVAL, + "the -a (append) and -n (no-overwrite) flags are mutually " + "exclusive")); + + /* The remaining arguments are configuration uri/string pairs. */ + if (argc != 0) { + if (argc % 2 != 0) + return (usage()); + cmdconfig = argv; + } + + if (json) { + if (append) + flags |= LOAD_JSON_APPEND; + if (no_overwrite) + flags |= LOAD_JSON_NO_OVERWRITE; + return (util_load_json(session, filename, flags)); + } else + return (load_dump(session)); +} + +/* + * load_dump -- + * Load from the WiredTiger dump format. + */ +static int +load_dump(WT_SESSION *session) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + int hex, tret; + char **list, **tlist, *uri, config[64]; + + cursor = NULL; + list = NULL; /* -Wuninitialized */ + hex = 0; /* -Wuninitialized */ + uri = NULL; + + /* Read the metadata file. */ + if ((ret = config_read(&list, &hex)) != 0) + return (ret); + + /* Reorder and check the list. */ + if ((ret = config_reorder(list)) != 0) + goto err; + + /* Update the config based on any command-line configuration. */ + if ((ret = config_update(session, list)) != 0) + goto err; + + uri = list[0]; + /* Create the items in the list. */ + if ((ret = config_exec(session, list)) != 0) + goto err; + + /* Open the insert cursor. */ + (void)snprintf(config, sizeof(config), + "dump=%s%s%s", + hex ? "hex" : "print", + append ? ",append" : "", no_overwrite ? ",overwrite=false" : ""); + if ((ret = session->open_cursor( + session, uri, NULL, config, &cursor)) != 0) { + ret = util_err(ret, "%s: session.open", uri); + goto err; + } + + /* + * Check the append flag (it only applies to objects where the primary + * key is a record number). + */ + if (append && strcmp(cursor->key_format, "r") != 0) { + fprintf(stderr, + "%s: %s: -a option illegal unless the primary key is a " + "record number\n", + progname, uri); + ret = 1; + } else + ret = insert(cursor, uri); + +err: /* + * Technically, we don't have to close the cursor because the session + * handle will do it for us, but I'd like to see the flush to disk and + * the close succeed, it's better to fail early when loading files. + */ + if (cursor != NULL && (tret = cursor->close(cursor)) != 0) { + tret = util_err(tret, "%s: cursor.close", uri); + if (ret == 0) + ret = tret; + } + if (ret == 0) + ret = util_flush(session, uri); + + for (tlist = list; *tlist != NULL; ++tlist) + free(*tlist); + free(list); + + return (ret == 0 ? 0 : 1); +} + +/* + * config_exec -- + * Create the tables/indices/colgroups implied by the list. + */ +int +config_exec(WT_SESSION *session, char **list) +{ + WT_DECL_RET; + + for (; *list != NULL; list += 2) + if ((ret = session->create(session, list[0], list[1])) != 0) + return (util_err(ret, "%s: session.create", list[0])); + return (0); +} + +/* + * config_list_free -- + * Add a value to the config list. + */ +int +config_list_add(CONFIG_LIST *clp, char *val) +{ + if (clp->entry + 1 >= clp->max_entry) + if ((clp->list = realloc(clp->list, (size_t) + (clp->max_entry += 100) * sizeof(char *))) == NULL) + /* List already freed by realloc. */ + return (util_err(errno, NULL)); + + clp->list[clp->entry++] = val; + clp->list[clp->entry] = NULL; + return (0); +} + +/* + * config_list_free -- + * Free the list and any of its entries. + */ +void +config_list_free(CONFIG_LIST *clp) +{ + char **entry; + + if (clp->list != NULL) + for (entry = &clp->list[0]; *entry != NULL; entry++) + free(*entry); + free(clp->list); + clp->list = NULL; +} + +/* + * config_read -- + * Read the config lines and do some basic validation. + */ +static int +config_read(char ***listp, int *hexp) +{ + ULINE l; + WT_DECL_RET; + int entry, eof, max_entry; + const char *s; + char **list, **tlist; + + list = NULL; + memset(&l, 0, sizeof(l)); + + /* Header line #1: "WiredTiger Dump" and a WiredTiger version. */ + if (util_read_line(&l, 0, &eof)) + return (1); + s = "WiredTiger Dump "; + if (strncmp(l.mem, s, strlen(s)) != 0) + return (format()); + + /* Header line #2: "Format={hex,print}". */ + if (util_read_line(&l, 0, &eof)) + return (1); + if (strcmp(l.mem, "Format=print") == 0) + *hexp = 0; + else if (strcmp(l.mem, "Format=hex") == 0) + *hexp = 1; + else + return (format()); + + /* Header line #3: "Header". */ + if (util_read_line(&l, 0, &eof)) + return (1); + if (strcmp(l.mem, "Header") != 0) + return (format()); + + /* Now, read in lines until we get to the end of the headers. */ + for (entry = max_entry = 0, list = NULL;; ++entry) { + if ((ret = util_read_line(&l, 0, &eof)) != 0) + goto err; + if (strcmp(l.mem, "Data") == 0) + break; + + /* + * Grow the array of header lines as necessary -- we need an + * extra slot for NULL termination. + */ + if (entry + 1 >= max_entry) { + if ((tlist = realloc(list, (size_t) + (max_entry += 100) * sizeof(char *))) == NULL) { + ret = util_err(errno, NULL); + + /* + * List already freed by realloc, still use err + * label for consistency. + */ + list = NULL; + goto err; + } + list = tlist; + } + if ((list[entry] = strdup(l.mem)) == NULL) { + ret = util_err(errno, NULL); + goto err; + } + list[entry + 1] = NULL; + } + + /* Headers are required, and they're supposed to be in pairs. */ + if (list == NULL || entry % 2 != 0) { + ret = format(); + goto err; + } + *listp = list; + return (0); + +err: if (list != NULL) { + for (tlist = list; *tlist != NULL; ++tlist) + free(*tlist); + free(list); + } + return (ret); +} + +/* + * config_reorder -- + * For table dumps, reorder the list so tables are first. + * For other dumps, make any needed checks. + */ +int +config_reorder(char **list) +{ + char **entry, *p; + + /* + * Search for a table name -- if we find one, then it's table dump, + * otherwise, it's a single file dump. + */ + for (entry = list; *entry != NULL; ++entry) + if (WT_PREFIX_MATCH(*entry, "table:")) + break; + if (*entry == NULL) { + /* + * Single file dumps can only have two lines, the file name and + * the configuration information. + */ + if ((list[0] == NULL || list[1] == NULL || list[2] != NULL) || + (WT_PREFIX_MATCH(list[0], "file:") && + WT_PREFIX_MATCH(list[0], "lsm:"))) + return (format()); + + entry = list; + } + + /* + * Make sure the table key/value pair comes first, then we can just + * run through the array in order. (We already checked that we had + * a multiple of 2 entries, so this is safe.) + */ + if (entry != list) { + p = list[0]; list[0] = entry[0]; entry[0] = p; + p = list[1]; list[1] = entry[1]; entry[1] = p; + } + return (0); +} + +/* + * config_update -- + * Reconcile and update the command line configuration against the + * config we found. + */ +int +config_update(WT_SESSION *session, char **list) +{ + int found; + const char *cfg[] = { NULL, NULL, NULL }; + char **configp, **listp; + const char **rm; + static const char *rmnames[] = { + "filename", "id", "checkpoint", "checkpoint_lsn", + "version", "source", NULL }; + + /* + * If the object has been renamed, replace all of the column group, + * index, file and table names with the new name. + */ + if (cmdname != NULL) { + for (listp = list; *listp != NULL; listp += 2) + if (WT_PREFIX_MATCH(*listp, "colgroup:") || + WT_PREFIX_MATCH(*listp, "file:") || + WT_PREFIX_MATCH(*listp, "index:") || + WT_PREFIX_MATCH(*listp, "table:")) + if (config_rename(listp, cmdname)) + return (1); + + /* + * If the object was renamed, and there are configuration pairs, + * rename the configuration pairs as well, because we don't know + * if the user used the old or new names for the pair's URI. + */ + for (configp = cmdconfig; + cmdconfig != NULL && *configp != NULL; configp += 2) + if (config_rename(configp, cmdname)) + return (1); + } + + /* + * Remove all "filename=", "source=" and other configurations + * that foil loading from the values. New filenames are chosen + * as part of table load. + */ + for (listp = list; *listp != NULL; listp += 2) + for (rm = rmnames; *rm != NULL; rm++) + if (strstr(listp[1], *rm) != NULL) + config_remove(listp[1], *rm); + + /* + * It's possible to update everything except the key/value formats. + * If there were command-line configuration pairs, walk the list of + * command-line configuration strings, and check. + */ + for (configp = cmdconfig; + cmdconfig != NULL && *configp != NULL; configp += 2) + if (strstr(configp[1], "key_format=") || + strstr(configp[1], "value_format=")) + return (util_err(0, + "the command line configuration string may not " + "modify the object's key or value format")); + + /* + * If there were command-line configuration pairs, walk the list of + * command-line URIs and find a matching dump URI. For each match, + * rewrite the dump configuration as described by the command-line + * configuration. It is an error if a command-line URI doesn't find + * a single, exact match, that's likely a mistake. + */ + for (configp = cmdconfig; + cmdconfig != NULL && *configp != NULL; configp += 2) { + found = 0; + for (listp = list; *listp != NULL; listp += 2) { + if (strncmp(*configp, listp[0], strlen(*configp)) != 0) + continue; + /* + * !!! + * We support JSON configuration strings, which leads to + * configuration strings with brackets. Unfortunately, + * that implies we can't simply append new configuration + * strings to existing ones. We call an unpublished + * WiredTiger API to do the concatenation: if anyone + * else ever needs it we can make it public, but I think + * that's unlikely. We're also playing fast and loose + * with types, but it should work. + */ + cfg[0] = listp[1]; + cfg[1] = configp[1]; + if (__wt_config_concat( + (WT_SESSION_IMPL *)session, cfg, + (const char **)&listp[1]) != 0) + return (1); + ++found; + } + switch (found) { + case 0: + return (util_err(0, + "the command line object name %s was not matched " + "by any loaded object name", *configp)); + case 1: + break; + default: + return (util_err(0, + "the command line object name %s was not unique, " + "matching more than a single loaded object name", + *configp)); + } + } + + /* Leak the memory, I don't care. */ + return (0); +} + +/* + * config_rename -- + * Update the URI name. + */ +static int +config_rename(char **urip, const char *name) +{ + size_t len; + char *buf, *p; + + /* Allocate room. */ + len = strlen(*urip) + strlen(name) + 10; + if ((buf = malloc(len)) == NULL) + return (util_err(errno, NULL)); + + /* + * Find the separating colon characters, but not the trailing one may + * not be there. + */ + if ((p = strchr(*urip, ':')) == NULL) { + free(buf); + return (format()); + } + *p = '\0'; + p = strchr(p + 1, ':'); + snprintf(buf, len, "%s:%s%s", *urip, name, p == NULL ? "" : p); + *urip = buf; + + return (0); +} + +/* + * config_remove -- + * Remove a single config key and its value. + */ +static void +config_remove(char *config, const char *ckey) +{ + int parens, quoted; + char *begin, match[100], *next, *p; + + snprintf(match, sizeof(match), "%s=", ckey); + if ((begin = strstr(config, match)) != NULL) { + parens = 0; + quoted = 0; + next = NULL; + for (p = begin + strlen(match); !next && *p; p++) + switch (*p) { + case '(': + if (!quoted) + parens++; + break; + case ')': + if (!quoted) + parens--; + break; + case '"': + quoted = !quoted; + break; + case ',': + if (!quoted && parens == 0) + next = p + 1; + break; + } + if (next) + memmove(begin, next, strlen(next) + 1); + else + *begin = '\0'; + } +} + +/* + * format -- + * The input doesn't match the dump format. + */ +static int +format(void) +{ + return (util_err(0, "input does not match WiredTiger dump format")); +} + +/* + * insert -- + * Read and insert data. + */ +static int +insert(WT_CURSOR *cursor, const char *name) +{ + ULINE key, value; + WT_DECL_RET; + uint64_t insert_count; + int eof; + + memset(&key, 0, sizeof(key)); + memset(&value, 0, sizeof(value)); + + /* Read key/value pairs and insert them into the file. */ + for (insert_count = 0;;) { + /* + * Three modes: in row-store, we always read a key and use it, + * in column-store, we might read it (a dump), we might read + * and ignore it (a dump with "append" set), or not read it at + * all (flat-text load). + */ + if (util_read_line(&key, 1, &eof)) + return (1); + if (eof == 1) + break; + if (!append) + cursor->set_key(cursor, key.mem); + + if (util_read_line(&value, 0, &eof)) + return (1); + cursor->set_value(cursor, value.mem); + + if ((ret = cursor->insert(cursor)) != 0) + return (util_err(ret, "%s: cursor.insert", name)); + + /* Report on progress every 100 inserts. */ + if (verbose && ++insert_count % 100 == 0) { + printf("\r\t%s: %" PRIu64, name, insert_count); + fflush(stdout); + } + } + + if (verbose) + printf("\r\t%s: %" PRIu64 "\n", name, insert_count); + + return (0); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "load [-as] [-f input-file] [-r name] [object configuration ...]\n", + progname, usage_prefix); + return (1); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_load.h b/src/third_party/wiredtiger/src/utilities/util_load.h new file mode 100644 index 00000000000..7bca677e178 --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_load.h @@ -0,0 +1,27 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * A list of configuration strings. + */ +typedef struct { + char **list; /* array of alternating (uri, config) values */ + int entry; /* next entry available in list */ + int max_entry; /* how many allocated in list */ +} CONFIG_LIST; + +int config_exec(WT_SESSION *, char **); +int config_list_add(CONFIG_LIST *, char *); +void config_list_free(CONFIG_LIST *); +int config_reorder(char **); +int config_update(WT_SESSION *, char **); + +/* Flags for util_load_json */ +#define LOAD_JSON_APPEND 0x0001 /* append (ignore record number keys) */ +#define LOAD_JSON_NO_OVERWRITE 0x0002 /* don't overwrite existing data */ + +int util_load_json(WT_SESSION *, const char *, uint32_t); diff --git a/src/third_party/wiredtiger/src/utilities/util_load_json.c b/src/third_party/wiredtiger/src/utilities/util_load_json.c new file mode 100644 index 00000000000..fb61df9ab16 --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_load_json.c @@ -0,0 +1,573 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" +#include "util_load.h" + +/* + * Encapsulates the input state for parsing JSON. + * + * At any time, we may be peeking at an unconsumed token; this is + * indicated by 'peeking' as true. toktype, tokstart, toklen will be + * set in this case. + * + * Generally we are collecting and processing tokens one by one. + * In JSON, tokens never span lines so this makes processing easy. + * The exception is that a JSON dump cursor takes the complete + * set of keys or values during cursor->set_key/set_value calls, + * which may contain many tokens and span lines. E.g. + * cursor->set_value("\"name\" : \"John\", \"phone\" : 2348765"); + * The raw key/value string is collected in the kvraw field. + */ +typedef struct { + WT_SESSION *session; /* associated session */ + ULINE line; /* current line */ + const char *p; /* points to cur position in line.mem */ + int ateof; /* current token is EOF */ + int peeking; /* peeking at next token */ + int toktype; /* next token, defined by __wt_json_token() */ + const char *tokstart; /* next token start (points into line.mem) */ + size_t toklen; /* next token length */ + char *kvraw; /* multiple line raw content collected so far */ + size_t kvrawstart; /* pos on cur line that JSON key/value starts */ + const char *filename; /* filename for error reporting */ + int linenum; /* line number for error reporting */ +} JSON_INPUT_STATE; + +static int json_column_group_index(WT_SESSION *, JSON_INPUT_STATE *, + CONFIG_LIST *, int); +static int json_data(WT_SESSION *, JSON_INPUT_STATE *, CONFIG_LIST *, uint32_t); +static int json_expect(WT_SESSION *, JSON_INPUT_STATE *, int); +static int json_peek(WT_SESSION *, JSON_INPUT_STATE *); +static int json_skip(WT_SESSION *, JSON_INPUT_STATE *, const char **); +static int json_kvraw_append(JSON_INPUT_STATE *, const char *, size_t); +static int json_strdup(JSON_INPUT_STATE *, char **); +static int json_top_level(WT_SESSION *, JSON_INPUT_STATE *, uint32_t); + +#define JSON_STRING_MATCH(ins, match) \ + ((ins)->toklen - 2 == strlen(match) && \ + strncmp((ins)->tokstart + 1, (match), (ins)->toklen - 2) == 0) + +#define JSON_INPUT_POS(ins) \ + ((size_t)((ins)->p - (const char *)(ins)->line.mem)) + +#define JSON_EXPECT(session, ins, tok) do { \ + if (json_expect(session, ins, tok)) \ + goto err; \ +} while (0) + +/* + * json_column_group_index -- + * Parse a column group or index entry from JSON input. + */ +static int +json_column_group_index(WT_SESSION *session, JSON_INPUT_STATE *ins, + CONFIG_LIST *clp, int idx) +{ + WT_DECL_RET; + char *config, *p, *uri; + int isconfig; + + uri = NULL; + config = NULL; + + while (json_peek(session, ins) == '{') { + JSON_EXPECT(session, ins, '{'); + JSON_EXPECT(session, ins, 's'); + isconfig = JSON_STRING_MATCH(ins, "config"); + if (!isconfig && !JSON_STRING_MATCH(ins, "uri")) + goto err; + JSON_EXPECT(session, ins, ':'); + JSON_EXPECT(session, ins, 's'); + + if ((ret = json_strdup(ins, &p)) != 0) { + ret = util_err(ret, NULL); + goto err; + } + if (isconfig) + config = p; + else + uri = p; + + isconfig = !isconfig; + JSON_EXPECT(session, ins, ','); + JSON_EXPECT(session, ins, 's'); + if (!JSON_STRING_MATCH(ins, isconfig ? "config" : "uri")) + goto err; + JSON_EXPECT(session, ins, ':'); + JSON_EXPECT(session, ins, 's'); + + if ((ret = json_strdup(ins, &p)) != 0) { + ret = util_err(ret, NULL); + goto err; + } + if (isconfig) + config = p; + else + uri = p; + JSON_EXPECT(session, ins, '}'); + if ((idx && strncmp(uri, "index:", 6) != 0) || + (!idx && strncmp(uri, "colgroup:", 9) != 0)) { + ret = util_err(EINVAL, + "%s: misplaced colgroup or index", uri); + goto err; + } + if ((ret = config_list_add(clp, uri)) != 0 || + (ret = config_list_add(clp, config)) != 0) + goto err; + + if (json_peek(session, ins) != ',') + break; + JSON_EXPECT(session, ins, ','); + if (json_peek(session, ins) != '{') + goto err; + } + if (0) { +err: if (ret == 0) + ret = EINVAL; + } + return (ret); +} + +/* + * json_kvraw_append -- + * Append to the kvraw buffer, which is used to collect all the + * raw key/value pairs from JSON input. + */ +static int json_kvraw_append(JSON_INPUT_STATE *ins, const char *str, size_t len) +{ + char *tmp; + size_t needsize; + + if (len > 0) { + needsize = strlen(ins->kvraw) + len + 2; + if ((tmp = malloc(needsize)) == NULL) + return (util_err(errno, NULL)); + snprintf(tmp, needsize, "%s %.*s", ins->kvraw, (int)len, str); + free(ins->kvraw); + ins->kvraw = tmp; + } + return (0); +} + +/* + * json_strdup -- + * Return a string, with no escapes or other JSON-isms, from the + * JSON string at the current input position. + */ +static int +json_strdup(JSON_INPUT_STATE *ins, char **resultp) +{ + WT_DECL_RET; + char *result, *resultcpy; + const char *src; + ssize_t resultlen; + size_t srclen; + + result = NULL; + src = ins->tokstart + 1; /*strip "" from token */ + srclen = ins->toklen - 2; + if ((resultlen = __wt_json_strlen(src, srclen)) < 0) { + ret = util_err(EINVAL, "Invalid config string"); + goto err; + } + resultlen += 1; + if ((result = (char *)malloc((size_t)resultlen)) == NULL) { + ret = util_err(errno, NULL); + goto err; + } + *resultp = result; + resultcpy = result; + if ((ret = __wt_json_strncpy(&resultcpy, (size_t)resultlen, src, + srclen)) + != 0) { + ret = util_err(ret, NULL); + goto err; + } + + if (0) { +err: if (ret == 0) + ret = EINVAL; + if (result != NULL) + free(result); + *resultp = NULL; + } + return (ret); +} + +/* + * json_data -- + * Parse the data portion of the JSON input, and insert all + * values. + */ +static int +json_data(WT_SESSION *session, JSON_INPUT_STATE *ins, CONFIG_LIST *clp, + uint32_t flags) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + char config[64], *endp, *uri; + const char *keyformat; + int isrec, nfield, nkeys, toktype, tret; + size_t keystrlen; + ssize_t gotnolen; + uint64_t gotno, recno; + + cursor = NULL; + uri = NULL; + + /* Reorder and check the list. */ + if ((ret = config_reorder(clp->list)) != 0) + goto err; + + /* Update config based on command-line configuration. */ + if ((ret = config_update(session, clp->list)) != 0) + goto err; + + /* Create the items collected. */ + if ((ret = config_exec(session, clp->list)) != 0) + goto err; + + uri = clp->list[0]; + (void)snprintf(config, sizeof(config), + "dump=json%s%s", + LF_ISSET(LOAD_JSON_APPEND) ? ",append" : "", + LF_ISSET(LOAD_JSON_NO_OVERWRITE) ? ",overwrite=false" : ""); + if ((ret = session->open_cursor( + session, uri, NULL, config, &cursor)) != 0) { + ret = util_err(ret, "%s: session.open", uri); + goto err; + } + keyformat = cursor->key_format; + isrec = (strcmp(keyformat, "r") == 0); + for (nkeys = 0; *keyformat; keyformat++) + if (!isdigit(*keyformat)) + nkeys++; + + recno = 0; + while (json_peek(session, ins) == '{') { + nfield = 0; + JSON_EXPECT(session, ins, '{'); + if (ins->kvraw == NULL) { + if ((ins->kvraw = (char *)malloc(1)) == NULL) { + ret = util_err(errno, NULL); + goto err; + } + } + ins->kvraw[0] = '\0'; + ins->kvrawstart = JSON_INPUT_POS(ins); + keystrlen = 0; + while (json_peek(session, ins) == 's') { + JSON_EXPECT(session, ins, 's'); + JSON_EXPECT(session, ins, ':'); + toktype = json_peek(session, ins); + JSON_EXPECT(session, ins, toktype); + if (isrec && nfield == 0) { + /* Verify the dump has recnos in order. */ + recno++; + gotno = __wt_strtouq(ins->tokstart, &endp, 0); + gotnolen = (endp - ins->tokstart); + if (recno != gotno || + ins->toklen != (size_t)gotnolen) { + ret = util_err(0, + "%s: recno out of order", uri); + goto err; + } + } + if (++nfield == nkeys) { + size_t curpos = JSON_INPUT_POS(ins); + if ((ret = json_kvraw_append(ins, + (char *)ins->line.mem + ins->kvrawstart, + curpos - ins->kvrawstart)) != 0) + goto err; + ins->kvrawstart = curpos; + keystrlen = strlen(ins->kvraw); + } + if (json_peek(session, ins) != ',') + break; + JSON_EXPECT(session, ins, ','); + if (json_peek(session, ins) != 's') + goto err; + } + if (json_kvraw_append(ins, ins->line.mem, JSON_INPUT_POS(ins))) + goto err; + + ins->kvraw[keystrlen] = '\0'; + if (!LF_ISSET(LOAD_JSON_APPEND)) + cursor->set_key(cursor, ins->kvraw); + /* skip over inserted space and comma */ + cursor->set_value(cursor, &ins->kvraw[keystrlen+2]); + if ((ret = cursor->insert(cursor)) != 0) { + ret = util_err(ret, "%s: cursor.insert", uri); + goto err; + } + + JSON_EXPECT(session, ins, '}'); + if (json_peek(session, ins) != ',') + break; + JSON_EXPECT(session, ins, ','); + if (json_peek(session, ins) != '{') + goto err; + } + if (0) { +err: if (ret == 0) + ret = EINVAL; + } + /* + * Technically, we don't have to close the cursor because the session + * handle will do it for us, but I'd like to see the flush to disk and + * the close succeed, it's better to fail early when loading files. + */ + if (cursor != NULL && (tret = cursor->close(cursor)) != 0) { + tret = util_err(tret, "%s: cursor.close", uri); + if (ret == 0) + ret = tret; + } + if (ret == 0) + ret = util_flush(session, uri); + return (ret); +} + +/* + * json_top_level -- + * Parse the top level JSON input. + */ +static int +json_top_level(WT_SESSION *session, JSON_INPUT_STATE *ins, uint32_t flags) +{ + CONFIG_LIST cl; + WT_DECL_RET; + char *config, *tableuri; + int toktype; + static const char *json_markers[] = { + "\"config\"", "\"colgroups\"", "\"indices\"", "\"data\"", NULL }; + + memset(&cl, 0, sizeof(cl)); + tableuri = NULL; + JSON_EXPECT(session, ins, '{'); + while (json_peek(session, ins) == 's') { + JSON_EXPECT(session, ins, 's'); + tableuri = realloc(tableuri, ins->toklen); + snprintf(tableuri, ins->toklen, "%.*s", + (int)(ins->toklen - 2), ins->tokstart + 1); + JSON_EXPECT(session, ins, ':'); + + /* + * Allow any ordering of 'config', 'colgroups', + * 'indices' before 'data', which must appear last. + * The non-'data' items build up a list of entries + * that created in our session before the data is + * inserted. + */ + for (;;) { + if (json_skip(session, ins, json_markers) != 0) + goto err; + JSON_EXPECT(session, ins, 's'); + if (JSON_STRING_MATCH(ins, "config")) { + JSON_EXPECT(session, ins, ':'); + JSON_EXPECT(session, ins, 's'); + if ((ret = json_strdup(ins, &config)) != 0) { + ret = util_err(ret, NULL); + goto err; + } + if ((ret = config_list_add(&cl, tableuri)) != 0) + goto err; + if ((ret = config_list_add(&cl, config)) != 0) + goto err; + tableuri = NULL; + } else if (JSON_STRING_MATCH(ins, "colgroups")) { + JSON_EXPECT(session, ins, ':'); + JSON_EXPECT(session, ins, '['); + if ((ret = json_column_group_index( + session, ins, &cl, 0)) != 0) + goto err; + JSON_EXPECT(session, ins, ']'); + } else if (JSON_STRING_MATCH(ins, "indices")) { + JSON_EXPECT(session, ins, ':'); + JSON_EXPECT(session, ins, '['); + if ((ret = json_column_group_index( + session, ins, &cl, 1)) != 0) + goto err; + JSON_EXPECT(session, ins, ']'); + } else if (JSON_STRING_MATCH(ins, "data")) { + JSON_EXPECT(session, ins, ':'); + JSON_EXPECT(session, ins, '['); + if ((ret = json_data(session, ins, &cl, + flags)) != 0) + goto err; + config_list_free(&cl); + break; + } + else + goto err; + } + + while ((toktype = json_peek(session, ins)) == '}' || + toktype == ']') + JSON_EXPECT(session, ins, toktype); + if (toktype == 0) /* Check EOF. */ + break; + if (toktype == ',') { + JSON_EXPECT(session, ins, ','); + if (json_peek(session, ins) != 's') + goto err; + continue; + } + } + JSON_EXPECT(session, ins, 0); + + if (0) { +err: if (ret == 0) + ret = EINVAL; + } + config_list_free(&cl); + if (tableuri != NULL) + free(tableuri); + return (ret); +} + +/* + * json_peek -- + * Set the input state to the next available token in the input + * and return its token type, a code defined by __wt_json_token(). + */ +static int +json_peek(WT_SESSION *session, JSON_INPUT_STATE *ins) +{ + WT_DECL_RET; + + if (!ins->peeking) { + while (!ins->ateof) { + while (isspace(*ins->p)) + ins->p++; + if (*ins->p) + break; + if (ins->kvraw != NULL) { + if (json_kvraw_append(ins, + (char *)ins->line.mem + ins->kvrawstart, + strlen(ins->line.mem) - ins->kvrawstart)) { + ret = -1; + goto err; + } + ins->kvrawstart = 0; + } + if (util_read_line(&ins->line, 1, + &ins->ateof)) { + ins->toktype = -1; + ret = -1; + goto err; + } + ins->linenum++; + ins->p = (const char *)ins->line.mem; + } + if (ins->ateof) + ins->toktype = 0; + else if (__wt_json_token(session, ins->p, + &ins->toktype, &ins->tokstart, + &ins->toklen) != 0) + ins->toktype = -1; + ins->peeking = 1; + } + if (0) { + err: if (ret == 0) + ret = -1; + } + return (ret == 0 ? ins->toktype : -1); +} + +/* + * json_expect -- + * Ensure that the type of the next token in the input matches + * the wanted value, and advance past it. The values of the + * input state will be set so specific string or integer values + * can be pulled out after this call. + */ +static int +json_expect(WT_SESSION *session, JSON_INPUT_STATE *ins, int wanttok) +{ + if (json_peek(session, ins) < 0) + return (1); + ins->p += ins->toklen; + ins->peeking = 0; + if (ins->toktype != wanttok) { + fprintf(stderr, + "%s: %d: %" WT_SIZET_FMT ": expected %s, got %s\n", + ins->filename, + ins->linenum, + JSON_INPUT_POS(ins) + 1, + __wt_json_tokname(wanttok), + __wt_json_tokname(ins->toktype)); + return (1); + } + return (0); +} + +/* + * json_skip -- + * Skip over JSON input until one of the specified strings appears. + * The tokenizer will be set to point to the beginning of + * that string. + */ +static int +json_skip(WT_SESSION *session, JSON_INPUT_STATE *ins, const char **matches) +{ + const char *hit; + const char **match; + + if (ins->kvraw != NULL) + return (1); + + hit = NULL; + while (!ins->ateof) { + for (match = matches; *match != NULL; match++) + if ((hit = strstr(ins->p, *match)) != NULL) + goto out; + if (util_read_line(&ins->line, 1, &ins->ateof)) { + ins->toktype = -1; + return (1); + } + ins->linenum++; + ins->p = (const char *)ins->line.mem; + } +out: + if (hit == NULL) + return (1); + + /* Set to this token. */ + ins->p = hit; + ins->peeking = 0; + ins->toktype = 0; + (void)json_peek(session, ins); + return (0); +} + +/* + * load_json -- + * Load from the JSON format produced by 'wt dump -j'. + */ +int +util_load_json(WT_SESSION *session, const char *filename, uint32_t flags) +{ + JSON_INPUT_STATE instate; + WT_DECL_RET; + + memset(&instate, 0, sizeof(instate)); + instate.session = session; + if (util_read_line(&instate.line, 0, &instate.ateof)) + return (1); + instate.p = (const char *)instate.line.mem; + instate.linenum = 1; + instate.filename = filename; + + if ((ret = json_top_level(session, &instate, flags)) != 0) + goto err; + +err: if (instate.line.mem != NULL) + free(instate.line.mem); + free(instate.kvraw); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_loadtext.c b/src/third_party/wiredtiger/src/utilities/util_loadtext.c new file mode 100644 index 00000000000..27c4c23b50c --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_loadtext.c @@ -0,0 +1,157 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int insert(WT_CURSOR *, const char *, int); +static int text(WT_SESSION *, const char *); +static int usage(void); + +int +util_loadtext(WT_SESSION *session, int argc, char *argv[]) +{ + int ch; + const char *uri; + + while ((ch = __wt_getopt(progname, argc, argv, "f:")) != EOF) + switch (ch) { + case 'f': /* input file */ + if (freopen(__wt_optarg, "r", stdin) == NULL) + return ( + util_err(errno, "%s: reopen", __wt_optarg)); + break; + case '?': + default: + return (usage()); + } + argc -= __wt_optind; + argv += __wt_optind; + + /* The remaining argument is the uri. */ + if (argc != 1) + return (usage()); + if ((uri = util_name(*argv, "table")) == NULL) + return (1); + + return (text(session, uri)); +} + +/* + * text -- + * Load flat-text into a file/table. + */ +static int +text(WT_SESSION *session, const char *uri) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + int readkey, tret; + + /* + * Open the cursor, configured to append new records (in the case of + * column-store objects), or to overwrite existing strings (in the + * case of row-store objects). The two flags are mutually exclusive, + * but the library doesn't currently care that we set both of them. + */ + if ((ret = session->open_cursor( + session, uri, NULL, "append,overwrite", &cursor)) != 0) + return (util_err(ret, "%s: session.open", uri)); + + /* + * We're about to load strings, make sure the formats match. + * + * Row-store tables have key/value pairs, column-store tables only have + * values. + */ + if (strcmp(cursor->value_format, "S") != 0 || + (strcmp(cursor->key_format, "S") != 0 && + strcmp(cursor->key_format, "r") != 0)) + return (util_err(EINVAL, + "the loadtext command can only load objects configured " + "for record number or string keys, and string values")); + readkey = strcmp(cursor->key_format, "r") == 0 ? 0 : 1; + + /* Insert the records */ + ret = insert(cursor, uri, readkey); + + /* + * Technically, we don't have to close the cursor because the session + * handle will do it for us, but I'd like to see the flush to disk and + * the close succeed, it's better to fail early when loading files. + */ + if ((tret = cursor->close(cursor)) != 0) { + tret = util_err(tret, "%s: cursor.close", uri); + if (ret == 0) + ret = tret; + } + if (ret == 0) + ret = util_flush(session, uri); + + return (ret == 0 ? 0 : 1); +} + +/* + * insert -- + * Read and insert data. + */ +static int +insert(WT_CURSOR *cursor, const char *name, int readkey) +{ + ULINE key, value; + WT_DECL_RET; + uint64_t insert_count; + int eof; + + memset(&key, 0, sizeof(key)); + memset(&value, 0, sizeof(value)); + + /* Read key/value pairs and insert them into the file. */ + for (insert_count = 0;;) { + /* + * Three modes: in row-store, we always read a key and use it, + * in column-store, we might read it (a dump), we might read + * and ignore it (a dump with "append" set), or not read it at + * all (flat-text load). + */ + if (readkey) { + if (util_read_line(&key, 1, &eof)) + return (1); + if (eof == 1) + break; + cursor->set_key(cursor, key.mem); + } + if (util_read_line(&value, readkey ? 0 : 1, &eof)) + return (1); + if (eof == 1) + break; + cursor->set_value(cursor, value.mem); + + if ((ret = cursor->insert(cursor)) != 0) + return (util_err(ret, "%s: cursor.insert", name)); + + /* Report on progress every 100 inserts. */ + if (verbose && ++insert_count % 100 == 0) { + printf("\r\t%s: %" PRIu64, name, insert_count); + fflush(stdout); + } + } + + if (verbose) + printf("\r\t%s: %" PRIu64 "\n", name, insert_count); + + return (0); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "loadtext [-f input-file] uri\n", + progname, usage_prefix); + return (1); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_main.c b/src/third_party/wiredtiger/src/utilities/util_main.c new file mode 100644 index 00000000000..04ab59f1ca9 --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_main.c @@ -0,0 +1,262 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +const char *home = "."; /* Home directory */ +const char *progname; /* Program name */ + /* Global arguments */ +const char *usage_prefix = "[-Vv] [-C config] [-h home]"; +int verbose; /* Verbose flag */ + +static const char *command; /* Command name */ + +static int usage(void); + +int +main(int argc, char *argv[]) +{ + WT_CONNECTION *conn; + WT_DECL_RET; + WT_SESSION *session; + size_t len; + int ch, major_v, minor_v, tret, (*func)(WT_SESSION *, int, char *[]); + char *p; + const char *cmd_config, *config; + + conn = NULL; + p = NULL; + + /* Get the program name. */ + if ((progname = strrchr(argv[0], '/')) == NULL) + progname = argv[0]; + else + ++progname; + command = ""; + + /* Check the version against the library build. */ + (void)wiredtiger_version(&major_v, & minor_v, NULL); + if (major_v != WIREDTIGER_VERSION_MAJOR || + minor_v != WIREDTIGER_VERSION_MINOR) { + fprintf(stderr, + "%s: program build version %d.%d does not match " + "library build version %d.%d\n", + progname, + WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR, + major_v, minor_v); + return (EXIT_FAILURE); + } + + /* Check for standard options. */ + cmd_config = config = NULL; + while ((ch = __wt_getopt(progname, argc, argv, "C:h:Vv")) != EOF) + switch (ch) { + case 'C': /* wiredtiger_open config */ + cmd_config = __wt_optarg; + break; + case 'h': /* home directory */ + home = __wt_optarg; + break; + case 'V': /* version */ + printf("%s\n", wiredtiger_version(NULL, NULL, NULL)); + return (EXIT_SUCCESS); + case 'v': /* verbose */ + verbose = 1; + break; + case '?': + default: + return (usage()); + } + argc -= __wt_optind; + argv += __wt_optind; + + /* The next argument is the command name. */ + if (argc < 1) + return (usage()); + command = argv[0]; + + /* Reset getopt. */ + __wt_optreset = __wt_optind = 1; + + func = NULL; + switch (command[0]) { + case 'b': + if (strcmp(command, "backup") == 0) + func = util_backup; + break; + case 'c': + if (strcmp(command, "compact") == 0) + func = util_compact; + else if (strcmp(command, "copyright") == 0) { + util_copyright(); + return (EXIT_SUCCESS); + } else if (strcmp(command, "create") == 0) { + func = util_create; + config = "create"; + } + break; + case 'd': + if (strcmp(command, "drop") == 0) + func = util_drop; + else if (strcmp(command, "dump") == 0) + func = util_dump; + break; + case 'l': + if (strcmp(command, "list") == 0) + func = util_list; + else if (strcmp(command, "load") == 0) { + func = util_load; + config = "create"; + } else if (strcmp(command, "loadtext") == 0) { + func = util_loadtext; + config = "create"; + } + break; + case 'p': + if (strcmp(command, "printlog") == 0) + func = util_printlog; + break; + case 'r': + if (strcmp(command, "read") == 0) + func = util_read; + else if (strcmp(command, "rename") == 0) + func = util_rename; + break; + case 's': + if (strcmp(command, "salvage") == 0) + func = util_salvage; + else if (strcmp(command, "stat") == 0) { + func = util_stat; + config = "statistics=(all)"; + } + break; + case 'u': + if (strcmp(command, "upgrade") == 0) + func = util_upgrade; + break; + case 'v': + if (strcmp(command, "verify") == 0) + func = util_verify; + break; + case 'w': + if (strcmp(command, "write") == 0) + func = util_write; + break; + default: + break; + } + if (func == NULL) + return (usage()); + + /* Build the configuration string, as necessary. */ + if (config == NULL) + config = cmd_config; + else if (cmd_config != NULL) { + len = strlen(cmd_config) + strlen(config) + 10; + if ((p = malloc(len)) == NULL) { + ret = util_err(errno, NULL); + goto err; + } + (void)snprintf(p, len, "%s,%s", config, cmd_config); + config = p; + } + + /* Open the database and a session. */ + if ((ret = wiredtiger_open(home, + verbose ? verbose_handler : NULL, config, &conn)) != 0) { + ret = util_err(ret, NULL); + goto err; + } + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) { + ret = util_err(ret, NULL); + goto err; + } + + /* Call the function. */ + ret = func(session, argc, argv); + + /* Close the database. */ + +err: if (conn != NULL && (tret = conn->close(conn, NULL)) != 0 && ret == 0) + ret = tret; + + if (p != NULL) + free(p); + + return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE); +} + +static int +usage(void) +{ + fprintf(stderr, + "WiredTiger Data Engine (version %d.%d)\n", + WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR); + fprintf(stderr, + "global options:\n" + "\t" "-C\twiredtiger_open configuration\n" + "\t" "-h\tdatabase directory\n" + "\t" "-V\tdisplay library version and exit\n" + "\t" "-v\tverbose\n"); + fprintf(stderr, + "commands:\n" + "\t" "backup\t database backup\n" + "\t" "compact\t compact an object\n" + "\t" "copyright copyright information\n" + "\t" "create\t create an object\n" + "\t" "drop\t drop an object\n" + "\t" "dump\t dump an object\n" + "\t" "list\t list database objects\n" + "\t" "load\t load an object\n" + "\t" "loadtext\t load an object from a text file\n" + "\t" "printlog display the database log\n" + "\t" "read\t read values from an object\n" + "\t" "rename\t rename an object\n" + "\t" "salvage\t salvage a file\n" + "\t" "stat\t display statistics for an object\n" + "\t" "upgrade\t upgrade an object\n" + "\t" "verify\t verify an object\n" + "\t" "write\t write values to an object\n"); + + return (EXIT_FAILURE); +} + +/* + * util_name -- + * Build a name. + */ +char * +util_name(const char *s, const char *type) +{ + size_t len; + char *name; + + if (WT_PREFIX_MATCH(s, "backup:") || + WT_PREFIX_MATCH(s, "config:") || + WT_PREFIX_MATCH(s, "statistics:")) { + fprintf(stderr, + "%s: %s: unsupported object type: %s\n", + progname, command, s); + return (NULL); + } + + len = strlen(type) + strlen(s) + 2; + if ((name = calloc(len, 1)) == NULL) { + (void)util_err(errno, NULL); + return (NULL); + } + + /* + * If the string has a URI prefix, use it verbatim, otherwise prepend + * the default type for the operation. + */ + if (strchr(s, ':') != NULL) + strcpy(name, s); + else + snprintf(name, len, "%s:%s", type, s); + return (name); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_misc.c b/src/third_party/wiredtiger/src/utilities/util_misc.c new file mode 100644 index 00000000000..71e307a2e0e --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_misc.c @@ -0,0 +1,146 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +int +util_cerr(const char *uri, const char *op, int ret) +{ + return (util_err(ret, "%s: cursor.%s", uri, op)); +} + +/* + * util_err -- + * Report an error. + */ +int +util_err(int e, const char *fmt, ...) +{ + va_list ap; + + (void)fprintf(stderr, "%s: ", progname); + if (fmt != NULL) { + va_start(ap, fmt); + (void)vfprintf(stderr, fmt, ap); + va_end(ap); + if (e != 0) + (void)fprintf(stderr, ": "); + } + if (e != 0) + (void)fprintf(stderr, "%s", wiredtiger_strerror(e)); + (void)fprintf(stderr, "\n"); + return (1); +} + +/* + * util_read_line -- + * Read a line from stdin into a ULINE. + */ +int +util_read_line(ULINE *l, int eof_expected, int *eofp) +{ + static uint64_t line = 0; + size_t len; + int ch; + + ++line; + *eofp = 0; + + if (l->memsize == 0) { + if ((l->mem = realloc(l->mem, l->memsize + 1024)) == NULL) + return (util_err(errno, NULL)); + l->memsize = 1024; + } + for (len = 0;; ++len) { + if ((ch = getchar()) == EOF) { + if (len == 0) { + if (eof_expected) { + *eofp = 1; + return (0); + } + return (util_err(0, + "line %" PRIu64 ": unexpected end-of-file", + line)); + } + return (util_err(0, + "line %" PRIu64 ": no newline terminator", line)); + } + if (ch == '\n') + break; + /* + * We nul-terminate the string so it's easier to convert the + * line into a record number, that means we always need one + * extra byte at the end. + */ + if (len >= l->memsize - 1) { + if ((l->mem = + realloc(l->mem, l->memsize + 1024)) == NULL) + return (util_err(errno, NULL)); + l->memsize += 1024; + } + ((uint8_t *)l->mem)[len] = (uint8_t)ch; + } + + ((uint8_t *)l->mem)[len] = '\0'; /* nul-terminate */ + + return (0); +} + +/* + * util_str2recno -- + * Convert a string to a record number. + */ +int +util_str2recno(const char *p, uint64_t *recnop) +{ + uint64_t recno; + char *endptr; + + /* + * strtouq takes lots of things like hex values, signs and so on and so + * forth -- none of them are OK with us. Check the string starts with + * digit, that turns off the special processing. + */ + if (!isdigit(p[0])) + goto format; + + errno = 0; + recno = __wt_strtouq(p, &endptr, 0); + if (recno == ULLONG_MAX && errno == ERANGE) + return (util_err(ERANGE, "%s: invalid record number", p)); + + if (endptr[0] != '\0') +format: return (util_err(EINVAL, "%s: invalid record number", p)); + + *recnop = recno; + return (0); +} + +/* + * util_flush -- + * Flush the file successfully, or drop it. + */ +int +util_flush(WT_SESSION *session, const char *uri) +{ + WT_DECL_RET; + size_t len; + char *buf; + + len = strlen(uri) + 100; + if ((buf = malloc(len)) == NULL) + return (util_err(errno, NULL)); + + (void)snprintf(buf, len, "target=(\"%s\")", uri); + if ((ret = session->checkpoint(session, buf)) != 0) { + ret = util_err(ret, "%s: session.checkpoint", uri); + (void)session->drop(session, uri, NULL); + } + + free(buf); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_printlog.c b/src/third_party/wiredtiger/src/utilities/util_printlog.c new file mode 100644 index 00000000000..7fc9bfa39b0 --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_printlog.c @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int usage(void); + +int +util_printlog(WT_SESSION *session, int argc, char *argv[]) +{ + WT_DECL_RET; + int ch, printable; + + printable = 0; + while ((ch = __wt_getopt(progname, argc, argv, "f:p")) != EOF) + switch (ch) { + case 'f': /* output file */ + if (freopen(__wt_optarg, "w", stdout) == NULL) { + fprintf(stderr, "%s: %s: reopen: %s\n", + progname, __wt_optarg, strerror(errno)); + return (1); + } + break; + case 'p': + printable = 1; + break; + case '?': + default: + return (usage()); + } + argc -= __wt_optind; + argv += __wt_optind; + + /* There should not be any more arguments. */ + if (argc != 0) + return (usage()); + + WT_UNUSED(printable); + ret = __wt_txn_printlog(session, stdout); + + if (ret != 0) { + fprintf(stderr, "%s: printlog failed: %s\n", + progname, wiredtiger_strerror(ret)); + goto err; + } + + if (0) { +err: ret = 1; + } + return (ret); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "printlog [-p] [-f output-file]\n", + progname, usage_prefix); + return (1); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_read.c b/src/third_party/wiredtiger/src/utilities/util_read.c new file mode 100644 index 00000000000..d9a629e40e2 --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_read.c @@ -0,0 +1,101 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int usage(void); + +int +util_read(WT_SESSION *session, int argc, char *argv[]) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + uint64_t recno; + int ch, rkey, rval; + const char *uri, *value; + + while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) + switch (ch) { + case '?': + default: + return (usage()); + } + argc -= __wt_optind; + argv += __wt_optind; + + /* The remaining arguments are a uri followed by a list of keys. */ + if (argc < 2) + return (usage()); + if ((uri = util_name(*argv, "table")) == NULL) + return (1); + + /* Open the object. */ + if ((ret = session->open_cursor( + session, uri, NULL, NULL, &cursor)) != 0) + return (util_err(ret, "%s: session.open", uri)); + + /* + * A simple search only makes sense if the key format is a string or a + * record number, and the value format is a single string. + */ + if (strcmp(cursor->key_format, "r") != 0 && + strcmp(cursor->key_format, "S") != 0) { + fprintf(stderr, + "%s: read command only possible when the key format is " + "a record number or string\n", + progname); + return (1); + } + rkey = strcmp(cursor->key_format, "r") == 0 ? 1 : 0; + if (strcmp(cursor->value_format, "S") != 0) { + fprintf(stderr, + "%s: read command only possible when the value format is " + "a string\n", + progname); + return (1); + } + + /* + * Run through the keys, returning non-zero on error or if any requested + * key isn't found. + */ + for (rval = 0; *++argv != NULL;) { + if (rkey) { + if (util_str2recno(*argv, &recno)) + return (1); + cursor->set_key(cursor, recno); + } else + cursor->set_key(cursor, *argv); + + switch (ret = cursor->search(cursor)) { + case 0: + if ((ret = cursor->get_value(cursor, &value)) != 0) + return (util_cerr(uri, "get_value", ret)); + if (printf("%s\n", value) < 0) + return (util_err(EIO, NULL)); + break; + case WT_NOTFOUND: + (void)util_err(0, "%s: not found", *argv); + rval = 1; + break; + default: + return (util_cerr(uri, "search", ret)); + } + } + + return (rval); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "read uri key ...\n", + progname, usage_prefix); + return (1); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_rename.c b/src/third_party/wiredtiger/src/utilities/util_rename.c new file mode 100644 index 00000000000..8c2aeb30c59 --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_rename.c @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int usage(void); + +int +util_rename(WT_SESSION *session, int argc, char *argv[]) +{ + WT_DECL_RET; + int ch; + char *uri, *newuri; + + uri = NULL; + while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) + switch (ch) { + case '?': + default: + return (usage()); + } + argc -= __wt_optind; + argv += __wt_optind; + + /* The remaining arguments are the object uri and new name. */ + if (argc != 2) + return (usage()); + if ((uri = util_name(*argv, "table")) == NULL) + return (1); + newuri = argv[1]; + + if ((ret = session->rename(session, uri, newuri, NULL)) != 0) { + fprintf(stderr, "%s: rename %s to %s: %s\n", + progname, uri, newuri, wiredtiger_strerror(ret)); + goto err; + } + + if (0) { +err: ret = 1; + } + + if (uri != NULL) + free(uri); + + return (ret); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "rename uri newuri\n", + progname, usage_prefix); + return (1); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_salvage.c b/src/third_party/wiredtiger/src/utilities/util_salvage.c new file mode 100644 index 00000000000..386365d8875 --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_salvage.c @@ -0,0 +1,68 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int usage(void); + +int +util_salvage(WT_SESSION *session, int argc, char *argv[]) +{ + WT_DECL_RET; + int ch; + const char *force; + char *name; + + force = NULL; + name = NULL; + while ((ch = __wt_getopt(progname, argc, argv, "F")) != EOF) + switch (ch) { + case 'F': + force = "force"; + break; + case '?': + default: + return (usage()); + } + argc -= __wt_optind; + argv += __wt_optind; + + /* The remaining argument is the file name. */ + if (argc != 1) + return (usage()); + if ((name = util_name(*argv, "file")) == NULL) + return (1); + + if ((ret = session->salvage(session, name, force)) != 0) { + fprintf(stderr, "%s: salvage(%s): %s\n", + progname, name, wiredtiger_strerror(ret)); + goto err; + } + + /* Verbose configures a progress counter, move to the next line. */ + if (verbose) + printf("\n"); + + if (0) { +err: ret = 1; + } + + if (name != NULL) + free(name); + + return (ret); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "salvage [-F] uri\n", + progname, usage_prefix); + return (1); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_stat.c b/src/third_party/wiredtiger/src/utilities/util_stat.c new file mode 100644 index 00000000000..caac560e839 --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_stat.c @@ -0,0 +1,103 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int usage(void); + +int +util_stat(WT_SESSION *session, int argc, char *argv[]) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + size_t urilen; + int all, ch, objname_free; + const char *pval, *desc; + char *objname, *uri; + + all = objname_free = 0; + objname = uri = NULL; + while ((ch = __wt_getopt(progname, argc, argv, "a")) != EOF) + switch (ch) { + case 'a': + all = 1; + break; + case '?': + default: + return (usage()); + } + argc -= __wt_optind; + argv += __wt_optind; + + /* + * If there are no arguments, the statistics cursor operates on the + * connection, otherwise, the optional remaining argument is a file + * or LSM name. + */ + switch (argc) { + case 0: + objname = (char *)""; + break; + case 1: + if ((objname = util_name(*argv, "table")) == NULL) + return (1); + objname_free = 1; + break; + default: + return (usage()); + } + + urilen = strlen("statistics:") + strlen(objname) + 1; + if ((uri = calloc(urilen, 1)) == NULL) { + fprintf(stderr, "%s: %s\n", progname, strerror(errno)); + goto err; + } + snprintf(uri, urilen, "statistics:%s", objname); + + if ((ret = session->open_cursor(session, uri, NULL, + all ? "statistics=(all)" : NULL, &cursor)) != 0) { + fprintf(stderr, "%s: cursor open(%s) failed: %s\n", + progname, uri, wiredtiger_strerror(ret)); + goto err; + } + + /* List the statistics. */ + while ( + (ret = cursor->next(cursor)) == 0 && + (ret = cursor->get_value(cursor, &desc, &pval, NULL)) == 0) + if (printf("%s=%s\n", desc, pval) < 0) { + ret = errno; + break; + } + if (ret == WT_NOTFOUND) + ret = 0; + + if (ret != 0) { + fprintf(stderr, "%s: cursor get(%s) failed: %s\n", + progname, objname, wiredtiger_strerror(ret)); + goto err; + } + + if (0) { +err: ret = 1; + } + if (objname_free) + free(objname); + free(uri); + + return (ret); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "stat -a [uri]\n", + progname, usage_prefix); + return (1); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_upgrade.c b/src/third_party/wiredtiger/src/utilities/util_upgrade.c new file mode 100644 index 00000000000..b56caca2ccd --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_upgrade.c @@ -0,0 +1,63 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int usage(void); + +int +util_upgrade(WT_SESSION *session, int argc, char *argv[]) +{ + WT_DECL_RET; + int ch; + char *name; + + name = NULL; + while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) + switch (ch) { + case '?': + default: + return (usage()); + } + argc -= __wt_optind; + argv += __wt_optind; + + /* The remaining argument is the table name. */ + if (argc != 1) + return (usage()); + if ((name = util_name(*argv, "table")) == NULL) + return (1); + + if ((ret = session->upgrade(session, name, NULL)) != 0) { + fprintf(stderr, "%s: upgrade(%s): %s\n", + progname, name, wiredtiger_strerror(ret)); + goto err; + } + + /* Verbose configures a progress counter, move to the next line. */ + if (verbose) + printf("\n"); + + if (0) { +err: ret = 1; + } + + if (name != NULL) + free(name); + + return (ret); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "upgrade uri\n", + progname, usage_prefix); + return (1); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_verbose.c b/src/third_party/wiredtiger/src/utilities/util_verbose.c new file mode 100644 index 00000000000..12ff1c5463c --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_verbose.c @@ -0,0 +1,62 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +/* + * __handle_error_verbose -- + * Verbose WT_EVENT_HANDLER->handle_error implementation: send to stderr. + */ +static int +__handle_error_verbose(WT_EVENT_HANDLER *handler, + WT_SESSION *session, int error, const char *errmsg) +{ + WT_UNUSED(handler); + WT_UNUSED(session); + WT_UNUSED(error); + + return (fprintf(stderr, "%s\n", errmsg) < 0 ? EIO : 0); +} + +/* + * __handle_message_verbose -- + * Verbose WT_EVENT_HANDLER->handle_message implementation: send to stdout. + */ +static int +__handle_message_verbose(WT_EVENT_HANDLER *handler, + WT_SESSION *session, const char *message) +{ + WT_UNUSED(handler); + WT_UNUSED(session); + + return (printf("%s\n", message) < 0 ? EIO : 0); +} + +/* + * __handle_progress_verbose -- + * Default WT_EVENT_HANDLER->handle_progress implementation: ignore. + */ +static int +__handle_progress_verbose(WT_EVENT_HANDLER *handler, + WT_SESSION *session, const char *operation, uint64_t progress) +{ + WT_UNUSED(handler); + WT_UNUSED(session); + + return ( + printf("\r\t%s %-20" PRIu64, operation, progress) < 0 ? EIO : 0); +} + +static WT_EVENT_HANDLER __event_handler_verbose = { + __handle_error_verbose, + __handle_message_verbose, + __handle_progress_verbose, + NULL /* Close handler. */ + +}; + +WT_EVENT_HANDLER *verbose_handler = &__event_handler_verbose; diff --git a/src/third_party/wiredtiger/src/utilities/util_verify.c b/src/third_party/wiredtiger/src/utilities/util_verify.c new file mode 100644 index 00000000000..6ae5fdeec26 --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_verify.c @@ -0,0 +1,119 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int usage(void); + +#undef OPT_ARGS +#undef USAGE_ARGS +#ifdef HAVE_DIAGNOSTIC +#define OPT_ARGS "d:" +#define USAGE_ARGS \ + "[-d dump_address | dump_blocks | dump_offsets=#,# | dump_pages] uri" +#else +#define OPT_ARGS "" +#define USAGE_ARGS "uri" +#endif + +int +util_verify(WT_SESSION *session, int argc, char *argv[]) +{ + WT_DECL_RET; + size_t size; + int ch, dump_address, dump_blocks, dump_pages; + char *config, *dump_offsets, *name; + + dump_address = dump_blocks = dump_pages = 0; + config = dump_offsets = name = NULL; + while ((ch = __wt_getopt(progname, argc, argv, OPT_ARGS)) != EOF) + switch (ch) { + case 'd': + if (strcmp(__wt_optarg, "dump_address") == 0) + dump_address = 1; + else if (strcmp(__wt_optarg, "dump_blocks") == 0) + dump_blocks = 1; + else if ( + WT_PREFIX_MATCH(__wt_optarg, "dump_offsets=")) { + if (dump_offsets != NULL) { + fprintf(stderr, + "%s: only a single 'dump_offsets' " + "argument supported\n", progname); + return (usage()); + } + dump_offsets = + __wt_optarg + strlen("dump_offsets="); + } else if (strcmp(__wt_optarg, "dump_pages") == 0) + dump_pages = 1; + else + return (usage()); + break; + case '?': + default: + return (usage()); + } + argc -= __wt_optind; + argv += __wt_optind; + + /* The remaining argument is the table name. */ + if (argc != 1) + return (usage()); + if ((name = util_name(*argv, "table")) == NULL) + return (1); + + /* Build the configuration string as necessary. */ + if (dump_address || dump_blocks || dump_offsets != NULL || dump_pages) { + size = + strlen("dump_address,") + + strlen("dump_blocks,") + + strlen("dump_pages,") + + strlen("dump_offsets[],") + + (dump_offsets == NULL ? 0 : strlen(dump_offsets)) + 20; + if ((config = malloc(size)) == NULL) { + (void)util_err(errno, NULL); + goto err; + } + snprintf(config, size, + "%s%s%s%s%s%s", + dump_address ? "dump_address," : "", + dump_blocks ? "dump_blocks," : "", + dump_offsets != NULL ? "dump_offsets=[" : "", + dump_offsets != NULL ? dump_offsets : "", + dump_offsets != NULL ? "]," : "", + dump_pages ? "dump_pages" : ""); + } + if ((ret = session->verify(session, name, config)) != 0) { + fprintf(stderr, "%s: verify(%s): %s\n", + progname, name, wiredtiger_strerror(ret)); + goto err; + } + + /* Verbose configures a progress counter, move to the next line. */ + if (verbose) + printf("\n"); + + if (0) { +err: ret = 1; + } + + if (config != NULL) + free(config); + if (name != NULL) + free(name); + + return (ret); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "verify %s\n", + progname, usage_prefix, USAGE_ARGS); + return (1); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_write.c b/src/third_party/wiredtiger/src/utilities/util_write.c new file mode 100644 index 00000000000..067b951c0cc --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_write.c @@ -0,0 +1,107 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int usage(void); + +int +util_write(WT_SESSION *session, int argc, char *argv[]) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + uint64_t recno; + int append, ch, overwrite, rkey; + const char *uri; + char config[100]; + + append = overwrite = 0; + while ((ch = __wt_getopt(progname, argc, argv, "ao")) != EOF) + switch (ch) { + case 'a': + append = 1; + break; + case 'o': + overwrite = 1; + break; + case '?': + default: + return (usage()); + } + argc -= __wt_optind; + argv += __wt_optind; + + /* + * The remaining arguments are a uri followed by a list of values (if + * append is set), or key/value pairs (if append is not set). + */ + if (append) { + if (argc < 2) + return (usage()); + } else + if (argc < 3 || ((argc - 1) % 2 != 0)) + return (usage()); + if ((uri = util_name(*argv, "table")) == NULL) + return (1); + + /* Open the object. */ + (void)snprintf(config, sizeof(config), "%s,%s", + append ? "append=true" : "", overwrite ? "overwrite=true" : ""); + if ((ret = session->open_cursor( + session, uri, NULL, config, &cursor)) != 0) + return (util_err(ret, "%s: session.open", uri)); + + /* + * A simple search only makes sense if the key format is a string or a + * record number, and the value format is a single string. + */ + if (strcmp(cursor->key_format, "r") != 0 && + strcmp(cursor->key_format, "S") != 0) { + fprintf(stderr, + "%s: write command only possible when the key format is " + "a record number or string\n", + progname); + return (1); + } + rkey = strcmp(cursor->key_format, "r") == 0 ? 1 : 0; + if (strcmp(cursor->value_format, "S") != 0) { + fprintf(stderr, + "%s: write command only possible when the value format is " + "a string\n", + progname); + return (1); + } + + /* Run through the values or key/value pairs. */ + while (*++argv != NULL) { + if (!append) { + if (rkey) { + if (util_str2recno(*argv, &recno)) + return (1); + cursor->set_key(cursor, recno); + } else + cursor->set_key(cursor, *argv); + ++argv; + } + cursor->set_value(cursor, *argv); + + if ((ret = cursor->insert(cursor)) != 0) + return (util_cerr(uri, "search", ret)); + } + + return (0); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "write [-ao] uri key ...\n", + progname, usage_prefix); + return (1); +} |