/*-
 * Copyright (c) 2014-2015 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

/*
 * Statistics counters:
 *
 * We use an array of statistics structures; threads write different structures
 * to avoid writing the same cache line and incurring cache coherency overheads,
 * which can dramatically slow fast and otherwise read-mostly workloads.
 *
 * With an 8B statistics value and 64B cache-line alignment, 8 values share the
 * same cache line. There are collisions when different threads choose the same
 * statistics structure and update values that live on the cache line. There is
 * likely some locality however: a thread updating the cursor search statistic
 * is likely to update other cursor statistics with a chance of hitting already
 * cached values.
 *
 * The actual statistic value must be signed, because one thread might increment
 * the value in its structure, and then another thread might decrement the same
 * value in another structure (where the value was initially zero), so the value
 * in the second thread's slot will go negative.
 *
 * When reading a statistics value, the array values are summed and returned to
 * the caller. The summation is performed without locking, so the value read
 * may be inconsistent (and might be negative, if increments/decrements race
 * with the reader).
 *
 * Choosing how many structures isn't easy: obviously, a smaller number creates
 * more conflicts while a larger number uses more memory.
 *
 * Ideally, if the application running on the system is CPU-intensive, and using
 * all CPUs on the system, we want to use the same number of slots as there are
 * CPUs (because their L1 caches are the units of coherency). However, in
 * practice we cannot easily determine how many CPUs are actually available to
 * the application.
 *
 * Our next best option is to use the number of threads in the application as a
 * heuristic for the number of CPUs (presumably, the application architect has
 * figured out how many CPUs are available). However, inside WiredTiger we don't
 * know when the application creates its threads.
 *
 * For now, we use a fixed number of slots. Ideally, we would approximate the
 * largest number of cores we expect on any machine where WiredTiger might be
 * run, however, we don't want to waste that much memory on smaller machines.
 * As of 2015, machines with more than 24 CPUs are relatively rare.
 *
 * Default hash table size; use a prime number of buckets rather than assuming
 * a good hash (Reference Sedgewick, Algorithms in C, "Hash Functions").
 */
#define	WT_COUNTER_SLOTS	23

/*
 * WT_STATS_SLOT_ID is the thread's slot ID for the array of structures.
 *
 * Ideally, we want a slot per CPU, and we want each thread to index the slot
 * corresponding to the CPU it runs on. Unfortunately, getting the ID of the
 * current CPU is difficult: some operating systems provide a system call to
 * acquire a CPU ID, but not all (regardless, making a system call to increment
 * a statistics value is far too expensive).
 *
 * Our second-best option is to use the thread ID. Unfortunately, there is no
 * portable way to obtain a unique thread ID that's a small-enough number to
 * be used as an array index (portable thread IDs are usually a pointer or an
 * opaque chunk, not a simple integer).
 *
 * Our solution is to use the session ID; there is normally a session per thread
 * and the session ID is a small, monotonically increasing number.
 */
#define	WT_STATS_SLOT_ID(session)					\
	((session)->id) % WT_COUNTER_SLOTS

/*
 * Statistic structures are arrays of int64_t's. We have functions to read/write
 * those structures regardless of the specific statistic structure we're working
 * with, by translating statistics structure field names to structure offsets.
 *
 * Translate a statistic's value name to an offset.
 */
#define	WT_STATS_FIELD_TO_SLOT(stats, fld)				\
	(int)(&(stats)[0]->fld - (int64_t *)(stats)[0])

/*
 * Sum the values from all structures in the array.
 */
static inline int64_t
__wt_stats_aggregate(void *stats_arg, int slot)
{
	int64_t **stats, aggr_v;
	int i;

	stats = stats_arg;
	for (aggr_v = 0, i = 0; i < WT_COUNTER_SLOTS; i++)
		aggr_v += stats[i][slot];

	/*
	 * This can race. However, any implementation with a single value can
	 * race as well, different threads could set the same counter value
	 * simultaneously. While we are making races more likely, we are not
	 * fundamentally weakening the isolation semantics found in updating a
	 * single value.
	 *
	 * Additionally, the aggregation can go negative (imagine a thread
	 * incrementing a value after aggregation has passed its slot and a
	 * second thread decrementing a value before aggregation has reached
	 * its slot).
	 *
	 * For historic API compatibility, the external type is a uint64_t;
	 * limit our return to positive values, negative numbers would just
	 * look really, really large.
	 */
	if (aggr_v < 0)
		aggr_v = 0;
	return (aggr_v);
}

/*
 * Clear the values in all structures in the array.
 */
static inline void
__wt_stats_clear(void *stats_arg, int slot)
{
	int64_t **stats;
	int i;

	stats = stats_arg;
	for (i = 0; i < WT_COUNTER_SLOTS; i++)
		stats[i][slot] = 0;
}

/*
 * Read/write statistics without any test for statistics configuration. Reading
 * and writing the field requires different actions: reading sums the values
 * across the array of structures, writing updates a single structure's value.
 */
#define	WT_STAT_READ(stats, fld)					\
	__wt_stats_aggregate(stats, WT_STATS_FIELD_TO_SLOT(stats, fld))
#define	WT_STAT_WRITE(stats, fld, v)					\
	(stats)->fld = (int64_t)(v)

#define	WT_STAT_DECRV(session, stats, fld, value)			\
	(stats)[WT_STATS_SLOT_ID(session)]->fld -= (int64_t)(value)
#define	WT_STAT_DECR(session, stats, fld)				\
	WT_STAT_DECRV(session, stats, fld, 1)
#define	WT_STAT_INCRV(session, stats, fld, value)			\
	(stats)[WT_STATS_SLOT_ID(session)]->fld += (int64_t)(value)
#define	WT_STAT_INCR(session, stats, fld)				\
	WT_STAT_INCRV(session, stats, fld, 1)
#define	WT_STAT_SET(session, stats, fld, value) do {			\
	__wt_stats_clear(stats, WT_STATS_FIELD_TO_SLOT(stats, fld));	\
	(stats)[0]->fld = (int64_t)(value);				\
} while (0)

/*
 * Update statistics if "fast" statistics are configured.
 */
#define	WT_STAT_FAST_DECRV(session, stats, fld, value) do {		\
	if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST))	\
		WT_STAT_DECRV(session, stats, fld, value);		\
} while (0)
#define	WT_STAT_FAST_DECR(session, stats, fld)				\
	WT_STAT_FAST_DECRV(session, stats, fld, 1)
#define	WT_STAT_FAST_INCRV(session, stats, fld, value) do {		\
	if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST))	\
		WT_STAT_INCRV(session, stats, fld, value);		\
} while (0)
#define	WT_STAT_FAST_INCR(session, stats, fld)				\
	WT_STAT_FAST_INCRV(session, stats, fld, 1)
#define	WT_STAT_FAST_SET(session, stats, fld, value) do {		\
	if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST))	\
		WT_STAT_SET(session, stats, fld, value);		\
} while (0)

/*
 * Update connection handle statistics if "fast" statistics are configured.
 */
#define	WT_STAT_FAST_CONN_DECR(session, fld)				\
	WT_STAT_FAST_DECR(session, S2C(session)->stats, fld)
#define	WT_STAT_FAST_CONN_DECRV(session, fld, value)			\
	WT_STAT_FAST_DECRV(session, S2C(session)->stats, fld, value)
#define	WT_STAT_FAST_CONN_INCR(session, fld)				\
	WT_STAT_FAST_INCR(session, S2C(session)->stats, fld)
#define	WT_STAT_FAST_CONN_INCRV(session, fld, value)			\
	WT_STAT_FAST_INCRV(session, S2C(session)->stats, fld, value)
#define	WT_STAT_FAST_CONN_SET(session, fld, value)			\
	WT_STAT_FAST_SET(session, S2C(session)->stats, fld, value)

/*
 * Update data-source handle statistics if "fast" statistics are configured
 * and the data-source handle is set.
 *
 * XXX
 * We shouldn't have to check if the data-source handle is NULL, but it's
 * necessary until everything is converted to using data-source handles.
 */
#define	WT_STAT_FAST_DATA_DECRV(session, fld, value) do {		\
	if ((session)->dhandle != NULL)					\
		WT_STAT_FAST_DECRV(					\
		    session, (session)->dhandle->stats, fld, value);	\
} while (0)
#define	WT_STAT_FAST_DATA_DECR(session, fld)				\
	WT_STAT_FAST_DATA_DECRV(session, fld, 1)
#define	WT_STAT_FAST_DATA_INCRV(session, fld, value) do {		\
	if ((session)->dhandle != NULL)					\
		WT_STAT_FAST_INCRV(					\
		    session, (session)->dhandle->stats, fld, value);	\
} while (0)
#define	WT_STAT_FAST_DATA_INCR(session, fld)				\
	WT_STAT_FAST_DATA_INCRV(session, fld, 1)
#define	WT_STAT_FAST_DATA_SET(session, fld, value) do {			\
	if ((session)->dhandle != NULL)					\
		WT_STAT_FAST_SET(					\
		    session, (session)->dhandle->stats, fld, value);	\
} while (0)

/*
 * DO NOT EDIT: automatically built by dist/stat.py.
 */
/* Statistics section: BEGIN */

/*
 * Statistics entries for connections.
 */
#define	WT_CONNECTION_STATS_BASE	1000
struct __wt_connection_stats {
	int64_t async_alloc_race;
	int64_t async_alloc_view;
	int64_t async_cur_queue;
	int64_t async_flush;
	int64_t async_full;
	int64_t async_max_queue;
	int64_t async_nowork;
	int64_t async_op_alloc;
	int64_t async_op_compact;
	int64_t async_op_insert;
	int64_t async_op_remove;
	int64_t async_op_search;
	int64_t async_op_update;
	int64_t block_byte_map_read;
	int64_t block_byte_read;
	int64_t block_byte_write;
	int64_t block_map_read;
	int64_t block_preload;
	int64_t block_read;
	int64_t block_write;
	int64_t cache_bytes_dirty;
	int64_t cache_bytes_internal;
	int64_t cache_bytes_inuse;
	int64_t cache_bytes_leaf;
	int64_t cache_bytes_max;
	int64_t cache_bytes_overflow;
	int64_t cache_bytes_read;
	int64_t cache_bytes_write;
	int64_t cache_eviction_app;
	int64_t cache_eviction_checkpoint;
	int64_t cache_eviction_clean;
	int64_t cache_eviction_deepen;
	int64_t cache_eviction_dirty;
	int64_t cache_eviction_fail;
	int64_t cache_eviction_force;
	int64_t cache_eviction_force_delete;
	int64_t cache_eviction_force_fail;
	int64_t cache_eviction_hazard;
	int64_t cache_eviction_internal;
	int64_t cache_eviction_maximum_page_size;
	int64_t cache_eviction_queue_empty;
	int64_t cache_eviction_queue_not_empty;
	int64_t cache_eviction_server_evicting;
	int64_t cache_eviction_server_not_evicting;
	int64_t cache_eviction_slow;
	int64_t cache_eviction_split_internal;
	int64_t cache_eviction_split_leaf;
	int64_t cache_eviction_walk;
	int64_t cache_eviction_worker_evicting;
	int64_t cache_inmem_split;
	int64_t cache_inmem_splittable;
	int64_t cache_lookaside_insert;
	int64_t cache_lookaside_remove;
	int64_t cache_overhead;
	int64_t cache_pages_dirty;
	int64_t cache_pages_inuse;
	int64_t cache_read;
	int64_t cache_read_lookaside;
	int64_t cache_write;
	int64_t cache_write_lookaside;
	int64_t cache_write_restore;
	int64_t cond_wait;
	int64_t cursor_create;
	int64_t cursor_insert;
	int64_t cursor_next;
	int64_t cursor_prev;
	int64_t cursor_remove;
	int64_t cursor_reset;
	int64_t cursor_restart;
	int64_t cursor_search;
	int64_t cursor_search_near;
	int64_t cursor_truncate;
	int64_t cursor_update;
	int64_t dh_conn_handle_count;
	int64_t dh_session_handles;
	int64_t dh_session_sweeps;
	int64_t dh_sweep_close;
	int64_t dh_sweep_ref;
	int64_t dh_sweep_remove;
	int64_t dh_sweep_tod;
	int64_t dh_sweeps;
	int64_t file_open;
	int64_t log_buffer_size;
	int64_t log_bytes_payload;
	int64_t log_bytes_written;
	int64_t log_close_yields;
	int64_t log_compress_len;
	int64_t log_compress_mem;
	int64_t log_compress_small;
	int64_t log_compress_write_fails;
	int64_t log_compress_writes;
	int64_t log_flush;
	int64_t log_max_filesize;
	int64_t log_prealloc_files;
	int64_t log_prealloc_max;
	int64_t log_prealloc_missed;
	int64_t log_prealloc_used;
	int64_t log_release_write_lsn;
	int64_t log_scan_records;
	int64_t log_scan_rereads;
	int64_t log_scans;
	int64_t log_slot_closes;
	int64_t log_slot_coalesced;
	int64_t log_slot_consolidated;
	int64_t log_slot_joins;
	int64_t log_slot_races;
	int64_t log_slot_switch_busy;
	int64_t log_slot_transitions;
	int64_t log_slot_unbuffered;
	int64_t log_sync;
	int64_t log_sync_dir;
	int64_t log_write_lsn;
	int64_t log_writes;
	int64_t log_zero_fills;
	int64_t lsm_checkpoint_throttle;
	int64_t lsm_merge_throttle;
	int64_t lsm_rows_merged;
	int64_t lsm_work_queue_app;
	int64_t lsm_work_queue_manager;
	int64_t lsm_work_queue_max;
	int64_t lsm_work_queue_switch;
	int64_t lsm_work_units_created;
	int64_t lsm_work_units_discarded;
	int64_t lsm_work_units_done;
	int64_t memory_allocation;
	int64_t memory_free;
	int64_t memory_grow;
	int64_t page_busy_blocked;
	int64_t page_forcible_evict_blocked;
	int64_t page_locked_blocked;
	int64_t page_read_blocked;
	int64_t page_sleep;
	int64_t read_io;
	int64_t rec_page_delete;
	int64_t rec_page_delete_fast;
	int64_t rec_pages;
	int64_t rec_pages_eviction;
	int64_t rec_split_stashed_bytes;
	int64_t rec_split_stashed_objects;
	int64_t rwlock_read;
	int64_t rwlock_write;
	int64_t session_cursor_open;
	int64_t session_open;
	int64_t txn_begin;
	int64_t txn_checkpoint;
	int64_t txn_checkpoint_generation;
	int64_t txn_checkpoint_running;
	int64_t txn_checkpoint_time_max;
	int64_t txn_checkpoint_time_min;
	int64_t txn_checkpoint_time_recent;
	int64_t txn_checkpoint_time_total;
	int64_t txn_commit;
	int64_t txn_fail_cache;
	int64_t txn_pinned_checkpoint_range;
	int64_t txn_pinned_range;
	int64_t txn_pinned_snapshot_range;
	int64_t txn_rollback;
	int64_t txn_snapshots_created;
	int64_t txn_snapshots_dropped;
	int64_t txn_sync;
	int64_t write_io;
};

/*
 * Statistics entries for data sources.
 */
#define	WT_DSRC_STATS_BASE	2000
struct __wt_dsrc_stats {
	int64_t allocation_size;
	int64_t block_alloc;
	int64_t block_checkpoint_size;
	int64_t block_extension;
	int64_t block_free;
	int64_t block_magic;
	int64_t block_major;
	int64_t block_minor;
	int64_t block_reuse_bytes;
	int64_t block_size;
	int64_t bloom_count;
	int64_t bloom_false_positive;
	int64_t bloom_hit;
	int64_t bloom_miss;
	int64_t bloom_page_evict;
	int64_t bloom_page_read;
	int64_t bloom_size;
	int64_t btree_checkpoint_generation;
	int64_t btree_column_deleted;
	int64_t btree_column_fix;
	int64_t btree_column_internal;
	int64_t btree_column_rle;
	int64_t btree_column_variable;
	int64_t btree_compact_rewrite;
	int64_t btree_entries;
	int64_t btree_fixed_len;
	int64_t btree_maximum_depth;
	int64_t btree_maxintlkey;
	int64_t btree_maxintlpage;
	int64_t btree_maxleafkey;
	int64_t btree_maxleafpage;
	int64_t btree_maxleafvalue;
	int64_t btree_overflow;
	int64_t btree_row_internal;
	int64_t btree_row_leaf;
	int64_t cache_bytes_read;
	int64_t cache_bytes_write;
	int64_t cache_eviction_checkpoint;
	int64_t cache_eviction_clean;
	int64_t cache_eviction_deepen;
	int64_t cache_eviction_dirty;
	int64_t cache_eviction_fail;
	int64_t cache_eviction_hazard;
	int64_t cache_eviction_internal;
	int64_t cache_eviction_split_internal;
	int64_t cache_eviction_split_leaf;
	int64_t cache_inmem_split;
	int64_t cache_inmem_splittable;
	int64_t cache_overflow_value;
	int64_t cache_read;
	int64_t cache_read_lookaside;
	int64_t cache_read_overflow;
	int64_t cache_write;
	int64_t cache_write_lookaside;
	int64_t cache_write_restore;
	int64_t compress_raw_fail;
	int64_t compress_raw_fail_temporary;
	int64_t compress_raw_ok;
	int64_t compress_read;
	int64_t compress_write;
	int64_t compress_write_fail;
	int64_t compress_write_too_small;
	int64_t cursor_create;
	int64_t cursor_insert;
	int64_t cursor_insert_bulk;
	int64_t cursor_insert_bytes;
	int64_t cursor_next;
	int64_t cursor_prev;
	int64_t cursor_remove;
	int64_t cursor_remove_bytes;
	int64_t cursor_reset;
	int64_t cursor_restart;
	int64_t cursor_search;
	int64_t cursor_search_near;
	int64_t cursor_truncate;
	int64_t cursor_update;
	int64_t cursor_update_bytes;
	int64_t lsm_checkpoint_throttle;
	int64_t lsm_chunk_count;
	int64_t lsm_generation_max;
	int64_t lsm_lookup_no_bloom;
	int64_t lsm_merge_throttle;
	int64_t rec_dictionary;
	int64_t rec_multiblock_internal;
	int64_t rec_multiblock_leaf;
	int64_t rec_multiblock_max;
	int64_t rec_overflow_key_internal;
	int64_t rec_overflow_key_leaf;
	int64_t rec_overflow_value;
	int64_t rec_page_delete;
	int64_t rec_page_delete_fast;
	int64_t rec_page_match;
	int64_t rec_pages;
	int64_t rec_pages_eviction;
	int64_t rec_prefix_compression;
	int64_t rec_suffix_compression;
	int64_t session_compact;
	int64_t session_cursor_open;
	int64_t txn_update_conflict;
};

/*
 * Statistics entries for join cursors.
 */
#define	WT_JOIN_STATS_BASE	3000
struct __wt_join_stats {
	int64_t accesses;
	int64_t actual_count;
	int64_t bloom_false_positive;
};

/* Statistics section: END */