diff options
author | Sergei Golubchik <serg@mariadb.org> | 2015-10-26 12:48:26 +0100 |
---|---|---|
committer | Sergei Golubchik <serg@mariadb.org> | 2015-10-26 12:57:57 +0100 |
commit | 2c8c65297865d9f8da501761f46e2a34e29af603 (patch) | |
tree | 3fdf4a00f8537bb3564827884f923ac56966e778 /storage/tokudb/PerconaFT/ft/serialize | |
download | mariadb-git-2c8c65297865d9f8da501761f46e2a34e29af603.tar.gz |
5.6.26-74.0
Diffstat (limited to 'storage/tokudb/PerconaFT/ft/serialize')
21 files changed, 8792 insertions, 0 deletions
diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc new file mode 100644 index 00000000000..1355f3739ee --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc @@ -0,0 +1,460 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include <algorithm> + +#include <string.h> + +#include "portability/memory.h" +#include "portability/toku_assert.h" +#include "portability/toku_stdint.h" +#include "portability/toku_stdlib.h" + +#include "ft/serialize/block_allocator.h" +#include "ft/serialize/block_allocator_strategy.h" + +#if TOKU_DEBUG_PARANOID +#define VALIDATE() validate() +#else +#define VALIDATE() +#endif + +static FILE *ba_trace_file = nullptr; + +void block_allocator::maybe_initialize_trace(void) { + const char *ba_trace_path = getenv("TOKU_BA_TRACE_PATH"); + if (ba_trace_path != nullptr) { + ba_trace_file = toku_os_fopen(ba_trace_path, "w"); + if (ba_trace_file == nullptr) { + fprintf(stderr, "tokuft: error: block allocator trace path found in environment (%s), " + "but it could not be opened for writing (errno %d)\n", + ba_trace_path, get_maybe_error_errno()); + } else { + fprintf(stderr, "tokuft: block allocator tracing enabled, path: %s\n", ba_trace_path); + } + } +} + +void block_allocator::maybe_close_trace() { + if (ba_trace_file != nullptr) { + int r = toku_os_fclose(ba_trace_file); + if (r != 0) { + fprintf(stderr, "tokuft: error: block allocator trace file did not close properly (r %d, errno %d)\n", + r, get_maybe_error_errno()); + } else { + fprintf(stderr, "tokuft: block allocator tracing finished, file closed successfully\n"); + } + } +} + +void block_allocator::_create_internal(uint64_t reserve_at_beginning, uint64_t alignment) { + // the alignment must be at least 512 and aligned with 512 to work with direct I/O + assert(alignment >= 512 && (alignment % 512) == 0); + + _reserve_at_beginning = reserve_at_beginning; + _alignment = alignment; + _n_blocks = 0; + _blocks_array_size = 1; + XMALLOC_N(_blocks_array_size, _blocks_array); + _n_bytes_in_use = reserve_at_beginning; + _strategy = BA_STRATEGY_FIRST_FIT; + + memset(&_trace_lock, 0, sizeof(toku_mutex_t)); + toku_mutex_init(&_trace_lock, nullptr); + + VALIDATE(); +} + +void block_allocator::create(uint64_t reserve_at_beginning, uint64_t alignment) { + _create_internal(reserve_at_beginning, alignment); + _trace_create(); +} + +void block_allocator::destroy() { + toku_free(_blocks_array); + _trace_destroy(); + toku_mutex_destroy(&_trace_lock); +} + +void block_allocator::set_strategy(enum allocation_strategy strategy) { + _strategy = strategy; +} + +void block_allocator::grow_blocks_array_by(uint64_t n_to_add) { + if (_n_blocks + n_to_add > _blocks_array_size) { + uint64_t new_size = _n_blocks + n_to_add; + uint64_t at_least = _blocks_array_size * 2; + if (at_least > new_size) { + new_size = at_least; + } + _blocks_array_size = new_size; + XREALLOC_N(_blocks_array_size, _blocks_array); + } +} + +void block_allocator::grow_blocks_array() { + grow_blocks_array_by(1); +} + +void block_allocator::create_from_blockpairs(uint64_t reserve_at_beginning, uint64_t alignment, + struct blockpair *pairs, uint64_t n_blocks) { + _create_internal(reserve_at_beginning, alignment); + + _n_blocks = n_blocks; + grow_blocks_array_by(_n_blocks); + memcpy(_blocks_array, pairs, _n_blocks * sizeof(struct blockpair)); + std::sort(_blocks_array, _blocks_array + _n_blocks); + for (uint64_t i = 0; i < _n_blocks; i++) { + // Allocator does not support size 0 blocks. See block_allocator_free_block. + invariant(_blocks_array[i].size > 0); + invariant(_blocks_array[i].offset >= _reserve_at_beginning); + invariant(_blocks_array[i].offset % _alignment == 0); + + _n_bytes_in_use += _blocks_array[i].size; + } + + VALIDATE(); + + _trace_create_from_blockpairs(); +} + +// Effect: align a value by rounding up. +static inline uint64_t align(uint64_t value, uint64_t ba_alignment) { + return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment; +} + +struct block_allocator::blockpair * +block_allocator::choose_block_to_alloc_after(size_t size, uint64_t heat) { + switch (_strategy) { + case BA_STRATEGY_FIRST_FIT: + return block_allocator_strategy::first_fit(_blocks_array, _n_blocks, size, _alignment); + case BA_STRATEGY_BEST_FIT: + return block_allocator_strategy::best_fit(_blocks_array, _n_blocks, size, _alignment); + case BA_STRATEGY_HEAT_ZONE: + return block_allocator_strategy::heat_zone(_blocks_array, _n_blocks, size, _alignment, heat); + case BA_STRATEGY_PADDED_FIT: + return block_allocator_strategy::padded_fit(_blocks_array, _n_blocks, size, _alignment); + default: + abort(); + } +} + +// Effect: Allocate a block. The resulting block must be aligned on the ba->alignment (which to make direct_io happy must be a positive multiple of 512). +void block_allocator::alloc_block(uint64_t size, uint64_t heat, uint64_t *offset) { + struct blockpair *bp; + + // Allocator does not support size 0 blocks. See block_allocator_free_block. + invariant(size > 0); + + grow_blocks_array(); + _n_bytes_in_use += size; + + uint64_t end_of_reserve = align(_reserve_at_beginning, _alignment); + + if (_n_blocks == 0) { + // First and only block + assert(_n_bytes_in_use == _reserve_at_beginning + size); // we know exactly how many are in use + _blocks_array[0].offset = align(_reserve_at_beginning, _alignment); + _blocks_array[0].size = size; + *offset = _blocks_array[0].offset; + goto done; + } else if (end_of_reserve + size <= _blocks_array[0].offset ) { + // Check to see if the space immediately after the reserve is big enough to hold the new block. + bp = &_blocks_array[0]; + memmove(bp + 1, bp, _n_blocks * sizeof(*bp)); + bp[0].offset = end_of_reserve; + bp[0].size = size; + *offset = end_of_reserve; + goto done; + } + + bp = choose_block_to_alloc_after(size, heat); + if (bp != nullptr) { + // our allocation strategy chose the space after `bp' to fit the new block + uint64_t answer_offset = align(bp->offset + bp->size, _alignment); + uint64_t blocknum = bp - _blocks_array; + invariant(&_blocks_array[blocknum] == bp); + invariant(blocknum < _n_blocks); + memmove(bp + 2, bp + 1, (_n_blocks - blocknum - 1) * sizeof(*bp)); + bp[1].offset = answer_offset; + bp[1].size = size; + *offset = answer_offset; + } else { + // It didn't fit anywhere, so fit it on the end. + assert(_n_blocks < _blocks_array_size); + bp = &_blocks_array[_n_blocks]; + uint64_t answer_offset = align(bp[-1].offset + bp[-1].size, _alignment); + bp->offset = answer_offset; + bp->size = size; + *offset = answer_offset; + } + +done: + _n_blocks++; + VALIDATE(); + + _trace_alloc(size, heat, *offset); +} + +// Find the index in the blocks array that has a particular offset. Requires that the block exist. +// Use binary search so it runs fast. +int64_t block_allocator::find_block(uint64_t offset) { + VALIDATE(); + if (_n_blocks == 1) { + assert(_blocks_array[0].offset == offset); + return 0; + } + + uint64_t lo = 0; + uint64_t hi = _n_blocks; + while (1) { + assert(lo < hi); // otherwise no such block exists. + uint64_t mid = (lo + hi) / 2; + uint64_t thisoff = _blocks_array[mid].offset; + if (thisoff < offset) { + lo = mid + 1; + } else if (thisoff > offset) { + hi = mid; + } else { + return mid; + } + } +} + +// To support 0-sized blocks, we need to include size as an input to this function. +// All 0-sized blocks at the same offset can be considered identical, but +// a 0-sized block can share offset with a non-zero sized block. +// The non-zero sized block is not exchangable with a zero sized block (or vice versa), +// so inserting 0-sized blocks can cause corruption here. +void block_allocator::free_block(uint64_t offset) { + VALIDATE(); + int64_t bn = find_block(offset); + assert(bn >= 0); // we require that there is a block with that offset. + _n_bytes_in_use -= _blocks_array[bn].size; + memmove(&_blocks_array[bn], &_blocks_array[bn + 1], + (_n_blocks - bn - 1) * sizeof(struct blockpair)); + _n_blocks--; + VALIDATE(); + + _trace_free(offset); +} + +uint64_t block_allocator::block_size(uint64_t offset) { + int64_t bn = find_block(offset); + assert(bn >=0); // we require that there is a block with that offset. + return _blocks_array[bn].size; +} + +uint64_t block_allocator::allocated_limit() const { + if (_n_blocks == 0) { + return _reserve_at_beginning; + } else { + struct blockpair *last = &_blocks_array[_n_blocks - 1]; + return last->offset + last->size; + } +} + +// Effect: Consider the blocks in sorted order. The reserved block at the beginning is number 0. The next one is number 1 and so forth. +// Return the offset and size of the block with that number. +// Return 0 if there is a block that big, return nonzero if b is too big. +int block_allocator::get_nth_block_in_layout_order(uint64_t b, uint64_t *offset, uint64_t *size) { + if (b ==0 ) { + *offset = 0; + *size = _reserve_at_beginning; + return 0; + } else if (b > _n_blocks) { + return -1; + } else { + *offset =_blocks_array[b - 1].offset; + *size =_blocks_array[b - 1].size; + return 0; + } +} + +// Requires: report->file_size_bytes is filled in +// Requires: report->data_bytes is filled in +// Requires: report->checkpoint_bytes_additional is filled in +void block_allocator::get_unused_statistics(TOKU_DB_FRAGMENTATION report) { + assert(_n_bytes_in_use == report->data_bytes + report->checkpoint_bytes_additional); + + report->unused_bytes = 0; + report->unused_blocks = 0; + report->largest_unused_block = 0; + if (_n_blocks > 0) { + //Deal with space before block 0 and after reserve: + { + struct blockpair *bp = &_blocks_array[0]; + assert(bp->offset >= align(_reserve_at_beginning, _alignment)); + uint64_t free_space = bp->offset - align(_reserve_at_beginning, _alignment); + if (free_space > 0) { + report->unused_bytes += free_space; + report->unused_blocks++; + if (free_space > report->largest_unused_block) { + report->largest_unused_block = free_space; + } + } + } + + //Deal with space between blocks: + for (uint64_t blocknum = 0; blocknum +1 < _n_blocks; blocknum ++) { + // Consider the space after blocknum + struct blockpair *bp = &_blocks_array[blocknum]; + uint64_t this_offset = bp[0].offset; + uint64_t this_size = bp[0].size; + uint64_t end_of_this_block = align(this_offset+this_size, _alignment); + uint64_t next_offset = bp[1].offset; + uint64_t free_space = next_offset - end_of_this_block; + if (free_space > 0) { + report->unused_bytes += free_space; + report->unused_blocks++; + if (free_space > report->largest_unused_block) { + report->largest_unused_block = free_space; + } + } + } + + //Deal with space after last block + { + struct blockpair *bp = &_blocks_array[_n_blocks-1]; + uint64_t this_offset = bp[0].offset; + uint64_t this_size = bp[0].size; + uint64_t end_of_this_block = align(this_offset+this_size, _alignment); + if (end_of_this_block < report->file_size_bytes) { + uint64_t free_space = report->file_size_bytes - end_of_this_block; + assert(free_space > 0); + report->unused_bytes += free_space; + report->unused_blocks++; + if (free_space > report->largest_unused_block) { + report->largest_unused_block = free_space; + } + } + } + } else { + // No blocks. Just the reserve. + uint64_t end_of_this_block = align(_reserve_at_beginning, _alignment); + if (end_of_this_block < report->file_size_bytes) { + uint64_t free_space = report->file_size_bytes - end_of_this_block; + assert(free_space > 0); + report->unused_bytes += free_space; + report->unused_blocks++; + if (free_space > report->largest_unused_block) { + report->largest_unused_block = free_space; + } + } + } +} + +void block_allocator::get_statistics(TOKU_DB_FRAGMENTATION report) { + report->data_bytes = _n_bytes_in_use; + report->data_blocks = _n_blocks; + report->file_size_bytes = 0; + report->checkpoint_bytes_additional = 0; + get_unused_statistics(report); +} + +void block_allocator::validate() const { + uint64_t n_bytes_in_use = _reserve_at_beginning; + for (uint64_t i = 0; i < _n_blocks; i++) { + n_bytes_in_use += _blocks_array[i].size; + if (i > 0) { + assert(_blocks_array[i].offset > _blocks_array[i - 1].offset); + assert(_blocks_array[i].offset >= _blocks_array[i - 1].offset + _blocks_array[i - 1].size ); + } + } + assert(n_bytes_in_use == _n_bytes_in_use); +} + +// Tracing + +void block_allocator::_trace_create(void) { + if (ba_trace_file != nullptr) { + toku_mutex_lock(&_trace_lock); + fprintf(ba_trace_file, "ba_trace_create %p %" PRIu64 " %" PRIu64 "\n", + this, _reserve_at_beginning, _alignment); + toku_mutex_unlock(&_trace_lock); + + fflush(ba_trace_file); + } +} + +void block_allocator::_trace_create_from_blockpairs(void) { + if (ba_trace_file != nullptr) { + toku_mutex_lock(&_trace_lock); + fprintf(ba_trace_file, "ba_trace_create_from_blockpairs %p %" PRIu64 " %" PRIu64 " ", + this, _reserve_at_beginning, _alignment); + for (uint64_t i = 0; i < _n_blocks; i++) { + fprintf(ba_trace_file, "[%" PRIu64 " %" PRIu64 "] ", + _blocks_array[i].offset, _blocks_array[i].size); + } + fprintf(ba_trace_file, "\n"); + toku_mutex_unlock(&_trace_lock); + + fflush(ba_trace_file); + } +} + +void block_allocator::_trace_destroy(void) { + if (ba_trace_file != nullptr) { + toku_mutex_lock(&_trace_lock); + fprintf(ba_trace_file, "ba_trace_destroy %p\n", this); + toku_mutex_unlock(&_trace_lock); + + fflush(ba_trace_file); + } +} + +void block_allocator::_trace_alloc(uint64_t size, uint64_t heat, uint64_t offset) { + if (ba_trace_file != nullptr) { + toku_mutex_lock(&_trace_lock); + fprintf(ba_trace_file, "ba_trace_alloc %p %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", + this, size, heat, offset); + toku_mutex_unlock(&_trace_lock); + + fflush(ba_trace_file); + } +} + +void block_allocator::_trace_free(uint64_t offset) { + if (ba_trace_file != nullptr) { + toku_mutex_lock(&_trace_lock); + fprintf(ba_trace_file, "ba_trace_free %p %" PRIu64 "\n", this, offset); + toku_mutex_unlock(&_trace_lock); + + fflush(ba_trace_file); + } +} diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator.h b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.h new file mode 100644 index 00000000000..9b2c1553e7f --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.h @@ -0,0 +1,214 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <db.h> + +#include "portability/toku_pthread.h" +#include "portability/toku_stdint.h" +#include "portability/toku_stdlib.h" + +// Block allocator. +// +// A block allocator manages the allocation of variable-sized blocks. +// The translation of block numbers to addresses is handled elsewhere. +// The allocation of block numbers is handled elsewhere. +// +// When creating a block allocator we also specify a certain-sized +// block at the beginning that is preallocated (and cannot be allocated or freed) +// +// We can allocate blocks of a particular size at a particular location. +// We can allocate blocks of a particular size at a location chosen by the allocator. +// We can free blocks. +// We can determine the size of a block. + +class block_allocator { +public: + static const size_t BLOCK_ALLOCATOR_ALIGNMENT = 4096; + + // How much must be reserved at the beginning for the block? + // The actual header is 8+4+4+8+8_4+8+ the length of the db names + 1 pointer for each root. + // So 4096 should be enough. + static const size_t BLOCK_ALLOCATOR_HEADER_RESERVE = 4096; + + static_assert(BLOCK_ALLOCATOR_HEADER_RESERVE % BLOCK_ALLOCATOR_ALIGNMENT == 0, + "block allocator header must have proper alignment"); + + static const size_t BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE = BLOCK_ALLOCATOR_HEADER_RESERVE * 2; + + enum allocation_strategy { + BA_STRATEGY_FIRST_FIT = 1, + BA_STRATEGY_BEST_FIT, + BA_STRATEGY_PADDED_FIT, + BA_STRATEGY_HEAT_ZONE + }; + + struct blockpair { + uint64_t offset; + uint64_t size; + blockpair(uint64_t o, uint64_t s) : + offset(o), size(s) { + } + int operator<(const struct blockpair &rhs) const { + return offset < rhs.offset; + } + int operator<(const uint64_t &o) const { + return offset < o; + } + }; + + // Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING bytes are not put into a block. + // The default allocation strategy is first fit (BA_STRATEGY_FIRST_FIT) + // All blocks be start on a multiple of ALIGNMENT. + // Aborts if we run out of memory. + // Parameters + // reserve_at_beginning (IN) Size of reserved block at beginning. This size does not have to be aligned. + // alignment (IN) Block alignment. + void create(uint64_t reserve_at_beginning, uint64_t alignment); + + // Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING bytes are not put into a block. + // The default allocation strategy is first fit (BA_STRATEGY_FIRST_FIT) + // The allocator is initialized to contain `n_blocks' of blockpairs, taken from `pairs' + // All blocks be start on a multiple of ALIGNMENT. + // Aborts if we run out of memory. + // Parameters + // pairs, unowned array of pairs to copy + // n_blocks, Size of pairs array + // reserve_at_beginning (IN) Size of reserved block at beginning. This size does not have to be aligned. + // alignment (IN) Block alignment. + void create_from_blockpairs(uint64_t reserve_at_beginning, uint64_t alignment, + struct blockpair *pairs, uint64_t n_blocks); + + // Effect: Destroy this block allocator + void destroy(); + + // Effect: Set the allocation strategy that the allocator should use + // Requires: No other threads are operating on this block allocator + void set_strategy(enum allocation_strategy strategy); + + // Effect: Allocate a block of the specified size at an address chosen by the allocator. + // Aborts if anything goes wrong. + // The block address will be a multiple of the alignment. + // Parameters: + // size (IN): The size of the block. (The size does not have to be aligned.) + // offset (OUT): The location of the block. + // heat (IN): A higher heat means we should be prepared to free this block soon (perhaps in the next checkpoint) + // Heat values are lexiographically ordered (like integers), but their specific values are arbitrary + void alloc_block(uint64_t size, uint64_t heat, uint64_t *offset); + + // Effect: Free the block at offset. + // Requires: There must be a block currently allocated at that offset. + // Parameters: + // offset (IN): The offset of the block. + void free_block(uint64_t offset); + + // Effect: Return the size of the block that starts at offset. + // Requires: There must be a block currently allocated at that offset. + // Parameters: + // offset (IN): The offset of the block. + uint64_t block_size(uint64_t offset); + + // Effect: Check to see if the block allocator is OK. This may take a long time. + // Usage Hints: Probably only use this for unit tests. + // TODO: Private? + void validate() const; + + // Effect: Return the unallocated block address of "infinite" size. + // That is, return the smallest address that is above all the allocated blocks. + uint64_t allocated_limit() const; + + // Effect: Consider the blocks in sorted order. The reserved block at the beginning is number 0. The next one is number 1 and so forth. + // Return the offset and size of the block with that number. + // Return 0 if there is a block that big, return nonzero if b is too big. + // Rationale: This is probably useful only for tests. + int get_nth_block_in_layout_order(uint64_t b, uint64_t *offset, uint64_t *size); + + // Effect: Fill in report to indicate how the file is used. + // Requires: + // report->file_size_bytes is filled in + // report->data_bytes is filled in + // report->checkpoint_bytes_additional is filled in + void get_unused_statistics(TOKU_DB_FRAGMENTATION report); + + // Effect: Fill in report->data_bytes with the number of bytes in use + // Fill in report->data_blocks with the number of blockpairs in use + // Fill in unused statistics using this->get_unused_statistics() + // Requires: + // report->file_size is ignored on return + // report->checkpoint_bytes_additional is ignored on return + void get_statistics(TOKU_DB_FRAGMENTATION report); + + // Block allocator tracing. + // - Enabled by setting TOKU_BA_TRACE_PATH to the file that the trace file + // should be written to. + // - Trace may be replayed by ba_trace_replay tool in tools/ directory + // eg: "cat mytracefile | ba_trace_replay" + static void maybe_initialize_trace(); + static void maybe_close_trace(); + +private: + void _create_internal(uint64_t reserve_at_beginning, uint64_t alignment); + void grow_blocks_array_by(uint64_t n_to_add); + void grow_blocks_array(); + int64_t find_block(uint64_t offset); + struct blockpair *choose_block_to_alloc_after(size_t size, uint64_t heat); + + // Tracing + toku_mutex_t _trace_lock; + void _trace_create(void); + void _trace_create_from_blockpairs(void); + void _trace_destroy(void); + void _trace_alloc(uint64_t size, uint64_t heat, uint64_t offset); + void _trace_free(uint64_t offset); + + // How much to reserve at the beginning + uint64_t _reserve_at_beginning; + // Block alignment + uint64_t _alignment; + // How many blocks + uint64_t _n_blocks; + // How big is the blocks_array. Must be >= n_blocks. + uint64_t _blocks_array_size; + // These blocks are sorted by address. + struct blockpair *_blocks_array; + // Including the reserve_at_beginning + uint64_t _n_bytes_in_use; + // The allocation strategy are we using + enum allocation_strategy _strategy; +}; diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.cc b/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.cc new file mode 100644 index 00000000000..62bb8fc4a87 --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.cc @@ -0,0 +1,224 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include <algorithm> + +#include <string.h> + +#include "portability/toku_assert.h" + +#include "ft/serialize/block_allocator_strategy.h" + +static uint64_t _align(uint64_t value, uint64_t ba_alignment) { + return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment; +} + +static uint64_t _roundup_to_power_of_two(uint64_t value) { + uint64_t r = 4096; + while (r < value) { + r *= 2; + invariant(r > 0); + } + return r; +} + +// First fit block allocation +static struct block_allocator::blockpair * +_first_fit(struct block_allocator::blockpair *blocks_array, + uint64_t n_blocks, uint64_t size, uint64_t alignment, + uint64_t max_padding) { + if (n_blocks == 1) { + // won't enter loop, can't underflow the direction < 0 case + return nullptr; + } + + struct block_allocator::blockpair *bp = &blocks_array[0]; + for (uint64_t n_spaces_to_check = n_blocks - 1; n_spaces_to_check > 0; + n_spaces_to_check--, bp++) { + // Consider the space after bp + uint64_t padded_alignment = max_padding != 0 ? _align(max_padding, alignment) : alignment; + uint64_t possible_offset = _align(bp->offset + bp->size, padded_alignment); + if (possible_offset + size <= bp[1].offset) { // bp[1] is always valid since bp < &blocks_array[n_blocks-1] + invariant(bp - blocks_array < (int64_t) n_blocks); + return bp; + } + } + return nullptr; +} + +static struct block_allocator::blockpair * +_first_fit_bw(struct block_allocator::blockpair *blocks_array, + uint64_t n_blocks, uint64_t size, uint64_t alignment, + uint64_t max_padding, struct block_allocator::blockpair *blocks_array_limit) { + if (n_blocks == 1) { + // won't enter loop, can't underflow the direction < 0 case + return nullptr; + } + + struct block_allocator::blockpair *bp = &blocks_array[-1]; + for (uint64_t n_spaces_to_check = n_blocks - 1; n_spaces_to_check > 0; + n_spaces_to_check--, bp--) { + // Consider the space after bp + uint64_t padded_alignment = max_padding != 0 ? _align(max_padding, alignment) : alignment; + uint64_t possible_offset = _align(bp->offset + bp->size, padded_alignment); + if (&bp[1] < blocks_array_limit && possible_offset + size <= bp[1].offset) { + invariant(blocks_array - bp < (int64_t) n_blocks); + return bp; + } + } + return nullptr; +} + +struct block_allocator::blockpair * +block_allocator_strategy::first_fit(struct block_allocator::blockpair *blocks_array, + uint64_t n_blocks, uint64_t size, uint64_t alignment) { + return _first_fit(blocks_array, n_blocks, size, alignment, 0); +} + +// Best fit block allocation +struct block_allocator::blockpair * +block_allocator_strategy::best_fit(struct block_allocator::blockpair *blocks_array, + uint64_t n_blocks, uint64_t size, uint64_t alignment) { + struct block_allocator::blockpair *best_bp = nullptr; + uint64_t best_hole_size = 0; + for (uint64_t blocknum = 0; blocknum + 1 < n_blocks; blocknum++) { + // Consider the space after blocknum + struct block_allocator::blockpair *bp = &blocks_array[blocknum]; + uint64_t possible_offset = _align(bp->offset + bp->size, alignment); + uint64_t possible_end_offset = possible_offset + size; + if (possible_end_offset <= bp[1].offset) { + // It fits here. Is it the best fit? + uint64_t hole_size = bp[1].offset - possible_end_offset; + if (best_bp == nullptr || hole_size < best_hole_size) { + best_hole_size = hole_size; + best_bp = bp; + } + } + } + return best_bp; +} + +static uint64_t padded_fit_alignment = 4096; + +// TODO: These compiler specific directives should be abstracted in a portability header +// portability/toku_compiler.h? +__attribute__((__constructor__)) +static void determine_padded_fit_alignment_from_env(void) { + // TODO: Should be in portability as 'toku_os_getenv()?' + const char *s = getenv("TOKU_BA_PADDED_FIT_ALIGNMENT"); + if (s != nullptr && strlen(s) > 0) { + const int64_t alignment = strtoll(s, nullptr, 10); + if (alignment <= 0) { + fprintf(stderr, "tokuft: error: block allocator padded fit alignment found in environment (%s), " + "but it's out of range (should be an integer > 0). defaulting to %" PRIu64 "\n", + s, padded_fit_alignment); + } else { + padded_fit_alignment = _roundup_to_power_of_two(alignment); + fprintf(stderr, "tokuft: setting block allocator padded fit alignment to %" PRIu64 "\n", + padded_fit_alignment); + } + } +} + +// First fit into a block that is oversized by up to max_padding. +// The hope is that if we purposefully waste a bit of space at allocation +// time we'll be more likely to reuse this block later. +struct block_allocator::blockpair * +block_allocator_strategy::padded_fit(struct block_allocator::blockpair *blocks_array, + uint64_t n_blocks, uint64_t size, uint64_t alignment) { + return _first_fit(blocks_array, n_blocks, size, alignment, padded_fit_alignment); +} + +static double hot_zone_threshold = 0.85; + +// TODO: These compiler specific directives should be abstracted in a portability header +// portability/toku_compiler.h? +__attribute__((__constructor__)) +static void determine_hot_zone_threshold_from_env(void) { + // TODO: Should be in portability as 'toku_os_getenv()?' + const char *s = getenv("TOKU_BA_HOT_ZONE_THRESHOLD"); + if (s != nullptr && strlen(s) > 0) { + const double hot_zone = strtod(s, nullptr); + if (hot_zone < 1 || hot_zone > 99) { + fprintf(stderr, "tokuft: error: block allocator hot zone threshold found in environment (%s), " + "but it's out of range (should be an integer 1 through 99). defaulting to 85\n", s); + hot_zone_threshold = 85 / 100; + } else { + fprintf(stderr, "tokuft: setting block allocator hot zone threshold to %s\n", s); + hot_zone_threshold = hot_zone / 100; + } + } +} + +struct block_allocator::blockpair * +block_allocator_strategy::heat_zone(struct block_allocator::blockpair *blocks_array, + uint64_t n_blocks, uint64_t size, uint64_t alignment, + uint64_t heat) { + if (heat > 0) { + struct block_allocator::blockpair *bp, *boundary_bp; + + // Hot allocation. Find the beginning of the hot zone. + boundary_bp = &blocks_array[n_blocks - 1]; + uint64_t highest_offset = _align(boundary_bp->offset + boundary_bp->size, alignment); + uint64_t hot_zone_offset = static_cast<uint64_t>(hot_zone_threshold * highest_offset); + + boundary_bp = std::lower_bound(blocks_array, blocks_array + n_blocks, hot_zone_offset); + uint64_t blocks_in_zone = (blocks_array + n_blocks) - boundary_bp; + uint64_t blocks_outside_zone = boundary_bp - blocks_array; + invariant(blocks_in_zone + blocks_outside_zone == n_blocks); + + if (blocks_in_zone > 0) { + // Find the first fit in the hot zone, going forward. + bp = _first_fit(boundary_bp, blocks_in_zone, size, alignment, 0); + if (bp != nullptr) { + return bp; + } + } + if (blocks_outside_zone > 0) { + // Find the first fit in the cold zone, going backwards. + bp = _first_fit_bw(boundary_bp, blocks_outside_zone, size, alignment, 0, &blocks_array[n_blocks]); + if (bp != nullptr) { + return bp; + } + } + } else { + // Cold allocations are simply first-fit from the beginning. + return _first_fit(blocks_array, n_blocks, size, alignment, 0); + } + return nullptr; +} diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.h b/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.h new file mode 100644 index 00000000000..8aded3898c1 --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.h @@ -0,0 +1,65 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <db.h> + +#include "ft/serialize/block_allocator.h" + +// Block allocation strategy implementations + +class block_allocator_strategy { +public: + static struct block_allocator::blockpair * + first_fit(struct block_allocator::blockpair *blocks_array, + uint64_t n_blocks, uint64_t size, uint64_t alignment); + + static struct block_allocator::blockpair * + best_fit(struct block_allocator::blockpair *blocks_array, + uint64_t n_blocks, uint64_t size, uint64_t alignment); + + static struct block_allocator::blockpair * + padded_fit(struct block_allocator::blockpair *blocks_array, + uint64_t n_blocks, uint64_t size, uint64_t alignment); + + static struct block_allocator::blockpair * + heat_zone(struct block_allocator::blockpair *blocks_array, + uint64_t n_blocks, uint64_t size, uint64_t alignment, + uint64_t heat); +}; diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_table.cc b/storage/tokudb/PerconaFT/ft/serialize/block_table.cc new file mode 100644 index 00000000000..7101ba9f58c --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/block_table.cc @@ -0,0 +1,993 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "portability/memory.h" +#include "portability/toku_assert.h" +#include "portability/toku_portability.h" +#include "portability/toku_pthread.h" + +// ugly but pragmatic, need access to dirty bits while holding translation lock +// TODO: Refactor this (possibly with FT-301) +#include "ft/ft-internal.h" + +// TODO: reorganize this dependency (FT-303) +#include "ft/ft-ops.h" // for toku_maybe_truncate_file +#include "ft/serialize/block_table.h" +#include "ft/serialize/rbuf.h" +#include "ft/serialize/wbuf.h" +#include "ft/serialize/block_allocator.h" + +#include "util/nb_mutex.h" +#include "util/scoped_malloc.h" + +// indicates the end of a freelist +static const BLOCKNUM freelist_null = { -1 }; + +// value of block_translation_pair.size if blocknum is unused +static const DISKOFF size_is_free = (DISKOFF) -1; + +// value of block_translation_pair.u.diskoff if blocknum is used but does not yet have a diskblock +static const DISKOFF diskoff_unused = (DISKOFF) -2; + +void block_table::_mutex_lock() { + toku_mutex_lock(&_mutex); +} + +void block_table::_mutex_unlock() { + toku_mutex_unlock(&_mutex); +} + +// TODO: Move lock to FT +void toku_ft_lock(FT ft) { + block_table *bt = &ft->blocktable; + bt->_mutex_lock(); +} + +// TODO: Move lock to FT +void toku_ft_unlock(FT ft) { + block_table *bt = &ft->blocktable; + toku_mutex_assert_locked(&bt->_mutex); + bt->_mutex_unlock(); +} + +// There are two headers: the reserve must fit them both and be suitably aligned. +static_assert(block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE % + block_allocator::BLOCK_ALLOCATOR_ALIGNMENT == 0, + "Block allocator's header reserve must be suitibly aligned"); +static_assert(block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE * 2 == + block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, + "Block allocator's total header reserve must exactly fit two headers"); + +// does NOT initialize the block allocator: the caller is responsible +void block_table::_create_internal() { + memset(&_current, 0, sizeof(struct translation)); + memset(&_inprogress, 0, sizeof(struct translation)); + memset(&_checkpointed, 0, sizeof(struct translation)); + memset(&_mutex, 0, sizeof(_mutex)); + toku_mutex_init(&_mutex, nullptr); + nb_mutex_init(&_safe_file_size_lock); +} + +// Fill in the checkpointed translation from buffer, and copy checkpointed to current. +// The one read from disk is the last known checkpointed one, so we are keeping it in +// place and then setting current (which is never stored on disk) for current use. +// The translation_buffer has translation only, we create the rest of the block_table. +int block_table::create_from_buffer(int fd, + DISKOFF location_on_disk, //Location of translation_buffer + DISKOFF size_on_disk, + unsigned char *translation_buffer) { + // Does not initialize the block allocator + _create_internal(); + + // Deserialize the translation and copy it to current + int r = _translation_deserialize_from_buffer(&_checkpointed, + location_on_disk, size_on_disk, + translation_buffer); + if (r != 0) { + return r; + } + _copy_translation(&_current, &_checkpointed, TRANSLATION_CURRENT); + + // Determine the file size + int64_t file_size; + r = toku_os_get_file_size(fd, &file_size); + lazy_assert_zero(r); + invariant(file_size >= 0); + _safe_file_size = file_size; + + // Gather the non-empty translations and use them to create the block allocator + toku::scoped_malloc pairs_buf(_checkpointed.smallest_never_used_blocknum.b * + sizeof(struct block_allocator::blockpair)); + struct block_allocator::blockpair *CAST_FROM_VOIDP(pairs, pairs_buf.get()); + uint64_t n_pairs = 0; + for (int64_t i = 0; i < _checkpointed.smallest_never_used_blocknum.b; i++) { + struct block_translation_pair pair = _checkpointed.block_translation[i]; + if (pair.size > 0) { + invariant(pair.u.diskoff != diskoff_unused); + pairs[n_pairs++] = block_allocator::blockpair(pair.u.diskoff, pair.size); + } + } + + _bt_block_allocator.create_from_blockpairs(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, + block_allocator::BLOCK_ALLOCATOR_ALIGNMENT, + pairs, n_pairs); + + return 0; +} + +void block_table::create() { + // Does not initialize the block allocator + _create_internal(); + + _checkpointed.type = TRANSLATION_CHECKPOINTED; + _checkpointed.smallest_never_used_blocknum = make_blocknum(RESERVED_BLOCKNUMS); + _checkpointed.length_of_array = _checkpointed.smallest_never_used_blocknum.b; + _checkpointed.blocknum_freelist_head = freelist_null; + XMALLOC_N(_checkpointed.length_of_array, _checkpointed.block_translation); + for (int64_t i = 0; i < _checkpointed.length_of_array; i++) { + _checkpointed.block_translation[i].size = 0; + _checkpointed.block_translation[i].u.diskoff = diskoff_unused; + } + + // we just created a default checkpointed, now copy it to current. + _copy_translation(&_current, &_checkpointed, TRANSLATION_CURRENT); + + // Create an empty block allocator. + _bt_block_allocator.create(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, + block_allocator::BLOCK_ALLOCATOR_ALIGNMENT); +} + +// TODO: Refactor with FT-303 +static void ft_set_dirty(FT ft, bool for_checkpoint) { + invariant(ft->h->type == FT_CURRENT); + if (for_checkpoint) { + invariant(ft->checkpoint_header->type == FT_CHECKPOINT_INPROGRESS); + ft->checkpoint_header->dirty = 1; + } else { + ft->h->dirty = 1; + } +} + +void block_table::_maybe_truncate_file(int fd, uint64_t size_needed_before) { + toku_mutex_assert_locked(&_mutex); + uint64_t new_size_needed = _bt_block_allocator.allocated_limit(); + //Save a call to toku_os_get_file_size (kernel call) if unlikely to be useful. + if (new_size_needed < size_needed_before && new_size_needed < _safe_file_size) { + nb_mutex_lock(&_safe_file_size_lock, &_mutex); + + // Must hold _safe_file_size_lock to change _safe_file_size. + if (new_size_needed < _safe_file_size) { + int64_t safe_file_size_before = _safe_file_size; + // Not safe to use the 'to-be-truncated' portion until truncate is done. + _safe_file_size = new_size_needed; + _mutex_unlock(); + + uint64_t size_after; + toku_maybe_truncate_file(fd, new_size_needed, safe_file_size_before, &size_after); + _mutex_lock(); + + _safe_file_size = size_after; + } + nb_mutex_unlock(&_safe_file_size_lock); + } +} + +void block_table::maybe_truncate_file_on_open(int fd) { + _mutex_lock(); + _maybe_truncate_file(fd, _safe_file_size); + _mutex_unlock(); +} + +void block_table::_copy_translation(struct translation *dst, struct translation *src, enum translation_type newtype) { + // We intend to malloc a fresh block, so the incoming translation should be empty + invariant_null(dst->block_translation); + + invariant(src->length_of_array >= src->smallest_never_used_blocknum.b); + invariant(newtype == TRANSLATION_DEBUG || + (src->type == TRANSLATION_CURRENT && newtype == TRANSLATION_INPROGRESS) || + (src->type == TRANSLATION_CHECKPOINTED && newtype == TRANSLATION_CURRENT)); + dst->type = newtype; + dst->smallest_never_used_blocknum = src->smallest_never_used_blocknum; + dst->blocknum_freelist_head = src->blocknum_freelist_head; + + // destination btt is of fixed size. Allocate + memcpy the exact length necessary. + dst->length_of_array = dst->smallest_never_used_blocknum.b; + XMALLOC_N(dst->length_of_array, dst->block_translation); + memcpy(dst->block_translation, src->block_translation, dst->length_of_array * sizeof(*dst->block_translation)); + + // New version of btt is not yet stored on disk. + dst->block_translation[RESERVED_BLOCKNUM_TRANSLATION].size = 0; + dst->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff = diskoff_unused; +} + +int64_t block_table::get_blocks_in_use_unlocked() { + BLOCKNUM b; + struct translation *t = &_current; + int64_t num_blocks = 0; + { + //Reserved blocknums do not get upgraded; They are part of the header. + for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b; b.b++) { + if (t->block_translation[b.b].size != size_is_free) { + num_blocks++; + } + } + } + return num_blocks; +} + +void block_table::_maybe_optimize_translation(struct translation *t) { + //Reduce 'smallest_never_used_blocknum.b' (completely free blocknums instead of just + //on a free list. Doing so requires us to regenerate the free list. + //This is O(n) work, so do it only if you're already doing that. + + BLOCKNUM b; + paranoid_invariant(t->smallest_never_used_blocknum.b >= RESERVED_BLOCKNUMS); + //Calculate how large the free suffix is. + int64_t freed; + { + for (b.b = t->smallest_never_used_blocknum.b; b.b > RESERVED_BLOCKNUMS; b.b--) { + if (t->block_translation[b.b-1].size != size_is_free) { + break; + } + } + freed = t->smallest_never_used_blocknum.b - b.b; + } + if (freed>0) { + t->smallest_never_used_blocknum.b = b.b; + if (t->length_of_array/4 > t->smallest_never_used_blocknum.b) { + //We're using more memory than necessary to represent this now. Reduce. + uint64_t new_length = t->smallest_never_used_blocknum.b * 2; + XREALLOC_N(new_length, t->block_translation); + t->length_of_array = new_length; + //No need to zero anything out. + } + + //Regenerate free list. + t->blocknum_freelist_head.b = freelist_null.b; + for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b; b.b++) { + if (t->block_translation[b.b].size == size_is_free) { + t->block_translation[b.b].u.next_free_blocknum = t->blocknum_freelist_head; + t->blocknum_freelist_head = b; + } + } + } +} + +// block table must be locked by caller of this function +void block_table::note_start_checkpoint_unlocked() { + toku_mutex_assert_locked(&_mutex); + + // We're going to do O(n) work to copy the translation, so we + // can afford to do O(n) work by optimizing the translation + _maybe_optimize_translation(&_current); + + // Copy current translation to inprogress translation. + _copy_translation(&_inprogress, &_current, TRANSLATION_INPROGRESS); + + _checkpoint_skipped = false; +} + +void block_table::note_skipped_checkpoint() { + //Purpose, alert block translation that the checkpoint was skipped, e.x. for a non-dirty header + _mutex_lock(); + paranoid_invariant_notnull(_inprogress.block_translation); + _checkpoint_skipped = true; + _mutex_unlock(); +} + +// Purpose: free any disk space used by previous checkpoint that isn't in use by either +// - current state +// - in-progress checkpoint +// capture inprogress as new checkpointed. +// For each entry in checkpointBTT +// if offset does not match offset in inprogress +// assert offset does not match offset in current +// free (offset,len) from checkpoint +// move inprogress to checkpoint (resetting type) +// inprogress = NULL +void block_table::note_end_checkpoint(int fd) { + // Free unused blocks + _mutex_lock(); + uint64_t allocated_limit_at_start = _bt_block_allocator.allocated_limit(); + paranoid_invariant_notnull(_inprogress.block_translation); + if (_checkpoint_skipped) { + toku_free(_inprogress.block_translation); + memset(&_inprogress, 0, sizeof(_inprogress)); + goto end; + } + + //Make certain inprogress was allocated space on disk + assert(_inprogress.block_translation[RESERVED_BLOCKNUM_TRANSLATION].size > 0); + assert(_inprogress.block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff > 0); + + { + struct translation *t = &_checkpointed; + for (int64_t i = 0; i < t->length_of_array; i++) { + struct block_translation_pair *pair = &t->block_translation[i]; + if (pair->size > 0 && !_translation_prevents_freeing(&_inprogress, make_blocknum(i), pair)) { + assert(!_translation_prevents_freeing(&_current, make_blocknum(i), pair)); + _bt_block_allocator.free_block(pair->u.diskoff); + } + } + toku_free(_checkpointed.block_translation); + _checkpointed = _inprogress; + _checkpointed.type = TRANSLATION_CHECKPOINTED; + memset(&_inprogress, 0, sizeof(_inprogress)); + _maybe_truncate_file(fd, allocated_limit_at_start); + } +end: + _mutex_unlock(); +} + +bool block_table::_is_valid_blocknum(struct translation *t, BLOCKNUM b) { + invariant(t->length_of_array >= t->smallest_never_used_blocknum.b); + return b.b >= 0 && b.b < t->smallest_never_used_blocknum.b; +} + +void block_table::_verify_valid_blocknum(struct translation *UU(t), BLOCKNUM UU(b)) { + invariant(_is_valid_blocknum(t, b)); +} + +bool block_table::_is_valid_freeable_blocknum(struct translation *t, BLOCKNUM b) { + invariant(t->length_of_array >= t->smallest_never_used_blocknum.b); + return b.b >= RESERVED_BLOCKNUMS && b.b < t->smallest_never_used_blocknum.b; +} + +// should be freeable +void block_table::_verify_valid_freeable_blocknum(struct translation *UU(t), BLOCKNUM UU(b)) { + invariant(_is_valid_freeable_blocknum(t, b)); +} + +// Also used only in ft-serialize-test. +void block_table::block_free(uint64_t offset) { + _mutex_lock(); + _bt_block_allocator.free_block(offset); + _mutex_unlock(); +} + +int64_t block_table::_calculate_size_on_disk(struct translation *t) { + return 8 + // smallest_never_used_blocknum + 8 + // blocknum_freelist_head + t->smallest_never_used_blocknum.b * 16 + // Array + 4; // 4 for checksum +} + +// We cannot free the disk space allocated to this blocknum if it is still in use by the given translation table. +bool block_table::_translation_prevents_freeing(struct translation *t, BLOCKNUM b, struct block_translation_pair *old_pair) { + return t->block_translation && + b.b < t->smallest_never_used_blocknum.b && + old_pair->u.diskoff == t->block_translation[b.b].u.diskoff; +} + +void block_table::_realloc_on_disk_internal(BLOCKNUM b, DISKOFF size, DISKOFF *offset, FT ft, bool for_checkpoint, uint64_t heat) { + toku_mutex_assert_locked(&_mutex); + ft_set_dirty(ft, for_checkpoint); + + struct translation *t = &_current; + struct block_translation_pair old_pair = t->block_translation[b.b]; + //Free the old block if it is not still in use by the checkpoint in progress or the previous checkpoint + bool cannot_free = (bool) + ((!for_checkpoint && _translation_prevents_freeing(&_inprogress, b, &old_pair)) || + _translation_prevents_freeing(&_checkpointed, b, &old_pair)); + if (!cannot_free && old_pair.u.diskoff!=diskoff_unused) { + _bt_block_allocator.free_block(old_pair.u.diskoff); + } + + uint64_t allocator_offset = diskoff_unused; + t->block_translation[b.b].size = size; + if (size > 0) { + // Allocate a new block if the size is greater than 0, + // if the size is just 0, offset will be set to diskoff_unused + _bt_block_allocator.alloc_block(size, heat, &allocator_offset); + } + t->block_translation[b.b].u.diskoff = allocator_offset; + *offset = allocator_offset; + + //Update inprogress btt if appropriate (if called because Pending bit is set). + if (for_checkpoint) { + paranoid_invariant(b.b < _inprogress.length_of_array); + _inprogress.block_translation[b.b] = t->block_translation[b.b]; + } +} + +void block_table::_ensure_safe_write_unlocked(int fd, DISKOFF block_size, DISKOFF block_offset) { + // Requires: holding _mutex + uint64_t size_needed = block_size + block_offset; + if (size_needed > _safe_file_size) { + // Must hold _safe_file_size_lock to change _safe_file_size. + nb_mutex_lock(&_safe_file_size_lock, &_mutex); + if (size_needed > _safe_file_size) { + _mutex_unlock(); + + int64_t size_after; + toku_maybe_preallocate_in_file(fd, size_needed, _safe_file_size, &size_after); + + _mutex_lock(); + _safe_file_size = size_after; + } + nb_mutex_unlock(&_safe_file_size_lock); + } +} + +void block_table::realloc_on_disk(BLOCKNUM b, DISKOFF size, DISKOFF *offset, FT ft, int fd, bool for_checkpoint, uint64_t heat) { + _mutex_lock(); + struct translation *t = &_current; + _verify_valid_freeable_blocknum(t, b); + _realloc_on_disk_internal(b, size, offset, ft, for_checkpoint, heat); + + _ensure_safe_write_unlocked(fd, size, *offset); + _mutex_unlock(); +} + +bool block_table::_pair_is_unallocated(struct block_translation_pair *pair) { + return pair->size == 0 && pair->u.diskoff == diskoff_unused; +} + +// Effect: figure out where to put the inprogress btt on disk, allocate space for it there. +// The space must be 512-byte aligned (both the starting address and the size). +// As a result, the allcoated space may be a little bit bigger (up to the next 512-byte boundary) than the actual btt. +void block_table::_alloc_inprogress_translation_on_disk_unlocked() { + toku_mutex_assert_locked(&_mutex); + + struct translation *t = &_inprogress; + paranoid_invariant_notnull(t->block_translation); + BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION); + //Each inprogress is allocated only once + paranoid_invariant(_pair_is_unallocated(&t->block_translation[b.b])); + + //Allocate a new block + int64_t size = _calculate_size_on_disk(t); + uint64_t offset; + _bt_block_allocator.alloc_block(size, 0, &offset); + t->block_translation[b.b].u.diskoff = offset; + t->block_translation[b.b].size = size; +} + +// Effect: Serializes the blocktable to a wbuf (which starts uninitialized) +// A clean shutdown runs checkpoint start so that current and inprogress are copies. +// The resulting wbuf buffer is guaranteed to be be 512-byte aligned and the total length is a multiple of 512 (so we pad with zeros at the end if needd) +// The address is guaranteed to be 512-byte aligned, but the size is not guaranteed. +// It *is* guaranteed that we can read up to the next 512-byte boundary, however +void block_table::serialize_translation_to_wbuf(int fd, struct wbuf *w, + int64_t *address, int64_t *size) { + _mutex_lock(); + struct translation *t = &_inprogress; + + BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION); + _alloc_inprogress_translation_on_disk_unlocked(); // The allocated block must be 512-byte aligned to make O_DIRECT happy. + uint64_t size_translation = _calculate_size_on_disk(t); + uint64_t size_aligned = roundup_to_multiple(512, size_translation); + assert((int64_t)size_translation==t->block_translation[b.b].size); + { + //Init wbuf + if (0) + printf("%s:%d writing translation table of size_translation %" PRIu64 " at %" PRId64 "\n", __FILE__, __LINE__, size_translation, t->block_translation[b.b].u.diskoff); + char *XMALLOC_N_ALIGNED(512, size_aligned, buf); + for (uint64_t i=size_translation; i<size_aligned; i++) buf[i]=0; // fill in the end of the buffer with zeros. + wbuf_init(w, buf, size_aligned); + } + wbuf_BLOCKNUM(w, t->smallest_never_used_blocknum); + wbuf_BLOCKNUM(w, t->blocknum_freelist_head); + int64_t i; + for (i=0; i<t->smallest_never_used_blocknum.b; i++) { + if (0) + printf("%s:%d %" PRId64 ",%" PRId64 "\n", __FILE__, __LINE__, t->block_translation[i].u.diskoff, t->block_translation[i].size); + wbuf_DISKOFF(w, t->block_translation[i].u.diskoff); + wbuf_DISKOFF(w, t->block_translation[i].size); + } + uint32_t checksum = toku_x1764_finish(&w->checksum); + wbuf_int(w, checksum); + *address = t->block_translation[b.b].u.diskoff; + *size = size_translation; + assert((*address)%512 == 0); + + _ensure_safe_write_unlocked(fd, size_aligned, *address); + _mutex_unlock(); +} + +// Perhaps rename: purpose is get disk address of a block, given its blocknum (blockid?) +void block_table::_translate_blocknum_to_offset_size_unlocked(BLOCKNUM b, DISKOFF *offset, DISKOFF *size) { + struct translation *t = &_current; + _verify_valid_blocknum(t, b); + if (offset) { + *offset = t->block_translation[b.b].u.diskoff; + } + if (size) { + *size = t->block_translation[b.b].size; + } +} + +// Perhaps rename: purpose is get disk address of a block, given its blocknum (blockid?) +void block_table::translate_blocknum_to_offset_size(BLOCKNUM b, DISKOFF *offset, DISKOFF *size) { + _mutex_lock(); + _translate_blocknum_to_offset_size_unlocked(b, offset, size); + _mutex_unlock(); +} + +// Only called by toku_allocate_blocknum +// Effect: expand the array to maintain size invariant +// given that one more never-used blocknum will soon be used. +void block_table::_maybe_expand_translation(struct translation *t) { + if (t->length_of_array <= t->smallest_never_used_blocknum.b) { + //expansion is necessary + uint64_t new_length = t->smallest_never_used_blocknum.b * 2; + XREALLOC_N(new_length, t->block_translation); + uint64_t i; + for (i = t->length_of_array; i < new_length; i++) { + t->block_translation[i].u.next_free_blocknum = freelist_null; + t->block_translation[i].size = size_is_free; + } + t->length_of_array = new_length; + } +} + +void block_table::_allocate_blocknum_unlocked(BLOCKNUM *res, FT ft) { + toku_mutex_assert_locked(&_mutex); + BLOCKNUM result; + struct translation *t = &_current; + if (t->blocknum_freelist_head.b == freelist_null.b) { + // no previously used blocknums are available + // use a never used blocknum + _maybe_expand_translation(t); //Ensure a never used blocknums is available + result = t->smallest_never_used_blocknum; + t->smallest_never_used_blocknum.b++; + } else { // reuse a previously used blocknum + result = t->blocknum_freelist_head; + BLOCKNUM next = t->block_translation[result.b].u.next_free_blocknum; + t->blocknum_freelist_head = next; + } + //Verify the blocknum is free + paranoid_invariant(t->block_translation[result.b].size == size_is_free); + //blocknum is not free anymore + t->block_translation[result.b].u.diskoff = diskoff_unused; + t->block_translation[result.b].size = 0; + _verify_valid_freeable_blocknum(t, result); + *res = result; + ft_set_dirty(ft, false); +} + +void block_table::allocate_blocknum(BLOCKNUM *res, FT ft) { + _mutex_lock(); + _allocate_blocknum_unlocked(res, ft); + _mutex_unlock(); +} + +void block_table::_free_blocknum_in_translation(struct translation *t, BLOCKNUM b) { + _verify_valid_freeable_blocknum(t, b); + paranoid_invariant(t->block_translation[b.b].size != size_is_free); + + t->block_translation[b.b].size = size_is_free; + t->block_translation[b.b].u.next_free_blocknum = t->blocknum_freelist_head; + t->blocknum_freelist_head = b; +} + +// Effect: Free a blocknum. +// If the blocknum holds the only reference to a block on disk, free that block +void block_table::_free_blocknum_unlocked(BLOCKNUM *bp, FT ft, bool for_checkpoint) { + toku_mutex_assert_locked(&_mutex); + BLOCKNUM b = *bp; + bp->b = 0; //Remove caller's reference. + + struct block_translation_pair old_pair = _current.block_translation[b.b]; + + _free_blocknum_in_translation(&_current, b); + if (for_checkpoint) { + paranoid_invariant(ft->checkpoint_header->type == FT_CHECKPOINT_INPROGRESS); + _free_blocknum_in_translation(&_inprogress, b); + } + + //If the size is 0, no disk block has ever been assigned to this blocknum. + if (old_pair.size > 0) { + //Free the old block if it is not still in use by the checkpoint in progress or the previous checkpoint + bool cannot_free = (bool) + (_translation_prevents_freeing(&_inprogress, b, &old_pair) || + _translation_prevents_freeing(&_checkpointed, b, &old_pair)); + if (!cannot_free) { + _bt_block_allocator.free_block(old_pair.u.diskoff); + } + } + else { + paranoid_invariant(old_pair.size==0); + paranoid_invariant(old_pair.u.diskoff == diskoff_unused); + } + ft_set_dirty(ft, for_checkpoint); +} + +void block_table::free_blocknum(BLOCKNUM *bp, FT ft, bool for_checkpoint) { + _mutex_lock(); + _free_blocknum_unlocked(bp, ft, for_checkpoint); + _mutex_unlock(); +} + +// Verify there are no free blocks. +void block_table::verify_no_free_blocknums() { + invariant(_current.blocknum_freelist_head.b == freelist_null.b); +} + +// Frees blocknums that have a size of 0 and unused diskoff +// Currently used for eliminating unused cached rollback log nodes +void block_table::free_unused_blocknums(BLOCKNUM root) { + _mutex_lock(); + int64_t smallest = _current.smallest_never_used_blocknum.b; + for (int64_t i=RESERVED_BLOCKNUMS; i < smallest; i++) { + if (i == root.b) { + continue; + } + BLOCKNUM b = make_blocknum(i); + if (_current.block_translation[b.b].size == 0) { + invariant(_current.block_translation[b.b].u.diskoff == diskoff_unused); + _free_blocknum_in_translation(&_current, b); + } + } + _mutex_unlock(); +} + +bool block_table::_no_data_blocks_except_root(BLOCKNUM root) { + bool ok = true; + _mutex_lock(); + int64_t smallest = _current.smallest_never_used_blocknum.b; + if (root.b < RESERVED_BLOCKNUMS) { + ok = false; + goto cleanup; + } + for (int64_t i = RESERVED_BLOCKNUMS; i < smallest; i++) { + if (i == root.b) { + continue; + } + BLOCKNUM b = make_blocknum(i); + if (_current.block_translation[b.b].size != size_is_free) { + ok = false; + goto cleanup; + } + } + cleanup: + _mutex_unlock(); + return ok; +} + +// Verify there are no data blocks except root. +// TODO(leif): This actually takes a lock, but I don't want to fix all the callers right now. +void block_table::verify_no_data_blocks_except_root(BLOCKNUM UU(root)) { + paranoid_invariant(_no_data_blocks_except_root(root)); +} + +bool block_table::_blocknum_allocated(BLOCKNUM b) { + _mutex_lock(); + struct translation *t = &_current; + _verify_valid_blocknum(t, b); + bool ok = t->block_translation[b.b].size != size_is_free; + _mutex_unlock(); + return ok; +} + +// Verify a blocknum is currently allocated. +void block_table::verify_blocknum_allocated(BLOCKNUM UU(b)) { + paranoid_invariant(_blocknum_allocated(b)); +} + +// Only used by toku_dump_translation table (debug info) +void block_table::_dump_translation_internal(FILE *f, struct translation *t) { + if (t->block_translation) { + BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION); + fprintf(f, " length_of_array[%" PRId64 "]", t->length_of_array); + fprintf(f, " smallest_never_used_blocknum[%" PRId64 "]", t->smallest_never_used_blocknum.b); + fprintf(f, " blocknum_free_list_head[%" PRId64 "]", t->blocknum_freelist_head.b); + fprintf(f, " size_on_disk[%" PRId64 "]", t->block_translation[b.b].size); + fprintf(f, " location_on_disk[%" PRId64 "]\n", t->block_translation[b.b].u.diskoff); + int64_t i; + for (i=0; i<t->length_of_array; i++) { + fprintf(f, " %" PRId64 ": %" PRId64 " %" PRId64 "\n", i, t->block_translation[i].u.diskoff, t->block_translation[i].size); + } + fprintf(f, "\n"); + } else { + fprintf(f, " does not exist\n"); + } +} + +// Only used by toku_ft_dump which is only for debugging purposes +// "pretty" just means we use tabs so we can parse output easier later +void block_table::dump_translation_table_pretty(FILE *f) { + _mutex_lock(); + struct translation *t = &_checkpointed; + assert(t->block_translation != nullptr); + for (int64_t i = 0; i < t->length_of_array; ++i) { + fprintf(f, "%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n", i, t->block_translation[i].u.diskoff, t->block_translation[i].size); + } + _mutex_unlock(); +} + +// Only used by toku_ft_dump which is only for debugging purposes +void block_table::dump_translation_table(FILE *f) { + _mutex_lock(); + fprintf(f, "Current block translation:"); + _dump_translation_internal(f, &_current); + fprintf(f, "Checkpoint in progress block translation:"); + _dump_translation_internal(f, &_inprogress); + fprintf(f, "Checkpointed block translation:"); + _dump_translation_internal(f, &_checkpointed); + _mutex_unlock(); +} + +// Only used by ftdump +void block_table::blocknum_dump_translation(BLOCKNUM b) { + _mutex_lock(); + + struct translation *t = &_current; + if (b.b < t->length_of_array) { + struct block_translation_pair *bx = &t->block_translation[b.b]; + printf("%" PRId64 ": %" PRId64 " %" PRId64 "\n", b.b, bx->u.diskoff, bx->size); + } + _mutex_unlock(); +} + +// Must not call this function when anything else is using the blocktable. +// No one may use the blocktable afterwards. +void block_table::destroy(void) { + // TODO: translation.destroy(); + toku_free(_current.block_translation); + toku_free(_inprogress.block_translation); + toku_free(_checkpointed.block_translation); + + _bt_block_allocator.destroy(); + toku_mutex_destroy(&_mutex); + nb_mutex_destroy(&_safe_file_size_lock); +} + +int block_table::_translation_deserialize_from_buffer(struct translation *t, + DISKOFF location_on_disk, + uint64_t size_on_disk, + // out: buffer with serialized translation + unsigned char *translation_buffer) { + int r = 0; + assert(location_on_disk != 0); + t->type = TRANSLATION_CHECKPOINTED; + + // check the checksum + uint32_t x1764 = toku_x1764_memory(translation_buffer, size_on_disk - 4); + uint64_t offset = size_on_disk - 4; + uint32_t stored_x1764 = toku_dtoh32(*(int*)(translation_buffer + offset)); + if (x1764 != stored_x1764) { + fprintf(stderr, "Translation table checksum failure: calc=0x%08x read=0x%08x\n", x1764, stored_x1764); + r = TOKUDB_BAD_CHECKSUM; + goto exit; + } + + struct rbuf rb; + rb.buf = translation_buffer; + rb.ndone = 0; + rb.size = size_on_disk-4;//4==checksum + + t->smallest_never_used_blocknum = rbuf_blocknum(&rb); + t->length_of_array = t->smallest_never_used_blocknum.b; + invariant(t->smallest_never_used_blocknum.b >= RESERVED_BLOCKNUMS); + t->blocknum_freelist_head = rbuf_blocknum(&rb); + XMALLOC_N(t->length_of_array, t->block_translation); + for (int64_t i = 0; i < t->length_of_array; i++) { + t->block_translation[i].u.diskoff = rbuf_DISKOFF(&rb); + t->block_translation[i].size = rbuf_DISKOFF(&rb); + } + invariant(_calculate_size_on_disk(t) == (int64_t) size_on_disk); + invariant(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].size == (int64_t) size_on_disk); + invariant(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff == location_on_disk); + +exit: + return r; +} + +int block_table::iterate(enum translation_type type, + BLOCKTABLE_CALLBACK f, void *extra, bool data_only, bool used_only) { + struct translation *src; + + int r = 0; + switch (type) { + case TRANSLATION_CURRENT: + src = &_current; + break; + case TRANSLATION_INPROGRESS: + src = &_inprogress; + break; + case TRANSLATION_CHECKPOINTED: + src = &_checkpointed; + break; + default: + r = EINVAL; + } + + struct translation fakecurrent; + memset(&fakecurrent, 0, sizeof(struct translation)); + + struct translation *t = &fakecurrent; + if (r == 0) { + _mutex_lock(); + _copy_translation(t, src, TRANSLATION_DEBUG); + t->block_translation[RESERVED_BLOCKNUM_TRANSLATION] = + src->block_translation[RESERVED_BLOCKNUM_TRANSLATION]; + _mutex_unlock(); + int64_t i; + for (i=0; i<t->smallest_never_used_blocknum.b; i++) { + struct block_translation_pair pair = t->block_translation[i]; + if (data_only && i< RESERVED_BLOCKNUMS) continue; + if (used_only && pair.size <= 0) continue; + r = f(make_blocknum(i), pair.size, pair.u.diskoff, extra); + if (r!=0) break; + } + toku_free(t->block_translation); + } + return r; +} + +typedef struct { + int64_t used_space; + int64_t total_space; +} frag_extra; + +static int frag_helper(BLOCKNUM UU(b), int64_t size, int64_t address, void *extra) { + frag_extra *info = (frag_extra *) extra; + + if (size + address > info->total_space) + info->total_space = size + address; + info->used_space += size; + return 0; +} + +void block_table::internal_fragmentation(int64_t *total_sizep, int64_t *used_sizep) { + frag_extra info = { 0, 0 }; + int r = iterate(TRANSLATION_CHECKPOINTED, frag_helper, &info, false, true); + assert_zero(r); + + if (total_sizep) *total_sizep = info.total_space; + if (used_sizep) *used_sizep = info.used_space; +} + +void block_table::_realloc_descriptor_on_disk_unlocked(DISKOFF size, DISKOFF *offset, FT ft) { + toku_mutex_assert_locked(&_mutex); + BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_DESCRIPTOR); + _realloc_on_disk_internal(b, size, offset, ft, false, 0); +} + +void block_table::realloc_descriptor_on_disk(DISKOFF size, DISKOFF *offset, FT ft, int fd) { + _mutex_lock(); + _realloc_descriptor_on_disk_unlocked(size, offset, ft); + _ensure_safe_write_unlocked(fd, size, *offset); + _mutex_unlock(); +} + +void block_table::get_descriptor_offset_size(DISKOFF *offset, DISKOFF *size) { + _mutex_lock(); + BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_DESCRIPTOR); + _translate_blocknum_to_offset_size_unlocked(b, offset, size); + _mutex_unlock(); +} + +void block_table::get_fragmentation_unlocked(TOKU_DB_FRAGMENTATION report) { + // Requires: blocktable lock is held. + // Requires: report->file_size_bytes is already filled in. + + // Count the headers. + report->data_bytes = block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE; + report->data_blocks = 1; + report->checkpoint_bytes_additional = block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE; + report->checkpoint_blocks_additional = 1; + + struct translation *current = &_current; + for (int64_t i = 0; i < current->length_of_array; i++) { + struct block_translation_pair *pair = ¤t->block_translation[i]; + if (pair->size > 0) { + report->data_bytes += pair->size; + report->data_blocks++; + } + } + + struct translation *checkpointed = &_checkpointed; + for (int64_t i = 0; i < checkpointed->length_of_array; i++) { + struct block_translation_pair *pair = &checkpointed->block_translation[i]; + if (pair->size > 0 && !(i < current->length_of_array && + current->block_translation[i].size > 0 && + current->block_translation[i].u.diskoff == pair->u.diskoff)) { + report->checkpoint_bytes_additional += pair->size; + report->checkpoint_blocks_additional++; + } + } + + struct translation *inprogress = &_inprogress; + for (int64_t i = 0; i < inprogress->length_of_array; i++) { + struct block_translation_pair *pair = &inprogress->block_translation[i]; + if (pair->size > 0 && !(i < current->length_of_array && + current->block_translation[i].size > 0 && + current->block_translation[i].u.diskoff == pair->u.diskoff) && + !(i < checkpointed->length_of_array && + checkpointed->block_translation[i].size > 0 && + checkpointed->block_translation[i].u.diskoff == pair->u.diskoff)) { + report->checkpoint_bytes_additional += pair->size; + report->checkpoint_blocks_additional++; + } + } + + _bt_block_allocator.get_unused_statistics(report); +} + +void block_table::get_info64(struct ftinfo64 *s) { + _mutex_lock(); + + struct translation *current = &_current; + s->num_blocks_allocated = current->length_of_array; + s->num_blocks_in_use = 0; + s->size_allocated = 0; + s->size_in_use = 0; + + for (int64_t i = 0; i < current->length_of_array; ++i) { + struct block_translation_pair *block = ¤t->block_translation[i]; + if (block->size != size_is_free) { + ++s->num_blocks_in_use; + s->size_in_use += block->size; + if (block->u.diskoff != diskoff_unused) { + uint64_t limit = block->u.diskoff + block->size; + if (limit > s->size_allocated) { + s->size_allocated = limit; + } + } + } + } + + _mutex_unlock(); +} + +int block_table::iterate_translation_tables(uint64_t checkpoint_count, + int (*iter)(uint64_t checkpoint_count, + int64_t total_num_rows, + int64_t blocknum, + int64_t diskoff, + int64_t size, + void *extra), + void *iter_extra) { + int error = 0; + _mutex_lock(); + + int64_t total_num_rows = _current.length_of_array + _checkpointed.length_of_array; + for (int64_t i = 0; error == 0 && i < _current.length_of_array; ++i) { + struct block_translation_pair *block = &_current.block_translation[i]; + error = iter(checkpoint_count, total_num_rows, i, block->u.diskoff, block->size, iter_extra); + } + for (int64_t i = 0; error == 0 && i < _checkpointed.length_of_array; ++i) { + struct block_translation_pair *block = &_checkpointed.block_translation[i]; + error = iter(checkpoint_count - 1, total_num_rows, i, block->u.diskoff, block->size, iter_extra); + } + + _mutex_unlock(); + return error; +} diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_table.h b/storage/tokudb/PerconaFT/ft/serialize/block_table.h new file mode 100644 index 00000000000..8d391674540 --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/block_table.h @@ -0,0 +1,285 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <db.h> + +#include "portability/toku_stdint.h" +#include "portability/toku_pthread.h" + +#include "ft/serialize/block_allocator.h" +#include "util/nb_mutex.h" + +struct ft; + +typedef struct blocknum_s { int64_t b; } BLOCKNUM; + +// Offset in a disk. -1 is the 'null' pointer. +typedef int64_t DISKOFF; + +// Unmovable reserved first, then reallocable. +// We reserve one blocknum for the translation table itself. +enum { + RESERVED_BLOCKNUM_NULL = 0, + RESERVED_BLOCKNUM_TRANSLATION = 1, + RESERVED_BLOCKNUM_DESCRIPTOR = 2, + RESERVED_BLOCKNUMS +}; + +typedef int (*BLOCKTABLE_CALLBACK)(BLOCKNUM b, int64_t size, int64_t address, void *extra); + +static inline BLOCKNUM make_blocknum(int64_t b) { + BLOCKNUM result = { .b = b }; + return result; +} +static const BLOCKNUM ROLLBACK_NONE = { .b = 0 }; + +/** + * There are three copies of the translation table (btt) in the block table: + * + * checkpointed Is initialized by deserializing from disk, + * and is the only version ever read from disk. + * When read from disk it is copied to current. + * It is immutable. It can be replaced by an inprogress btt. + * + * inprogress Is only filled by copying from current, + * and is the only version ever serialized to disk. + * (It is serialized to disk on checkpoint and clean shutdown.) + * At end of checkpoint it replaces 'checkpointed'. + * During a checkpoint, any 'pending' dirty writes will update + * inprogress. + * + * current Is initialized by copying from checkpointed, + * is the only version ever modified while the database is in use, + * and is the only version ever copied to inprogress. + * It is never stored on disk. + */ +class block_table { +public: + enum translation_type { + TRANSLATION_NONE = 0, + TRANSLATION_CURRENT, + TRANSLATION_INPROGRESS, + TRANSLATION_CHECKPOINTED, + TRANSLATION_DEBUG + }; + + void create(); + + int create_from_buffer(int fd, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer); + + void destroy(); + + // Checkpointing + void note_start_checkpoint_unlocked(); + void note_end_checkpoint(int fd); + void note_skipped_checkpoint(); + void maybe_truncate_file_on_open(int fd); + + // Blocknums + void allocate_blocknum(BLOCKNUM *res, struct ft *ft); + void realloc_on_disk(BLOCKNUM b, DISKOFF size, DISKOFF *offset, struct ft *ft, int fd, bool for_checkpoint, uint64_t heat); + void free_blocknum(BLOCKNUM *b, struct ft *ft, bool for_checkpoint); + void translate_blocknum_to_offset_size(BLOCKNUM b, DISKOFF *offset, DISKOFF *size); + void free_unused_blocknums(BLOCKNUM root); + void realloc_descriptor_on_disk(DISKOFF size, DISKOFF *offset, struct ft *ft, int fd); + void get_descriptor_offset_size(DISKOFF *offset, DISKOFF *size); + + // External verfication + void verify_blocknum_allocated(BLOCKNUM b); + void verify_no_data_blocks_except_root(BLOCKNUM root); + void verify_no_free_blocknums(); + + // Serialization + void serialize_translation_to_wbuf(int fd, struct wbuf *w, int64_t *address, int64_t *size); + + // DEBUG ONLY (ftdump included), tests included + void blocknum_dump_translation(BLOCKNUM b); + void dump_translation_table_pretty(FILE *f); + void dump_translation_table(FILE *f); + void block_free(uint64_t offset); + + int iterate(enum translation_type type, BLOCKTABLE_CALLBACK f, void *extra, bool data_only, bool used_only); + void internal_fragmentation(int64_t *total_sizep, int64_t *used_sizep); + + // Requires: blocktable lock is held. + // Requires: report->file_size_bytes is already filled in. + void get_fragmentation_unlocked(TOKU_DB_FRAGMENTATION report); + + int64_t get_blocks_in_use_unlocked(); + + void get_info64(struct ftinfo64 *); + + int iterate_translation_tables(uint64_t, int (*)(uint64_t, int64_t, int64_t, int64_t, int64_t, void *), void *); + +private: + struct block_translation_pair { + // If in the freelist, use next_free_blocknum, otherwise diskoff. + union { + DISKOFF diskoff; + BLOCKNUM next_free_blocknum; + } u; + + // Set to 0xFFFFFFFFFFFFFFFF for free + DISKOFF size; + }; + + // This is the BTT (block translation table) + // When the translation (btt) is stored on disk: + // In Header: + // size_on_disk + // location_on_disk + // In block translation table (in order): + // smallest_never_used_blocknum + // blocknum_freelist_head + // array + // a checksum + struct translation { + enum translation_type type; + + // Number of elements in array (block_translation). always >= smallest_never_used_blocknum + int64_t length_of_array; + BLOCKNUM smallest_never_used_blocknum; + + // Next (previously used) unused blocknum (free list) + BLOCKNUM blocknum_freelist_head; + struct block_translation_pair *block_translation; + + // size_on_disk is stored in block_translation[RESERVED_BLOCKNUM_TRANSLATION].size + // location_on is stored in block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff + }; + + void _create_internal(); + int _translation_deserialize_from_buffer(struct translation *t, // destination into which to deserialize + DISKOFF location_on_disk, // location of translation_buffer + uint64_t size_on_disk, + unsigned char * translation_buffer); // buffer with serialized translation + + void _copy_translation(struct translation *dst, struct translation *src, enum translation_type newtype); + void _maybe_optimize_translation(struct translation *t); + void _maybe_expand_translation(struct translation *t); + bool _translation_prevents_freeing(struct translation *t, BLOCKNUM b, struct block_translation_pair *old_pair); + void _free_blocknum_in_translation(struct translation *t, BLOCKNUM b); + int64_t _calculate_size_on_disk(struct translation *t); + bool _pair_is_unallocated(struct block_translation_pair *pair); + void _alloc_inprogress_translation_on_disk_unlocked(); + void _dump_translation_internal(FILE *f, struct translation *t); + + // Blocknum management + void _allocate_blocknum_unlocked(BLOCKNUM *res, struct ft *ft); + void _free_blocknum_unlocked(BLOCKNUM *bp, struct ft *ft, bool for_checkpoint); + void _realloc_descriptor_on_disk_unlocked(DISKOFF size, DISKOFF *offset, struct ft *ft); + void _realloc_on_disk_internal(BLOCKNUM b, DISKOFF size, DISKOFF *offset, struct ft *ft, bool for_checkpoint, uint64_t heat); + void _translate_blocknum_to_offset_size_unlocked(BLOCKNUM b, DISKOFF *offset, DISKOFF *size); + + // File management + void _maybe_truncate_file(int fd, uint64_t size_needed_before); + void _ensure_safe_write_unlocked(int fd, DISKOFF block_size, DISKOFF block_offset); + + // Verification + bool _is_valid_blocknum(struct translation *t, BLOCKNUM b); + void _verify_valid_blocknum(struct translation *t, BLOCKNUM b); + bool _is_valid_freeable_blocknum(struct translation *t, BLOCKNUM b); + void _verify_valid_freeable_blocknum(struct translation *t, BLOCKNUM b); + bool _no_data_blocks_except_root(BLOCKNUM root); + bool _blocknum_allocated(BLOCKNUM b); + + // Locking + // + // TODO: Move the lock to the FT + void _mutex_lock(); + void _mutex_unlock(); + + // The current translation is the one used by client threads. + // It is not represented on disk. + struct translation _current; + + // The translation used by the checkpoint currently in progress. + // If the checkpoint thread allocates a block, it must also update the current translation. + struct translation _inprogress; + + // The translation for the data that shall remain inviolate on disk until the next checkpoint finishes, + // after which any blocks used only in this translation can be freed. + struct translation _checkpointed; + + // The in-memory data structure for block allocation. + // There is no on-disk data structure for block allocation. + // Note: This is *allocation* not *translation* - the block allocator is unaware of which + // blocks are used for which translation, but simply allocates and deallocates blocks. + block_allocator _bt_block_allocator; + toku_mutex_t _mutex; + struct nb_mutex _safe_file_size_lock; + bool _checkpoint_skipped; + uint64_t _safe_file_size; + + // Because the lock is in a weird place right now + friend void toku_ft_lock(struct ft *ft); + friend void toku_ft_unlock(struct ft *ft); +}; + +// For serialize / deserialize + +#include "ft/serialize/wbuf.h" + +static inline void wbuf_BLOCKNUM (struct wbuf *w, BLOCKNUM b) { + wbuf_ulonglong(w, b.b); +} + +static inline void wbuf_nocrc_BLOCKNUM (struct wbuf *w, BLOCKNUM b) { + wbuf_nocrc_ulonglong(w, b.b); +} + +static inline void wbuf_DISKOFF(struct wbuf *wb, DISKOFF off) { + wbuf_ulonglong(wb, (uint64_t) off); +} + +#include "ft/serialize/rbuf.h" + +static inline DISKOFF rbuf_DISKOFF(struct rbuf *rb) { + return rbuf_ulonglong(rb); +} + +static inline BLOCKNUM rbuf_blocknum(struct rbuf *rb) { + BLOCKNUM result = make_blocknum(rbuf_longlong(rb)); + return result; +} + +static inline void rbuf_ma_BLOCKNUM(struct rbuf *rb, memarena *UU(ma), BLOCKNUM *blocknum) { + *blocknum = rbuf_blocknum(rb); +} diff --git a/storage/tokudb/PerconaFT/ft/serialize/compress.cc b/storage/tokudb/PerconaFT/ft/serialize/compress.cc new file mode 100644 index 00000000000..1719b6b7cb5 --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/compress.cc @@ -0,0 +1,257 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include <toku_portability.h> +#include <util/scoped_malloc.h> + +#include <zlib.h> +#include <lzma.h> +#include <snappy.h> + +#include "compress.h" +#include "memory.h" +#include "quicklz.h" +#include "toku_assert.h" + +static inline enum toku_compression_method +normalize_compression_method(enum toku_compression_method method) +// Effect: resolve "friendly" names like "fast" and "small" into their real values. +{ + switch (method) { + case TOKU_DEFAULT_COMPRESSION_METHOD: + case TOKU_FAST_COMPRESSION_METHOD: + return TOKU_QUICKLZ_METHOD; + case TOKU_SMALL_COMPRESSION_METHOD: + return TOKU_LZMA_METHOD; + default: + return method; // everything else is fine + } +} + +size_t toku_compress_bound (enum toku_compression_method a, size_t size) +// See compress.h for the specification of this function. +{ + a = normalize_compression_method(a); + switch (a) { + case TOKU_NO_COMPRESSION: + return size + 1; + case TOKU_LZMA_METHOD: + return 1+lzma_stream_buffer_bound(size); // We need one extra for the rfc1950-style header byte (bits -03 are TOKU_LZMA_METHOD (1), bits 4-7 are the compression level) + case TOKU_QUICKLZ_METHOD: + return size+400 + 1; // quicklz manual says 400 bytes is enough. We need one more byte for the rfc1950-style header byte. bits 0-3 are 9, bits 4-7 are the QLZ_COMPRESSION_LEVEL. + case TOKU_ZLIB_METHOD: + return compressBound (size); + case TOKU_ZLIB_WITHOUT_CHECKSUM_METHOD: + return 2+deflateBound(nullptr, size); // We need one extra for the rfc1950-style header byte, and one extra to store windowBits (a bit over cautious about future upgrades maybe). + case TOKU_SNAPPY_METHOD: + return (1 + snappy::MaxCompressedLength(size)); + default: + break; + } + // fall through for bad enum (thus compiler can warn us if we didn't use all the enums + assert(0); return 0; +} + +void toku_compress (enum toku_compression_method a, + // the following types and naming conventions come from zlib.h + Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen) +// See compress.h for the specification of this function. +{ + static const int zlib_compression_level = 5; + static const int zlib_without_checksum_windowbits = -15; + + a = normalize_compression_method(a); + assert(sourceLen < (1LL << 32)); + switch (a) { + case TOKU_NO_COMPRESSION: + dest[0] = TOKU_NO_COMPRESSION; + memcpy(dest + 1, source, sourceLen); + *destLen = sourceLen + 1; + return; + case TOKU_ZLIB_METHOD: { + int r = compress2(dest, destLen, source, sourceLen, zlib_compression_level); + assert(r == Z_OK); + assert((dest[0]&0xF) == TOKU_ZLIB_METHOD); + return; + } + case TOKU_QUICKLZ_METHOD: { + if (sourceLen==0) { + // quicklz requires at least one byte, so we handle this ourselves + assert(1 <= *destLen); + *destLen = 1; + } else { + toku::scoped_calloc qsc_buf(sizeof(qlz_state_compress)); + qlz_state_compress *qsc = reinterpret_cast<qlz_state_compress *>(qsc_buf.get()); + size_t actual_destlen = qlz_compress(source, (char*)(dest+1), sourceLen, qsc); + assert(actual_destlen + 1 <= *destLen); + // add one for the rfc1950-style header byte. + *destLen = actual_destlen + 1; + } + // Fill in that first byte + dest[0] = TOKU_QUICKLZ_METHOD + (QLZ_COMPRESSION_LEVEL << 4); + return; + } + case TOKU_LZMA_METHOD: { + const int lzma_compression_level = 2; + if (sourceLen==0) { + // lzma version 4.999 requires at least one byte, so we'll do it ourselves. + assert(1<=*destLen); + *destLen = 1; + } else { + size_t out_pos = 1; + lzma_ret r = lzma_easy_buffer_encode(lzma_compression_level, + LZMA_CHECK_NONE, NULL, + source, sourceLen, + dest, &out_pos, *destLen); + assert(out_pos < *destLen); + if (r != LZMA_OK) { + fprintf(stderr, "lzma_easy_buffer_encode() returned %d\n", (int) r); + } + assert(r==LZMA_OK); + *destLen = out_pos; + } + dest[0] = TOKU_LZMA_METHOD + (lzma_compression_level << 4); + return; + } + case TOKU_ZLIB_WITHOUT_CHECKSUM_METHOD: { + z_stream strm; + strm.zalloc = Z_NULL; + strm.zfree = Z_NULL; + strm.opaque = Z_NULL; + strm.next_in = const_cast<Bytef *>(source); + strm.avail_in = sourceLen; + int r = deflateInit2(&strm, zlib_compression_level, Z_DEFLATED, + zlib_without_checksum_windowbits, 8, Z_DEFAULT_STRATEGY); + lazy_assert(r == Z_OK); + strm.next_out = dest + 2; + strm.avail_out = *destLen - 2; + r = deflate(&strm, Z_FINISH); + lazy_assert(r == Z_STREAM_END); + r = deflateEnd(&strm); + lazy_assert(r == Z_OK); + *destLen = strm.total_out + 2; + dest[0] = TOKU_ZLIB_WITHOUT_CHECKSUM_METHOD + (zlib_compression_level << 4); + dest[1] = zlib_without_checksum_windowbits; + return; + } + case TOKU_SNAPPY_METHOD: { + snappy::RawCompress((char*)source, sourceLen, (char*)dest + 1, destLen); + *destLen += 1; + dest[0] = TOKU_SNAPPY_METHOD; + return; + } + default: + break; + } + // default fall through to error. + assert(0); +} + +void toku_decompress (Bytef *dest, uLongf destLen, + const Bytef *source, uLongf sourceLen) +// See compress.h for the specification of this function. +{ + assert(sourceLen>=1); // need at least one byte for the RFC header. + switch (source[0] & 0xF) { + case TOKU_NO_COMPRESSION: + memcpy(dest, source + 1, sourceLen - 1); + return; + case TOKU_ZLIB_METHOD: { + uLongf actual_destlen = destLen; + int r = uncompress(dest, &actual_destlen, source, sourceLen); + assert(r == Z_OK); + assert(actual_destlen == destLen); + return; + } + case TOKU_QUICKLZ_METHOD: + if (sourceLen>1) { + toku::scoped_calloc state_buf(sizeof(qlz_state_decompress)); + qlz_state_decompress *qsd = reinterpret_cast<qlz_state_decompress *>(state_buf.get()); + uLongf actual_destlen = qlz_decompress((char*)source+1, dest, qsd); + assert(actual_destlen == destLen); + } else { + // length 1 means there is no data, so do nothing. + assert(destLen==0); + } + return; + case TOKU_LZMA_METHOD: { + if (sourceLen>1) { + uint64_t memlimit = UINT64_MAX; + size_t out_pos = 0; + size_t in_pos = 1; + lzma_ret r = lzma_stream_buffer_decode(&memlimit, // memlimit, use UINT64_MAX to disable this check + 0, // flags + NULL, // allocator + source, &in_pos, sourceLen, + dest, &out_pos, destLen); + assert(r==LZMA_OK); + assert(out_pos == destLen); + } else { + // length 1 means there is no data, so do nothing. + assert(destLen==0); + } + return; + } + case TOKU_ZLIB_WITHOUT_CHECKSUM_METHOD: { + z_stream strm; + strm.next_in = const_cast<Bytef *>(source + 2); + strm.avail_in = sourceLen - 2; + strm.zalloc = Z_NULL; + strm.zfree = Z_NULL; + strm.opaque = Z_NULL; + char windowBits = source[1]; + int r = inflateInit2(&strm, windowBits); + lazy_assert(r == Z_OK); + strm.next_out = dest; + strm.avail_out = destLen; + r = inflate(&strm, Z_FINISH); + lazy_assert(r == Z_STREAM_END); + r = inflateEnd(&strm); + lazy_assert(r == Z_OK); + return; + } + case TOKU_SNAPPY_METHOD: { + bool r = snappy::RawUncompress((char*)source + 1, sourceLen - 1, (char*)dest); + assert(r); + return; + } + } + // default fall through to error. + assert(0); +} diff --git a/storage/tokudb/PerconaFT/ft/serialize/compress.h b/storage/tokudb/PerconaFT/ft/serialize/compress.h new file mode 100644 index 00000000000..74307985e75 --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/compress.h @@ -0,0 +1,78 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <zlib.h> +#include <db.h> + +// The following provides an abstraction of quicklz and zlib. +// We offer three compression methods: ZLIB, QUICKLZ, and LZMA, as well as a "no compression" option. These options are declared in make_tdb.c. +// The resulting byte string includes enough information for us to decompress it. That is, we can tell whether it's z-compressed or qz-compressed or xz-compressed. + +size_t toku_compress_bound (enum toku_compression_method a, size_t size); +// Effect: Return the number of bytes needed to compress a buffer of size SIZE using compression method A. +// Typically, the result is a little bit larger than SIZE, since some data cannot be compressed. +// Usage note: It may help to know roughly how much space is involved. +// zlib's bound is something like (size + (size>>12) + (size>>14) + (size>>25) + 13. +// quicklz's bound is something like size+400. + +void toku_compress (enum toku_compression_method a, + // the following types and naming conventions come from zlib.h + Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen); +// Effect: Using compression method A, compress SOURCE into DEST. The number of bytes to compress is passed in SOURCELEN. +// On input: *destLen is the size of the buffer. +// On output: *destLen is the size of the actual compressed data. +// Usage note: sourceLen may be be zero (unlike for quicklz, which requires sourceLen>0). +// Requires: The buffer must be big enough to hold the compressed data. (That is *destLen >= compressBound(a, sourceLen)) +// Requires: sourceLen < 2^32. +// Usage note: Although we *try* to assert if the DESTLEN isn't big enough, it's possible that it's too late by then (in the case of quicklz which offers +// no way to avoid a buffer overrun.) So we require that that DESTLEN is big enough. +// Rationale: zlib's argument order is DEST then SOURCE with the size of the buffer passed in *destLen, and the size of the result returned in *destLen. +// quicklz's argument order is SOURCE then DEST with the size returned (and it has no way to verify that an overright didn't happen). +// We use zlib's calling conventions partly because it is safer, and partly because it is more established. +// We also use zlib's ugly camel case convention for destLen and sourceLen. +// Unlike zlib, we return no error codes. Instead, we require that the data be OK and the size of the buffers is OK, and assert if there's a problem. + +void toku_decompress (Bytef *dest, uLongf destLen, + const Bytef *source, uLongf sourceLen); +// Effect: Decompress source (length sourceLen) into dest (length destLen) +// This function can decompress data compressed with either zlib or quicklz compression methods (calling toku_compress(), which puts an appropriate header on so we know which it is.) +// Requires: destLen is equal to the actual decompressed size of the data. +// Requires: The source must have been properly compressed. diff --git a/storage/tokudb/PerconaFT/ft/serialize/ft-node-deserialize.cc b/storage/tokudb/PerconaFT/ft/serialize/ft-node-deserialize.cc new file mode 100644 index 00000000000..02a9dfd085c --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/ft-node-deserialize.cc @@ -0,0 +1,186 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "ft/node.h" +#include "ft/ft-internal.h" +#include "ft/serialize/ft_node-serialize.h" + +/* + * ft-node-deserialize.c - + * This file contains functions used by deserializtion + * code paths in and out of the engine. The functions can, + * essentially, be broken up into two types. Some of these + * functions return error codes based expected values inside + * the fractal tree node, others merely read the specific + * quantities of bytes out of the buffer. It is expeceted + * that these will be called in the correct order by users + * of these functions/this API. + * + */ + +// Sets initial values for the given fractal tree node to be +// deserialized +void +initialize_ftnode(FTNODE node, BLOCKNUM blocknum) +{ + node->fullhash = 0xDEADBEEF; // <CER> Is this 'spoof' ok? + node->blocknum = blocknum; + node->dirty = 0; + node->bp = NULL; + // <CER> Can we use this initialization as a correctness assert in + // a later function? + node->layout_version_read_from_disk = 0; +} + +/************************ + * TODO: In other deserialization code, we check the rb size member. We + * verify that it is greater than or equal to 24. Ignoring this magic + * number for a moment, should we put this check in its own function? * +*************************/ + + +// Read and check the 'magic' bytes on disk. Returns an error if +// the magic does not match. +int +read_and_check_magic(struct rbuf *rb) +{ + int r = 0; + const void *magic; + rbuf_literal_bytes(rb, &magic, 8); + if (memcmp(magic, "tokuleaf", 8)!=0 && + memcmp(magic, "tokunode", 8)!=0) { + r = DB_BADFORMAT; // TODO: Return more meaningful error. + } + + return r; +} + +// Read the version number from the given buffer +// and returns an error if the version is too old. +int +read_and_check_version(FTNODE node, struct rbuf *rb) +{ + int r = 0; + int version = rbuf_int(rb); + node->layout_version_read_from_disk = version; + if (version < FT_LAYOUT_MIN_SUPPORTED_VERSION) { + r = 1; // TODO: Better error reporting. + } + + return r; +} + +// Reads the basic version, build, and child info from +// the given buffer. +void +read_node_info(FTNODE node, struct rbuf *rb, int version) +{ + node->layout_version = version; + node->layout_version_original = rbuf_int(rb); + node->build_id = rbuf_int(rb); + node->n_children = rbuf_int(rb); +} + +// Allocates the partitions based on the given node's nubmer +// of children. It then reads, out of the given buffer, +// the start and size of each child partition. +// TODO: Should these be two seperate functions? +void +allocate_and_read_partition_offsets(FTNODE node, struct rbuf *rb, FTNODE_DISK_DATA *ndd) +{ + XMALLOC_N(node->n_children, node->bp); + // TODO: Fix this to use xmalloc_n + XMALLOC_N(node->n_children, *ndd); + // Read the partition locations. + for (int i = 0; i < node->n_children; i++) { + BP_START(*ndd, i) = rbuf_int(rb); + BP_SIZE (*ndd, i) = rbuf_int(rb); + } +} + +// Compares checksum of stored (in the given buffer) checksum +// and the checksum of the buffer itself. If these are NOT +// equal, this function returns an appropriate error code. +int +check_node_info_checksum(struct rbuf *rb) +{ + int r = 0; + // Verify checksum of header stored. + uint32_t checksum = toku_x1764_memory(rb->buf, rb->ndone); + uint32_t stored_checksum = rbuf_int(rb); + + if (stored_checksum != checksum) { + // TODO: dump_bad_block(rb->buf, rb->size); + r = TOKUDB_BAD_CHECKSUM; + } + + return r; +} + +// Reads node info from older (13 and 14) fractal tree nodes +// out of the given buffer. +void +read_legacy_node_info(FTNODE node, struct rbuf *rb, int version) +{ + (void)rbuf_int(rb); // 1. nodesize + node->flags = rbuf_int(rb); // 2. flags + node->height = rbuf_int(rb); // 3. height + + // If the version is less than 14, there are two extra ints here. + // we would need to ignore them if they are there. + if (version == FT_LAYOUT_VERSION_13) { + (void) rbuf_int(rb); // 4. rand4 + (void) rbuf_int(rb); // 5. local + } +} + +// Assuming the given buffer is in the correct position, +// this checks to see if the stored checksum matches the +// checksum of the entire buffer. +int +check_legacy_end_checksum(struct rbuf *rb) +{ + int r = 0; + uint32_t expected_xsum = rbuf_int(rb); + uint32_t actual_xsum = toku_x1764_memory(rb->buf, rb->size - 4); + if (expected_xsum != actual_xsum) { + r = TOKUDB_BAD_CHECKSUM; + } + + return r; +} diff --git a/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.cc b/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.cc new file mode 100644 index 00000000000..a7bc2949276 --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.cc @@ -0,0 +1,812 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "ft/ft.h" +#include "ft/ft-internal.h" +#include "ft/msg.h" +#include "ft/serialize/block_allocator.h" +#include "ft/serialize/block_table.h" +#include "ft/serialize/compress.h" +#include "ft/serialize/ft-serialize.h" + +// not version-sensitive because we only serialize a descriptor using the current layout_version +uint32_t +toku_serialize_descriptor_size(DESCRIPTOR desc) { + //Checksum NOT included in this. Checksum only exists in header's version. + uint32_t size = 4; // four bytes for size of descriptor + size += desc->dbt.size; + return size; +} + +static uint32_t +deserialize_descriptor_size(DESCRIPTOR desc, int layout_version) { + //Checksum NOT included in this. Checksum only exists in header's version. + uint32_t size = 4; // four bytes for size of descriptor + if (layout_version == FT_LAYOUT_VERSION_13) + size += 4; // for version 13, include four bytes of "version" + size += desc->dbt.size; + return size; +} + +void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, DESCRIPTOR desc) { + wbuf_bytes(wb, desc->dbt.data, desc->dbt.size); +} + +//Descriptor is written to disk during toku_ft_handle_open iff we have a new (or changed) +//descriptor. +//Descriptors are NOT written during the header checkpoint process. +void +toku_serialize_descriptor_contents_to_fd(int fd, DESCRIPTOR desc, DISKOFF offset) { + // make the checksum + int64_t size = toku_serialize_descriptor_size(desc)+4; //4 for checksum + int64_t size_aligned = roundup_to_multiple(512, size); + struct wbuf w; + char *XMALLOC_N_ALIGNED(512, size_aligned, aligned_buf); + for (int64_t i=size; i<size_aligned; i++) aligned_buf[i] = 0; + wbuf_init(&w, aligned_buf, size); + toku_serialize_descriptor_contents_to_wbuf(&w, desc); + { + //Add checksum + uint32_t checksum = toku_x1764_finish(&w.checksum); + wbuf_int(&w, checksum); + } + lazy_assert(w.ndone==w.size); + { + //Actual Write translation table + toku_os_full_pwrite(fd, w.buf, size_aligned, offset); + } + toku_free(w.buf); +} + +static void +deserialize_descriptor_from_rbuf(struct rbuf *rb, DESCRIPTOR desc, int layout_version) { + if (layout_version <= FT_LAYOUT_VERSION_13) { + // in older versions of tokuft, the descriptor had a 4 byte + // version, which we skip over + (void) rbuf_int(rb); + } + + uint32_t size; + const void *data; + rbuf_bytes(rb, &data, &size); + toku_memdup_dbt(&desc->dbt, data, size); +} + +static int +deserialize_descriptor_from(int fd, block_table *bt, DESCRIPTOR desc, int layout_version) { + int r = 0; + DISKOFF offset; + DISKOFF size; + unsigned char *dbuf = nullptr; + bt->get_descriptor_offset_size(&offset, &size); + memset(desc, 0, sizeof(*desc)); + if (size > 0) { + lazy_assert(size>=4); //4 for checksum + { + ssize_t size_to_malloc = roundup_to_multiple(512, size); + XMALLOC_N_ALIGNED(512, size_to_malloc, dbuf); + { + + ssize_t sz_read = toku_os_pread(fd, dbuf, size_to_malloc, offset); + lazy_assert(sz_read==size_to_malloc); + } + { + // check the checksum + uint32_t x1764 = toku_x1764_memory(dbuf, size-4); + //printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk); + uint32_t stored_x1764 = toku_dtoh32(*(int*)(dbuf + size-4)); + if (x1764 != stored_x1764) { + fprintf(stderr, "Descriptor checksum failure: calc=0x%08x read=0x%08x\n", x1764, stored_x1764); + r = TOKUDB_BAD_CHECKSUM; + toku_free(dbuf); + goto exit; + } + } + + struct rbuf rb = { .buf = dbuf, .size = (unsigned int) size, .ndone = 0 }; + deserialize_descriptor_from_rbuf(&rb, desc, layout_version); + lazy_assert(deserialize_descriptor_size(desc, layout_version) + 4 == size); + toku_free(dbuf); + } + } +exit: + return r; +} + +int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version) +// Effect: Deserialize the ft header. +// We deserialize ft_header only once and then share everything with all the FTs. +{ + int r; + FT ft = NULL; + paranoid_invariant(version >= FT_LAYOUT_MIN_SUPPORTED_VERSION); + paranoid_invariant(version <= FT_LAYOUT_VERSION); + // We already know: + // we have an rbuf representing the header. + // The checksum has been validated + + //Verification of initial elements. + //Check magic number + const void *magic; + rbuf_literal_bytes(rb, &magic, 8); + lazy_assert(memcmp(magic,"tokudata",8)==0); + + XCALLOC(ft); + ft->checkpoint_header = NULL; + toku_list_init(&ft->live_ft_handles); + + //version MUST be in network order on disk regardless of disk order + ft->layout_version_read_from_disk = rbuf_network_int(rb); + invariant(ft->layout_version_read_from_disk >= FT_LAYOUT_MIN_SUPPORTED_VERSION); + invariant(ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION); + + //build_id MUST be in network order on disk regardless of disk order + uint32_t build_id; + build_id = rbuf_network_int(rb); + + //Size MUST be in network order regardless of disk order. + uint32_t size; + size = rbuf_network_int(rb); + lazy_assert(size == rb->size); + + const void *tmp_byte_order_check; + lazy_assert((sizeof tmp_byte_order_check) >= 8); + rbuf_literal_bytes(rb, &tmp_byte_order_check, 8); //Must not translate byte order + int64_t byte_order_stored; + byte_order_stored = *(int64_t*)tmp_byte_order_check; + lazy_assert(byte_order_stored == toku_byte_order_host); + + uint64_t checkpoint_count; + checkpoint_count = rbuf_ulonglong(rb); + LSN checkpoint_lsn; + checkpoint_lsn = rbuf_LSN(rb); + unsigned nodesize; + nodesize = rbuf_int(rb); + DISKOFF translation_address_on_disk; + translation_address_on_disk = rbuf_DISKOFF(rb); + DISKOFF translation_size_on_disk; + translation_size_on_disk = rbuf_DISKOFF(rb); + lazy_assert(translation_address_on_disk > 0); + lazy_assert(translation_size_on_disk > 0); + + // initialize the tree lock + toku_ft_init_reflock(ft); + + //Load translation table + { + size_t size_to_read = roundup_to_multiple(512, translation_size_on_disk); + unsigned char *XMALLOC_N_ALIGNED(512, size_to_read, tbuf); + { + // This cast is messed up in 32-bits if the block translation + // table is ever more than 4GB. But in that case, the + // translation table itself won't fit in main memory. + ssize_t readsz = toku_os_pread(fd, tbuf, size_to_read, + translation_address_on_disk); + assert(readsz >= translation_size_on_disk); + assert(readsz <= (ssize_t)size_to_read); + } + // Create table and read in data. + r = ft->blocktable.create_from_buffer(fd, + translation_address_on_disk, + translation_size_on_disk, + tbuf); + toku_free(tbuf); + if (r != 0) { + goto exit; + } + } + + BLOCKNUM root_blocknum; + root_blocknum = rbuf_blocknum(rb); + unsigned flags; + flags = rbuf_int(rb); + if (ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_13) { + // deprecate 'TOKU_DB_VALCMP_BUILTIN'. just remove the flag + flags &= ~TOKU_DB_VALCMP_BUILTIN_13; + } + int layout_version_original; + layout_version_original = rbuf_int(rb); + uint32_t build_id_original; + build_id_original = rbuf_int(rb); + uint64_t time_of_creation; + time_of_creation = rbuf_ulonglong(rb); + uint64_t time_of_last_modification; + time_of_last_modification = rbuf_ulonglong(rb); + + if (ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_18) { + // 17 was the last version with these fields, we no longer store + // them, so read and discard them + (void) rbuf_ulonglong(rb); // num_blocks_to_upgrade_13 + if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_15) { + (void) rbuf_ulonglong(rb); // num_blocks_to_upgrade_14 + } + } + + // fake creation during the last checkpoint + TXNID root_xid_that_created; + root_xid_that_created = checkpoint_lsn.lsn; + if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_14) { + rbuf_TXNID(rb, &root_xid_that_created); + } + + // TODO(leif): get this to default to what's specified, not the + // hard-coded default + unsigned basementnodesize; + basementnodesize = FT_DEFAULT_BASEMENT_NODE_SIZE; + uint64_t time_of_last_verification; + time_of_last_verification = 0; + if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_15) { + basementnodesize = rbuf_int(rb); + time_of_last_verification = rbuf_ulonglong(rb); + } + + STAT64INFO_S on_disk_stats; + on_disk_stats = ZEROSTATS; + uint64_t time_of_last_optimize_begin; + time_of_last_optimize_begin = 0; + uint64_t time_of_last_optimize_end; + time_of_last_optimize_end = 0; + uint32_t count_of_optimize_in_progress; + count_of_optimize_in_progress = 0; + MSN msn_at_start_of_last_completed_optimize; + msn_at_start_of_last_completed_optimize = ZERO_MSN; + if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_18) { + on_disk_stats.numrows = rbuf_ulonglong(rb); + on_disk_stats.numbytes = rbuf_ulonglong(rb); + ft->in_memory_stats = on_disk_stats; + time_of_last_optimize_begin = rbuf_ulonglong(rb); + time_of_last_optimize_end = rbuf_ulonglong(rb); + count_of_optimize_in_progress = rbuf_int(rb); + msn_at_start_of_last_completed_optimize = rbuf_MSN(rb); + } + + enum toku_compression_method compression_method; + MSN highest_unused_msn_for_upgrade; + highest_unused_msn_for_upgrade.msn = (MIN_MSN.msn - 1); + if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_19) { + unsigned char method = rbuf_char(rb); + compression_method = (enum toku_compression_method) method; + highest_unused_msn_for_upgrade = rbuf_MSN(rb); + } else { + // we hard coded zlib until 5.2, then quicklz in 5.2 + if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_18) { + compression_method = TOKU_ZLIB_METHOD; + } else { + compression_method = TOKU_QUICKLZ_METHOD; + } + } + + MSN max_msn_in_ft; + max_msn_in_ft = ZERO_MSN; // We'll upgrade it from the root node later if necessary + if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_21) { + max_msn_in_ft = rbuf_MSN(rb); + } + + unsigned fanout; + fanout = FT_DEFAULT_FANOUT; + if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_28) { + fanout = rbuf_int(rb); + } + + (void) rbuf_int(rb); //Read in checksum and ignore (already verified). + if (rb->ndone != rb->size) { + fprintf(stderr, "Header size did not match contents.\n"); + r = EINVAL; + goto exit; + } + + { + struct ft_header h = { + .type = FT_CURRENT, + .dirty = 0, + .checkpoint_count = checkpoint_count, + .checkpoint_lsn = checkpoint_lsn, + .layout_version = FT_LAYOUT_VERSION, + .layout_version_original = layout_version_original, + .build_id = build_id, + .build_id_original = build_id_original, + .time_of_creation = time_of_creation, + .root_xid_that_created = root_xid_that_created, + .time_of_last_modification = time_of_last_modification, + .time_of_last_verification = time_of_last_verification, + .root_blocknum = root_blocknum, + .flags = flags, + .nodesize = nodesize, + .basementnodesize = basementnodesize, + .compression_method = compression_method, + .fanout = fanout, + .highest_unused_msn_for_upgrade = highest_unused_msn_for_upgrade, + .max_msn_in_ft = max_msn_in_ft, + .time_of_last_optimize_begin = time_of_last_optimize_begin, + .time_of_last_optimize_end = time_of_last_optimize_end, + .count_of_optimize_in_progress = count_of_optimize_in_progress, + .count_of_optimize_in_progress_read_from_disk = count_of_optimize_in_progress, + .msn_at_start_of_last_completed_optimize = msn_at_start_of_last_completed_optimize, + .on_disk_stats = on_disk_stats + }; + XMEMDUP(ft->h, &h); + } + + if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_18) { + // This needs ft->h to be non-null, so we have to do it after we + // read everything else. + r = toku_upgrade_subtree_estimates_to_stat64info(fd, ft); + if (r != 0) { + goto exit; + } + } + if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_21) { + r = toku_upgrade_msn_from_root_to_header(fd, ft); + if (r != 0) { + goto exit; + } + } + + invariant((uint32_t) ft->layout_version_read_from_disk == version); + r = deserialize_descriptor_from(fd, &ft->blocktable, &ft->descriptor, version); + if (r != 0) { + goto exit; + } + + // initialize for svn #4541 + toku_clone_dbt(&ft->cmp_descriptor.dbt, ft->descriptor.dbt); + + // Version 13 descriptors had an extra 4 bytes that we don't read + // anymore. Since the header is going to think it's the current + // version if it gets written out, we need to write the descriptor in + // the new format (without those bytes) before that happens. + if (version <= FT_LAYOUT_VERSION_13) { + toku_ft_update_descriptor_with_fd(ft, &ft->cmp_descriptor, fd); + } + r = 0; +exit: + if (r != 0 && ft != NULL) { + toku_free(ft); + ft = NULL; + } + *ftp = ft; + return r; +} + +static size_t +serialize_ft_min_size (uint32_t version) { + size_t size = 0; + + switch(version) { + case FT_LAYOUT_VERSION_28: + size += sizeof(uint32_t); // fanout in ft + case FT_LAYOUT_VERSION_27: + case FT_LAYOUT_VERSION_26: + case FT_LAYOUT_VERSION_25: + case FT_LAYOUT_VERSION_24: + case FT_LAYOUT_VERSION_23: + case FT_LAYOUT_VERSION_22: + case FT_LAYOUT_VERSION_21: + size += sizeof(MSN); // max_msn_in_ft + case FT_LAYOUT_VERSION_20: + case FT_LAYOUT_VERSION_19: + size += 1; // compression method + size += sizeof(MSN); // highest_unused_msn_for_upgrade + case FT_LAYOUT_VERSION_18: + size += sizeof(uint64_t); // time_of_last_optimize_begin + size += sizeof(uint64_t); // time_of_last_optimize_end + size += sizeof(uint32_t); // count_of_optimize_in_progress + size += sizeof(MSN); // msn_at_start_of_last_completed_optimize + size -= 8; // removed num_blocks_to_upgrade_14 + size -= 8; // removed num_blocks_to_upgrade_13 + case FT_LAYOUT_VERSION_17: + size += 16; + invariant(sizeof(STAT64INFO_S) == 16); + case FT_LAYOUT_VERSION_16: + case FT_LAYOUT_VERSION_15: + size += 4; // basement node size + size += 8; // num_blocks_to_upgrade_14 (previously num_blocks_to_upgrade, now one int each for upgrade from 13, 14 + size += 8; // time of last verification + case FT_LAYOUT_VERSION_14: + size += 8; //TXNID that created + case FT_LAYOUT_VERSION_13: + size += ( 4 // build_id + +4 // build_id_original + +8 // time_of_creation + +8 // time_of_last_modification + ); + // fall through + case FT_LAYOUT_VERSION_12: + size += (+8 // "tokudata" + +4 // version + +4 // original_version + +4 // size + +8 // byte order verification + +8 // checkpoint_count + +8 // checkpoint_lsn + +4 // tree's nodesize + +8 // translation_size_on_disk + +8 // translation_address_on_disk + +4 // checksum + +8 // Number of blocks in old version. + +8 // diskoff + +4 // flags + ); + break; + default: + abort(); + } + + lazy_assert(size <= block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE); + return size; +} + +int deserialize_ft_from_fd_into_rbuf(int fd, + toku_off_t offset_of_header, + struct rbuf *rb, + uint64_t *checkpoint_count, + LSN *checkpoint_lsn, + uint32_t * version_p) +// Effect: Read and parse the header of a fractalal tree +// +// Simply reading the raw bytes of the header into an rbuf is insensitive +// to disk format version. If that ever changes, then modify this. +// +// TOKUDB_DICTIONARY_NO_HEADER means we can overwrite everything in the +// file AND the header is useless +{ + int r = 0; + const int64_t prefix_size = 8 + // magic ("tokudata") + 4 + // version + 4 + // build_id + 4; // size + const int64_t read_size = roundup_to_multiple(512, prefix_size); + unsigned char *XMALLOC_N_ALIGNED(512, read_size, prefix); + rb->buf = NULL; + int64_t n = toku_os_pread(fd, prefix, read_size, offset_of_header); + if (n != read_size) { + if (n==0) { + r = TOKUDB_DICTIONARY_NO_HEADER; + } else if (n<0) { + r = get_error_errno(); + } else { + r = EINVAL; + } + toku_free(prefix); + goto exit; + } + + rbuf_init(rb, prefix, prefix_size); + + //Check magic number + const void *magic; + rbuf_literal_bytes(rb, &magic, 8); + if (memcmp(magic,"tokudata",8)!=0) { + if ((*(uint64_t*)magic) == 0) { + r = TOKUDB_DICTIONARY_NO_HEADER; + } else { + r = EINVAL; //Not a tokudb file! Do not use. + } + goto exit; + } + + //Version MUST be in network order regardless of disk order. + uint32_t version; + version = rbuf_network_int(rb); + *version_p = version; + if (version < FT_LAYOUT_MIN_SUPPORTED_VERSION) { + r = TOKUDB_DICTIONARY_TOO_OLD; //Cannot use + goto exit; + } else if (version > FT_LAYOUT_VERSION) { + r = TOKUDB_DICTIONARY_TOO_NEW; //Cannot use + goto exit; + } + + //build_id MUST be in network order regardless of disk order. + uint32_t build_id __attribute__((__unused__)); + build_id = rbuf_network_int(rb); + int64_t min_header_size; + min_header_size = serialize_ft_min_size(version); + + //Size MUST be in network order regardless of disk order. + uint32_t size; + size = rbuf_network_int(rb); + //If too big, it is corrupt. We would probably notice during checksum + //but may have to do a multi-gigabyte malloc+read to find out. + //If its too small reading rbuf would crash, so verify. + if (size > block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE || size < min_header_size) { + r = TOKUDB_DICTIONARY_NO_HEADER; + goto exit; + } + + lazy_assert(rb->ndone==prefix_size); + rb->size = size; + { + toku_free(rb->buf); + uint32_t size_to_read = roundup_to_multiple(512, size); + XMALLOC_N_ALIGNED(512, size_to_read, rb->buf); + + assert(offset_of_header%512==0); + n = toku_os_pread(fd, rb->buf, size_to_read, offset_of_header); + if (n != size_to_read) { + if (n < 0) { + r = get_error_errno(); + } else { + r = EINVAL; //Header might be useless (wrong size) or could be a disk read error. + } + goto exit; + } + } + //It's version 14 or later. Magic looks OK. + //We have an rbuf that represents the header. + //Size is within acceptable bounds. + + //Verify checksum (FT_LAYOUT_VERSION_13 or later, when checksum function changed) + uint32_t calculated_x1764; + calculated_x1764 = toku_x1764_memory(rb->buf, rb->size-4); + uint32_t stored_x1764; + stored_x1764 = toku_dtoh32(*(int*)(rb->buf+rb->size-4)); + if (calculated_x1764 != stored_x1764) { + r = TOKUDB_BAD_CHECKSUM; //Header useless + fprintf(stderr, "Header checksum failure: calc=0x%08x read=0x%08x\n", calculated_x1764, stored_x1764); + goto exit; + } + + //Verify byte order + const void *tmp_byte_order_check; + lazy_assert((sizeof toku_byte_order_host) == 8); + rbuf_literal_bytes(rb, &tmp_byte_order_check, 8); //Must not translate byte order + int64_t byte_order_stored; + byte_order_stored = *(int64_t*)tmp_byte_order_check; + if (byte_order_stored != toku_byte_order_host) { + r = TOKUDB_DICTIONARY_NO_HEADER; //Cannot use dictionary + goto exit; + } + + //Load checkpoint count + *checkpoint_count = rbuf_ulonglong(rb); + *checkpoint_lsn = rbuf_LSN(rb); + //Restart at beginning during regular deserialization + rb->ndone = 0; + +exit: + if (r != 0 && rb->buf != NULL) { + toku_free(rb->buf); + rb->buf = NULL; + } + return r; +} + +// Read ft from file into struct. Read both headers and use one. +// We want the latest acceptable header whose checkpoint_lsn is no later +// than max_acceptable_lsn. +int +toku_deserialize_ft_from(int fd, + LSN max_acceptable_lsn, + FT *ft) +{ + struct rbuf rb_0; + struct rbuf rb_1; + uint64_t checkpoint_count_0 = 0; + uint64_t checkpoint_count_1 = 0; + LSN checkpoint_lsn_0; + LSN checkpoint_lsn_1; + uint32_t version_0 = 0, version_1 = 0, version = 0; + bool h0_acceptable = false; + bool h1_acceptable = false; + struct rbuf *rb = NULL; + int r0, r1, r; + + toku_off_t header_0_off = 0; + r0 = deserialize_ft_from_fd_into_rbuf(fd, header_0_off, &rb_0, &checkpoint_count_0, &checkpoint_lsn_0, &version_0); + if (r0 == 0 && checkpoint_lsn_0.lsn <= max_acceptable_lsn.lsn) { + h0_acceptable = true; + } + + toku_off_t header_1_off = block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE; + r1 = deserialize_ft_from_fd_into_rbuf(fd, header_1_off, &rb_1, &checkpoint_count_1, &checkpoint_lsn_1, &version_1); + if (r1 == 0 && checkpoint_lsn_1.lsn <= max_acceptable_lsn.lsn) { + h1_acceptable = true; + } + + // if either header is too new, the dictionary is unreadable + if (r0 == TOKUDB_DICTIONARY_TOO_NEW || r1 == TOKUDB_DICTIONARY_TOO_NEW || + !(h0_acceptable || h1_acceptable)) { + // We were unable to read either header or at least one is too + // new. Certain errors are higher priority than others. Order of + // these if/else if is important. + if (r0 == TOKUDB_DICTIONARY_TOO_NEW || r1 == TOKUDB_DICTIONARY_TOO_NEW) { + r = TOKUDB_DICTIONARY_TOO_NEW; + } else if (r0 == TOKUDB_DICTIONARY_TOO_OLD || r1 == TOKUDB_DICTIONARY_TOO_OLD) { + r = TOKUDB_DICTIONARY_TOO_OLD; + } else if (r0 == TOKUDB_BAD_CHECKSUM && r1 == TOKUDB_BAD_CHECKSUM) { + fprintf(stderr, "Both header checksums failed.\n"); + r = TOKUDB_BAD_CHECKSUM; + } else if (r0 == TOKUDB_DICTIONARY_NO_HEADER || r1 == TOKUDB_DICTIONARY_NO_HEADER) { + r = TOKUDB_DICTIONARY_NO_HEADER; + } else { + r = r0 ? r0 : r1; //Arbitrarily report the error from the + //first header, unless it's readable + } + + // it should not be possible for both headers to be later than the max_acceptable_lsn + invariant(!((r0==0 && checkpoint_lsn_0.lsn > max_acceptable_lsn.lsn) && + (r1==0 && checkpoint_lsn_1.lsn > max_acceptable_lsn.lsn))); + invariant(r!=0); + goto exit; + } + + if (h0_acceptable && h1_acceptable) { + if (checkpoint_count_0 > checkpoint_count_1) { + invariant(checkpoint_count_0 == checkpoint_count_1 + 1); + invariant(version_0 >= version_1); + rb = &rb_0; + version = version_0; + } + else { + invariant(checkpoint_count_1 == checkpoint_count_0 + 1); + invariant(version_1 >= version_0); + rb = &rb_1; + version = version_1; + } + } else if (h0_acceptable) { + if (r1 == TOKUDB_BAD_CHECKSUM) { + // print something reassuring + fprintf(stderr, "Header 2 checksum failed, but header 1 ok. Proceeding.\n"); + } + rb = &rb_0; + version = version_0; + } else if (h1_acceptable) { + if (r0 == TOKUDB_BAD_CHECKSUM) { + // print something reassuring + fprintf(stderr, "Header 1 checksum failed, but header 2 ok. Proceeding.\n"); + } + rb = &rb_1; + version = version_1; + } + + paranoid_invariant(rb); + r = deserialize_ft_versioned(fd, rb, ft, version); + +exit: + if (rb_0.buf) { + toku_free(rb_0.buf); + } + if (rb_1.buf) { + toku_free(rb_1.buf); + } + return r; +} + + +size_t toku_serialize_ft_size (FT_HEADER h) { + size_t size = serialize_ft_min_size(h->layout_version); + //There is no dynamic data. + lazy_assert(size <= block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE); + return size; +} + + +void toku_serialize_ft_to_wbuf ( + struct wbuf *wbuf, + FT_HEADER h, + DISKOFF translation_location_on_disk, + DISKOFF translation_size_on_disk + ) +{ + wbuf_literal_bytes(wbuf, "tokudata", 8); + wbuf_network_int (wbuf, h->layout_version); //MUST be in network order regardless of disk order + wbuf_network_int (wbuf, BUILD_ID); //MUST be in network order regardless of disk order + wbuf_network_int (wbuf, wbuf->size); //MUST be in network order regardless of disk order + wbuf_literal_bytes(wbuf, &toku_byte_order_host, 8); //Must not translate byte order + wbuf_ulonglong(wbuf, h->checkpoint_count); + wbuf_LSN (wbuf, h->checkpoint_lsn); + wbuf_int (wbuf, h->nodesize); + + wbuf_DISKOFF(wbuf, translation_location_on_disk); + wbuf_DISKOFF(wbuf, translation_size_on_disk); + wbuf_BLOCKNUM(wbuf, h->root_blocknum); + wbuf_int(wbuf, h->flags); + wbuf_int(wbuf, h->layout_version_original); + wbuf_int(wbuf, h->build_id_original); + wbuf_ulonglong(wbuf, h->time_of_creation); + wbuf_ulonglong(wbuf, h->time_of_last_modification); + wbuf_TXNID(wbuf, h->root_xid_that_created); + wbuf_int(wbuf, h->basementnodesize); + wbuf_ulonglong(wbuf, h->time_of_last_verification); + wbuf_ulonglong(wbuf, h->on_disk_stats.numrows); + wbuf_ulonglong(wbuf, h->on_disk_stats.numbytes); + wbuf_ulonglong(wbuf, h->time_of_last_optimize_begin); + wbuf_ulonglong(wbuf, h->time_of_last_optimize_end); + wbuf_int(wbuf, h->count_of_optimize_in_progress); + wbuf_MSN(wbuf, h->msn_at_start_of_last_completed_optimize); + wbuf_char(wbuf, (unsigned char) h->compression_method); + wbuf_MSN(wbuf, h->highest_unused_msn_for_upgrade); + wbuf_MSN(wbuf, h->max_msn_in_ft); + wbuf_int(wbuf, h->fanout); + uint32_t checksum = toku_x1764_finish(&wbuf->checksum); + wbuf_int(wbuf, checksum); + lazy_assert(wbuf->ndone == wbuf->size); +} + +void toku_serialize_ft_to(int fd, FT_HEADER h, block_table *bt, CACHEFILE cf) { + lazy_assert(h->type==FT_CHECKPOINT_INPROGRESS); + struct wbuf w_translation; + int64_t size_translation; + int64_t address_translation; + + // Must serialize translation first, to get address,size for header. + bt->serialize_translation_to_wbuf(fd, &w_translation, + &address_translation, + &size_translation); + assert(size_translation == w_translation.ndone); + + // the number of bytes available in the buffer is 0 mod 512, and those last bytes are all initialized. + assert(w_translation.size % 512 == 0); + + struct wbuf w_main; + size_t size_main = toku_serialize_ft_size(h); + size_t size_main_aligned = roundup_to_multiple(512, size_main); + assert(size_main_aligned<block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE); + char *XMALLOC_N_ALIGNED(512, size_main_aligned, mainbuf); + for (size_t i=size_main; i<size_main_aligned; i++) mainbuf[i]=0; // initialize the end of the buffer with zeros + wbuf_init(&w_main, mainbuf, size_main); + toku_serialize_ft_to_wbuf(&w_main, h, address_translation, size_translation); + lazy_assert(w_main.ndone == size_main); + + // Actually write translation table + // This write is guaranteed to read good data at the end of the buffer, since the + // w_translation.buf is padded with zeros to a 512-byte boundary. + toku_os_full_pwrite(fd, w_translation.buf, roundup_to_multiple(512, size_translation), address_translation); + + //Everything but the header MUST be on disk before header starts. + //Otherwise we will think the header is good and some blocks might not + //yet be on disk. + //If the header has a cachefile we need to do cachefile fsync (to + //prevent crash if we redirected to dev null) + //If there is no cachefile we still need to do an fsync. + if (cf) { + toku_cachefile_fsync(cf); + } + else { + toku_file_fsync(fd); + } + + //Alternate writing header to two locations: + // Beginning (0) or BLOCK_ALLOCATOR_HEADER_RESERVE + toku_off_t main_offset; + main_offset = (h->checkpoint_count & 0x1) ? 0 : block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE; + toku_os_full_pwrite(fd, w_main.buf, size_main_aligned, main_offset); + toku_free(w_main.buf); + toku_free(w_translation.buf); +} diff --git a/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.h b/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.h new file mode 100644 index 00000000000..fe31ff7c5fd --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.h @@ -0,0 +1,62 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "ft/ft.h" +#include "ft/serialize/block_table.h" + +size_t toku_serialize_ft_size(struct ft_header *h); +void toku_serialize_ft_to(int fd, struct ft_header *h, block_table *bt, CACHEFILE cf); +void toku_serialize_ft_to_wbuf(struct wbuf *wbuf, struct ft_header *h, DISKOFF translation_location_on_disk, DISKOFF translation_size_on_disk); +void toku_serialize_descriptor_contents_to_fd(int fd, DESCRIPTOR desc, DISKOFF offset); +void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, DESCRIPTOR desc); + +int toku_deserialize_ft_from(int fd, LSN max_acceptable_lsn, FT *ft); + +// TODO rename +int deserialize_ft_from_fd_into_rbuf(int fd, + toku_off_t offset_of_header, + struct rbuf *rb, + uint64_t *checkpoint_count, + LSN *checkpoint_lsn, + uint32_t *version_p); + +// used by verify +// TODO rename +int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ft, uint32_t version); diff --git a/storage/tokudb/PerconaFT/ft/serialize/ft_layout_version.h b/storage/tokudb/PerconaFT/ft/serialize/ft_layout_version.h new file mode 100644 index 00000000000..72b6882bc06 --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/ft_layout_version.h @@ -0,0 +1,79 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +//Must be defined before other recursive headers could include logger/recover.h +enum ft_layout_version_e { + FT_LAYOUT_VERSION_5 = 5, + FT_LAYOUT_VERSION_6 = 6, // Diff from 5 to 6: Add leafentry_estimate + FT_LAYOUT_VERSION_7 = 7, // Diff from 6 to 7: Add exact-bit to leafentry_estimate #818, add magic to header #22, add per-subdatase flags #333 + FT_LAYOUT_VERSION_8 = 8, // Diff from 7 to 8: Use murmur instead of crc32. We are going to make a simplification and stop supporting version 7 and before. Current As of Beta 1.0.6 + FT_LAYOUT_VERSION_9 = 9, // Diff from 8 to 9: Variable-sized blocks and compression. + FT_LAYOUT_VERSION_10 = 10, // Diff from 9 to 10: Variable number of compressed sub-blocks per block, disk byte order == intel byte order, Subtree estimates instead of just leafentry estimates, translation table, dictionary descriptors, checksum in header, subdb support removed from ft layer + FT_LAYOUT_VERSION_11 = 11, // Diff from 10 to 11: Nested transaction leafentries (completely redesigned). FT_CMDs on disk now support XIDS (multiple txnids) instead of exactly one. + FT_LAYOUT_VERSION_12 = 12, // Diff from 11 to 12: Added FT_CMD 'FT_INSERT_NO_OVERWRITE', compressed block format, num old blocks + FT_LAYOUT_VERSION_13 = 13, // Diff from 12 to 13: Fixed loader pivot bug, added build_id to every node, timestamps to ft + FT_LAYOUT_VERSION_14 = 14, // Diff from 13 to 14: Added MVCC; deprecated TOKU_DB_VALCMP_BUILTIN(_13); Remove fingerprints; Support QUICKLZ; add end-to-end checksum on uncompressed data. + FT_LAYOUT_VERSION_15 = 15, // Diff from 14 to 15: basement nodes, last verification time + FT_LAYOUT_VERSION_16 = 16, // Dr. No: No subtree estimates, partition layout information represented more transparently. + // ALERT ALERT ALERT: version 16 never released to customers, internal and beta use only + FT_LAYOUT_VERSION_17 = 17, // Dr. No: Add STAT64INFO_S to ft header + FT_LAYOUT_VERSION_18 = 18, // Dr. No: Add HOT info to ft header + FT_LAYOUT_VERSION_19 = 19, // Doofenshmirtz: Add compression method, highest_unused_msn_for_upgrade + FT_LAYOUT_VERSION_20 = 20, // Deadshot: Add compression method to log_fcreate, + // mgr_last_xid after begin checkpoint, + // last_xid to shutdown + FT_LAYOUT_VERSION_21 = 21, // Ming: Add max_msn_in_ft to header, + // Removed log suppression logentry + FT_LAYOUT_VERSION_22 = 22, // Ming: Add oldest known referenced xid to each ftnode, for better garbage collection + FT_LAYOUT_VERSION_23 = 23, // Ming: Fix upgrade path #5902 + FT_LAYOUT_VERSION_24 = 24, // Riddler: change logentries that log transactions to store TXNID_PAIRs instead of TXNIDs + FT_LAYOUT_VERSION_25 = 25, // SecretSquirrel: ROLLBACK_LOG_NODES (on disk and in memory) now just use blocknum (instead of blocknum + hash) to point to other log nodes. same for xstillopen log entry + FT_LAYOUT_VERSION_26 = 26, // Hojo: basements store key/vals separately on disk for fixed klpair length BNs + FT_LAYOUT_VERSION_27 = 27, // serialize message trees with nonleaf buffers to avoid key, msn sort on deserialize + FT_LAYOUT_VERSION_28 = 28, // Add fanout to ft_header + FT_NEXT_VERSION, // the version after the current version + FT_LAYOUT_VERSION = FT_NEXT_VERSION-1, // A hack so I don't have to change this line. + FT_LAYOUT_MIN_SUPPORTED_VERSION = FT_LAYOUT_VERSION_13, // Minimum version supported + + // Define these symbolically so the knowledge of exactly which layout version got rid of fingerprints isn't spread all over the code. + FT_LAST_LAYOUT_VERSION_WITH_FINGERPRINT = FT_LAYOUT_VERSION_13, + FT_FIRST_LAYOUT_VERSION_WITH_END_TO_END_CHECKSUM = FT_LAYOUT_VERSION_14, + FT_FIRST_LAYOUT_VERSION_WITH_BASEMENT_NODES = FT_LAYOUT_VERSION_15, +}; diff --git a/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.cc b/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.cc new file mode 100644 index 00000000000..c4f4886b6a0 --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.cc @@ -0,0 +1,2872 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "portability/toku_atomic.h" + +#include "ft/cachetable/cachetable.h" +#include "ft/ft.h" +#include "ft/ft-internal.h" +#include "ft/node.h" +#include "ft/logger/log-internal.h" +#include "ft/txn/rollback.h" +#include "ft/serialize/block_allocator.h" +#include "ft/serialize/block_table.h" +#include "ft/serialize/compress.h" +#include "ft/serialize/ft_node-serialize.h" +#include "ft/serialize/sub_block.h" +#include "util/sort.h" +#include "util/threadpool.h" +#include "util/status.h" +#include "util/scoped_malloc.h" + +static FT_UPGRADE_STATUS_S ft_upgrade_status; + +#define STATUS_INIT(k,c,t,l,inc) TOKUFT_STATUS_INIT(ft_upgrade_status, k, c, t, "ft upgrade: " l, inc) + +static void +status_init(void) +{ + // Note, this function initializes the keyname, type, and legend fields. + // Value fields are initialized to zero by compiler. + STATUS_INIT(FT_UPGRADE_FOOTPRINT, nullptr, UINT64, "footprint", TOKU_ENGINE_STATUS); + ft_upgrade_status.initialized = true; +} +#undef STATUS_INIT + +#define UPGRADE_STATUS_VALUE(x) ft_upgrade_status.status[x].value.num + +void +toku_ft_upgrade_get_status(FT_UPGRADE_STATUS s) { + if (!ft_upgrade_status.initialized) { + status_init(); + } + UPGRADE_STATUS_VALUE(FT_UPGRADE_FOOTPRINT) = toku_log_upgrade_get_footprint(); + *s = ft_upgrade_status; +} + +static int num_cores = 0; // cache the number of cores for the parallelization +static struct toku_thread_pool *ft_pool = NULL; +bool toku_serialize_in_parallel; + +int get_num_cores(void) { + return num_cores; +} + +struct toku_thread_pool *get_ft_pool(void) { + return ft_pool; +} + +void toku_serialize_set_parallel(bool in_parallel) { + toku_unsafe_set(&toku_serialize_in_parallel, in_parallel); +} + +void toku_ft_serialize_layer_init(void) { + num_cores = toku_os_get_number_active_processors(); + int r = toku_thread_pool_create(&ft_pool, num_cores); + lazy_assert_zero(r); + block_allocator::maybe_initialize_trace(); + toku_serialize_in_parallel = false; +} + +void toku_ft_serialize_layer_destroy(void) { + toku_thread_pool_destroy(&ft_pool); + block_allocator::maybe_close_trace(); +} + +enum { FILE_CHANGE_INCREMENT = (16 << 20) }; + +static inline uint64_t +alignup64(uint64_t a, uint64_t b) { + return ((a+b-1)/b)*b; +} + +// safe_file_size_lock must be held. +void +toku_maybe_truncate_file (int fd, uint64_t size_used, uint64_t expected_size, uint64_t *new_sizep) +// Effect: If file size >= SIZE+32MiB, reduce file size. +// (32 instead of 16.. hysteresis). +// Return 0 on success, otherwise an error number. +{ + int64_t file_size; + { + int r = toku_os_get_file_size(fd, &file_size); + lazy_assert_zero(r); + invariant(file_size >= 0); + } + invariant(expected_size == (uint64_t)file_size); + // If file space is overallocated by at least 32M + if ((uint64_t)file_size >= size_used + (2*FILE_CHANGE_INCREMENT)) { + toku_off_t new_size = alignup64(size_used, (2*FILE_CHANGE_INCREMENT)); //Truncate to new size_used. + invariant(new_size < file_size); + invariant(new_size >= 0); + int r = ftruncate(fd, new_size); + lazy_assert_zero(r); + *new_sizep = new_size; + } + else { + *new_sizep = file_size; + } + return; +} + +static int64_t +min64(int64_t a, int64_t b) { + if (a<b) return a; + return b; +} + +void +toku_maybe_preallocate_in_file (int fd, int64_t size, int64_t expected_size, int64_t *new_size) +// Effect: make the file bigger by either doubling it or growing by 16MiB whichever is less, until it is at least size +// Return 0 on success, otherwise an error number. +{ + int64_t file_size = 0; + //TODO(yoni): Allow variable stripe_width (perhaps from ft) for larger raids + const uint64_t stripe_width = 4096; + { + int r = toku_os_get_file_size(fd, &file_size); + if (r != 0) { // debug #2463 + int the_errno = get_maybe_error_errno(); + fprintf(stderr, "%s:%d fd=%d size=%" PRIu64 " r=%d errno=%d\n", __FUNCTION__, __LINE__, fd, size, r, the_errno); fflush(stderr); + } + lazy_assert_zero(r); + } + invariant(file_size >= 0); + invariant(expected_size == file_size); + // We want to double the size of the file, or add 16MiB, whichever is less. + // We emulate calling this function repeatedly until it satisfies the request. + int64_t to_write = 0; + if (file_size == 0) { + // Prevent infinite loop by starting with stripe_width as a base case. + to_write = stripe_width; + } + while (file_size + to_write < size) { + to_write += alignup64(min64(file_size + to_write, FILE_CHANGE_INCREMENT), stripe_width); + } + if (to_write > 0) { + assert(to_write%512==0); + toku::scoped_malloc_aligned wbuf_aligned(to_write, 512); + char *wbuf = reinterpret_cast<char *>(wbuf_aligned.get()); + memset(wbuf, 0, to_write); + toku_off_t start_write = alignup64(file_size, stripe_width); + invariant(start_write >= file_size); + toku_os_full_pwrite(fd, wbuf, to_write, start_write); + *new_size = start_write + to_write; + } + else { + *new_size = file_size; + } +} + +// Don't include the sub_block header +// Overhead calculated in same order fields are written to wbuf +enum { + node_header_overhead = (8+ // magic "tokunode" or "tokuleaf" or "tokuroll" + 4+ // layout_version + 4+ // layout_version_original + 4), // build_id +}; + +// uncompressed header offsets +enum { + uncompressed_magic_offset = 0, + uncompressed_version_offset = 8, +}; + +static uint32_t +serialize_node_header_size(FTNODE node) { + uint32_t retval = 0; + retval += 8; // magic + retval += sizeof(node->layout_version); + retval += sizeof(node->layout_version_original); + retval += 4; // BUILD_ID + retval += 4; // n_children + retval += node->n_children*8; // encode start offset and length of each partition + retval += 4; // checksum + return retval; +} + +static void +serialize_node_header(FTNODE node, FTNODE_DISK_DATA ndd, struct wbuf *wbuf) { + if (node->height == 0) + wbuf_nocrc_literal_bytes(wbuf, "tokuleaf", 8); + else + wbuf_nocrc_literal_bytes(wbuf, "tokunode", 8); + paranoid_invariant(node->layout_version == FT_LAYOUT_VERSION); + wbuf_nocrc_int(wbuf, node->layout_version); + wbuf_nocrc_int(wbuf, node->layout_version_original); + wbuf_nocrc_uint(wbuf, BUILD_ID); + wbuf_nocrc_int (wbuf, node->n_children); + for (int i=0; i<node->n_children; i++) { + assert(BP_SIZE(ndd,i)>0); + wbuf_nocrc_int(wbuf, BP_START(ndd, i)); // save the beginning of the partition + wbuf_nocrc_int(wbuf, BP_SIZE (ndd, i)); // and the size + } + // checksum the header + uint32_t end_to_end_checksum = toku_x1764_memory(wbuf->buf, wbuf_get_woffset(wbuf)); + wbuf_nocrc_int(wbuf, end_to_end_checksum); + invariant(wbuf->ndone == wbuf->size); +} + +static uint32_t +serialize_ftnode_partition_size (FTNODE node, int i) +{ + uint32_t result = 0; + paranoid_invariant(node->bp[i].state == PT_AVAIL); + result++; // Byte that states what the partition is + if (node->height > 0) { + NONLEAF_CHILDINFO bnc = BNC(node, i); + // number of messages (4 bytes) plus size of the buffer + result += (4 + toku_bnc_nbytesinbuf(bnc)); + // number of offsets (4 bytes) plus an array of 4 byte offsets, for each message tree + result += (4 + (4 * bnc->fresh_message_tree.size())); + result += (4 + (4 * bnc->stale_message_tree.size())); + result += (4 + (4 * bnc->broadcast_list.size())); + } + else { + result += 4 + bn_data::HEADER_LENGTH; // n_entries in buffer table + basement header + result += BLB_NBYTESINDATA(node, i); + } + result += 4; // checksum + return result; +} + +#define FTNODE_PARTITION_DMT_LEAVES 0xaa +#define FTNODE_PARTITION_MSG_BUFFER 0xbb + +UU() static int +assert_fresh(const int32_t &offset, const uint32_t UU(idx), message_buffer *const msg_buffer) { + bool is_fresh = msg_buffer->get_freshness(offset); + assert(is_fresh); + return 0; +} + +UU() static int +assert_stale(const int32_t &offset, const uint32_t UU(idx), message_buffer *const msg_buffer) { + bool is_fresh = msg_buffer->get_freshness(offset); + assert(!is_fresh); + return 0; +} + +static void bnc_verify_message_trees(NONLEAF_CHILDINFO UU(bnc)) { +#ifdef TOKU_DEBUG_PARANOID + bnc->fresh_message_tree.iterate<message_buffer, assert_fresh>(&bnc->msg_buffer); + bnc->stale_message_tree.iterate<message_buffer, assert_stale>(&bnc->msg_buffer); +#endif +} + +static int +wbuf_write_offset(const int32_t &offset, const uint32_t UU(idx), struct wbuf *const wb) { + wbuf_nocrc_int(wb, offset); + return 0; +} + +static void serialize_child_buffer(NONLEAF_CHILDINFO bnc, struct wbuf *wb) { + unsigned char ch = FTNODE_PARTITION_MSG_BUFFER; + wbuf_nocrc_char(wb, ch); + + // serialize the message buffer + bnc->msg_buffer.serialize_to_wbuf(wb); + + // serialize the message trees (num entries, offsets array): + // first, verify their contents are consistent with the message buffer + bnc_verify_message_trees(bnc); + + // fresh + wbuf_nocrc_int(wb, bnc->fresh_message_tree.size()); + bnc->fresh_message_tree.iterate<struct wbuf, wbuf_write_offset>(wb); + + // stale + wbuf_nocrc_int(wb, bnc->stale_message_tree.size()); + bnc->stale_message_tree.iterate<struct wbuf, wbuf_write_offset>(wb); + + // broadcast + wbuf_nocrc_int(wb, bnc->broadcast_list.size()); + bnc->broadcast_list.iterate<struct wbuf, wbuf_write_offset>(wb); +} + +// +// Serialize the i'th partition of node into sb +// For leaf nodes, this would be the i'th basement node +// For internal nodes, this would be the i'th internal node +// +static void +serialize_ftnode_partition(FTNODE node, int i, struct sub_block *sb) { + // Caller should have allocated memory. + invariant_notnull(sb->uncompressed_ptr); + invariant(sb->uncompressed_size > 0); + paranoid_invariant(sb->uncompressed_size == serialize_ftnode_partition_size(node, i)); + + // + // Now put the data into sb->uncompressed_ptr + // + struct wbuf wb; + wbuf_init(&wb, sb->uncompressed_ptr, sb->uncompressed_size); + if (node->height > 0) { + // TODO: (Zardosht) possibly exit early if there are no messages + serialize_child_buffer(BNC(node, i), &wb); + } + else { + unsigned char ch = FTNODE_PARTITION_DMT_LEAVES; + bn_data* bd = BLB_DATA(node, i); + + wbuf_nocrc_char(&wb, ch); + wbuf_nocrc_uint(&wb, bd->num_klpairs()); + + bd->serialize_to_wbuf(&wb); + } + uint32_t end_to_end_checksum = toku_x1764_memory(sb->uncompressed_ptr, wbuf_get_woffset(&wb)); + wbuf_nocrc_int(&wb, end_to_end_checksum); + invariant(wb.ndone == wb.size); + invariant(sb->uncompressed_size==wb.ndone); +} + +// +// Takes the data in sb->uncompressed_ptr, and compresses it +// into a newly allocated buffer sb->compressed_ptr +// +static void +compress_ftnode_sub_block(struct sub_block *sb, enum toku_compression_method method) { + invariant(sb->compressed_ptr != nullptr); + invariant(sb->compressed_size_bound > 0); + paranoid_invariant(sb->compressed_size_bound == toku_compress_bound(method, sb->uncompressed_size)); + + // + // This probably seems a bit complicated. Here is what is going on. + // In PerconaFT 5.0, sub_blocks were compressed and the compressed data + // was checksummed. The checksum did NOT include the size of the compressed data + // and the size of the uncompressed data. The fields of sub_block only reference the + // compressed data, and it is the responsibility of the user of the sub_block + // to write the length + // + // For Dr. No, we want the checksum to also include the size of the compressed data, and the + // size of the decompressed data, because this data + // may be read off of disk alone, so it must be verifiable alone. + // + // So, we pass in a buffer to compress_nocrc_sub_block that starts 8 bytes after the beginning + // of sb->compressed_ptr, so we have space to put in the sizes, and then run the checksum. + // + sb->compressed_size = compress_nocrc_sub_block( + sb, + (char *)sb->compressed_ptr + 8, + sb->compressed_size_bound, + method + ); + + uint32_t* extra = (uint32_t *)(sb->compressed_ptr); + // store the compressed and uncompressed size at the beginning + extra[0] = toku_htod32(sb->compressed_size); + extra[1] = toku_htod32(sb->uncompressed_size); + // now checksum the entire thing + sb->compressed_size += 8; // now add the eight bytes that we saved for the sizes + sb->xsum = toku_x1764_memory(sb->compressed_ptr,sb->compressed_size); + + // + // This is the end result for Dr. No and forward. For ftnodes, sb->compressed_ptr contains + // two integers at the beginning, the size and uncompressed size, and then the compressed + // data. sb->xsum contains the checksum of this entire thing. + // + // In PerconaFT 5.0, sb->compressed_ptr only contained the compressed data, sb->xsum + // checksummed only the compressed data, and the checksumming of the sizes were not + // done here. + // +} + +// +// Returns the size needed to serialize the ftnode info +// Does not include header information that is common with rollback logs +// such as the magic, layout_version, and build_id +// Includes only node specific info such as pivot information, n_children, and so on +// +static uint32_t +serialize_ftnode_info_size(FTNODE node) +{ + uint32_t retval = 0; + retval += 8; // max_msn_applied_to_node_on_disk + retval += 4; // nodesize + retval += 4; // flags + retval += 4; // height; + retval += 8; // oldest_referenced_xid_known + retval += node->pivotkeys.serialized_size(); + retval += (node->n_children-1)*4; // encode length of each pivot + if (node->height > 0) { + retval += node->n_children*8; // child blocknum's + } + retval += 4; // checksum + return retval; +} + +static void serialize_ftnode_info(FTNODE node, SUB_BLOCK sb) { + // Memory must have been allocated by our caller. + invariant(sb->uncompressed_size > 0); + invariant_notnull(sb->uncompressed_ptr); + paranoid_invariant(sb->uncompressed_size == serialize_ftnode_info_size(node)); + + struct wbuf wb; + wbuf_init(&wb, sb->uncompressed_ptr, sb->uncompressed_size); + + wbuf_MSN(&wb, node->max_msn_applied_to_node_on_disk); + wbuf_nocrc_uint(&wb, 0); // write a dummy value for where node->nodesize used to be + wbuf_nocrc_uint(&wb, node->flags); + wbuf_nocrc_int (&wb, node->height); + wbuf_TXNID(&wb, node->oldest_referenced_xid_known); + node->pivotkeys.serialize_to_wbuf(&wb); + + // child blocks, only for internal nodes + if (node->height > 0) { + for (int i = 0; i < node->n_children; i++) { + wbuf_nocrc_BLOCKNUM(&wb, BP_BLOCKNUM(node,i)); + } + } + + uint32_t end_to_end_checksum = toku_x1764_memory(sb->uncompressed_ptr, wbuf_get_woffset(&wb)); + wbuf_nocrc_int(&wb, end_to_end_checksum); + invariant(wb.ndone == wb.size); + invariant(sb->uncompressed_size==wb.ndone); +} + +// This is the size of the uncompressed data, not including the compression headers +unsigned int +toku_serialize_ftnode_size (FTNODE node) { + unsigned int result = 0; + // + // As of now, this seems to be called if and only if the entire node is supposed + // to be in memory, so we will assert it. + // + toku_ftnode_assert_fully_in_memory(node); + result += serialize_node_header_size(node); + result += serialize_ftnode_info_size(node); + for (int i = 0; i < node->n_children; i++) { + result += serialize_ftnode_partition_size(node,i); + } + return result; +} + +struct serialize_times { + tokutime_t serialize_time; + tokutime_t compress_time; +}; + +static void +serialize_and_compress_partition(FTNODE node, + int childnum, + enum toku_compression_method compression_method, + SUB_BLOCK sb, + struct serialize_times *st) +{ + // serialize, compress, update status + tokutime_t t0 = toku_time_now(); + serialize_ftnode_partition(node, childnum, sb); + tokutime_t t1 = toku_time_now(); + compress_ftnode_sub_block(sb, compression_method); + tokutime_t t2 = toku_time_now(); + + st->serialize_time += t1 - t0; + st->compress_time += t2 - t1; +} + +void +toku_create_compressed_partition_from_available( + FTNODE node, + int childnum, + enum toku_compression_method compression_method, + SUB_BLOCK sb + ) +{ + tokutime_t t0 = toku_time_now(); + + // serialize + sb->uncompressed_size = serialize_ftnode_partition_size(node, childnum); + toku::scoped_malloc uncompressed_buf(sb->uncompressed_size); + sb->uncompressed_ptr = uncompressed_buf.get(); + serialize_ftnode_partition(node, childnum, sb); + + tokutime_t t1 = toku_time_now(); + + // compress. no need to pad with extra bytes for sizes/xsum - we're not storing them + set_compressed_size_bound(sb, compression_method); + sb->compressed_ptr = toku_xmalloc(sb->compressed_size_bound); + sb->compressed_size = compress_nocrc_sub_block( + sb, + sb->compressed_ptr, + sb->compressed_size_bound, + compression_method + ); + sb->uncompressed_ptr = NULL; + + tokutime_t t2 = toku_time_now(); + + toku_ft_status_update_serialize_times(node, t1 - t0, t2 - t1); +} + +static void +serialize_and_compress_serially(FTNODE node, + int npartitions, + enum toku_compression_method compression_method, + struct sub_block sb[], + struct serialize_times *st) { + for (int i = 0; i < npartitions; i++) { + serialize_and_compress_partition(node, i, compression_method, &sb[i], st); + } +} + +struct serialize_compress_work { + struct work base; + FTNODE node; + int i; + enum toku_compression_method compression_method; + struct sub_block *sb; + struct serialize_times st; +}; + +static void * +serialize_and_compress_worker(void *arg) { + struct workset *ws = (struct workset *) arg; + while (1) { + struct serialize_compress_work *w = (struct serialize_compress_work *) workset_get(ws); + if (w == NULL) + break; + int i = w->i; + serialize_and_compress_partition(w->node, i, w->compression_method, &w->sb[i], &w->st); + } + workset_release_ref(ws); + return arg; +} + +static void +serialize_and_compress_in_parallel(FTNODE node, + int npartitions, + enum toku_compression_method compression_method, + struct sub_block sb[], + struct serialize_times *st) { + if (npartitions == 1) { + serialize_and_compress_partition(node, 0, compression_method, &sb[0], st); + } else { + int T = num_cores; + if (T > npartitions) + T = npartitions; + if (T > 0) + T = T - 1; + struct workset ws; + ZERO_STRUCT(ws); + workset_init(&ws); + struct serialize_compress_work work[npartitions]; + workset_lock(&ws); + for (int i = 0; i < npartitions; i++) { + work[i] = (struct serialize_compress_work) { .base = {{NULL, NULL}}, + .node = node, + .i = i, + .compression_method = compression_method, + .sb = sb, + .st = { .serialize_time = 0, .compress_time = 0} }; + workset_put_locked(&ws, &work[i].base); + } + workset_unlock(&ws); + toku_thread_pool_run(ft_pool, 0, &T, serialize_and_compress_worker, &ws); + workset_add_ref(&ws, T); + serialize_and_compress_worker(&ws); + workset_join(&ws); + workset_destroy(&ws); + + // gather up the statistics from each thread's work item + for (int i = 0; i < npartitions; i++) { + st->serialize_time += work[i].st.serialize_time; + st->compress_time += work[i].st.compress_time; + } + } +} + +static void +serialize_and_compress_sb_node_info(FTNODE node, struct sub_block *sb, + enum toku_compression_method compression_method, struct serialize_times *st) { + // serialize, compress, update serialize times. + tokutime_t t0 = toku_time_now(); + serialize_ftnode_info(node, sb); + tokutime_t t1 = toku_time_now(); + compress_ftnode_sub_block(sb, compression_method); + tokutime_t t2 = toku_time_now(); + + st->serialize_time += t1 - t0; + st->compress_time += t2 - t1; +} + +int toku_serialize_ftnode_to_memory(FTNODE node, + FTNODE_DISK_DATA* ndd, + unsigned int basementnodesize, + enum toku_compression_method compression_method, + bool do_rebalancing, + bool in_parallel, // for loader is true, for toku_ftnode_flush_callback, is false + /*out*/ size_t *n_bytes_to_write, + /*out*/ size_t *n_uncompressed_bytes, + /*out*/ char **bytes_to_write) +// Effect: Writes out each child to a separate malloc'd buffer, then compresses +// all of them, and writes the uncompressed header, to bytes_to_write, +// which is malloc'd. +// +// The resulting buffer is guaranteed to be 512-byte aligned and the total length is a multiple of 512 (so we pad with zeros at the end if needed). +// 512-byte padding is for O_DIRECT to work. +{ + toku_ftnode_assert_fully_in_memory(node); + + if (do_rebalancing && node->height == 0) { + toku_ftnode_leaf_rebalance(node, basementnodesize); + } + const int npartitions = node->n_children; + + // Each partition represents a compressed sub block + // For internal nodes, a sub block is a message buffer + // For leaf nodes, a sub block is a basement node + toku::scoped_calloc sb_buf(sizeof(struct sub_block) * npartitions); + struct sub_block *sb = reinterpret_cast<struct sub_block *>(sb_buf.get()); + XREALLOC_N(npartitions, *ndd); + + // + // First, let's serialize and compress the individual sub blocks + // + + // determine how large our serialization and compression buffers need to be. + size_t serialize_buf_size = 0, compression_buf_size = 0; + for (int i = 0; i < node->n_children; i++) { + sb[i].uncompressed_size = serialize_ftnode_partition_size(node, i); + sb[i].compressed_size_bound = toku_compress_bound(compression_method, sb[i].uncompressed_size); + serialize_buf_size += sb[i].uncompressed_size; + compression_buf_size += sb[i].compressed_size_bound + 8; // add 8 extra bytes, 4 for compressed size, 4 for decompressed size + } + + // give each sub block a base pointer to enough buffer space for serialization and compression + toku::scoped_malloc serialize_buf(serialize_buf_size); + toku::scoped_malloc compression_buf(compression_buf_size); + for (size_t i = 0, uncompressed_offset = 0, compressed_offset = 0; i < (size_t) node->n_children; i++) { + sb[i].uncompressed_ptr = reinterpret_cast<char *>(serialize_buf.get()) + uncompressed_offset; + sb[i].compressed_ptr = reinterpret_cast<char *>(compression_buf.get()) + compressed_offset; + uncompressed_offset += sb[i].uncompressed_size; + compressed_offset += sb[i].compressed_size_bound + 8; // add 8 extra bytes, 4 for compressed size, 4 for decompressed size + invariant(uncompressed_offset <= serialize_buf_size); + invariant(compressed_offset <= compression_buf_size); + } + + // do the actual serialization now that we have buffer space + struct serialize_times st = { 0, 0 }; + if (in_parallel) { + serialize_and_compress_in_parallel(node, npartitions, compression_method, sb, &st); + } else { + serialize_and_compress_serially(node, npartitions, compression_method, sb, &st); + } + + // + // Now lets create a sub-block that has the common node information, + // This does NOT include the header + // + + // determine how large our serialization and copmression buffers need to be + struct sub_block sb_node_info; + sub_block_init(&sb_node_info); + size_t sb_node_info_uncompressed_size = serialize_ftnode_info_size(node); + size_t sb_node_info_compressed_size_bound = toku_compress_bound(compression_method, sb_node_info_uncompressed_size); + toku::scoped_malloc sb_node_info_uncompressed_buf(sb_node_info_uncompressed_size); + toku::scoped_malloc sb_node_info_compressed_buf(sb_node_info_compressed_size_bound + 8); // add 8 extra bytes, 4 for compressed size, 4 for decompressed size + sb_node_info.uncompressed_size = sb_node_info_uncompressed_size; + sb_node_info.uncompressed_ptr = sb_node_info_uncompressed_buf.get(); + sb_node_info.compressed_size_bound = sb_node_info_compressed_size_bound; + sb_node_info.compressed_ptr = sb_node_info_compressed_buf.get(); + + // do the actual serialization now that we have buffer space + serialize_and_compress_sb_node_info(node, &sb_node_info, compression_method, &st); + + // + // At this point, we have compressed each of our pieces into individual sub_blocks, + // we can put the header and all the subblocks into a single buffer and return it. + // + + // update the serialize times, ignore the header for simplicity. we captured all + // of the partitions' serialize times so that's probably good enough. + toku_ft_status_update_serialize_times(node, st.serialize_time, st.compress_time); + + // The total size of the node is: + // size of header + disk size of the n+1 sub_block's created above + uint32_t total_node_size = (serialize_node_header_size(node) // uncompressed header + + sb_node_info.compressed_size // compressed nodeinfo (without its checksum) + + 4); // nodeinfo's checksum + uint32_t total_uncompressed_size = (serialize_node_header_size(node) // uncompressed header + + sb_node_info.uncompressed_size // uncompressed nodeinfo (without its checksum) + + 4); // nodeinfo's checksum + // store the BP_SIZESs + for (int i = 0; i < node->n_children; i++) { + uint32_t len = sb[i].compressed_size + 4; // data and checksum + BP_SIZE (*ndd,i) = len; + BP_START(*ndd,i) = total_node_size; + total_node_size += sb[i].compressed_size + 4; + total_uncompressed_size += sb[i].uncompressed_size + 4; + } + + // now create the final serialized node + uint32_t total_buffer_size = roundup_to_multiple(512, total_node_size); // make the buffer be 512 bytes. + char *XMALLOC_N_ALIGNED(512, total_buffer_size, data); + char *curr_ptr = data; + + // write the header + struct wbuf wb; + wbuf_init(&wb, curr_ptr, serialize_node_header_size(node)); + serialize_node_header(node, *ndd, &wb); + assert(wb.ndone == wb.size); + curr_ptr += serialize_node_header_size(node); + + // now write sb_node_info + memcpy(curr_ptr, sb_node_info.compressed_ptr, sb_node_info.compressed_size); + curr_ptr += sb_node_info.compressed_size; + // write the checksum + *(uint32_t *)curr_ptr = toku_htod32(sb_node_info.xsum); + curr_ptr += sizeof(sb_node_info.xsum); + + for (int i = 0; i < npartitions; i++) { + memcpy(curr_ptr, sb[i].compressed_ptr, sb[i].compressed_size); + curr_ptr += sb[i].compressed_size; + // write the checksum + *(uint32_t *)curr_ptr = toku_htod32(sb[i].xsum); + curr_ptr += sizeof(sb[i].xsum); + } + // Zero the rest of the buffer + memset(data + total_node_size, 0, total_buffer_size - total_node_size); + + assert(curr_ptr - data == total_node_size); + *bytes_to_write = data; + *n_bytes_to_write = total_buffer_size; + *n_uncompressed_bytes = total_uncompressed_size; + + invariant(*n_bytes_to_write % 512 == 0); + invariant(reinterpret_cast<unsigned long long>(*bytes_to_write) % 512 == 0); + return 0; +} + +int +toku_serialize_ftnode_to (int fd, BLOCKNUM blocknum, FTNODE node, FTNODE_DISK_DATA* ndd, bool do_rebalancing, FT ft, bool for_checkpoint) { + + size_t n_to_write; + size_t n_uncompressed_bytes; + char *compressed_buf = nullptr; + + // because toku_serialize_ftnode_to is only called for + // in toku_ftnode_flush_callback, we pass false + // for in_parallel. The reasoning is that when we write + // nodes to disk via toku_ftnode_flush_callback, we + // assume that it is being done on a non-critical + // background thread (probably for checkpointing), and therefore + // should not hog CPU, + // + // Should the above facts change, we may want to revisit + // passing false for in_parallel here + // + // alternatively, we could have made in_parallel a parameter + // for toku_serialize_ftnode_to, but instead we did this. + int r = toku_serialize_ftnode_to_memory( + node, + ndd, + ft->h->basementnodesize, + ft->h->compression_method, + do_rebalancing, + toku_unsafe_fetch(&toku_serialize_in_parallel), + &n_to_write, + &n_uncompressed_bytes, + &compressed_buf + ); + if (r != 0) { + return r; + } + + // If the node has never been written, then write the whole buffer, including the zeros + invariant(blocknum.b>=0); + DISKOFF offset; + + // Dirties the ft + ft->blocktable.realloc_on_disk(blocknum, n_to_write, &offset, + ft, fd, for_checkpoint, + // Allocations for nodes high in the tree are considered 'hot', + // as they are likely to move again in the next checkpoint. + node->height); + + tokutime_t t0 = toku_time_now(); + toku_os_full_pwrite(fd, compressed_buf, n_to_write, offset); + tokutime_t t1 = toku_time_now(); + + tokutime_t io_time = t1 - t0; + toku_ft_status_update_flush_reason(node, n_uncompressed_bytes, n_to_write, io_time, for_checkpoint); + + toku_free(compressed_buf); + node->dirty = 0; // See #1957. Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction. + return 0; +} + +static void +sort_and_steal_offset_arrays(NONLEAF_CHILDINFO bnc, + const toku::comparator &cmp, + int32_t **fresh_offsets, int32_t nfresh, + int32_t **stale_offsets, int32_t nstale, + int32_t **broadcast_offsets, int32_t nbroadcast) { + // We always have fresh / broadcast offsets (even if they are empty) + // but we may not have stale offsets, in the case of v13 upgrade. + invariant(fresh_offsets != nullptr); + invariant(broadcast_offsets != nullptr); + invariant(cmp.valid()); + + typedef toku::sort<int32_t, const struct toku_msg_buffer_key_msn_cmp_extra, toku_msg_buffer_key_msn_cmp> msn_sort; + + const int32_t n_in_this_buffer = nfresh + nstale + nbroadcast; + struct toku_msg_buffer_key_msn_cmp_extra extra(cmp, &bnc->msg_buffer); + msn_sort::mergesort_r(*fresh_offsets, nfresh, extra); + bnc->fresh_message_tree.destroy(); + bnc->fresh_message_tree.create_steal_sorted_array(fresh_offsets, nfresh, n_in_this_buffer); + if (stale_offsets) { + msn_sort::mergesort_r(*stale_offsets, nstale, extra); + bnc->stale_message_tree.destroy(); + bnc->stale_message_tree.create_steal_sorted_array(stale_offsets, nstale, n_in_this_buffer); + } + bnc->broadcast_list.destroy(); + bnc->broadcast_list.create_steal_sorted_array(broadcast_offsets, nbroadcast, n_in_this_buffer); +} + +static MSN +deserialize_child_buffer_v13(FT ft, NONLEAF_CHILDINFO bnc, struct rbuf *rb) { + // We skip 'stale' offsets for upgraded nodes. + int32_t nfresh = 0, nbroadcast = 0; + int32_t *fresh_offsets = nullptr, *broadcast_offsets = nullptr; + + // Only sort buffers if we have a valid comparison function. In certain scenarios, + // like deserialie_ft_versioned() or tokuftdump, we'll need to deserialize ftnodes + // for simple inspection and don't actually require that the message buffers are + // properly sorted. This is very ugly, but correct. + const bool sort = ft->cmp.valid(); + + MSN highest_msn_in_this_buffer = + bnc->msg_buffer.deserialize_from_rbuf_v13(rb, &ft->h->highest_unused_msn_for_upgrade, + sort ? &fresh_offsets : nullptr, &nfresh, + sort ? &broadcast_offsets : nullptr, &nbroadcast); + + if (sort) { + sort_and_steal_offset_arrays(bnc, ft->cmp, + &fresh_offsets, nfresh, + nullptr, 0, // no stale offsets + &broadcast_offsets, nbroadcast); + } + + return highest_msn_in_this_buffer; +} + +static void +deserialize_child_buffer_v26(NONLEAF_CHILDINFO bnc, struct rbuf *rb, const toku::comparator &cmp) { + int32_t nfresh = 0, nstale = 0, nbroadcast = 0; + int32_t *fresh_offsets, *stale_offsets, *broadcast_offsets; + + // Only sort buffers if we have a valid comparison function. In certain scenarios, + // like deserialie_ft_versioned() or tokuftdump, we'll need to deserialize ftnodes + // for simple inspection and don't actually require that the message buffers are + // properly sorted. This is very ugly, but correct. + const bool sort = cmp.valid(); + + // read in the message buffer + bnc->msg_buffer.deserialize_from_rbuf(rb, + sort ? &fresh_offsets : nullptr, &nfresh, + sort ? &stale_offsets : nullptr, &nstale, + sort ? &broadcast_offsets : nullptr, &nbroadcast); + + if (sort) { + sort_and_steal_offset_arrays(bnc, cmp, + &fresh_offsets, nfresh, + &stale_offsets, nstale, + &broadcast_offsets, nbroadcast); + } +} + +static void +deserialize_child_buffer(NONLEAF_CHILDINFO bnc, struct rbuf *rb) { + // read in the message buffer + bnc->msg_buffer.deserialize_from_rbuf(rb, + nullptr, nullptr, // fresh_offsets, nfresh, + nullptr, nullptr, // stale_offsets, nstale, + nullptr, nullptr); // broadcast_offsets, nbroadcast + + // read in each message tree (fresh, stale, broadcast) + int32_t nfresh = rbuf_int(rb); + int32_t *XMALLOC_N(nfresh, fresh_offsets); + for (int i = 0; i < nfresh; i++) { + fresh_offsets[i] = rbuf_int(rb); + } + + int32_t nstale = rbuf_int(rb); + int32_t *XMALLOC_N(nstale, stale_offsets); + for (int i = 0; i < nstale; i++) { + stale_offsets[i] = rbuf_int(rb); + } + + int32_t nbroadcast = rbuf_int(rb); + int32_t *XMALLOC_N(nbroadcast, broadcast_offsets); + for (int i = 0; i < nbroadcast; i++) { + broadcast_offsets[i] = rbuf_int(rb); + } + + // build OMTs out of each offset array + bnc->fresh_message_tree.destroy(); + bnc->fresh_message_tree.create_steal_sorted_array(&fresh_offsets, nfresh, nfresh); + bnc->stale_message_tree.destroy(); + bnc->stale_message_tree.create_steal_sorted_array(&stale_offsets, nstale, nstale); + bnc->broadcast_list.destroy(); + bnc->broadcast_list.create_steal_sorted_array(&broadcast_offsets, nbroadcast, nbroadcast); +} + +// dump a buffer to stderr +// no locking around this for now +void +dump_bad_block(unsigned char *vp, uint64_t size) { + const uint64_t linesize = 64; + uint64_t n = size / linesize; + for (uint64_t i = 0; i < n; i++) { + fprintf(stderr, "%p: ", vp); + for (uint64_t j = 0; j < linesize; j++) { + unsigned char c = vp[j]; + fprintf(stderr, "%2.2X", c); + } + fprintf(stderr, "\n"); + vp += linesize; + } + size = size % linesize; + for (uint64_t i=0; i<size; i++) { + if ((i % linesize) == 0) + fprintf(stderr, "%p: ", vp+i); + fprintf(stderr, "%2.2X", vp[i]); + if (((i+1) % linesize) == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +//////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////// + +BASEMENTNODE toku_create_empty_bn(void) { + BASEMENTNODE bn = toku_create_empty_bn_no_buffer(); + bn->data_buffer.initialize_empty(); + return bn; +} + +BASEMENTNODE toku_clone_bn(BASEMENTNODE orig_bn) { + BASEMENTNODE bn = toku_create_empty_bn_no_buffer(); + bn->max_msn_applied = orig_bn->max_msn_applied; + bn->seqinsert = orig_bn->seqinsert; + bn->stale_ancestor_messages_applied = orig_bn->stale_ancestor_messages_applied; + bn->stat64_delta = orig_bn->stat64_delta; + bn->data_buffer.clone(&orig_bn->data_buffer); + return bn; +} + +BASEMENTNODE toku_create_empty_bn_no_buffer(void) { + BASEMENTNODE XMALLOC(bn); + bn->max_msn_applied.msn = 0; + bn->seqinsert = 0; + bn->stale_ancestor_messages_applied = false; + bn->stat64_delta = ZEROSTATS; + bn->data_buffer.init_zero(); + return bn; +} + +NONLEAF_CHILDINFO toku_create_empty_nl(void) { + NONLEAF_CHILDINFO XMALLOC(cn); + cn->msg_buffer.create(); + cn->fresh_message_tree.create_no_array(); + cn->stale_message_tree.create_no_array(); + cn->broadcast_list.create_no_array(); + memset(cn->flow, 0, sizeof cn->flow); + return cn; +} + +// must clone the OMTs, since we serialize them along with the message buffer +NONLEAF_CHILDINFO toku_clone_nl(NONLEAF_CHILDINFO orig_childinfo) { + NONLEAF_CHILDINFO XMALLOC(cn); + cn->msg_buffer.clone(&orig_childinfo->msg_buffer); + cn->fresh_message_tree.create_no_array(); + cn->fresh_message_tree.clone(orig_childinfo->fresh_message_tree); + cn->stale_message_tree.create_no_array(); + cn->stale_message_tree.clone(orig_childinfo->stale_message_tree); + cn->broadcast_list.create_no_array(); + cn->broadcast_list.clone(orig_childinfo->broadcast_list); + memset(cn->flow, 0, sizeof cn->flow); + return cn; +} + +void destroy_basement_node (BASEMENTNODE bn) +{ + bn->data_buffer.destroy(); + toku_free(bn); +} + +void destroy_nonleaf_childinfo (NONLEAF_CHILDINFO nl) +{ + nl->msg_buffer.destroy(); + nl->fresh_message_tree.destroy(); + nl->stale_message_tree.destroy(); + nl->broadcast_list.destroy(); + toku_free(nl); +} + +void read_block_from_fd_into_rbuf( + int fd, + BLOCKNUM blocknum, + FT ft, + struct rbuf *rb + ) +{ + // get the file offset and block size for the block + DISKOFF offset, size; + ft->blocktable.translate_blocknum_to_offset_size(blocknum, &offset, &size); + DISKOFF size_aligned = roundup_to_multiple(512, size); + uint8_t *XMALLOC_N_ALIGNED(512, size_aligned, raw_block); + rbuf_init(rb, raw_block, size); + // read the block + ssize_t rlen = toku_os_pread(fd, raw_block, size_aligned, offset); + assert((DISKOFF)rlen >= size); + assert((DISKOFF)rlen <= size_aligned); +} + +static const int read_header_heuristic_max = 32*1024; + +#ifndef MIN +#define MIN(a,b) (((a)>(b)) ? (b) : (a)) +#endif + +// Effect: If the header part of the node is small enough, then read it into the rbuf. The rbuf will be allocated to be big enough in any case. +static void read_ftnode_header_from_fd_into_rbuf_if_small_enough(int fd, BLOCKNUM blocknum, + FT ft, struct rbuf *rb, + ftnode_fetch_extra *bfe) { + DISKOFF offset, size; + ft->blocktable.translate_blocknum_to_offset_size(blocknum, &offset, &size); + DISKOFF read_size = roundup_to_multiple(512, MIN(read_header_heuristic_max, size)); + uint8_t *XMALLOC_N_ALIGNED(512, roundup_to_multiple(512, size), raw_block); + rbuf_init(rb, raw_block, read_size); + + // read the block + tokutime_t t0 = toku_time_now(); + ssize_t rlen = toku_os_pread(fd, raw_block, read_size, offset); + tokutime_t t1 = toku_time_now(); + + assert(rlen >= 0); + rbuf_init(rb, raw_block, rlen); + + bfe->bytes_read = rlen; + bfe->io_time = t1 - t0; + toku_ft_status_update_pivot_fetch_reason(bfe); +} + +// +// read the compressed partition into the sub_block, +// validate the checksum of the compressed data +// +int +read_compressed_sub_block(struct rbuf *rb, struct sub_block *sb) +{ + int r = 0; + sb->compressed_size = rbuf_int(rb); + sb->uncompressed_size = rbuf_int(rb); + const void **cp = (const void **) &sb->compressed_ptr; + rbuf_literal_bytes(rb, cp, sb->compressed_size); + sb->xsum = rbuf_int(rb); + // let's check the checksum + uint32_t actual_xsum = toku_x1764_memory((char *)sb->compressed_ptr-8, 8+sb->compressed_size); + if (sb->xsum != actual_xsum) { + r = TOKUDB_BAD_CHECKSUM; + } + return r; +} + +static int +read_and_decompress_sub_block(struct rbuf *rb, struct sub_block *sb) +{ + int r = 0; + r = read_compressed_sub_block(rb, sb); + if (r != 0) { + goto exit; + } + + just_decompress_sub_block(sb); +exit: + return r; +} + +// Allocates space for the sub-block and de-compresses the data from +// the supplied compressed pointer.. +void +just_decompress_sub_block(struct sub_block *sb) +{ + // <CER> TODO: Add assert that the subblock was read in. + sb->uncompressed_ptr = toku_xmalloc(sb->uncompressed_size); + + toku_decompress( + (Bytef *) sb->uncompressed_ptr, + sb->uncompressed_size, + (Bytef *) sb->compressed_ptr, + sb->compressed_size + ); +} + +// verify the checksum +int +verify_ftnode_sub_block (struct sub_block *sb) +{ + int r = 0; + // first verify the checksum + uint32_t data_size = sb->uncompressed_size - 4; // checksum is 4 bytes at end + uint32_t stored_xsum = toku_dtoh32(*((uint32_t *)((char *)sb->uncompressed_ptr + data_size))); + uint32_t actual_xsum = toku_x1764_memory(sb->uncompressed_ptr, data_size); + if (stored_xsum != actual_xsum) { + dump_bad_block((Bytef *) sb->uncompressed_ptr, sb->uncompressed_size); + r = TOKUDB_BAD_CHECKSUM; + } + return r; +} + +// This function deserializes the data stored by serialize_ftnode_info +static int +deserialize_ftnode_info( + struct sub_block *sb, + FTNODE node + ) +{ + // sb_node_info->uncompressed_ptr stores the serialized node information + // this function puts that information into node + + // first verify the checksum + int r = 0; + r = verify_ftnode_sub_block(sb); + if (r != 0) { + goto exit; + } + + uint32_t data_size; + data_size = sb->uncompressed_size - 4; // checksum is 4 bytes at end + + // now with the data verified, we can read the information into the node + struct rbuf rb; + rbuf_init(&rb, (unsigned char *) sb->uncompressed_ptr, data_size); + + node->max_msn_applied_to_node_on_disk = rbuf_MSN(&rb); + (void)rbuf_int(&rb); + node->flags = rbuf_int(&rb); + node->height = rbuf_int(&rb); + if (node->layout_version_read_from_disk < FT_LAYOUT_VERSION_19) { + (void) rbuf_int(&rb); // optimized_for_upgrade + } + if (node->layout_version_read_from_disk >= FT_LAYOUT_VERSION_22) { + rbuf_TXNID(&rb, &node->oldest_referenced_xid_known); + } + + // now create the basement nodes or childinfos, depending on whether this is a + // leaf node or internal node + // now the subtree_estimates + + // n_children is now in the header, nd the allocatio of the node->bp is in deserialize_ftnode_from_rbuf. + + // now the pivots + if (node->n_children > 1) { + node->pivotkeys.deserialize_from_rbuf(&rb, node->n_children - 1); + } else { + node->pivotkeys.create_empty(); + } + + // if this is an internal node, unpack the block nums, and fill in necessary fields + // of childinfo + if (node->height > 0) { + for (int i = 0; i < node->n_children; i++) { + BP_BLOCKNUM(node,i) = rbuf_blocknum(&rb); + BP_WORKDONE(node, i) = 0; + } + } + + // make sure that all the data was read + if (data_size != rb.ndone) { + dump_bad_block(rb.buf, rb.size); + abort(); + } +exit: + return r; +} + +static void +setup_available_ftnode_partition(FTNODE node, int i) { + if (node->height == 0) { + set_BLB(node, i, toku_create_empty_bn()); + BLB_MAX_MSN_APPLIED(node,i) = node->max_msn_applied_to_node_on_disk; + } + else { + set_BNC(node, i, toku_create_empty_nl()); + } +} + +// Assign the child_to_read member of the bfe from the given ftnode +// that has been brought into memory. +static void +update_bfe_using_ftnode(FTNODE node, ftnode_fetch_extra *bfe) +{ + if (bfe->type == ftnode_fetch_subset && bfe->search != NULL) { + // we do not take into account prefetching yet + // as of now, if we need a subset, the only thing + // we can possibly require is a single basement node + // we find out what basement node the query cares about + // and check if it is available + bfe->child_to_read = toku_ft_search_which_child( + bfe->ft->cmp, + node, + bfe->search + ); + } else if (bfe->type == ftnode_fetch_keymatch) { + // we do not take into account prefetching yet + // as of now, if we need a subset, the only thing + // we can possibly require is a single basement node + // we find out what basement node the query cares about + // and check if it is available + if (node->height == 0) { + int left_child = bfe->leftmost_child_wanted(node); + int right_child = bfe->rightmost_child_wanted(node); + if (left_child == right_child) { + bfe->child_to_read = left_child; + } + } + } +} + +// Using the search parameters in the bfe, this function will +// initialize all of the given ftnode's partitions. +static void +setup_partitions_using_bfe(FTNODE node, + ftnode_fetch_extra *bfe, + bool data_in_memory) +{ + // Leftmost and Rightmost Child bounds. + int lc, rc; + if (bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_prefetch) { + lc = bfe->leftmost_child_wanted(node); + rc = bfe->rightmost_child_wanted(node); + } else { + lc = -1; + rc = -1; + } + + // + // setup memory needed for the node + // + //printf("node height %d, blocknum %" PRId64 ", type %d lc %d rc %d\n", node->height, node->blocknum.b, bfe->type, lc, rc); + for (int i = 0; i < node->n_children; i++) { + BP_INIT_UNTOUCHED_CLOCK(node,i); + if (data_in_memory) { + BP_STATE(node, i) = ((bfe->wants_child_available(i) || (lc <= i && i <= rc)) + ? PT_AVAIL : PT_COMPRESSED); + } else { + BP_STATE(node, i) = PT_ON_DISK; + } + BP_WORKDONE(node,i) = 0; + + switch (BP_STATE(node,i)) { + case PT_AVAIL: + setup_available_ftnode_partition(node, i); + BP_TOUCH_CLOCK(node,i); + break; + case PT_COMPRESSED: + set_BSB(node, i, sub_block_creat()); + break; + case PT_ON_DISK: + set_BNULL(node, i); + break; + case PT_INVALID: + abort(); + } + } +} + +static void setup_ftnode_partitions(FTNODE node, ftnode_fetch_extra *bfe, bool data_in_memory) +// Effect: Used when reading a ftnode into main memory, this sets up the partitions. +// We set bfe->child_to_read as well as the BP_STATE and the data pointers (e.g., with set_BSB or set_BNULL or other set_ operations). +// Arguments: Node: the node to set up. +// bfe: Describes the key range needed. +// data_in_memory: true if we have all the data (in which case we set the BP_STATE to be either PT_AVAIL or PT_COMPRESSED depending on the bfe. +// false if we don't have the partitions in main memory (in which case we set the state to PT_ON_DISK. +{ + // Set bfe->child_to_read. + update_bfe_using_ftnode(node, bfe); + + // Setup the partitions. + setup_partitions_using_bfe(node, bfe, data_in_memory); +} + +/* deserialize the partition from the sub-block's uncompressed buffer + * and destroy the uncompressed buffer + */ +static int +deserialize_ftnode_partition( + struct sub_block *sb, + FTNODE node, + int childnum, // which partition to deserialize + const toku::comparator &cmp + ) +{ + int r = 0; + r = verify_ftnode_sub_block(sb); + if (r != 0) { + goto exit; + } + uint32_t data_size; + data_size = sb->uncompressed_size - 4; // checksum is 4 bytes at end + + // now with the data verified, we can read the information into the node + struct rbuf rb; + rbuf_init(&rb, (unsigned char *) sb->uncompressed_ptr, data_size); + unsigned char ch; + ch = rbuf_char(&rb); + + if (node->height > 0) { + assert(ch == FTNODE_PARTITION_MSG_BUFFER); + NONLEAF_CHILDINFO bnc = BNC(node, childnum); + if (node->layout_version_read_from_disk <= FT_LAYOUT_VERSION_26) { + // Layout version <= 26 did not serialize sorted message trees to disk. + deserialize_child_buffer_v26(bnc, &rb, cmp); + } else { + deserialize_child_buffer(bnc, &rb); + } + BP_WORKDONE(node, childnum) = 0; + } + else { + assert(ch == FTNODE_PARTITION_DMT_LEAVES); + BLB_SEQINSERT(node, childnum) = 0; + uint32_t num_entries = rbuf_int(&rb); + // we are now at the first byte of first leafentry + data_size -= rb.ndone; // remaining bytes of leafentry data + + BASEMENTNODE bn = BLB(node, childnum); + bn->data_buffer.deserialize_from_rbuf(num_entries, &rb, data_size, node->layout_version_read_from_disk); + } + assert(rb.ndone == rb.size); +exit: + return r; +} + +static int +decompress_and_deserialize_worker(struct rbuf curr_rbuf, struct sub_block curr_sb, FTNODE node, int child, + const toku::comparator &cmp, tokutime_t *decompress_time) +{ + int r = 0; + tokutime_t t0 = toku_time_now(); + r = read_and_decompress_sub_block(&curr_rbuf, &curr_sb); + tokutime_t t1 = toku_time_now(); + if (r == 0) { + // at this point, sb->uncompressed_ptr stores the serialized node partition + r = deserialize_ftnode_partition(&curr_sb, node, child, cmp); + } + *decompress_time = t1 - t0; + + toku_free(curr_sb.uncompressed_ptr); + return r; +} + +static int +check_and_copy_compressed_sub_block_worker(struct rbuf curr_rbuf, struct sub_block curr_sb, FTNODE node, int child) +{ + int r = 0; + r = read_compressed_sub_block(&curr_rbuf, &curr_sb); + if (r != 0) { + goto exit; + } + + SUB_BLOCK bp_sb; + bp_sb = BSB(node, child); + bp_sb->compressed_size = curr_sb.compressed_size; + bp_sb->uncompressed_size = curr_sb.uncompressed_size; + bp_sb->compressed_ptr = toku_xmalloc(bp_sb->compressed_size); + memcpy(bp_sb->compressed_ptr, curr_sb.compressed_ptr, bp_sb->compressed_size); +exit: + return r; +} + +static FTNODE alloc_ftnode_for_deserialize(uint32_t fullhash, BLOCKNUM blocknum) { +// Effect: Allocate an FTNODE and fill in the values that are not read from + FTNODE XMALLOC(node); + node->fullhash = fullhash; + node->blocknum = blocknum; + node->dirty = 0; + node->bp = nullptr; + node->oldest_referenced_xid_known = TXNID_NONE; + return node; +} + +static int +deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode, + FTNODE_DISK_DATA* ndd, + BLOCKNUM blocknum, + uint32_t fullhash, + ftnode_fetch_extra *bfe, + struct rbuf *rb, + int fd) +// If we have enough information in the rbuf to construct a header, then do so. +// Also fetch in the basement node if needed. +// Return 0 if it worked. If something goes wrong (including that we are looking at some old data format that doesn't have partitions) then return nonzero. +{ + int r = 0; + + tokutime_t t0, t1; + tokutime_t decompress_time = 0; + tokutime_t deserialize_time = 0; + + t0 = toku_time_now(); + + FTNODE node = alloc_ftnode_for_deserialize(fullhash, blocknum); + + if (rb->size < 24) { + // TODO: What error do we return here? + // Does it even matter? + r = toku_db_badformat(); + goto cleanup; + } + + const void *magic; + rbuf_literal_bytes(rb, &magic, 8); + if (memcmp(magic, "tokuleaf", 8)!=0 && + memcmp(magic, "tokunode", 8)!=0) { + r = toku_db_badformat(); + goto cleanup; + } + + node->layout_version_read_from_disk = rbuf_int(rb); + if (node->layout_version_read_from_disk < FT_FIRST_LAYOUT_VERSION_WITH_BASEMENT_NODES) { + // This code path doesn't have to worry about upgrade. + r = toku_db_badformat(); + goto cleanup; + } + + // If we get here, we know the node is at least + // FT_FIRST_LAYOUT_VERSION_WITH_BASEMENT_NODES. We haven't changed + // the serialization format since then (this comment is correct as of + // version 20, which is Deadshot) so we can go ahead and say the + // layout version is current (it will be as soon as we finish + // deserializing). + // TODO(leif): remove node->layout_version (#5174) + node->layout_version = FT_LAYOUT_VERSION; + + node->layout_version_original = rbuf_int(rb); + node->build_id = rbuf_int(rb); + node->n_children = rbuf_int(rb); + // Guaranteed to be have been able to read up to here. If n_children + // is too big, we may have a problem, so check that we won't overflow + // while reading the partition locations. + unsigned int nhsize; + nhsize = serialize_node_header_size(node); // we can do this because n_children is filled in. + unsigned int needed_size; + needed_size = nhsize + 12; // we need 12 more so that we can read the compressed block size information that follows for the nodeinfo. + if (needed_size > rb->size) { + r = toku_db_badformat(); + goto cleanup; + } + + XMALLOC_N(node->n_children, node->bp); + XMALLOC_N(node->n_children, *ndd); + // read the partition locations + for (int i=0; i<node->n_children; i++) { + BP_START(*ndd,i) = rbuf_int(rb); + BP_SIZE (*ndd,i) = rbuf_int(rb); + } + + uint32_t checksum; + checksum = toku_x1764_memory(rb->buf, rb->ndone); + uint32_t stored_checksum; + stored_checksum = rbuf_int(rb); + if (stored_checksum != checksum) { + dump_bad_block(rb->buf, rb->size); + r = TOKUDB_BAD_CHECKSUM; + goto cleanup; + } + + // Now we want to read the pivot information. + struct sub_block sb_node_info; + sub_block_init(&sb_node_info); + sb_node_info.compressed_size = rbuf_int(rb); // we'll be able to read these because we checked the size earlier. + sb_node_info.uncompressed_size = rbuf_int(rb); + if (rb->size-rb->ndone < sb_node_info.compressed_size + 8) { + r = toku_db_badformat(); + goto cleanup; + } + + // Finish reading compressed the sub_block + const void **cp; + cp = (const void **) &sb_node_info.compressed_ptr; + rbuf_literal_bytes(rb, cp, sb_node_info.compressed_size); + sb_node_info.xsum = rbuf_int(rb); + // let's check the checksum + uint32_t actual_xsum; + actual_xsum = toku_x1764_memory((char *)sb_node_info.compressed_ptr-8, 8+sb_node_info.compressed_size); + if (sb_node_info.xsum != actual_xsum) { + r = TOKUDB_BAD_CHECKSUM; + goto cleanup; + } + + // Now decompress the subblock + { + toku::scoped_malloc sb_node_info_buf(sb_node_info.uncompressed_size); + sb_node_info.uncompressed_ptr = sb_node_info_buf.get(); + tokutime_t decompress_t0 = toku_time_now(); + toku_decompress( + (Bytef *) sb_node_info.uncompressed_ptr, + sb_node_info.uncompressed_size, + (Bytef *) sb_node_info.compressed_ptr, + sb_node_info.compressed_size + ); + tokutime_t decompress_t1 = toku_time_now(); + decompress_time = decompress_t1 - decompress_t0; + + // at this point sb->uncompressed_ptr stores the serialized node info. + r = deserialize_ftnode_info(&sb_node_info, node); + if (r != 0) { + goto cleanup; + } + } + + // Now we have the ftnode_info. We have a bunch more stuff in the + // rbuf, so we might be able to store the compressed data for some + // objects. + // We can proceed to deserialize the individual subblocks. + + // setup the memory of the partitions + // for partitions being decompressed, create either message buffer or basement node + // for partitions staying compressed, create sub_block + setup_ftnode_partitions(node, bfe, false); + + // We must capture deserialize and decompression time before + // the pf_callback, otherwise we would double-count. + t1 = toku_time_now(); + deserialize_time = (t1 - t0) - decompress_time; + + // do partial fetch if necessary + if (bfe->type != ftnode_fetch_none) { + PAIR_ATTR attr; + r = toku_ftnode_pf_callback(node, *ndd, bfe, fd, &attr); + if (r != 0) { + goto cleanup; + } + } + + // handle clock + for (int i = 0; i < node->n_children; i++) { + if (bfe->wants_child_available(i)) { + paranoid_invariant(BP_STATE(node,i) == PT_AVAIL); + BP_TOUCH_CLOCK(node,i); + } + } + *ftnode = node; + r = 0; + +cleanup: + if (r == 0) { + bfe->deserialize_time += deserialize_time; + bfe->decompress_time += decompress_time; + toku_ft_status_update_deserialize_times(node, deserialize_time, decompress_time); + } + if (r != 0) { + if (node) { + toku_free(*ndd); + toku_free(node->bp); + toku_free(node); + } + } + return r; +} + +// This function takes a deserialized version 13 or 14 buffer and +// constructs the associated internal, non-leaf ftnode object. It +// also creates MSN's for older messages created in older versions +// that did not generate MSN's for messages. These new MSN's are +// generated from the root downwards, counting backwards from MIN_MSN +// and persisted in the ft header. +static int +deserialize_and_upgrade_internal_node(FTNODE node, + struct rbuf *rb, + ftnode_fetch_extra *bfe, + STAT64INFO info) +{ + int version = node->layout_version_read_from_disk; + + if (version == FT_LAST_LAYOUT_VERSION_WITH_FINGERPRINT) { + (void) rbuf_int(rb); // 10. fingerprint + } + + node->n_children = rbuf_int(rb); // 11. n_children + + // Sub-tree esitmates... + for (int i = 0; i < node->n_children; ++i) { + if (version == FT_LAST_LAYOUT_VERSION_WITH_FINGERPRINT) { + (void) rbuf_int(rb); // 12. fingerprint + } + uint64_t nkeys = rbuf_ulonglong(rb); // 13. nkeys + uint64_t ndata = rbuf_ulonglong(rb); // 14. ndata + uint64_t dsize = rbuf_ulonglong(rb); // 15. dsize + (void) rbuf_char(rb); // 16. exact (char) + invariant(nkeys == ndata); + if (info) { + // info is non-null if we're trying to upgrade old subtree + // estimates to stat64info + info->numrows += nkeys; + info->numbytes += dsize; + } + } + + // Pivot keys + node->pivotkeys.deserialize_from_rbuf(rb, node->n_children - 1); + + // Create space for the child node buffers (a.k.a. partitions). + XMALLOC_N(node->n_children, node->bp); + + // Set the child blocknums. + for (int i = 0; i < node->n_children; ++i) { + BP_BLOCKNUM(node, i) = rbuf_blocknum(rb); // 18. blocknums + BP_WORKDONE(node, i) = 0; + } + + // Read in the child buffer maps. + for (int i = 0; i < node->n_children; ++i) { + // The following fields were previously used by the `sub_block_map' + // They include: + // - 4 byte index + (void) rbuf_int(rb); + // - 4 byte offset + (void) rbuf_int(rb); + // - 4 byte size + (void) rbuf_int(rb); + } + + // We need to setup this node's partitions, but we can't call the + // existing call (setup_ftnode_paritions.) because there are + // existing optimizations that would prevent us from bringing all + // of this node's partitions into memory. Instead, We use the + // existing bfe and node to set the bfe's child_to_search member. + // Then we create a temporary bfe that needs all the nodes to make + // sure we properly intitialize our partitions before filling them + // in from our soon-to-be-upgraded node. + update_bfe_using_ftnode(node, bfe); + ftnode_fetch_extra temp_bfe; + temp_bfe.create_for_full_read(nullptr); + setup_partitions_using_bfe(node, &temp_bfe, true); + + // Cache the highest MSN generated for the message buffers. This + // will be set in the ftnode. + // + // The way we choose MSNs for upgraded messages is delicate. The + // field `highest_unused_msn_for_upgrade' in the header is always an + // MSN that no message has yet. So when we have N messages that need + // MSNs, we decrement it by N, and then use it and the N-1 MSNs less + // than it, but we do not use the value we decremented it to. + // + // In the code below, we initialize `lowest' with the value of + // `highest_unused_msn_for_upgrade' after it is decremented, so we + // need to be sure to increment it once before we enqueue our first + // message. + MSN highest_msn; + highest_msn.msn = 0; + + // Deserialize de-compressed buffers. + for (int i = 0; i < node->n_children; ++i) { + NONLEAF_CHILDINFO bnc = BNC(node, i); + MSN highest_msn_in_this_buffer = deserialize_child_buffer_v13(bfe->ft, bnc, rb); + if (highest_msn.msn == 0) { + highest_msn.msn = highest_msn_in_this_buffer.msn; + } + } + + // Assign the highest msn from our upgrade message buffers + node->max_msn_applied_to_node_on_disk = highest_msn; + // Since we assigned MSNs to this node's messages, we need to dirty it. + node->dirty = 1; + + // Must compute the checksum now (rather than at the end, while we + // still have the pointer to the buffer). + if (version >= FT_FIRST_LAYOUT_VERSION_WITH_END_TO_END_CHECKSUM) { + uint32_t expected_xsum = toku_dtoh32(*(uint32_t*)(rb->buf+rb->size-4)); // 27. checksum + uint32_t actual_xsum = toku_x1764_memory(rb->buf, rb->size-4); + if (expected_xsum != actual_xsum) { + fprintf(stderr, "%s:%d: Bad checksum: expected = %" PRIx32 ", actual= %" PRIx32 "\n", + __FUNCTION__, + __LINE__, + expected_xsum, + actual_xsum); + fprintf(stderr, + "Checksum failure while reading node in file %s.\n", + toku_cachefile_fname_in_env(bfe->ft->cf)); + fflush(stderr); + return toku_db_badformat(); + } + } + + return 0; +} + +// This function takes a deserialized version 13 or 14 buffer and +// constructs the associated leaf ftnode object. +static int +deserialize_and_upgrade_leaf_node(FTNODE node, + struct rbuf *rb, + ftnode_fetch_extra *bfe, + STAT64INFO info) +{ + int r = 0; + int version = node->layout_version_read_from_disk; + + // This is a leaf node, so the offsets in the buffer will be + // different from the internal node offsets above. + uint64_t nkeys = rbuf_ulonglong(rb); // 10. nkeys + uint64_t ndata = rbuf_ulonglong(rb); // 11. ndata + uint64_t dsize = rbuf_ulonglong(rb); // 12. dsize + invariant(nkeys == ndata); + if (info) { + // info is non-null if we're trying to upgrade old subtree + // estimates to stat64info + info->numrows += nkeys; + info->numbytes += dsize; + } + + // This is the optimized for upgrade field. + if (version == FT_LAYOUT_VERSION_14) { + (void) rbuf_int(rb); // 13. optimized + } + + // npartitions - This is really the number of leaf entries in + // our single basement node. There should only be 1 (ONE) + // partition, so there shouldn't be any pivot key stored. This + // means the loop will not iterate. We could remove the loop and + // assert that the value is indeed 1. + int npartitions = rbuf_int(rb); // 14. npartitions + assert(npartitions == 1); + + // Set number of children to 1, since we will only have one + // basement node. + node->n_children = 1; + XMALLOC_N(node->n_children, node->bp); + node->pivotkeys.create_empty(); + + // Create one basement node to contain all the leaf entries by + // setting up the single partition and updating the bfe. + update_bfe_using_ftnode(node, bfe); + ftnode_fetch_extra temp_bfe; + temp_bfe.create_for_full_read(bfe->ft); + setup_partitions_using_bfe(node, &temp_bfe, true); + + // 11. Deserialize the partition maps, though they are not used in the + // newer versions of ftnodes. + for (int i = 0; i < node->n_children; ++i) { + // The following fields were previously used by the `sub_block_map' + // They include: + // - 4 byte index + (void) rbuf_int(rb); + // - 4 byte offset + (void) rbuf_int(rb); + // - 4 byte size + (void) rbuf_int(rb); + } + + // Copy all of the leaf entries into the single basement node. + + // The number of leaf entries in buffer. + int n_in_buf = rbuf_int(rb); // 15. # of leaves + BLB_SEQINSERT(node,0) = 0; + BASEMENTNODE bn = BLB(node, 0); + + // Read the leaf entries from the buffer, advancing the buffer + // as we go. + bool has_end_to_end_checksum = (version >= FT_FIRST_LAYOUT_VERSION_WITH_END_TO_END_CHECKSUM); + if (version <= FT_LAYOUT_VERSION_13) { + // Create our mempool. + // Loop through + for (int i = 0; i < n_in_buf; ++i) { + LEAFENTRY_13 le = reinterpret_cast<LEAFENTRY_13>(&rb->buf[rb->ndone]); + uint32_t disksize = leafentry_disksize_13(le); + rb->ndone += disksize; // 16. leaf entry (13) + invariant(rb->ndone<=rb->size); + LEAFENTRY new_le; + size_t new_le_size; + void* key = NULL; + uint32_t keylen = 0; + r = toku_le_upgrade_13_14(le, + &key, + &keylen, + &new_le_size, + &new_le); + assert_zero(r); + // Copy the pointer value straight into the OMT + LEAFENTRY new_le_in_bn = nullptr; + void *maybe_free; + bn->data_buffer.get_space_for_insert( + i, + key, + keylen, + new_le_size, + &new_le_in_bn, + &maybe_free + ); + if (maybe_free) { + toku_free(maybe_free); + } + memcpy(new_le_in_bn, new_le, new_le_size); + toku_free(new_le); + } + } else { + uint32_t data_size = rb->size - rb->ndone; + if (has_end_to_end_checksum) { + data_size -= sizeof(uint32_t); + } + bn->data_buffer.deserialize_from_rbuf(n_in_buf, rb, data_size, node->layout_version_read_from_disk); + } + + // Whatever this is must be less than the MSNs of every message above + // it, so it's ok to take it here. + bn->max_msn_applied = bfe->ft->h->highest_unused_msn_for_upgrade; + bn->stale_ancestor_messages_applied = false; + node->max_msn_applied_to_node_on_disk = bn->max_msn_applied; + + // Checksum (end to end) is only on version 14 + if (has_end_to_end_checksum) { + uint32_t expected_xsum = rbuf_int(rb); // 17. checksum + uint32_t actual_xsum = toku_x1764_memory(rb->buf, rb->size - 4); + if (expected_xsum != actual_xsum) { + fprintf(stderr, "%s:%d: Bad checksum: expected = %" PRIx32 ", actual= %" PRIx32 "\n", + __FUNCTION__, + __LINE__, + expected_xsum, + actual_xsum); + fprintf(stderr, + "Checksum failure while reading node in file %s.\n", + toku_cachefile_fname_in_env(bfe->ft->cf)); + fflush(stderr); + return toku_db_badformat(); + } + } + + // We should have read the whole block by this point. + if (rb->ndone != rb->size) { + // TODO: Error handling. + return 1; + } + + return r; +} + +static int +read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum, + DISKOFF offset, DISKOFF size, + FT ft, + struct rbuf *rb, + /* out */ int *layout_version_p); + +// This function upgrades a version 14 or 13 ftnode to the current +// verison. NOTE: This code assumes the first field of the rbuf has +// already been read from the buffer (namely the layout_version of the +// ftnode.) +static int +deserialize_and_upgrade_ftnode(FTNODE node, + FTNODE_DISK_DATA* ndd, + BLOCKNUM blocknum, + ftnode_fetch_extra *bfe, + STAT64INFO info, + int fd) +{ + int r = 0; + int version; + + // I. First we need to de-compress the entire node, only then can + // we read the different sub-sections. + // get the file offset and block size for the block + DISKOFF offset, size; + bfe->ft->blocktable.translate_blocknum_to_offset_size(blocknum, &offset, &size); + + struct rbuf rb; + r = read_and_decompress_block_from_fd_into_rbuf(fd, + blocknum, + offset, + size, + bfe->ft, + &rb, + &version); + if (r != 0) { + goto exit; + } + + // Re-read the magic field from the previous call, since we are + // restarting with a fresh rbuf. + { + const void *magic; + rbuf_literal_bytes(&rb, &magic, 8); // 1. magic + } + + // II. Start reading ftnode fields out of the decompressed buffer. + + // Copy over old version info. + node->layout_version_read_from_disk = rbuf_int(&rb); // 2. layout version + version = node->layout_version_read_from_disk; + assert(version <= FT_LAYOUT_VERSION_14); + // Upgrade the current version number to the current version. + node->layout_version = FT_LAYOUT_VERSION; + + node->layout_version_original = rbuf_int(&rb); // 3. original layout + node->build_id = rbuf_int(&rb); // 4. build id + + // The remaining offsets into the rbuf do not map to the current + // version, so we need to fill in the blanks and ignore older + // fields. + (void)rbuf_int(&rb); // 5. nodesize + node->flags = rbuf_int(&rb); // 6. flags + node->height = rbuf_int(&rb); // 7. height + + // If the version is less than 14, there are two extra ints here. + // we would need to ignore them if they are there. + // These are the 'fingerprints'. + if (version == FT_LAYOUT_VERSION_13) { + (void) rbuf_int(&rb); // 8. rand4 + (void) rbuf_int(&rb); // 9. local + } + + // The next offsets are dependent on whether this is a leaf node + // or not. + + // III. Read in Leaf and Internal Node specific data. + + // Check height to determine whether this is a leaf node or not. + if (node->height > 0) { + r = deserialize_and_upgrade_internal_node(node, &rb, bfe, info); + } else { + r = deserialize_and_upgrade_leaf_node(node, &rb, bfe, info); + } + + XMALLOC_N(node->n_children, *ndd); + // Initialize the partition locations to zero, because version 14 + // and below have no notion of partitions on disk. + for (int i=0; i<node->n_children; i++) { + BP_START(*ndd,i) = 0; + BP_SIZE (*ndd,i) = 0; + } + + toku_free(rb.buf); +exit: + return r; +} + +static int +deserialize_ftnode_from_rbuf( + FTNODE *ftnode, + FTNODE_DISK_DATA* ndd, + BLOCKNUM blocknum, + uint32_t fullhash, + ftnode_fetch_extra *bfe, + STAT64INFO info, + struct rbuf *rb, + int fd + ) +// Effect: deserializes a ftnode that is in rb (with pointer of rb just past the magic) into a FTNODE. +{ + int r = 0; + struct sub_block sb_node_info; + + tokutime_t t0, t1; + tokutime_t decompress_time = 0; + tokutime_t deserialize_time = 0; + + t0 = toku_time_now(); + + FTNODE node = alloc_ftnode_for_deserialize(fullhash, blocknum); + + // now start reading from rbuf + // first thing we do is read the header information + const void *magic; + rbuf_literal_bytes(rb, &magic, 8); + if (memcmp(magic, "tokuleaf", 8)!=0 && + memcmp(magic, "tokunode", 8)!=0) { + r = toku_db_badformat(); + goto cleanup; + } + + node->layout_version_read_from_disk = rbuf_int(rb); + lazy_assert(node->layout_version_read_from_disk >= FT_LAYOUT_MIN_SUPPORTED_VERSION); + + // Check if we are reading in an older node version. + if (node->layout_version_read_from_disk <= FT_LAYOUT_VERSION_14) { + int version = node->layout_version_read_from_disk; + // Perform the upgrade. + r = deserialize_and_upgrade_ftnode(node, ndd, blocknum, bfe, info, fd); + if (r != 0) { + goto cleanup; + } + + if (version <= FT_LAYOUT_VERSION_13) { + // deprecate 'TOKU_DB_VALCMP_BUILTIN'. just remove the flag + node->flags &= ~TOKU_DB_VALCMP_BUILTIN_13; + } + + // If everything is ok, just re-assign the ftnode and retrn. + *ftnode = node; + r = 0; + goto cleanup; + } + + // Upgrade versions after 14 to current. This upgrade is trivial, it + // removes the optimized for upgrade field, which has already been + // removed in the deserialization code (see + // deserialize_ftnode_info()). + node->layout_version = FT_LAYOUT_VERSION; + node->layout_version_original = rbuf_int(rb); + node->build_id = rbuf_int(rb); + node->n_children = rbuf_int(rb); + XMALLOC_N(node->n_children, node->bp); + XMALLOC_N(node->n_children, *ndd); + // read the partition locations + for (int i=0; i<node->n_children; i++) { + BP_START(*ndd,i) = rbuf_int(rb); + BP_SIZE (*ndd,i) = rbuf_int(rb); + } + // verify checksum of header stored + uint32_t checksum; + checksum = toku_x1764_memory(rb->buf, rb->ndone); + uint32_t stored_checksum; + stored_checksum = rbuf_int(rb); + if (stored_checksum != checksum) { + dump_bad_block(rb->buf, rb->size); + invariant(stored_checksum == checksum); + } + + // now we read and decompress the pivot and child information + sub_block_init(&sb_node_info); + { + tokutime_t sb_decompress_t0 = toku_time_now(); + r = read_and_decompress_sub_block(rb, &sb_node_info); + tokutime_t sb_decompress_t1 = toku_time_now(); + decompress_time += sb_decompress_t1 - sb_decompress_t0; + } + if (r != 0) { + goto cleanup; + } + + // at this point, sb->uncompressed_ptr stores the serialized node info + r = deserialize_ftnode_info(&sb_node_info, node); + if (r != 0) { + goto cleanup; + } + toku_free(sb_node_info.uncompressed_ptr); + + // now that the node info has been deserialized, we can proceed to deserialize + // the individual sub blocks + + // setup the memory of the partitions + // for partitions being decompressed, create either message buffer or basement node + // for partitions staying compressed, create sub_block + setup_ftnode_partitions(node, bfe, true); + + // This loop is parallelizeable, since we don't have a dependency on the work done so far. + for (int i = 0; i < node->n_children; i++) { + uint32_t curr_offset = BP_START(*ndd,i); + uint32_t curr_size = BP_SIZE(*ndd,i); + // the compressed, serialized partitions start at where rb is currently pointing, + // which would be rb->buf + rb->ndone + // we need to intialize curr_rbuf to point to this place + struct rbuf curr_rbuf = {.buf = NULL, .size = 0, .ndone = 0}; + rbuf_init(&curr_rbuf, rb->buf + curr_offset, curr_size); + + // + // now we are at the point where we have: + // - read the entire compressed node off of disk, + // - decompressed the pivot and offset information, + // - have arrived at the individual partitions. + // + // Based on the information in bfe, we want to decompress a subset of + // of the compressed partitions (also possibly none or possibly all) + // The partitions that we want to decompress and make available + // to the node, we do, the rest we simply copy in compressed + // form into the node, and set the state of the partition to PT_COMPRESSED + // + + struct sub_block curr_sb; + sub_block_init(&curr_sb); + + // curr_rbuf is passed by value to decompress_and_deserialize_worker, so there's no ugly race condition. + // This would be more obvious if curr_rbuf were an array. + + // deserialize_ftnode_info figures out what the state + // should be and sets up the memory so that we are ready to use it + + switch (BP_STATE(node,i)) { + case PT_AVAIL: { + // case where we read and decompress the partition + tokutime_t partition_decompress_time; + r = decompress_and_deserialize_worker(curr_rbuf, curr_sb, node, i, + bfe->ft->cmp, &partition_decompress_time); + decompress_time += partition_decompress_time; + if (r != 0) { + goto cleanup; + } + break; + } + case PT_COMPRESSED: + // case where we leave the partition in the compressed state + r = check_and_copy_compressed_sub_block_worker(curr_rbuf, curr_sb, node, i); + if (r != 0) { + goto cleanup; + } + break; + case PT_INVALID: // this is really bad + case PT_ON_DISK: // it's supposed to be in memory. + abort(); + } + } + *ftnode = node; + r = 0; + +cleanup: + if (r == 0) { + t1 = toku_time_now(); + deserialize_time = (t1 - t0) - decompress_time; + bfe->deserialize_time += deserialize_time; + bfe->decompress_time += decompress_time; + toku_ft_status_update_deserialize_times(node, deserialize_time, decompress_time); + } + if (r != 0) { + // NOTE: Right now, callers higher in the stack will assert on + // failure, so this is OK for production. However, if we + // create tools that use this function to search for errors in + // the FT, then we will leak memory. + if (node) { + toku_free(node); + } + } + return r; +} + +int +toku_deserialize_bp_from_disk(FTNODE node, FTNODE_DISK_DATA ndd, int childnum, int fd, ftnode_fetch_extra *bfe) { + int r = 0; + assert(BP_STATE(node,childnum) == PT_ON_DISK); + assert(node->bp[childnum].ptr.tag == BCT_NULL); + + // + // setup the partition + // + setup_available_ftnode_partition(node, childnum); + BP_STATE(node,childnum) = PT_AVAIL; + + // + // read off disk and make available in memory + // + // get the file offset and block size for the block + DISKOFF node_offset, total_node_disk_size; + bfe->ft->blocktable.translate_blocknum_to_offset_size(node->blocknum, &node_offset, &total_node_disk_size); + + uint32_t curr_offset = BP_START(ndd, childnum); + uint32_t curr_size = BP_SIZE (ndd, childnum); + + struct rbuf rb; + rbuf_init(&rb, nullptr, 0); + + uint32_t pad_at_beginning = (node_offset+curr_offset)%512; + uint32_t padded_size = roundup_to_multiple(512, pad_at_beginning + curr_size); + + toku::scoped_malloc_aligned raw_block_buf(padded_size, 512); + uint8_t *raw_block = reinterpret_cast<uint8_t *>(raw_block_buf.get()); + rbuf_init(&rb, pad_at_beginning+raw_block, curr_size); + tokutime_t t0 = toku_time_now(); + + // read the block + assert(0==((unsigned long long)raw_block)%512); // for O_DIRECT + assert(0==(padded_size)%512); + assert(0==(node_offset+curr_offset-pad_at_beginning)%512); + ssize_t rlen = toku_os_pread(fd, raw_block, padded_size, node_offset+curr_offset-pad_at_beginning); + assert((DISKOFF)rlen >= pad_at_beginning + curr_size); // we read in at least enough to get what we wanted + assert((DISKOFF)rlen <= padded_size); // we didn't read in too much. + + tokutime_t t1 = toku_time_now(); + + // read sub block + struct sub_block curr_sb; + sub_block_init(&curr_sb); + r = read_compressed_sub_block(&rb, &curr_sb); + if (r != 0) { + return r; + } + invariant(curr_sb.compressed_ptr != NULL); + + // decompress + toku::scoped_malloc uncompressed_buf(curr_sb.uncompressed_size); + curr_sb.uncompressed_ptr = uncompressed_buf.get(); + toku_decompress((Bytef *) curr_sb.uncompressed_ptr, curr_sb.uncompressed_size, + (Bytef *) curr_sb.compressed_ptr, curr_sb.compressed_size); + + // deserialize + tokutime_t t2 = toku_time_now(); + + r = deserialize_ftnode_partition(&curr_sb, node, childnum, bfe->ft->cmp); + + tokutime_t t3 = toku_time_now(); + + // capture stats + tokutime_t io_time = t1 - t0; + tokutime_t decompress_time = t2 - t1; + tokutime_t deserialize_time = t3 - t2; + bfe->deserialize_time += deserialize_time; + bfe->decompress_time += decompress_time; + toku_ft_status_update_deserialize_times(node, deserialize_time, decompress_time); + + bfe->bytes_read = rlen; + bfe->io_time = io_time; + + return r; +} + +// Take a ftnode partition that is in the compressed state, and make it avail +int +toku_deserialize_bp_from_compressed(FTNODE node, int childnum, ftnode_fetch_extra *bfe) { + int r = 0; + assert(BP_STATE(node, childnum) == PT_COMPRESSED); + SUB_BLOCK curr_sb = BSB(node, childnum); + + toku::scoped_malloc uncompressed_buf(curr_sb->uncompressed_size); + assert(curr_sb->uncompressed_ptr == NULL); + curr_sb->uncompressed_ptr = uncompressed_buf.get(); + + setup_available_ftnode_partition(node, childnum); + BP_STATE(node,childnum) = PT_AVAIL; + + // decompress the sub_block + tokutime_t t0 = toku_time_now(); + + toku_decompress( + (Bytef *) curr_sb->uncompressed_ptr, + curr_sb->uncompressed_size, + (Bytef *) curr_sb->compressed_ptr, + curr_sb->compressed_size + ); + + tokutime_t t1 = toku_time_now(); + + r = deserialize_ftnode_partition(curr_sb, node, childnum, bfe->ft->cmp); + + tokutime_t t2 = toku_time_now(); + + tokutime_t decompress_time = t1 - t0; + tokutime_t deserialize_time = t2 - t1; + bfe->deserialize_time += deserialize_time; + bfe->decompress_time += decompress_time; + toku_ft_status_update_deserialize_times(node, deserialize_time, decompress_time); + + toku_free(curr_sb->compressed_ptr); + toku_free(curr_sb); + return r; +} + +static int +deserialize_ftnode_from_fd(int fd, + BLOCKNUM blocknum, + uint32_t fullhash, + FTNODE *ftnode, + FTNODE_DISK_DATA *ndd, + ftnode_fetch_extra *bfe, + STAT64INFO info) +{ + struct rbuf rb = RBUF_INITIALIZER; + + tokutime_t t0 = toku_time_now(); + read_block_from_fd_into_rbuf(fd, blocknum, bfe->ft, &rb); + tokutime_t t1 = toku_time_now(); + + // Decompress and deserialize the ftnode. Time statistics + // are taken inside this function. + int r = deserialize_ftnode_from_rbuf(ftnode, ndd, blocknum, fullhash, bfe, info, &rb, fd); + if (r != 0) { + dump_bad_block(rb.buf,rb.size); + } + + bfe->bytes_read = rb.size; + bfe->io_time = t1 - t0; + toku_free(rb.buf); + return r; +} + +// Read ftnode from file into struct. Perform version upgrade if necessary. +int +toku_deserialize_ftnode_from (int fd, + BLOCKNUM blocknum, + uint32_t fullhash, + FTNODE *ftnode, + FTNODE_DISK_DATA* ndd, + ftnode_fetch_extra *bfe + ) +// Effect: Read a node in. If possible, read just the header. +{ + int r = 0; + struct rbuf rb = RBUF_INITIALIZER; + + // each function below takes the appropriate io/decompression/deserialize statistics + + if (!bfe->read_all_partitions) { + read_ftnode_header_from_fd_into_rbuf_if_small_enough(fd, blocknum, bfe->ft, &rb, bfe); + r = deserialize_ftnode_header_from_rbuf_if_small_enough(ftnode, ndd, blocknum, fullhash, bfe, &rb, fd); + } else { + // force us to do it the old way + r = -1; + } + if (r != 0) { + // Something went wrong, go back to doing it the old way. + r = deserialize_ftnode_from_fd(fd, blocknum, fullhash, ftnode, ndd, bfe, NULL); + } + + toku_free(rb.buf); + return r; +} + +void +toku_verify_or_set_counts(FTNODE UU(node)) { +} + +int +toku_db_badformat(void) { + return DB_BADFORMAT; +} + +static size_t +serialize_rollback_log_size(ROLLBACK_LOG_NODE log) { + size_t size = node_header_overhead //8 "tokuroll", 4 version, 4 version_original, 4 build_id + +16 //TXNID_PAIR + +8 //sequence + +8 //blocknum + +8 //previous (blocknum) + +8 //resident_bytecount + +8 //memarena size + +log->rollentry_resident_bytecount; + return size; +} + +static void +serialize_rollback_log_node_to_buf(ROLLBACK_LOG_NODE log, char *buf, size_t calculated_size, int UU(n_sub_blocks), struct sub_block UU(sub_block[])) { + struct wbuf wb; + wbuf_init(&wb, buf, calculated_size); + { //Serialize rollback log to local wbuf + wbuf_nocrc_literal_bytes(&wb, "tokuroll", 8); + lazy_assert(log->layout_version == FT_LAYOUT_VERSION); + wbuf_nocrc_int(&wb, log->layout_version); + wbuf_nocrc_int(&wb, log->layout_version_original); + wbuf_nocrc_uint(&wb, BUILD_ID); + wbuf_nocrc_TXNID_PAIR(&wb, log->txnid); + wbuf_nocrc_ulonglong(&wb, log->sequence); + wbuf_nocrc_BLOCKNUM(&wb, log->blocknum); + wbuf_nocrc_BLOCKNUM(&wb, log->previous); + wbuf_nocrc_ulonglong(&wb, log->rollentry_resident_bytecount); + //Write down memarena size needed to restore + wbuf_nocrc_ulonglong(&wb, log->rollentry_arena.total_size_in_use()); + + { + //Store rollback logs + struct roll_entry *item; + size_t done_before = wb.ndone; + for (item = log->newest_logentry; item; item = item->prev) { + toku_logger_rollback_wbuf_nocrc_write(&wb, item); + } + lazy_assert(done_before + log->rollentry_resident_bytecount == wb.ndone); + } + } + lazy_assert(wb.ndone == wb.size); + lazy_assert(calculated_size==wb.ndone); +} + +static void +serialize_uncompressed_block_to_memory(char * uncompressed_buf, + int n_sub_blocks, + struct sub_block sub_block[/*n_sub_blocks*/], + enum toku_compression_method method, + /*out*/ size_t *n_bytes_to_write, + /*out*/ char **bytes_to_write) +// Guarantees that the malloc'd BYTES_TO_WRITE is 512-byte aligned (so that O_DIRECT will work) +{ + // allocate space for the compressed uncompressed_buf + size_t compressed_len = get_sum_compressed_size_bound(n_sub_blocks, sub_block, method); + size_t sub_block_header_len = sub_block_header_size(n_sub_blocks); + size_t header_len = node_header_overhead + sub_block_header_len + sizeof (uint32_t); // node + sub_block + checksum + char *XMALLOC_N_ALIGNED(512, roundup_to_multiple(512, header_len + compressed_len), compressed_buf); + + // copy the header + memcpy(compressed_buf, uncompressed_buf, node_header_overhead); + if (0) printf("First 4 bytes before compressing data are %02x%02x%02x%02x\n", + uncompressed_buf[node_header_overhead], uncompressed_buf[node_header_overhead+1], + uncompressed_buf[node_header_overhead+2], uncompressed_buf[node_header_overhead+3]); + + // compress all of the sub blocks + char *uncompressed_ptr = uncompressed_buf + node_header_overhead; + char *compressed_ptr = compressed_buf + header_len; + compressed_len = compress_all_sub_blocks(n_sub_blocks, sub_block, uncompressed_ptr, compressed_ptr, num_cores, ft_pool, method); + + //if (0) printf("Block %" PRId64 " Size before compressing %u, after compression %" PRIu64 "\n", blocknum.b, calculated_size-node_header_overhead, (uint64_t) compressed_len); + + // serialize the sub block header + uint32_t *ptr = (uint32_t *)(compressed_buf + node_header_overhead); + *ptr++ = toku_htod32(n_sub_blocks); + for (int i=0; i<n_sub_blocks; i++) { + ptr[0] = toku_htod32(sub_block[i].compressed_size); + ptr[1] = toku_htod32(sub_block[i].uncompressed_size); + ptr[2] = toku_htod32(sub_block[i].xsum); + ptr += 3; + } + + // compute the header checksum and serialize it + uint32_t header_length = (char *)ptr - (char *)compressed_buf; + uint32_t xsum = toku_x1764_memory(compressed_buf, header_length); + *ptr = toku_htod32(xsum); + + uint32_t padded_len = roundup_to_multiple(512, header_len + compressed_len); + // Zero out padding. + for (uint32_t i = header_len+compressed_len; i < padded_len; i++) { + compressed_buf[i] = 0; + } + *n_bytes_to_write = padded_len; + *bytes_to_write = compressed_buf; +} + +void +toku_serialize_rollback_log_to_memory_uncompressed(ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBACK_LOG_NODE serialized) { + // get the size of the serialized node + size_t calculated_size = serialize_rollback_log_size(log); + + serialized->len = calculated_size; + serialized->n_sub_blocks = 0; + // choose sub block parameters + int sub_block_size = 0; + size_t data_size = calculated_size - node_header_overhead; + choose_sub_block_size(data_size, max_sub_blocks, &sub_block_size, &serialized->n_sub_blocks); + lazy_assert(0 < serialized->n_sub_blocks && serialized->n_sub_blocks <= max_sub_blocks); + lazy_assert(sub_block_size > 0); + + // set the initial sub block size for all of the sub blocks + for (int i = 0; i < serialized->n_sub_blocks; i++) + sub_block_init(&serialized->sub_block[i]); + set_all_sub_block_sizes(data_size, sub_block_size, serialized->n_sub_blocks, serialized->sub_block); + + // allocate space for the serialized node + XMALLOC_N(calculated_size, serialized->data); + // serialize the node into buf + serialize_rollback_log_node_to_buf(log, serialized->data, calculated_size, serialized->n_sub_blocks, serialized->sub_block); + serialized->blocknum = log->blocknum; +} + +int +toku_serialize_rollback_log_to (int fd, ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBACK_LOG_NODE serialized_log, bool is_serialized, + FT ft, bool for_checkpoint) { + size_t n_to_write; + char *compressed_buf; + struct serialized_rollback_log_node serialized_local; + + if (is_serialized) { + invariant_null(log); + } else { + invariant_null(serialized_log); + serialized_log = &serialized_local; + toku_serialize_rollback_log_to_memory_uncompressed(log, serialized_log); + } + + BLOCKNUM blocknum = serialized_log->blocknum; + invariant(blocknum.b >= 0); + + // Compress and malloc buffer to write + serialize_uncompressed_block_to_memory(serialized_log->data, + serialized_log->n_sub_blocks, + serialized_log->sub_block, + ft->h->compression_method, + &n_to_write, &compressed_buf); + + // Dirties the ft + DISKOFF offset; + ft->blocktable.realloc_on_disk(blocknum, n_to_write, &offset, + ft, fd, for_checkpoint, + // We consider rollback log flushing the hottest possible allocation, + // since rollback logs are short-lived compared to FT nodes. + INT_MAX); + + toku_os_full_pwrite(fd, compressed_buf, n_to_write, offset); + toku_free(compressed_buf); + if (!is_serialized) { + toku_static_serialized_rollback_log_destroy(&serialized_local); + log->dirty = 0; // See #1957. Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction. + } + return 0; +} + +static int +deserialize_rollback_log_from_rbuf (BLOCKNUM blocknum, ROLLBACK_LOG_NODE *log_p, struct rbuf *rb) { + ROLLBACK_LOG_NODE MALLOC(result); + int r; + if (result==NULL) { + r=get_error_errno(); + if (0) { died0: toku_free(result); } + return r; + } + + const void *magic; + rbuf_literal_bytes(rb, &magic, 8); + lazy_assert(!memcmp(magic, "tokuroll", 8)); + + result->layout_version = rbuf_int(rb); + lazy_assert((FT_LAYOUT_VERSION_25 <= result->layout_version && result->layout_version <= FT_LAYOUT_VERSION_27) || + (result->layout_version == FT_LAYOUT_VERSION)); + result->layout_version_original = rbuf_int(rb); + result->layout_version_read_from_disk = result->layout_version; + result->build_id = rbuf_int(rb); + result->dirty = false; + //TODO: Maybe add descriptor (or just descriptor version) here eventually? + //TODO: This is hard.. everything is shared in a single dictionary. + rbuf_TXNID_PAIR(rb, &result->txnid); + result->sequence = rbuf_ulonglong(rb); + result->blocknum = rbuf_blocknum(rb); + if (result->blocknum.b != blocknum.b) { + r = toku_db_badformat(); + goto died0; + } + result->previous = rbuf_blocknum(rb); + result->rollentry_resident_bytecount = rbuf_ulonglong(rb); + + size_t arena_initial_size = rbuf_ulonglong(rb); + result->rollentry_arena.create(arena_initial_size); + if (0) { died1: result->rollentry_arena.destroy(); goto died0; } + + //Load rollback entries + lazy_assert(rb->size > 4); + //Start with empty list + result->oldest_logentry = result->newest_logentry = NULL; + while (rb->ndone < rb->size) { + struct roll_entry *item; + uint32_t rollback_fsize = rbuf_int(rb); //Already read 4. Rest is 4 smaller + const void *item_vec; + rbuf_literal_bytes(rb, &item_vec, rollback_fsize-4); + unsigned char* item_buf = (unsigned char*)item_vec; + r = toku_parse_rollback(item_buf, rollback_fsize-4, &item, &result->rollentry_arena); + if (r!=0) { + r = toku_db_badformat(); + goto died1; + } + //Add to head of list + if (result->oldest_logentry) { + result->oldest_logentry->prev = item; + result->oldest_logentry = item; + item->prev = NULL; + } + else { + result->oldest_logentry = result->newest_logentry = item; + item->prev = NULL; + } + } + + toku_free(rb->buf); + rb->buf = NULL; + *log_p = result; + return 0; +} + +static int +deserialize_rollback_log_from_rbuf_versioned (uint32_t version, BLOCKNUM blocknum, + ROLLBACK_LOG_NODE *log, + struct rbuf *rb) { + int r = 0; + ROLLBACK_LOG_NODE rollback_log_node = NULL; + invariant((FT_LAYOUT_VERSION_25 <= version && version <= FT_LAYOUT_VERSION_27) || version == FT_LAYOUT_VERSION); + r = deserialize_rollback_log_from_rbuf(blocknum, &rollback_log_node, rb); + if (r==0) { + *log = rollback_log_node; + } + return r; +} + +int +decompress_from_raw_block_into_rbuf(uint8_t *raw_block, size_t raw_block_size, struct rbuf *rb, BLOCKNUM blocknum) { + int r = 0; + // get the number of compressed sub blocks + int n_sub_blocks; + n_sub_blocks = toku_dtoh32(*(uint32_t*)(&raw_block[node_header_overhead])); + + // verify the number of sub blocks + invariant(0 <= n_sub_blocks); + invariant(n_sub_blocks <= max_sub_blocks); + + { // verify the header checksum + uint32_t header_length = node_header_overhead + sub_block_header_size(n_sub_blocks); + invariant(header_length <= raw_block_size); + uint32_t xsum = toku_x1764_memory(raw_block, header_length); + uint32_t stored_xsum = toku_dtoh32(*(uint32_t *)(raw_block + header_length)); + if (xsum != stored_xsum) { + r = TOKUDB_BAD_CHECKSUM; + } + } + + // deserialize the sub block header + struct sub_block sub_block[n_sub_blocks]; + uint32_t *sub_block_header = (uint32_t *) &raw_block[node_header_overhead+4]; + for (int i = 0; i < n_sub_blocks; i++) { + sub_block_init(&sub_block[i]); + sub_block[i].compressed_size = toku_dtoh32(sub_block_header[0]); + sub_block[i].uncompressed_size = toku_dtoh32(sub_block_header[1]); + sub_block[i].xsum = toku_dtoh32(sub_block_header[2]); + sub_block_header += 3; + } + + // This predicate needs to be here and instead of where it is set + // for the compiler. + if (r == TOKUDB_BAD_CHECKSUM) { + goto exit; + } + + // verify sub block sizes + for (int i = 0; i < n_sub_blocks; i++) { + uint32_t compressed_size = sub_block[i].compressed_size; + if (compressed_size<=0 || compressed_size>(1<<30)) { + r = toku_db_badformat(); + goto exit; + } + + uint32_t uncompressed_size = sub_block[i].uncompressed_size; + if (0) printf("Block %" PRId64 " Compressed size = %u, uncompressed size=%u\n", blocknum.b, compressed_size, uncompressed_size); + if (uncompressed_size<=0 || uncompressed_size>(1<<30)) { + r = toku_db_badformat(); + goto exit; + } + } + + // sum up the uncompressed size of the sub blocks + size_t uncompressed_size; + uncompressed_size = get_sum_uncompressed_size(n_sub_blocks, sub_block); + + // allocate the uncompressed buffer + size_t size; + size = node_header_overhead + uncompressed_size; + unsigned char *buf; + XMALLOC_N(size, buf); + rbuf_init(rb, buf, size); + + // copy the uncompressed node header to the uncompressed buffer + memcpy(rb->buf, raw_block, node_header_overhead); + + // point at the start of the compressed data (past the node header, the sub block header, and the header checksum) + unsigned char *compressed_data; + compressed_data = raw_block + node_header_overhead + sub_block_header_size(n_sub_blocks) + sizeof (uint32_t); + + // point at the start of the uncompressed data + unsigned char *uncompressed_data; + uncompressed_data = rb->buf + node_header_overhead; + + // decompress all the compressed sub blocks into the uncompressed buffer + r = decompress_all_sub_blocks(n_sub_blocks, sub_block, compressed_data, uncompressed_data, num_cores, ft_pool); + if (r != 0) { + fprintf(stderr, "%s:%d block %" PRId64 " failed %d at %p size %lu\n", __FUNCTION__, __LINE__, blocknum.b, r, raw_block, raw_block_size); + dump_bad_block(raw_block, raw_block_size); + goto exit; + } + + rb->ndone=0; +exit: + return r; +} + +static int decompress_from_raw_block_into_rbuf_versioned(uint32_t version, uint8_t *raw_block, size_t raw_block_size, struct rbuf *rb, BLOCKNUM blocknum) { + // This function exists solely to accomodate future changes in compression. + int r = 0; + if ((version == FT_LAYOUT_VERSION_13 || version == FT_LAYOUT_VERSION_14) || + (FT_LAYOUT_VERSION_25 <= version && version <= FT_LAYOUT_VERSION_27) || + version == FT_LAYOUT_VERSION) { + r = decompress_from_raw_block_into_rbuf(raw_block, raw_block_size, rb, blocknum); + } else { + abort(); + } + return r; +} + +static int +read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum, + DISKOFF offset, DISKOFF size, + FT ft, + struct rbuf *rb, + /* out */ int *layout_version_p) { + int r = 0; + if (0) printf("Deserializing Block %" PRId64 "\n", blocknum.b); + + DISKOFF size_aligned = roundup_to_multiple(512, size); + uint8_t *XMALLOC_N_ALIGNED(512, size_aligned, raw_block); + { + // read the (partially compressed) block + ssize_t rlen = toku_os_pread(fd, raw_block, size_aligned, offset); + lazy_assert((DISKOFF)rlen >= size); + lazy_assert((DISKOFF)rlen <= size_aligned); + } + // get the layout_version + int layout_version; + { + uint8_t *magic = raw_block + uncompressed_magic_offset; + if (memcmp(magic, "tokuleaf", 8)!=0 && + memcmp(magic, "tokunode", 8)!=0 && + memcmp(magic, "tokuroll", 8)!=0) { + r = toku_db_badformat(); + goto cleanup; + } + uint8_t *version = raw_block + uncompressed_version_offset; + layout_version = toku_dtoh32(*(uint32_t*)version); + if (layout_version < FT_LAYOUT_MIN_SUPPORTED_VERSION || layout_version > FT_LAYOUT_VERSION) { + r = toku_db_badformat(); + goto cleanup; + } + } + + r = decompress_from_raw_block_into_rbuf_versioned(layout_version, raw_block, size, rb, blocknum); + if (r != 0) { + // We either failed the checksome, or there is a bad format in + // the buffer. + if (r == TOKUDB_BAD_CHECKSUM) { + fprintf(stderr, + "Checksum failure while reading raw block in file %s.\n", + toku_cachefile_fname_in_env(ft->cf)); + abort(); + } else { + r = toku_db_badformat(); + goto cleanup; + } + } + + *layout_version_p = layout_version; +cleanup: + if (r!=0) { + if (rb->buf) toku_free(rb->buf); + rb->buf = NULL; + } + if (raw_block) { + toku_free(raw_block); + } + return r; +} + +// Read rollback log node from file into struct. +// Perform version upgrade if necessary. +int toku_deserialize_rollback_log_from(int fd, BLOCKNUM blocknum, ROLLBACK_LOG_NODE *logp, FT ft) { + int layout_version = 0; + int r; + + struct rbuf rb; + rbuf_init(&rb, nullptr, 0); + + // get the file offset and block size for the block + DISKOFF offset, size; + ft->blocktable.translate_blocknum_to_offset_size(blocknum, &offset, &size); + + // if the size is 0, then the blocknum is unused + if (size == 0) { + // blocknum is unused, just create an empty one and get out + ROLLBACK_LOG_NODE XMALLOC(log); + rollback_empty_log_init(log); + log->blocknum.b = blocknum.b; + r = 0; + *logp = log; + goto cleanup; + } + + r = read_and_decompress_block_from_fd_into_rbuf(fd, blocknum, offset, size, ft, &rb, &layout_version); + if (r!=0) goto cleanup; + + { + uint8_t *magic = rb.buf + uncompressed_magic_offset; + if (memcmp(magic, "tokuroll", 8)!=0) { + r = toku_db_badformat(); + goto cleanup; + } + } + + r = deserialize_rollback_log_from_rbuf_versioned(layout_version, blocknum, logp, &rb); + +cleanup: + if (rb.buf) { + toku_free(rb.buf); + } + return r; +} + +int +toku_upgrade_subtree_estimates_to_stat64info(int fd, FT ft) +{ + int r = 0; + // 15 was the last version with subtree estimates + invariant(ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_15); + + FTNODE unused_node = NULL; + FTNODE_DISK_DATA unused_ndd = NULL; + ftnode_fetch_extra bfe; + bfe.create_for_min_read(ft); + r = deserialize_ftnode_from_fd(fd, ft->h->root_blocknum, 0, &unused_node, &unused_ndd, + &bfe, &ft->h->on_disk_stats); + ft->in_memory_stats = ft->h->on_disk_stats; + + if (unused_node) { + toku_ftnode_free(&unused_node); + } + if (unused_ndd) { + toku_free(unused_ndd); + } + return r; +} + +int +toku_upgrade_msn_from_root_to_header(int fd, FT ft) +{ + int r; + // 21 was the first version with max_msn_in_ft in the header + invariant(ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_20); + + FTNODE node; + FTNODE_DISK_DATA ndd; + ftnode_fetch_extra bfe; + bfe.create_for_min_read(ft); + r = deserialize_ftnode_from_fd(fd, ft->h->root_blocknum, 0, &node, &ndd, &bfe, nullptr); + if (r != 0) { + goto exit; + } + + ft->h->max_msn_in_ft = node->max_msn_applied_to_node_on_disk; + toku_ftnode_free(&node); + toku_free(ndd); + exit: + return r; +} + +#undef UPGRADE_STATUS_VALUE diff --git a/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.h b/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.h new file mode 100644 index 00000000000..3ad616053e9 --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.h @@ -0,0 +1,92 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "ft/ft.h" +#include "ft/node.h" +#include "ft/serialize/sub_block.h" +#include "ft/serialize/rbuf.h" +#include "ft/serialize/wbuf.h" +#include "ft/serialize/block_table.h" + +unsigned int toku_serialize_ftnode_size(FTNODE node); +int toku_serialize_ftnode_to_memory(FTNODE node, FTNODE_DISK_DATA *ndd, + unsigned int basementnodesize, + enum toku_compression_method compression_method, + bool do_rebalancing, bool in_parallel, + size_t *n_bytes_to_write, size_t *n_uncompressed_bytes, + char **bytes_to_write); +int toku_serialize_ftnode_to(int fd, BLOCKNUM, FTNODE node, FTNODE_DISK_DATA *ndd, bool do_rebalancing, FT ft, bool for_checkpoint); +int toku_serialize_rollback_log_to(int fd, ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBACK_LOG_NODE serialized_log, bool is_serialized, + FT ft, bool for_checkpoint); +void toku_serialize_rollback_log_to_memory_uncompressed(ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBACK_LOG_NODE serialized); + +int toku_deserialize_rollback_log_from(int fd, BLOCKNUM blocknum, ROLLBACK_LOG_NODE *logp, FT ft); +int toku_deserialize_bp_from_disk(FTNODE node, FTNODE_DISK_DATA ndd, int childnum, int fd, ftnode_fetch_extra *bfe); +int toku_deserialize_bp_from_compressed(FTNODE node, int childnum, ftnode_fetch_extra *bfe); +int toku_deserialize_ftnode_from(int fd, BLOCKNUM off, uint32_t fullhash, FTNODE *node, FTNODE_DISK_DATA *ndd, ftnode_fetch_extra *bfe); + +void toku_serialize_set_parallel(bool); + +// used by nonleaf node partial eviction +void toku_create_compressed_partition_from_available(FTNODE node, int childnum, + enum toku_compression_method compression_method, SUB_BLOCK sb); + +// <CER> For verifying old, non-upgraded nodes (versions 13 and 14). +int decompress_from_raw_block_into_rbuf(uint8_t *raw_block, size_t raw_block_size, struct rbuf *rb, BLOCKNUM blocknum); + +// used by verify +int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ft, uint32_t version); +void read_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum, FT ft, struct rbuf *rb); +int read_compressed_sub_block(struct rbuf *rb, struct sub_block *sb); +int verify_ftnode_sub_block(struct sub_block *sb); +void just_decompress_sub_block(struct sub_block *sb); + +// used by ft-node-deserialize.cc +void initialize_ftnode(FTNODE node, BLOCKNUM blocknum); +int read_and_check_magic(struct rbuf *rb); +int read_and_check_version(FTNODE node, struct rbuf *rb); +void read_node_info(FTNODE node, struct rbuf *rb, int version); +void allocate_and_read_partition_offsets(FTNODE node, struct rbuf *rb, FTNODE_DISK_DATA *ndd); +int check_node_info_checksum(struct rbuf *rb); +void read_legacy_node_info(FTNODE node, struct rbuf *rb, int version); +int check_legacy_end_checksum(struct rbuf *rb); + +// exported so the loader can dump bad blocks +void dump_bad_block(unsigned char *vp, uint64_t size); diff --git a/storage/tokudb/PerconaFT/ft/serialize/quicklz.cc b/storage/tokudb/PerconaFT/ft/serialize/quicklz.cc new file mode 100644 index 00000000000..44f084f3475 --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/quicklz.cc @@ -0,0 +1,887 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +// Fast data compression library +// Copyright (C) 2006-2011 Lasse Mikkel Reinhold +// lar@quicklz.com +// +// QuickLZ can be used for free under the GPL 1, 2 or 3 license (where anything +// released into public must be open source) or under a commercial license if such +// has been acquired (see http://www.quicklz.com/order.html). The commercial license +// does not cover derived or ported versions created by third parties under GPL. + +// 1.5.0 final + +#include "quicklz.h" + +#if QLZ_VERSION_MAJOR != 1 || QLZ_VERSION_MINOR != 5 || QLZ_VERSION_REVISION != 0 + #error quicklz.c and quicklz.h have different versions +#endif + +#if (defined(__X86__) || defined(__i386__) || defined(i386) || defined(_M_IX86) || defined(__386__) || defined(__x86_64__) || defined(_M_X64)) + #define X86X64 +#endif + +#define MINOFFSET 2 +#define UNCONDITIONAL_MATCHLEN 6 +#define UNCOMPRESSED_END 4 +#define CWORD_LEN 4 + +#if QLZ_COMPRESSION_LEVEL == 1 && defined QLZ_PTR_64 && QLZ_STREAMING_BUFFER == 0 + #define OFFSET_BASE source + #define CAST (ui32)(size_t) +#else + #define OFFSET_BASE 0 + #define CAST +#endif + +int qlz_get_setting(int setting) +{ + switch (setting) + { + case 0: return QLZ_COMPRESSION_LEVEL; + case 1: return sizeof(qlz_state_compress); + case 2: return sizeof(qlz_state_decompress); + case 3: return QLZ_STREAMING_BUFFER; +#ifdef QLZ_MEMORY_SAFE + case 6: return 1; +#else + case 6: return 0; +#endif + case 7: return QLZ_VERSION_MAJOR; + case 8: return QLZ_VERSION_MINOR; + case 9: return QLZ_VERSION_REVISION; + } + return -1; +} + +#if QLZ_COMPRESSION_LEVEL == 1 +static int same(const unsigned char *src, size_t n) +{ + while(n > 0 && *(src + n) == *src) + n--; + return n == 0 ? 1 : 0; +} +#endif + +static void reset_table_compress(qlz_state_compress *state) +{ + int i; + for(i = 0; i < QLZ_HASH_VALUES; i++) + { +#if QLZ_COMPRESSION_LEVEL == 1 + state->hash[i].offset = 0; +#else + state->hash_counter[i] = 0; + state->hash[i].offset[0] = 0; +#endif + } +} + +static void reset_table_decompress(qlz_state_decompress *state) +{ + (void)state; +#if QLZ_COMPRESSION_LEVEL == 2 + for(int i = 0; i < QLZ_HASH_VALUES; i++) + { + state->hash_counter[i] = 0; + } +#endif +} + +static __inline ui32 hash_func(ui32 i) +{ +#if QLZ_COMPRESSION_LEVEL == 2 + return ((i >> 9) ^ (i >> 13) ^ i) & (QLZ_HASH_VALUES - 1); +#else + return ((i >> 12) ^ i) & (QLZ_HASH_VALUES - 1); +#endif +} + +static __inline ui32 fast_read(void const *src, ui32 bytes) +{ +#ifndef X86X64 + unsigned char *p = (unsigned char*)src; + switch (bytes) + { + case 4: + return(*p | *(p + 1) << 8 | *(p + 2) << 16 | *(p + 3) << 24); + case 3: + return(*p | *(p + 1) << 8 | *(p + 2) << 16); + case 2: + return(*p | *(p + 1) << 8); + case 1: + return(*p); + } + return 0; +#else + if (bytes >= 1 && bytes <= 4) + return *((ui32*)src); + else + return 0; +#endif +} + +static __inline ui32 hashat(const unsigned char *src) +{ + ui32 fetch, hash; + fetch = fast_read(src, 3); + hash = hash_func(fetch); + return hash; +} + +static __inline void fast_write(ui32 f, void *dst, size_t bytes) +{ +#ifndef X86X64 + unsigned char *p = (unsigned char*)dst; + + switch (bytes) + { + case 4: + *p = (unsigned char)f; + *(p + 1) = (unsigned char)(f >> 8); + *(p + 2) = (unsigned char)(f >> 16); + *(p + 3) = (unsigned char)(f >> 24); + return; + case 3: + *p = (unsigned char)f; + *(p + 1) = (unsigned char)(f >> 8); + *(p + 2) = (unsigned char)(f >> 16); + return; + case 2: + *p = (unsigned char)f; + *(p + 1) = (unsigned char)(f >> 8); + return; + case 1: + *p = (unsigned char)f; + return; + } +#else + switch (bytes) + { + case 4: + *((ui32*)dst) = f; + return; + case 3: + *((ui32*)dst) = f; + return; + case 2: + *((ui16 *)dst) = (ui16)f; + return; + case 1: + *((unsigned char*)dst) = (unsigned char)f; + return; + } +#endif +} + + +size_t qlz_size_decompressed(const char *source) +{ + ui32 n, r; + n = (((*source) & 2) == 2) ? 4 : 1; + r = fast_read(source + 1 + n, n); + r = r & (0xffffffff >> ((4 - n)*8)); + return r; +} + +size_t qlz_size_compressed(const char *source) +{ + ui32 n, r; + n = (((*source) & 2) == 2) ? 4 : 1; + r = fast_read(source + 1, n); + r = r & (0xffffffff >> ((4 - n)*8)); + return r; +} + +static +size_t qlz_size_header(const char *source) +{ + size_t n = 2*((((*source) & 2) == 2) ? 4 : 1) + 1; + return n; +} + + +static __inline void memcpy_up(unsigned char *dst, const unsigned char *src, ui32 n) +{ + // Caution if modifying memcpy_up! Overlap of dst and src must be special handled. +#ifndef X86X64 + unsigned char *end = dst + n; + while(dst < end) + { + *dst = *src; + dst++; + src++; + } +#else + ui32 f = 0; + do + { + *(ui32 *)(dst + f) = *(ui32 *)(src + f); + f += MINOFFSET + 1; + } + while (f < n); +#endif +} + +__attribute__((unused)) +static __inline void update_hash(qlz_state_decompress *state, const unsigned char *s) +{ +#if QLZ_COMPRESSION_LEVEL == 1 + ui32 hash; + hash = hashat(s); + state->hash[hash].offset = s; + state->hash_counter[hash] = 1; +#elif QLZ_COMPRESSION_LEVEL == 2 + ui32 hash; + unsigned char c; + hash = hashat(s); + c = state->hash_counter[hash]; + state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = s; + c++; + state->hash_counter[hash] = c; +#endif + (void)state; + (void)s; +} + +#if QLZ_COMPRESSION_LEVEL <= 2 +static void update_hash_upto(qlz_state_decompress *state, unsigned char **lh, const unsigned char *max) +{ + while(*lh < max) + { + (*lh)++; + update_hash(state, *lh); + } +} +#endif + +static size_t qlz_compress_core(const unsigned char *source, unsigned char *destination, size_t size, qlz_state_compress *state) +{ + const unsigned char *last_byte = source + size - 1; + const unsigned char *src = source; + unsigned char *cword_ptr = destination; + unsigned char *dst = destination + CWORD_LEN; + ui32 cword_val = 1U << 31; + const unsigned char *last_matchstart = last_byte - UNCONDITIONAL_MATCHLEN - UNCOMPRESSED_END; + ui32 fetch = 0; + unsigned int lits = 0; + + (void) lits; + + if(src <= last_matchstart) + fetch = fast_read(src, 3); + + while(src <= last_matchstart) + { + if ((cword_val & 1) == 1) + { + // store uncompressed if compression ratio is too low + if (src > source + (size >> 1) && dst - destination > src - source - ((src - source) >> 5)) + return 0; + + fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN); + + cword_ptr = dst; + dst += CWORD_LEN; + cword_val = 1U << 31; + fetch = fast_read(src, 3); + } +#if QLZ_COMPRESSION_LEVEL == 1 + { + const unsigned char *o; + ui32 hash, cached; + + hash = hash_func(fetch); + cached = fetch ^ state->hash[hash].cache; + state->hash[hash].cache = fetch; + + o = state->hash[hash].offset + OFFSET_BASE; + state->hash[hash].offset = CAST(src - OFFSET_BASE); + +#ifdef X86X64 + if ((cached & 0xffffff) == 0 && o != OFFSET_BASE && (src - o > MINOFFSET || (src == o + 1 && lits >= 3 && src > source + 3 && same(src - 3, 6)))) + { + if(cached != 0) + { +#else + if (cached == 0 && o != OFFSET_BASE && (src - o > MINOFFSET || (src == o + 1 && lits >= 3 && src > source + 3 && same(src - 3, 6)))) + { + if (*(o + 3) != *(src + 3)) + { +#endif + hash <<= 4; + cword_val = (cword_val >> 1) | (1U << 31); + fast_write((3 - 2) | hash, dst, 2); + src += 3; + dst += 2; + } + else + { + const unsigned char *old_src = src; + size_t matchlen; + hash <<= 4; + + cword_val = (cword_val >> 1) | (1U << 31); + src += 4; + + if(*(o + (src - old_src)) == *src) + { + src++; + if(*(o + (src - old_src)) == *src) + { + size_t q = last_byte - UNCOMPRESSED_END - (src - 5) + 1; + size_t remaining = q > 255 ? 255 : q; + src++; + while(*(o + (src - old_src)) == *src && (size_t)(src - old_src) < remaining) + src++; + } + } + + matchlen = src - old_src; + if (matchlen < 18) + { + fast_write((ui32)(matchlen - 2) | hash, dst, 2); + dst += 2; + } + else + { + fast_write((ui32)(matchlen << 16) | hash, dst, 3); + dst += 3; + } + } + fetch = fast_read(src, 3); + lits = 0; + } + else + { + lits++; + *dst = *src; + src++; + dst++; + cword_val = (cword_val >> 1); +#ifdef X86X64 + fetch = fast_read(src, 3); +#else + fetch = (fetch >> 8 & 0xffff) | (*(src + 2) << 16); +#endif + } + } +#elif QLZ_COMPRESSION_LEVEL >= 2 + { + const unsigned char *o, *offset2; + ui32 hash, matchlen, k, m, best_k = 0; + unsigned char c; + size_t remaining = (last_byte - UNCOMPRESSED_END - src + 1) > 255 ? 255 : (last_byte - UNCOMPRESSED_END - src + 1); + (void)best_k; + + + //hash = hashat(src); + fetch = fast_read(src, 3); + hash = hash_func(fetch); + + c = state->hash_counter[hash]; + + offset2 = state->hash[hash].offset[0]; + if(offset2 < src - MINOFFSET && c > 0 && ((fast_read(offset2, 3) ^ fetch) & 0xffffff) == 0) + { + matchlen = 3; + if(*(offset2 + matchlen) == *(src + matchlen)) + { + matchlen = 4; + while(*(offset2 + matchlen) == *(src + matchlen) && matchlen < remaining) + matchlen++; + } + } + else + matchlen = 0; + for(k = 1; k < QLZ_POINTERS && c > k; k++) + { + o = state->hash[hash].offset[k]; +#if QLZ_COMPRESSION_LEVEL == 3 + if(((fast_read(o, 3) ^ fetch) & 0xffffff) == 0 && o < src - MINOFFSET) +#elif QLZ_COMPRESSION_LEVEL == 2 + if(*(src + matchlen) == *(o + matchlen) && ((fast_read(o, 3) ^ fetch) & 0xffffff) == 0 && o < src - MINOFFSET) +#endif + { + m = 3; + while(*(o + m) == *(src + m) && m < remaining) + m++; +#if QLZ_COMPRESSION_LEVEL == 3 + if ((m > matchlen) || (m == matchlen && o > offset2)) +#elif QLZ_COMPRESSION_LEVEL == 2 + if (m > matchlen) +#endif + { + offset2 = o; + matchlen = m; + best_k = k; + } + } + } + o = offset2; + state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src; + c++; + state->hash_counter[hash] = c; + +#if QLZ_COMPRESSION_LEVEL == 3 + if(matchlen > 2 && src - o < 131071) + { + ui32 u; + size_t offset = src - o; + + for(u = 1; u < matchlen; u++) + { + hash = hashat(src + u); + c = state->hash_counter[hash]++; + state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src + u; + } + + cword_val = (cword_val >> 1) | (1U << 31); + src += matchlen; + + if(matchlen == 3 && offset <= 63) + { + *dst = (unsigned char)(offset << 2); + dst++; + } + else if (matchlen == 3 && offset <= 16383) + { + ui32 f = (ui32)((offset << 2) | 1); + fast_write(f, dst, 2); + dst += 2; + } + else if (matchlen <= 18 && offset <= 1023) + { + ui32 f = ((matchlen - 3) << 2) | ((ui32)offset << 6) | 2; + fast_write(f, dst, 2); + dst += 2; + } + + else if(matchlen <= 33) + { + ui32 f = ((matchlen - 2) << 2) | ((ui32)offset << 7) | 3; + fast_write(f, dst, 3); + dst += 3; + } + else + { + ui32 f = ((matchlen - 3) << 7) | ((ui32)offset << 15) | 3; + fast_write(f, dst, 4); + dst += 4; + } + } + else + { + *dst = *src; + src++; + dst++; + cword_val = (cword_val >> 1); + } +#elif QLZ_COMPRESSION_LEVEL == 2 + + if(matchlen > 2) + { + cword_val = (cword_val >> 1) | (1U << 31); + src += matchlen; + + if (matchlen < 10) + { + ui32 f = best_k | ((matchlen - 2) << 2) | (hash << 5); + fast_write(f, dst, 2); + dst += 2; + } + else + { + ui32 f = best_k | (matchlen << 16) | (hash << 5); + fast_write(f, dst, 3); + dst += 3; + } + } + else + { + *dst = *src; + src++; + dst++; + cword_val = (cword_val >> 1); + } +#endif + } +#endif + } + while (src <= last_byte) + { + if ((cword_val & 1) == 1) + { + fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN); + cword_ptr = dst; + dst += CWORD_LEN; + cword_val = 1U << 31; + } +#if QLZ_COMPRESSION_LEVEL < 3 + if (src <= last_byte - 3) + { +#if QLZ_COMPRESSION_LEVEL == 1 + ui32 hash, fetchv; + fetchv = fast_read(src, 3); + hash = hash_func(fetch); + state->hash[hash].offset = CAST(src - OFFSET_BASE); + state->hash[hash].cache = fetchv; +#elif QLZ_COMPRESSION_LEVEL == 2 + ui32 hash; + unsigned char c; + hash = hashat(src); + c = state->hash_counter[hash]; + state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src; + c++; + state->hash_counter[hash] = c; +#endif + } +#endif + *dst = *src; + src++; + dst++; + cword_val = (cword_val >> 1); + } + + while((cword_val & 1) != 1) + cword_val = (cword_val >> 1); + + fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN); + + // min. size must be 9 bytes so that the qlz_size functions can take 9 bytes as argument + return dst - destination < 9 ? 9 : dst - destination; +} + +static size_t qlz_decompress_core(const unsigned char *source, unsigned char *destination, size_t size, qlz_state_decompress *state, const unsigned char *history) +{ + const unsigned char *src = source + qlz_size_header((const char *)source); + unsigned char *dst = destination; + const unsigned char *last_destination_byte = destination + size - 1; + ui32 cword_val = 1; + const unsigned char *last_matchstart = last_destination_byte - UNCONDITIONAL_MATCHLEN - UNCOMPRESSED_END; + unsigned char *last_hashed = destination - 1; + const unsigned char *last_source_byte = source + qlz_size_compressed((const char *)source) - 1; + static const ui32 bitlut[16] = {4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0}; + + (void) last_source_byte; + (void) last_hashed; + (void) state; + (void) history; + + for(;;) + { + ui32 fetch; + + if (cword_val == 1) + { +#ifdef QLZ_MEMORY_SAFE + if(src + CWORD_LEN - 1 > last_source_byte) + return 0; +#endif + cword_val = fast_read(src, CWORD_LEN); + src += CWORD_LEN; + } + +#ifdef QLZ_MEMORY_SAFE + if(src + 4 - 1 > last_source_byte) + return 0; +#endif + + fetch = fast_read(src, 4); + + if ((cword_val & 1) == 1) + { + ui32 matchlen; + const unsigned char *offset2; + +#if QLZ_COMPRESSION_LEVEL == 1 + ui32 hash; + cword_val = cword_val >> 1; + hash = (fetch >> 4) & 0xfff; + offset2 = (const unsigned char *)(size_t)state->hash[hash].offset; + + if((fetch & 0xf) != 0) + { + matchlen = (fetch & 0xf) + 2; + src += 2; + } + else + { + matchlen = *(src + 2); + src += 3; + } + +#elif QLZ_COMPRESSION_LEVEL == 2 + ui32 hash; + unsigned char c; + cword_val = cword_val >> 1; + hash = (fetch >> 5) & 0x7ff; + c = (unsigned char)(fetch & 0x3); + offset2 = state->hash[hash].offset[c]; + + if((fetch & (28)) != 0) + { + matchlen = ((fetch >> 2) & 0x7) + 2; + src += 2; + } + else + { + matchlen = *(src + 2); + src += 3; + } + +#elif QLZ_COMPRESSION_LEVEL == 3 + ui32 offset; + cword_val = cword_val >> 1; + if ((fetch & 3) == 0) + { + offset = (fetch & 0xff) >> 2; + matchlen = 3; + src++; + } + else if ((fetch & 2) == 0) + { + offset = (fetch & 0xffff) >> 2; + matchlen = 3; + src += 2; + } + else if ((fetch & 1) == 0) + { + offset = (fetch & 0xffff) >> 6; + matchlen = ((fetch >> 2) & 15) + 3; + src += 2; + } + else if ((fetch & 127) != 3) + { + offset = (fetch >> 7) & 0x1ffff; + matchlen = ((fetch >> 2) & 0x1f) + 2; + src += 3; + } + else + { + offset = (fetch >> 15); + matchlen = ((fetch >> 7) & 255) + 3; + src += 4; + } + + offset2 = dst - offset; +#endif + +#ifdef QLZ_MEMORY_SAFE + if(offset2 < history || offset2 > dst - MINOFFSET - 1) + return 0; + + if(matchlen > (ui32)(last_destination_byte - dst - UNCOMPRESSED_END + 1)) + return 0; +#endif + + memcpy_up(dst, offset2, matchlen); + dst += matchlen; + +#if QLZ_COMPRESSION_LEVEL <= 2 + update_hash_upto(state, &last_hashed, dst - matchlen); + last_hashed = dst - 1; +#endif + } + else + { + if (dst < last_matchstart) + { + unsigned int n = bitlut[cword_val & 0xf]; +#ifdef X86X64 + *(ui32 *)dst = *(ui32 *)src; +#else + memcpy_up(dst, src, 4); +#endif + cword_val = cword_val >> n; + dst += n; + src += n; +#if QLZ_COMPRESSION_LEVEL <= 2 + update_hash_upto(state, &last_hashed, dst - 3); +#endif + } + else + { + while(dst <= last_destination_byte) + { + if (cword_val == 1) + { + src += CWORD_LEN; + cword_val = 1U << 31; + } +#ifdef QLZ_MEMORY_SAFE + if(src >= last_source_byte + 1) + return 0; +#endif + *dst = *src; + dst++; + src++; + cword_val = cword_val >> 1; + } + +#if QLZ_COMPRESSION_LEVEL <= 2 + update_hash_upto(state, &last_hashed, last_destination_byte - 3); // todo, use constant +#endif + return size; + } + + } + } +} + +size_t qlz_compress(const void *source, char *destination, size_t size, qlz_state_compress *state) +{ + size_t r; + ui32 compressed; + size_t base; + + if(size == 0 || size > 0xffffffff - 400) + return 0; + + if(size < 216) + base = 3; + else + base = 9; + +#if QLZ_STREAMING_BUFFER > 0 + if (state->stream_counter + size - 1 >= QLZ_STREAMING_BUFFER) +#endif + { + reset_table_compress(state); + r = base + qlz_compress_core((const unsigned char *)source, (unsigned char*)destination + base, size, state); +#if QLZ_STREAMING_BUFFER > 0 + reset_table_compress(state); +#endif + if(r == base) + { + memcpy(destination + base, source, size); + r = size + base; + compressed = 0; + } + else + { + compressed = 1; + } + state->stream_counter = 0; + } +#if QLZ_STREAMING_BUFFER > 0 + else + { + unsigned char *src = state->stream_buffer + state->stream_counter; + + memcpy(src, source, size); + r = base + qlz_compress_core(src, (unsigned char*)destination + base, size, state); + + if(r == base) + { + memcpy(destination + base, src, size); + r = size + base; + compressed = 0; + reset_table_compress(state); + } + else + { + compressed = 1; + } + state->stream_counter += size; + } +#endif + if(base == 3) + { + *destination = (unsigned char)(0 | compressed); + *(destination + 1) = (unsigned char)r; + *(destination + 2) = (unsigned char)size; + } + else + { + *destination = (unsigned char)(2 | compressed); + fast_write((ui32)r, destination + 1, 4); + fast_write((ui32)size, destination + 5, 4); + } + + *destination |= (QLZ_COMPRESSION_LEVEL << 2); + *destination |= (1 << 6); + *destination |= ((QLZ_STREAMING_BUFFER == 0 ? 0 : (QLZ_STREAMING_BUFFER == 100000 ? 1 : (QLZ_STREAMING_BUFFER == 1000000 ? 2 : 3))) << 4); + +// 76543210 +// 01SSLLHC + + return r; +} + +size_t qlz_decompress(const char *source, void *destination, qlz_state_decompress *state) +{ + size_t dsiz = qlz_size_decompressed(source); + +#if QLZ_STREAMING_BUFFER > 0 + if (state->stream_counter + qlz_size_decompressed(source) - 1 >= QLZ_STREAMING_BUFFER) +#endif + { + if((*source & 1) == 1) + { + reset_table_decompress(state); + dsiz = qlz_decompress_core((const unsigned char *)source, (unsigned char *)destination, dsiz, state, (const unsigned char *)destination); + } + else + { + memcpy(destination, source + qlz_size_header(source), dsiz); + } + state->stream_counter = 0; + reset_table_decompress(state); + } +#if QLZ_STREAMING_BUFFER > 0 + else + { + unsigned char *dst = state->stream_buffer + state->stream_counter; + if((*source & 1) == 1) + { + dsiz = qlz_decompress_core((const unsigned char *)source, dst, dsiz, state, (const unsigned char *)state->stream_buffer); + } + else + { + memcpy(dst, source + qlz_size_header(source), dsiz); + reset_table_decompress(state); + } + memcpy(destination, dst, dsiz); + state->stream_counter += dsiz; + } +#endif + return dsiz; +} + diff --git a/storage/tokudb/PerconaFT/ft/serialize/quicklz.h b/storage/tokudb/PerconaFT/ft/serialize/quicklz.h new file mode 100644 index 00000000000..b9ce2f9913c --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/quicklz.h @@ -0,0 +1,177 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +// Fast data compression library +// Copyright (C) 2006-2011 Lasse Mikkel Reinhold +// lar@quicklz.com +// +// QuickLZ can be used for free under the GPL 1, 2 or 3 license (where anything +// released into public must be open source) or under a commercial license if such +// has been acquired (see http://www.quicklz.com/order.html). The commercial license +// does not cover derived or ported versions created by third parties under GPL. + +// You can edit following user settings. Data must be decompressed with the same +// setting of QLZ_COMPRESSION_LEVEL and QLZ_STREAMING_BUFFER as it was compressed +// (see manual). If QLZ_STREAMING_BUFFER > 0, scratch buffers must be initially +// zeroed out (see manual). First #ifndef makes it possible to define settings from +// the outside like the compiler command line. + +// 1.5.0 final + +#ifndef QLZ_COMPRESSION_LEVEL + //#define QLZ_COMPRESSION_LEVEL 1 + //#define QLZ_COMPRESSION_LEVEL 2 + #define QLZ_COMPRESSION_LEVEL 3 + + #define QLZ_STREAMING_BUFFER 0 + //#define QLZ_STREAMING_BUFFER 100000 + //#define QLZ_STREAMING_BUFFER 1000000 + + //#define QLZ_MEMORY_SAFE +#endif + +#define QLZ_VERSION_MAJOR 1 +#define QLZ_VERSION_MINOR 5 +#define QLZ_VERSION_REVISION 0 + +// Using size_t, memset() and memcpy() +#include <string.h> + +// Verify compression level +#if QLZ_COMPRESSION_LEVEL != 1 && QLZ_COMPRESSION_LEVEL != 2 && QLZ_COMPRESSION_LEVEL != 3 +#error QLZ_COMPRESSION_LEVEL must be 1, 2 or 3 +#endif + +typedef unsigned int ui32; +typedef unsigned short int ui16; + +// Decrease QLZ_POINTERS for level 3 to increase compression speed. Do not touch any other values! +#if QLZ_COMPRESSION_LEVEL == 1 +#define QLZ_POINTERS 1 +#define QLZ_HASH_VALUES 4096 +#elif QLZ_COMPRESSION_LEVEL == 2 +#define QLZ_POINTERS 4 +#define QLZ_HASH_VALUES 2048 +#elif QLZ_COMPRESSION_LEVEL == 3 +#define QLZ_POINTERS 16 +#define QLZ_HASH_VALUES 4096 +#endif + +// Detect if pointer size is 64-bit. It's not fatal if some 64-bit target is not detected because this is only for adding an optional 64-bit optimization. +#if defined _LP64 || defined __LP64__ || defined __64BIT__ || _ADDR64 || defined _WIN64 || defined __arch64__ || __WORDSIZE == 64 || (defined __sparc && defined __sparcv9) || defined __x86_64 || defined __amd64 || defined __x86_64__ || defined _M_X64 || defined _M_IA64 || defined __ia64 || defined __IA64__ + #define QLZ_PTR_64 +#endif + +// hash entry +typedef struct +{ +#if QLZ_COMPRESSION_LEVEL == 1 + ui32 cache; +#if defined QLZ_PTR_64 && QLZ_STREAMING_BUFFER == 0 + unsigned int offset; +#else + const unsigned char *offset; +#endif +#else + const unsigned char *offset[QLZ_POINTERS]; +#endif + +} qlz_hash_compress; + +typedef struct +{ +#if QLZ_COMPRESSION_LEVEL == 1 + const unsigned char *offset; +#else + const unsigned char *offset[QLZ_POINTERS]; +#endif +} qlz_hash_decompress; + + +// states +typedef struct +{ + #if QLZ_STREAMING_BUFFER > 0 + unsigned char stream_buffer[QLZ_STREAMING_BUFFER]; + #endif + size_t stream_counter; + qlz_hash_compress hash[QLZ_HASH_VALUES]; + unsigned char hash_counter[QLZ_HASH_VALUES]; +} qlz_state_compress; + + +#if QLZ_COMPRESSION_LEVEL == 1 || QLZ_COMPRESSION_LEVEL == 2 + typedef struct + { +#if QLZ_STREAMING_BUFFER > 0 + unsigned char stream_buffer[QLZ_STREAMING_BUFFER]; +#endif + qlz_hash_decompress hash[QLZ_HASH_VALUES]; + unsigned char hash_counter[QLZ_HASH_VALUES]; + size_t stream_counter; + } qlz_state_decompress; +#elif QLZ_COMPRESSION_LEVEL == 3 + typedef struct + { +#if QLZ_STREAMING_BUFFER > 0 + unsigned char stream_buffer[QLZ_STREAMING_BUFFER]; +#endif +#if QLZ_COMPRESSION_LEVEL <= 2 + qlz_hash_decompress hash[QLZ_HASH_VALUES]; +#endif + size_t stream_counter; + } qlz_state_decompress; +#endif + + +#if defined (__cplusplus) +extern "C" { +#endif + +// Public functions of QuickLZ +size_t qlz_size_decompressed(const char *source); +size_t qlz_size_compressed(const char *source); +size_t qlz_compress(const void *source, char *destination, size_t size, qlz_state_compress *state); +size_t qlz_decompress(const char *source, void *destination, qlz_state_decompress *state); +int qlz_get_setting(int setting); + +#if defined (__cplusplus) +} +#endif diff --git a/storage/tokudb/PerconaFT/ft/serialize/rbuf.h b/storage/tokudb/PerconaFT/ft/serialize/rbuf.h new file mode 100644 index 00000000000..c14dedbf992 --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/rbuf.h @@ -0,0 +1,156 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <string.h> + +#include "portability/memory.h" +#include "portability/toku_assert.h" +#include "portability/toku_htonl.h" +#include "portability/toku_portability.h" +#include "util/memarena.h" + +struct rbuf { + unsigned char *buf; + unsigned int size; + unsigned int ndone; +}; +#define RBUF_INITIALIZER ((struct rbuf){.buf = NULL, .size=0, .ndone=0}) + +static inline void rbuf_init(struct rbuf *r, unsigned char *buf, unsigned int size) { + r->buf = buf; + r->size = size; + r->ndone = 0; +} + +static inline unsigned int rbuf_get_roffset(struct rbuf *r) { + return r->ndone; +} + +static inline unsigned char rbuf_char (struct rbuf *r) { + assert(r->ndone<r->size); + return r->buf[r->ndone++]; +} + +static inline void rbuf_ma_uint8_t (struct rbuf *r, memarena *ma __attribute__((__unused__)), uint8_t *num) { + *num = rbuf_char(r); +} + +static inline void rbuf_ma_bool (struct rbuf *r, memarena *ma __attribute__((__unused__)), bool *b) { + uint8_t n = rbuf_char(r); + *b = (n!=0); +} + +//Read an int that MUST be in network order regardless of disk order +static unsigned int rbuf_network_int (struct rbuf *r) __attribute__((__unused__)); +static unsigned int rbuf_network_int (struct rbuf *r) { + assert(r->ndone+4 <= r->size); + uint32_t result = toku_ntohl(*(uint32_t*)(r->buf+r->ndone)); // This only works on machines where unaligned loads are OK. + r->ndone+=4; + return result; +} + +static unsigned int rbuf_int (struct rbuf *r) { +#if 1 + assert(r->ndone+4 <= r->size); + uint32_t result = toku_dtoh32(*(uint32_t*)(r->buf+r->ndone)); // This only works on machines where unaligned loads are OK. + r->ndone+=4; + return result; +#else + unsigned char c0 = rbuf_char(r); + unsigned char c1 = rbuf_char(r); + unsigned char c2 = rbuf_char(r); + unsigned char c3 = rbuf_char(r); + return ((c0<<24)| + (c1<<16)| + (c2<<8)| + (c3<<0)); +#endif +} + +static inline void rbuf_literal_bytes (struct rbuf *r, const void **bytes, unsigned int n_bytes) { + *bytes = &r->buf[r->ndone]; + r->ndone+=n_bytes; + assert(r->ndone<=r->size); +} + +/* Return a pointer into the middle of the buffer. */ +static inline void rbuf_bytes (struct rbuf *r, const void **bytes, unsigned int *n_bytes) +{ + *n_bytes = rbuf_int(r); + rbuf_literal_bytes(r, bytes, *n_bytes); +} + +static inline unsigned long long rbuf_ulonglong (struct rbuf *r) { + unsigned i0 = rbuf_int(r); + unsigned i1 = rbuf_int(r); + return ((unsigned long long)(i0)<<32) | ((unsigned long long)(i1)); +} + +static inline signed long long rbuf_longlong (struct rbuf *r) { + return (signed long long)rbuf_ulonglong(r); +} + +static inline void rbuf_ma_uint32_t (struct rbuf *r, memarena *ma __attribute__((__unused__)), uint32_t *num) { + *num = rbuf_int(r); +} + +static inline void rbuf_ma_uint64_t (struct rbuf *r, memarena *ma __attribute__((__unused__)), uint64_t *num) { + *num = rbuf_ulonglong(r); +} + +// Don't try to use the same space, malloc it +static inline void rbuf_BYTESTRING (struct rbuf *r, BYTESTRING *bs) { + bs->len = rbuf_int(r); + uint32_t newndone = r->ndone + bs->len; + assert(newndone <= r->size); + bs->data = (char *) toku_memdup(&r->buf[r->ndone], (size_t)bs->len); + assert(bs->data); + r->ndone = newndone; +} + +static inline void rbuf_ma_BYTESTRING (struct rbuf *r, memarena *ma, BYTESTRING *bs) { + bs->len = rbuf_int(r); + uint32_t newndone = r->ndone + bs->len; + assert(newndone <= r->size); + bs->data = (char *) ma->malloc_from_arena(bs->len); + assert(bs->data); + memcpy(bs->data, &r->buf[r->ndone], bs->len); + r->ndone = newndone; +} diff --git a/storage/tokudb/PerconaFT/ft/serialize/sub_block.cc b/storage/tokudb/PerconaFT/ft/serialize/sub_block.cc new file mode 100644 index 00000000000..c967d4b4c1c --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/sub_block.cc @@ -0,0 +1,389 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include <errno.h> +#include <stdio.h> +#include <string.h> +#include <zlib.h> + +#include "portability/memory.h" +#include "portability/toku_assert.h" +#include "portability/toku_portability.h" + +#include "ft/serialize/compress.h" +#include "ft/serialize/sub_block.h" +#include "ft/serialize/quicklz.h" +#include "util/threadpool.h" +#include "util/x1764.h" + +SUB_BLOCK sub_block_creat(void) { + SUB_BLOCK XMALLOC(sb); + sub_block_init(sb); + return sb; +} +void sub_block_init(SUB_BLOCK sub_block) { + sub_block->uncompressed_ptr = 0; + sub_block->uncompressed_size = 0; + + sub_block->compressed_ptr = 0; + sub_block->compressed_size_bound = 0; + sub_block->compressed_size = 0; + + sub_block->xsum = 0; +} + +// get the size of the compression header +size_t +sub_block_header_size(int n_sub_blocks) { + return sizeof (uint32_t) + n_sub_blocks * sizeof (struct stored_sub_block); +} + +void +set_compressed_size_bound(struct sub_block *se, enum toku_compression_method method) { + se->compressed_size_bound = toku_compress_bound(method, se->uncompressed_size); +} + +// get the sum of the sub block compressed sizes +size_t +get_sum_compressed_size_bound(int n_sub_blocks, struct sub_block sub_block[], enum toku_compression_method method) { + size_t compressed_size_bound = 0; + for (int i = 0; i < n_sub_blocks; i++) { + sub_block[i].compressed_size_bound = toku_compress_bound(method, sub_block[i].uncompressed_size); + compressed_size_bound += sub_block[i].compressed_size_bound; + } + return compressed_size_bound; +} + +// get the sum of the sub block uncompressed sizes +size_t +get_sum_uncompressed_size(int n_sub_blocks, struct sub_block sub_block[]) { + size_t uncompressed_size = 0; + for (int i = 0; i < n_sub_blocks; i++) + uncompressed_size += sub_block[i].uncompressed_size; + return uncompressed_size; +} + +// round up n +static inline int +alignup32(int a, int b) { + return ((a+b-1) / b) * b; +} + +// Choose n_sub_blocks and sub_block_size such that the product is >= total_size and the sub_block_size is at +// least >= the target_sub_block_size. +int +choose_sub_block_size(int total_size, int n_sub_blocks_limit, int *sub_block_size_ret, int *n_sub_blocks_ret) { + if (total_size < 0 || n_sub_blocks_limit < 1) + return EINVAL; + + const int alignment = 32; + + int n_sub_blocks, sub_block_size; + n_sub_blocks = total_size / target_sub_block_size; + if (n_sub_blocks <= 1) { + if (total_size > 0 && n_sub_blocks_limit > 0) + n_sub_blocks = 1; + sub_block_size = total_size; + } else { + if (n_sub_blocks > n_sub_blocks_limit) // limit the number of sub-blocks + n_sub_blocks = n_sub_blocks_limit; + sub_block_size = alignup32(total_size / n_sub_blocks, alignment); + while (sub_block_size * n_sub_blocks < total_size) // round up the sub-block size until big enough + sub_block_size += alignment; + } + + *sub_block_size_ret = sub_block_size; + *n_sub_blocks_ret = n_sub_blocks; + + return 0; +} + +// Choose the right size of basement nodes. For now, just align up to +// 256k blocks and hope it compresses well enough. +int +choose_basement_node_size(int total_size, int *sub_block_size_ret, int *n_sub_blocks_ret) { + if (total_size < 0) + return EINVAL; + + *n_sub_blocks_ret = (total_size + max_basement_node_uncompressed_size - 1) / max_basement_node_uncompressed_size; + *sub_block_size_ret = max_basement_node_uncompressed_size; + + return 0; +} + +void +set_all_sub_block_sizes(int total_size, int sub_block_size, int n_sub_blocks, struct sub_block sub_block[]) { + int size_left = total_size; + int i; + for (i = 0; i < n_sub_blocks-1; i++) { + sub_block[i].uncompressed_size = sub_block_size; + size_left -= sub_block_size; + } + if (i == 0 || size_left > 0) + sub_block[i].uncompressed_size = size_left; +} + +// find the index of the first sub block that contains offset +// Returns the sub block index, else returns -1 +int +get_sub_block_index(int n_sub_blocks, struct sub_block sub_block[], size_t offset) { + size_t start_offset = 0; + for (int i = 0; i < n_sub_blocks; i++) { + size_t size = sub_block[i].uncompressed_size; + if (offset < start_offset + size) + return i; + start_offset += size; + } + return -1; +} + +#include "workset.h" + +void +compress_work_init(struct compress_work *w, enum toku_compression_method method, struct sub_block *sub_block) { + w->method = method; + w->sub_block = sub_block; +} + +// +// takes the uncompressed contents of sub_block +// and compresses them into sb_compressed_ptr +// cs_bound is the compressed size bound +// Returns the size of the compressed data +// +uint32_t +compress_nocrc_sub_block( + struct sub_block *sub_block, + void* sb_compressed_ptr, + uint32_t cs_bound, + enum toku_compression_method method + ) +{ + // compress it + Bytef *uncompressed_ptr = (Bytef *) sub_block->uncompressed_ptr; + Bytef *compressed_ptr = (Bytef *) sb_compressed_ptr; + uLongf uncompressed_len = sub_block->uncompressed_size; + uLongf real_compressed_len = cs_bound; + toku_compress(method, + compressed_ptr, &real_compressed_len, + uncompressed_ptr, uncompressed_len); + return real_compressed_len; +} + +void +compress_sub_block(struct sub_block *sub_block, enum toku_compression_method method) { + sub_block->compressed_size = compress_nocrc_sub_block( + sub_block, + sub_block->compressed_ptr, + sub_block->compressed_size_bound, + method + ); + // checksum it + sub_block->xsum = toku_x1764_memory(sub_block->compressed_ptr, sub_block->compressed_size); +} + +void * +compress_worker(void *arg) { + struct workset *ws = (struct workset *) arg; + while (1) { + struct compress_work *w = (struct compress_work *) workset_get(ws); + if (w == NULL) + break; + compress_sub_block(w->sub_block, w->method); + } + workset_release_ref(ws); + return arg; +} + +size_t +compress_all_sub_blocks(int n_sub_blocks, struct sub_block sub_block[], char *uncompressed_ptr, char *compressed_ptr, int num_cores, struct toku_thread_pool *pool, enum toku_compression_method method) { + char *compressed_base_ptr = compressed_ptr; + size_t compressed_len; + + // This is a complex way to write a parallel loop. Cilk would be better. + + if (n_sub_blocks == 1) { + // single sub-block + sub_block[0].uncompressed_ptr = uncompressed_ptr; + sub_block[0].compressed_ptr = compressed_ptr; + compress_sub_block(&sub_block[0], method); + compressed_len = sub_block[0].compressed_size; + } else { + // multiple sub-blocks + int T = num_cores; // T = min(num_cores, n_sub_blocks) - 1 + if (T > n_sub_blocks) + T = n_sub_blocks; + if (T > 0) + T = T - 1; // threads in addition to the running thread + + struct workset ws; + ZERO_STRUCT(ws); + workset_init(&ws); + + struct compress_work work[n_sub_blocks]; + workset_lock(&ws); + for (int i = 0; i < n_sub_blocks; i++) { + sub_block[i].uncompressed_ptr = uncompressed_ptr; + sub_block[i].compressed_ptr = compressed_ptr; + compress_work_init(&work[i], method, &sub_block[i]); + workset_put_locked(&ws, &work[i].base); + uncompressed_ptr += sub_block[i].uncompressed_size; + compressed_ptr += sub_block[i].compressed_size_bound; + } + workset_unlock(&ws); + + // compress the sub-blocks + if (0) printf("%s:%d T=%d N=%d\n", __FUNCTION__, __LINE__, T, n_sub_blocks); + toku_thread_pool_run(pool, 0, &T, compress_worker, &ws); + workset_add_ref(&ws, T); + compress_worker(&ws); + + // wait for all of the work to complete + workset_join(&ws); + workset_destroy(&ws); + + // squeeze out the holes not used by the compress bound + compressed_ptr = compressed_base_ptr + sub_block[0].compressed_size; + for (int i = 1; i < n_sub_blocks; i++) { + memmove(compressed_ptr, sub_block[i].compressed_ptr, sub_block[i].compressed_size); + compressed_ptr += sub_block[i].compressed_size; + } + + compressed_len = compressed_ptr - compressed_base_ptr; + } + return compressed_len; +} + +// initialize the decompression work +void +decompress_work_init(struct decompress_work *dw, + void *compress_ptr, uint32_t compress_size, + void *uncompress_ptr, uint32_t uncompress_size, + uint32_t xsum) { + dw->compress_ptr = compress_ptr; + dw->compress_size = compress_size; + dw->uncompress_ptr = uncompress_ptr; + dw->uncompress_size = uncompress_size; + dw->xsum = xsum; + dw->error = 0; +} + +int verbose_decompress_sub_block = 1; + +// decompress one block +int +decompress_sub_block(void *compress_ptr, uint32_t compress_size, void *uncompress_ptr, uint32_t uncompress_size, uint32_t expected_xsum) { + int result = 0; + + // verify checksum + uint32_t xsum = toku_x1764_memory(compress_ptr, compress_size); + if (xsum != expected_xsum) { + if (verbose_decompress_sub_block) fprintf(stderr, "%s:%d xsum %u expected %u\n", __FUNCTION__, __LINE__, xsum, expected_xsum); + result = EINVAL; + } else { + // decompress + toku_decompress((Bytef *) uncompress_ptr, uncompress_size, (Bytef *) compress_ptr, compress_size); + } + return result; +} + +// decompress blocks until there is no more work to do +void * +decompress_worker(void *arg) { + struct workset *ws = (struct workset *) arg; + while (1) { + struct decompress_work *dw = (struct decompress_work *) workset_get(ws); + if (dw == NULL) + break; + dw->error = decompress_sub_block(dw->compress_ptr, dw->compress_size, dw->uncompress_ptr, dw->uncompress_size, dw->xsum); + } + workset_release_ref(ws); + return arg; +} + +int +decompress_all_sub_blocks(int n_sub_blocks, struct sub_block sub_block[], unsigned char *compressed_data, unsigned char *uncompressed_data, int num_cores, struct toku_thread_pool *pool) { + int r; + + if (n_sub_blocks == 1) { + r = decompress_sub_block(compressed_data, sub_block[0].compressed_size, uncompressed_data, sub_block[0].uncompressed_size, sub_block[0].xsum); + } else { + // compute the number of additional threads needed for decompressing this node + int T = num_cores; // T = min(#cores, #blocks) - 1 + if (T > n_sub_blocks) + T = n_sub_blocks; + if (T > 0) + T = T - 1; // threads in addition to the running thread + + // init the decompression work set + struct workset ws; + ZERO_STRUCT(ws); + workset_init(&ws); + + // initialize the decompression work and add to the work set + struct decompress_work decompress_work[n_sub_blocks]; + workset_lock(&ws); + for (int i = 0; i < n_sub_blocks; i++) { + decompress_work_init(&decompress_work[i], compressed_data, sub_block[i].compressed_size, uncompressed_data, sub_block[i].uncompressed_size, sub_block[i].xsum); + workset_put_locked(&ws, &decompress_work[i].base); + + uncompressed_data += sub_block[i].uncompressed_size; + compressed_data += sub_block[i].compressed_size; + } + workset_unlock(&ws); + + // decompress the sub-blocks + if (0) printf("%s:%d Cores=%d Blocks=%d T=%d\n", __FUNCTION__, __LINE__, num_cores, n_sub_blocks, T); + toku_thread_pool_run(pool, 0, &T, decompress_worker, &ws); + workset_add_ref(&ws, T); + decompress_worker(&ws); + + // cleanup + workset_join(&ws); + workset_destroy(&ws); + + r = 0; + for (int i = 0; i < n_sub_blocks; i++) { + r = decompress_work[i].error; + if (r != 0) + break; + } + } + + return r; +} diff --git a/storage/tokudb/PerconaFT/ft/serialize/sub_block.h b/storage/tokudb/PerconaFT/ft/serialize/sub_block.h new file mode 100644 index 00000000000..2ae8a2a41bb --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/sub_block.h @@ -0,0 +1,160 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "ft/serialize/compress.h" + +// TODO: Clean this abstraciton up +static const int max_sub_blocks = 8; +static const int target_sub_block_size = 512 * 1024; +static const int max_basement_nodes = 32; +static const int max_basement_node_uncompressed_size = 256 * 1024; +static const int max_basement_node_compressed_size = 64 * 1024; + +struct sub_block { + void *uncompressed_ptr; + uint32_t uncompressed_size; + + void *compressed_ptr; + uint32_t compressed_size; // real compressed size + uint32_t compressed_size_bound; // estimated compressed size + + uint32_t xsum; // sub block checksum +}; +typedef struct sub_block *SUB_BLOCK; + +struct stored_sub_block { + uint32_t uncompressed_size; + uint32_t compressed_size; + uint32_t xsum; +}; + +void sub_block_init(SUB_BLOCK); +SUB_BLOCK sub_block_creat(void); + +// get the size of the compression header +size_t +sub_block_header_size(int n_sub_blocks); + +void +set_compressed_size_bound(struct sub_block *se, enum toku_compression_method method); + +// get the sum of the sub block compressed bound sizes +size_t +get_sum_compressed_size_bound(int n_sub_blocks, struct sub_block sub_block[], enum toku_compression_method method); + +// get the sum of the sub block uncompressed sizes +size_t +get_sum_uncompressed_size(int n_sub_blocks, struct sub_block sub_block[]); + +// Choose n_sub_blocks and sub_block_size such that the product is >= total_size and the sub_block_size is at +// least >= the target_sub_block_size. +int +choose_sub_block_size(int total_size, int n_sub_blocks_limit, int *sub_block_size_ret, int *n_sub_blocks_ret); + +int +choose_basement_node_size(int total_size, int *sub_block_size_ret, int *n_sub_blocks_ret); + +void +set_all_sub_block_sizes(int total_size, int sub_block_size, int n_sub_blocks, struct sub_block sub_block[]); + +// find the index of the first sub block that contains the offset +// Returns the index if found, else returns -1 +int +get_sub_block_index(int n_sub_blocks, struct sub_block sub_block[], size_t offset); + +#include "workset.h" + +struct compress_work { + struct work base; + enum toku_compression_method method; + struct sub_block *sub_block; +}; + +void +compress_work_init(struct compress_work *w, enum toku_compression_method method, struct sub_block *sub_block); + +uint32_t +compress_nocrc_sub_block( + struct sub_block *sub_block, + void* sb_compressed_ptr, + uint32_t cs_bound, + enum toku_compression_method method + ); + +void +compress_sub_block(struct sub_block *sub_block, enum toku_compression_method method); + +void * +compress_worker(void *arg); + +size_t +compress_all_sub_blocks(int n_sub_blocks, struct sub_block sub_block[], char *uncompressed_ptr, char *compressed_ptr, int num_cores, struct toku_thread_pool *pool, enum toku_compression_method method); + +struct decompress_work { + struct work base; + void *compress_ptr; + void *uncompress_ptr; + uint32_t compress_size; + uint32_t uncompress_size; + uint32_t xsum; + int error; +}; + +// initialize the decompression work +void +decompress_work_init(struct decompress_work *dw, + void *compress_ptr, uint32_t compress_size, + void *uncompress_ptr, uint32_t uncompress_size, + uint32_t xsum); + +// decompress one block +int +decompress_sub_block(void *compress_ptr, uint32_t compress_size, void *uncompress_ptr, uint32_t uncompress_size, uint32_t expected_xsum); + +// decompress blocks until there is no more work to do +void * +decompress_worker(void *arg); + +// decompress all sub blocks from the compressed_data buffer to the uncompressed_data buffer +// Returns 0 if success, otherwise an error +int +decompress_all_sub_blocks(int n_sub_blocks, struct sub_block sub_block[], unsigned char *compressed_data, unsigned char *uncompressed_data, int num_cores, struct toku_thread_pool *pool); + +extern int verbose_decompress_sub_block; diff --git a/storage/tokudb/PerconaFT/ft/serialize/wbuf.h b/storage/tokudb/PerconaFT/ft/serialize/wbuf.h new file mode 100644 index 00000000000..062294e2182 --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/wbuf.h @@ -0,0 +1,209 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <memory.h> +#include <string.h> + +#include "portability/toku_htonl.h" + +#include "util/bytestring.h" +#include "util/x1764.h" + +/* When serializing a value, write it into a buffer. */ +/* This code requires that the buffer be big enough to hold whatever you put into it. */ +/* This abstraction doesn't do a good job of hiding its internals. + * Why? The performance of this code is important, and we want to inline stuff */ +//Why is size here an int instead of DISKOFF like in the initializer? +struct wbuf { + unsigned char *buf; + unsigned int size; + unsigned int ndone; + struct x1764 checksum; // The checksum state +}; + +static inline void wbuf_nocrc_init (struct wbuf *w, void *buf, unsigned int size) { + w->buf = (unsigned char *) buf; + w->size = size; + w->ndone = 0; +} + +static inline void wbuf_init (struct wbuf *w, void *buf, unsigned int size) { + wbuf_nocrc_init(w, buf, size); + toku_x1764_init(&w->checksum); +} + +static inline size_t wbuf_get_woffset(struct wbuf *w) { + return w->ndone; +} + +/* Write a character. */ +static inline void wbuf_nocrc_char (struct wbuf *w, unsigned char ch) { + assert(w->ndone<w->size); + w->buf[w->ndone++]=ch; +} + +/* Write a character. */ +static inline void wbuf_nocrc_uint8_t (struct wbuf *w, uint8_t ch) { + assert(w->ndone<w->size); + w->buf[w->ndone++]=ch; +} + +static inline void wbuf_char (struct wbuf *w, unsigned char ch) { + wbuf_nocrc_char (w, ch); + toku_x1764_add(&w->checksum, &w->buf[w->ndone-1], 1); +} + +//Write an int that MUST be in network order regardless of disk order +static void wbuf_network_int (struct wbuf *w, int32_t i) __attribute__((__unused__)); +static void wbuf_network_int (struct wbuf *w, int32_t i) { + assert(w->ndone + 4 <= w->size); + *(uint32_t*)(&w->buf[w->ndone]) = toku_htonl(i); + toku_x1764_add(&w->checksum, &w->buf[w->ndone], 4); + w->ndone += 4; +} + +static inline void wbuf_nocrc_int (struct wbuf *w, int32_t i) { +#if 0 + wbuf_nocrc_char(w, i>>24); + wbuf_nocrc_char(w, i>>16); + wbuf_nocrc_char(w, i>>8); + wbuf_nocrc_char(w, i>>0); +#else + assert(w->ndone + 4 <= w->size); + #if 0 + w->buf[w->ndone+0] = i>>24; + w->buf[w->ndone+1] = i>>16; + w->buf[w->ndone+2] = i>>8; + w->buf[w->ndone+3] = i>>0; + #else + *(uint32_t*)(&w->buf[w->ndone]) = toku_htod32(i); + #endif + w->ndone += 4; +#endif +} + +static inline void wbuf_int (struct wbuf *w, int32_t i) { + wbuf_nocrc_int(w, i); + toku_x1764_add(&w->checksum, &w->buf[w->ndone-4], 4); +} + +static inline void wbuf_nocrc_uint (struct wbuf *w, uint32_t i) { + wbuf_nocrc_int(w, (int32_t)i); +} + +static inline void wbuf_uint (struct wbuf *w, uint32_t i) { + wbuf_int(w, (int32_t)i); +} + +static inline uint8_t* wbuf_nocrc_reserve_literal_bytes(struct wbuf *w, uint32_t nbytes) { + assert(w->ndone + nbytes <= w->size); + uint8_t * dest = w->buf + w->ndone; + w->ndone += nbytes; + return dest; +} + +static inline void wbuf_nocrc_literal_bytes(struct wbuf *w, const void *bytes_bv, uint32_t nbytes) { + const unsigned char *bytes = (const unsigned char *) bytes_bv; +#if 0 + { int i; for (i=0; i<nbytes; i++) wbuf_nocrc_char(w, bytes[i]); } +#else + assert(w->ndone + nbytes <= w->size); + memcpy(w->buf + w->ndone, bytes, (size_t)nbytes); + w->ndone += nbytes; +#endif +} + +static inline void wbuf_literal_bytes(struct wbuf *w, const void *bytes_bv, uint32_t nbytes) { + wbuf_nocrc_literal_bytes(w, bytes_bv, nbytes); + toku_x1764_add(&w->checksum, &w->buf[w->ndone-nbytes], nbytes); +} + +static void wbuf_nocrc_bytes (struct wbuf *w, const void *bytes_bv, uint32_t nbytes) { + wbuf_nocrc_uint(w, nbytes); + wbuf_nocrc_literal_bytes(w, bytes_bv, nbytes); +} + +static void wbuf_bytes (struct wbuf *w, const void *bytes_bv, uint32_t nbytes) { + wbuf_uint(w, nbytes); + wbuf_literal_bytes(w, bytes_bv, nbytes); +} + +static void wbuf_nocrc_ulonglong (struct wbuf *w, uint64_t ull) { + wbuf_nocrc_uint(w, (uint32_t)(ull>>32)); + wbuf_nocrc_uint(w, (uint32_t)(ull&0xFFFFFFFF)); +} + +static void wbuf_ulonglong (struct wbuf *w, uint64_t ull) { + wbuf_uint(w, (uint32_t)(ull>>32)); + wbuf_uint(w, (uint32_t)(ull&0xFFFFFFFF)); +} + +static inline void wbuf_nocrc_uint64_t(struct wbuf *w, uint64_t ull) { + wbuf_nocrc_ulonglong(w, ull); +} + + +static inline void wbuf_uint64_t(struct wbuf *w, uint64_t ull) { + wbuf_ulonglong(w, ull); +} + +static inline void wbuf_nocrc_bool (struct wbuf *w, bool b) { + wbuf_nocrc_uint8_t(w, (uint8_t)(b ? 1 : 0)); +} + +static inline void wbuf_nocrc_BYTESTRING (struct wbuf *w, BYTESTRING v) { + wbuf_nocrc_bytes(w, v.data, v.len); +} + +static inline void wbuf_BYTESTRING (struct wbuf *w, BYTESTRING v) { + wbuf_bytes(w, v.data, v.len); +} + +static inline void wbuf_uint8_t (struct wbuf *w, uint8_t v) { + wbuf_char(w, v); +} + +static inline void wbuf_nocrc_uint32_t (struct wbuf *w, uint32_t v) { + wbuf_nocrc_uint(w, v); +} + +static inline void wbuf_uint32_t (struct wbuf *w, uint32_t v) { + wbuf_uint(w, v); +} diff --git a/storage/tokudb/PerconaFT/ft/serialize/workset.h b/storage/tokudb/PerconaFT/ft/serialize/workset.h new file mode 100644 index 00000000000..073741fccb1 --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/workset.h @@ -0,0 +1,135 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <toku_list.h> +#include <toku_pthread.h> + +// The work struct is the base class for work to be done by some threads +struct work { + struct toku_list next; +}; + +// The workset struct contains the set of work to be done by some threads +struct workset { + toku_mutex_t lock; + struct toku_list worklist; // a list of work + int refs; // number of workers that have a reference on the workset + toku_cond_t worker_wait; // a condition variable used to wait for all of the worker to release their reference on the workset +}; + +static inline void +workset_init(struct workset *ws) { + toku_mutex_init(&ws->lock, NULL); + toku_list_init(&ws->worklist); + ws->refs = 1; // the calling thread gets a reference + toku_cond_init(&ws->worker_wait, NULL); +} + +static inline void +workset_destroy(struct workset *ws) { + invariant(toku_list_empty(&ws->worklist)); + toku_cond_destroy(&ws->worker_wait); + toku_mutex_destroy(&ws->lock); +} + +static inline void +workset_lock(struct workset *ws) { + toku_mutex_lock(&ws->lock); +} + +static inline void +workset_unlock(struct workset *ws) { + toku_mutex_unlock(&ws->lock); +} + +// Put work in the workset. Assume the workset is already locked. +static inline void +workset_put_locked(struct workset *ws, struct work *w) { + toku_list_push(&ws->worklist, &w->next); +} + +// Put work in the workset +static inline void +workset_put(struct workset *ws, struct work *w) { + workset_lock(ws); + workset_put_locked(ws, w); + workset_unlock(ws); +} + +// Get work from the workset +static inline struct work * +workset_get(struct workset *ws) { + workset_lock(ws); + struct work *w = NULL; + if (!toku_list_empty(&ws->worklist)) { + struct toku_list *l = toku_list_pop_head(&ws->worklist); + w = toku_list_struct(l, struct work, next); + } + workset_unlock(ws); + return w; +} + +// Add references to the workset +static inline void +workset_add_ref(struct workset *ws, int refs) { + workset_lock(ws); + ws->refs += refs; + workset_unlock(ws); +} + +// Release a reference on the workset +static inline void +workset_release_ref(struct workset *ws) { + workset_lock(ws); + if (--ws->refs == 0) { + toku_cond_broadcast(&ws->worker_wait); + } + workset_unlock(ws); +} + +// Wait until all of the worker threads have released their reference on the workset +static inline void +workset_join(struct workset *ws) { + workset_lock(ws); + while (ws->refs != 0) { + toku_cond_wait(&ws->worker_wait, &ws->lock); + } + workset_unlock(ws); +} |