diff options
Diffstat (limited to 'storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc')
-rw-r--r-- | storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc | 473 |
1 files changed, 136 insertions, 337 deletions
diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc index 1355f3739ee..19811373d16 100644 --- a/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc +++ b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc @@ -46,415 +46,214 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. #include "portability/toku_stdlib.h" #include "ft/serialize/block_allocator.h" -#include "ft/serialize/block_allocator_strategy.h" +#include "ft/serialize/rbtree_mhs.h" #if TOKU_DEBUG_PARANOID -#define VALIDATE() validate() +#define VALIDATE() Validate() #else #define VALIDATE() #endif -static FILE *ba_trace_file = nullptr; - -void block_allocator::maybe_initialize_trace(void) { - const char *ba_trace_path = getenv("TOKU_BA_TRACE_PATH"); - if (ba_trace_path != nullptr) { - ba_trace_file = toku_os_fopen(ba_trace_path, "w"); - if (ba_trace_file == nullptr) { - fprintf(stderr, "tokuft: error: block allocator trace path found in environment (%s), " - "but it could not be opened for writing (errno %d)\n", - ba_trace_path, get_maybe_error_errno()); - } else { - fprintf(stderr, "tokuft: block allocator tracing enabled, path: %s\n", ba_trace_path); - } - } -} - -void block_allocator::maybe_close_trace() { - if (ba_trace_file != nullptr) { - int r = toku_os_fclose(ba_trace_file); - if (r != 0) { - fprintf(stderr, "tokuft: error: block allocator trace file did not close properly (r %d, errno %d)\n", - r, get_maybe_error_errno()); - } else { - fprintf(stderr, "tokuft: block allocator tracing finished, file closed successfully\n"); - } - } -} - -void block_allocator::_create_internal(uint64_t reserve_at_beginning, uint64_t alignment) { - // the alignment must be at least 512 and aligned with 512 to work with direct I/O - assert(alignment >= 512 && (alignment % 512) == 0); +void BlockAllocator::CreateInternal(uint64_t reserve_at_beginning, + uint64_t alignment) { + // the alignment must be at least 512 and aligned with 512 to work with + // direct I/O + invariant(alignment >= 512 && (alignment % 512) == 0); _reserve_at_beginning = reserve_at_beginning; _alignment = alignment; _n_blocks = 0; - _blocks_array_size = 1; - XMALLOC_N(_blocks_array_size, _blocks_array); _n_bytes_in_use = reserve_at_beginning; - _strategy = BA_STRATEGY_FIRST_FIT; - - memset(&_trace_lock, 0, sizeof(toku_mutex_t)); - toku_mutex_init(&_trace_lock, nullptr); + _tree = new MhsRbTree::Tree(alignment); +} +void BlockAllocator::Create(uint64_t reserve_at_beginning, uint64_t alignment) { + CreateInternal(reserve_at_beginning, alignment); + _tree->Insert({reserve_at_beginning, MAX_BYTE}); VALIDATE(); } -void block_allocator::create(uint64_t reserve_at_beginning, uint64_t alignment) { - _create_internal(reserve_at_beginning, alignment); - _trace_create(); +void BlockAllocator::Destroy() { + delete _tree; } -void block_allocator::destroy() { - toku_free(_blocks_array); - _trace_destroy(); - toku_mutex_destroy(&_trace_lock); -} +void BlockAllocator::CreateFromBlockPairs(uint64_t reserve_at_beginning, + uint64_t alignment, + struct BlockPair *translation_pairs, + uint64_t n_blocks) { + CreateInternal(reserve_at_beginning, alignment); + _n_blocks = n_blocks; -void block_allocator::set_strategy(enum allocation_strategy strategy) { - _strategy = strategy; -} + struct BlockPair *XMALLOC_N(n_blocks, pairs); + memcpy(pairs, translation_pairs, n_blocks * sizeof(struct BlockPair)); + std::sort(pairs, pairs + n_blocks); -void block_allocator::grow_blocks_array_by(uint64_t n_to_add) { - if (_n_blocks + n_to_add > _blocks_array_size) { - uint64_t new_size = _n_blocks + n_to_add; - uint64_t at_least = _blocks_array_size * 2; - if (at_least > new_size) { - new_size = at_least; - } - _blocks_array_size = new_size; - XREALLOC_N(_blocks_array_size, _blocks_array); + if (pairs[0]._offset > reserve_at_beginning) { + _tree->Insert( + {reserve_at_beginning, pairs[0]._offset - reserve_at_beginning}); } -} - -void block_allocator::grow_blocks_array() { - grow_blocks_array_by(1); -} - -void block_allocator::create_from_blockpairs(uint64_t reserve_at_beginning, uint64_t alignment, - struct blockpair *pairs, uint64_t n_blocks) { - _create_internal(reserve_at_beginning, alignment); - - _n_blocks = n_blocks; - grow_blocks_array_by(_n_blocks); - memcpy(_blocks_array, pairs, _n_blocks * sizeof(struct blockpair)); - std::sort(_blocks_array, _blocks_array + _n_blocks); for (uint64_t i = 0; i < _n_blocks; i++) { - // Allocator does not support size 0 blocks. See block_allocator_free_block. - invariant(_blocks_array[i].size > 0); - invariant(_blocks_array[i].offset >= _reserve_at_beginning); - invariant(_blocks_array[i].offset % _alignment == 0); - - _n_bytes_in_use += _blocks_array[i].size; + // Allocator does not support size 0 blocks. See + // block_allocator_free_block. + invariant(pairs[i]._size > 0); + invariant(pairs[i]._offset >= _reserve_at_beginning); + invariant(pairs[i]._offset % _alignment == 0); + + _n_bytes_in_use += pairs[i]._size; + + MhsRbTree::OUUInt64 free_size(MAX_BYTE); + MhsRbTree::OUUInt64 free_offset(pairs[i]._offset + pairs[i]._size); + if (i < n_blocks - 1) { + MhsRbTree::OUUInt64 next_offset(pairs[i + 1]._offset); + invariant(next_offset >= free_offset); + free_size = next_offset - free_offset; + if (free_size == 0) + continue; + } + _tree->Insert({free_offset, free_size}); } - + toku_free(pairs); VALIDATE(); - - _trace_create_from_blockpairs(); } // Effect: align a value by rounding up. -static inline uint64_t align(uint64_t value, uint64_t ba_alignment) { +static inline uint64_t Align(uint64_t value, uint64_t ba_alignment) { return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment; } -struct block_allocator::blockpair * -block_allocator::choose_block_to_alloc_after(size_t size, uint64_t heat) { - switch (_strategy) { - case BA_STRATEGY_FIRST_FIT: - return block_allocator_strategy::first_fit(_blocks_array, _n_blocks, size, _alignment); - case BA_STRATEGY_BEST_FIT: - return block_allocator_strategy::best_fit(_blocks_array, _n_blocks, size, _alignment); - case BA_STRATEGY_HEAT_ZONE: - return block_allocator_strategy::heat_zone(_blocks_array, _n_blocks, size, _alignment, heat); - case BA_STRATEGY_PADDED_FIT: - return block_allocator_strategy::padded_fit(_blocks_array, _n_blocks, size, _alignment); - default: - abort(); - } -} - -// Effect: Allocate a block. The resulting block must be aligned on the ba->alignment (which to make direct_io happy must be a positive multiple of 512). -void block_allocator::alloc_block(uint64_t size, uint64_t heat, uint64_t *offset) { - struct blockpair *bp; - +// Effect: Allocate a block. The resulting block must be aligned on the +// ba->alignment (which to make direct_io happy must be a positive multiple of +// 512). +void BlockAllocator::AllocBlock(uint64_t size, + uint64_t *offset) { // Allocator does not support size 0 blocks. See block_allocator_free_block. invariant(size > 0); - grow_blocks_array(); _n_bytes_in_use += size; + *offset = _tree->Remove(size); - uint64_t end_of_reserve = align(_reserve_at_beginning, _alignment); - - if (_n_blocks == 0) { - // First and only block - assert(_n_bytes_in_use == _reserve_at_beginning + size); // we know exactly how many are in use - _blocks_array[0].offset = align(_reserve_at_beginning, _alignment); - _blocks_array[0].size = size; - *offset = _blocks_array[0].offset; - goto done; - } else if (end_of_reserve + size <= _blocks_array[0].offset ) { - // Check to see if the space immediately after the reserve is big enough to hold the new block. - bp = &_blocks_array[0]; - memmove(bp + 1, bp, _n_blocks * sizeof(*bp)); - bp[0].offset = end_of_reserve; - bp[0].size = size; - *offset = end_of_reserve; - goto done; - } - - bp = choose_block_to_alloc_after(size, heat); - if (bp != nullptr) { - // our allocation strategy chose the space after `bp' to fit the new block - uint64_t answer_offset = align(bp->offset + bp->size, _alignment); - uint64_t blocknum = bp - _blocks_array; - invariant(&_blocks_array[blocknum] == bp); - invariant(blocknum < _n_blocks); - memmove(bp + 2, bp + 1, (_n_blocks - blocknum - 1) * sizeof(*bp)); - bp[1].offset = answer_offset; - bp[1].size = size; - *offset = answer_offset; - } else { - // It didn't fit anywhere, so fit it on the end. - assert(_n_blocks < _blocks_array_size); - bp = &_blocks_array[_n_blocks]; - uint64_t answer_offset = align(bp[-1].offset + bp[-1].size, _alignment); - bp->offset = answer_offset; - bp->size = size; - *offset = answer_offset; - } - -done: _n_blocks++; VALIDATE(); - - _trace_alloc(size, heat, *offset); -} - -// Find the index in the blocks array that has a particular offset. Requires that the block exist. -// Use binary search so it runs fast. -int64_t block_allocator::find_block(uint64_t offset) { - VALIDATE(); - if (_n_blocks == 1) { - assert(_blocks_array[0].offset == offset); - return 0; - } - - uint64_t lo = 0; - uint64_t hi = _n_blocks; - while (1) { - assert(lo < hi); // otherwise no such block exists. - uint64_t mid = (lo + hi) / 2; - uint64_t thisoff = _blocks_array[mid].offset; - if (thisoff < offset) { - lo = mid + 1; - } else if (thisoff > offset) { - hi = mid; - } else { - return mid; - } - } } -// To support 0-sized blocks, we need to include size as an input to this function. +// To support 0-sized blocks, we need to include size as an input to this +// function. // All 0-sized blocks at the same offset can be considered identical, but // a 0-sized block can share offset with a non-zero sized block. -// The non-zero sized block is not exchangable with a zero sized block (or vice versa), -// so inserting 0-sized blocks can cause corruption here. -void block_allocator::free_block(uint64_t offset) { +// The non-zero sized block is not exchangable with a zero sized block (or vice +// versa), so inserting 0-sized blocks can cause corruption here. +void BlockAllocator::FreeBlock(uint64_t offset, uint64_t size) { VALIDATE(); - int64_t bn = find_block(offset); - assert(bn >= 0); // we require that there is a block with that offset. - _n_bytes_in_use -= _blocks_array[bn].size; - memmove(&_blocks_array[bn], &_blocks_array[bn + 1], - (_n_blocks - bn - 1) * sizeof(struct blockpair)); + _n_bytes_in_use -= size; + _tree->Insert({offset, size}); _n_blocks--; VALIDATE(); - - _trace_free(offset); -} - -uint64_t block_allocator::block_size(uint64_t offset) { - int64_t bn = find_block(offset); - assert(bn >=0); // we require that there is a block with that offset. - return _blocks_array[bn].size; } -uint64_t block_allocator::allocated_limit() const { - if (_n_blocks == 0) { - return _reserve_at_beginning; - } else { - struct blockpair *last = &_blocks_array[_n_blocks - 1]; - return last->offset + last->size; - } +uint64_t BlockAllocator::AllocatedLimit() const { + MhsRbTree::Node *max_node = _tree->MaxNode(); + return rbn_offset(max_node).ToInt(); } -// Effect: Consider the blocks in sorted order. The reserved block at the beginning is number 0. The next one is number 1 and so forth. +// Effect: Consider the blocks in sorted order. The reserved block at the +// beginning is number 0. The next one is number 1 and so forth. // Return the offset and size of the block with that number. // Return 0 if there is a block that big, return nonzero if b is too big. -int block_allocator::get_nth_block_in_layout_order(uint64_t b, uint64_t *offset, uint64_t *size) { - if (b ==0 ) { +int BlockAllocator::NthBlockInLayoutOrder(uint64_t b, + uint64_t *offset, + uint64_t *size) { + MhsRbTree::Node *x, *y; + if (b == 0) { *offset = 0; *size = _reserve_at_beginning; - return 0; + return 0; } else if (b > _n_blocks) { return -1; } else { - *offset =_blocks_array[b - 1].offset; - *size =_blocks_array[b - 1].size; + x = _tree->MinNode(); + for (uint64_t i = 1; i <= b; i++) { + y = x; + x = _tree->Successor(x); + } + *size = (rbn_offset(x) - (rbn_offset(y) + rbn_size(y))).ToInt(); + *offset = (rbn_offset(y) + rbn_size(y)).ToInt(); return 0; } } +struct VisUnusedExtra { + TOKU_DB_FRAGMENTATION _report; + uint64_t _align; +}; + +static void VisUnusedCollector(void *extra, + MhsRbTree::Node *node, + uint64_t UU(depth)) { + struct VisUnusedExtra *v_e = (struct VisUnusedExtra *)extra; + TOKU_DB_FRAGMENTATION report = v_e->_report; + uint64_t alignm = v_e->_align; + + MhsRbTree::OUUInt64 offset = rbn_offset(node); + MhsRbTree::OUUInt64 size = rbn_size(node); + MhsRbTree::OUUInt64 answer_offset(Align(offset.ToInt(), alignm)); + uint64_t free_space = (offset + size - answer_offset).ToInt(); + if (free_space > 0) { + report->unused_bytes += free_space; + report->unused_blocks++; + if (free_space > report->largest_unused_block) { + report->largest_unused_block = free_space; + } + } +} // Requires: report->file_size_bytes is filled in // Requires: report->data_bytes is filled in // Requires: report->checkpoint_bytes_additional is filled in -void block_allocator::get_unused_statistics(TOKU_DB_FRAGMENTATION report) { - assert(_n_bytes_in_use == report->data_bytes + report->checkpoint_bytes_additional); +void BlockAllocator::UnusedStatistics(TOKU_DB_FRAGMENTATION report) { + invariant(_n_bytes_in_use == + report->data_bytes + report->checkpoint_bytes_additional); report->unused_bytes = 0; report->unused_blocks = 0; report->largest_unused_block = 0; - if (_n_blocks > 0) { - //Deal with space before block 0 and after reserve: - { - struct blockpair *bp = &_blocks_array[0]; - assert(bp->offset >= align(_reserve_at_beginning, _alignment)); - uint64_t free_space = bp->offset - align(_reserve_at_beginning, _alignment); - if (free_space > 0) { - report->unused_bytes += free_space; - report->unused_blocks++; - if (free_space > report->largest_unused_block) { - report->largest_unused_block = free_space; - } - } - } - - //Deal with space between blocks: - for (uint64_t blocknum = 0; blocknum +1 < _n_blocks; blocknum ++) { - // Consider the space after blocknum - struct blockpair *bp = &_blocks_array[blocknum]; - uint64_t this_offset = bp[0].offset; - uint64_t this_size = bp[0].size; - uint64_t end_of_this_block = align(this_offset+this_size, _alignment); - uint64_t next_offset = bp[1].offset; - uint64_t free_space = next_offset - end_of_this_block; - if (free_space > 0) { - report->unused_bytes += free_space; - report->unused_blocks++; - if (free_space > report->largest_unused_block) { - report->largest_unused_block = free_space; - } - } - } - - //Deal with space after last block - { - struct blockpair *bp = &_blocks_array[_n_blocks-1]; - uint64_t this_offset = bp[0].offset; - uint64_t this_size = bp[0].size; - uint64_t end_of_this_block = align(this_offset+this_size, _alignment); - if (end_of_this_block < report->file_size_bytes) { - uint64_t free_space = report->file_size_bytes - end_of_this_block; - assert(free_space > 0); - report->unused_bytes += free_space; - report->unused_blocks++; - if (free_space > report->largest_unused_block) { - report->largest_unused_block = free_space; - } - } - } - } else { - // No blocks. Just the reserve. - uint64_t end_of_this_block = align(_reserve_at_beginning, _alignment); - if (end_of_this_block < report->file_size_bytes) { - uint64_t free_space = report->file_size_bytes - end_of_this_block; - assert(free_space > 0); - report->unused_bytes += free_space; - report->unused_blocks++; - if (free_space > report->largest_unused_block) { - report->largest_unused_block = free_space; - } - } - } + struct VisUnusedExtra extra = {report, _alignment}; + _tree->InOrderVisitor(VisUnusedCollector, &extra); } -void block_allocator::get_statistics(TOKU_DB_FRAGMENTATION report) { - report->data_bytes = _n_bytes_in_use; - report->data_blocks = _n_blocks; +void BlockAllocator::Statistics(TOKU_DB_FRAGMENTATION report) { + report->data_bytes = _n_bytes_in_use; + report->data_blocks = _n_blocks; report->file_size_bytes = 0; report->checkpoint_bytes_additional = 0; - get_unused_statistics(report); + UnusedStatistics(report); } -void block_allocator::validate() const { - uint64_t n_bytes_in_use = _reserve_at_beginning; - for (uint64_t i = 0; i < _n_blocks; i++) { - n_bytes_in_use += _blocks_array[i].size; - if (i > 0) { - assert(_blocks_array[i].offset > _blocks_array[i - 1].offset); - assert(_blocks_array[i].offset >= _blocks_array[i - 1].offset + _blocks_array[i - 1].size ); - } - } - assert(n_bytes_in_use == _n_bytes_in_use); -} - -// Tracing - -void block_allocator::_trace_create(void) { - if (ba_trace_file != nullptr) { - toku_mutex_lock(&_trace_lock); - fprintf(ba_trace_file, "ba_trace_create %p %" PRIu64 " %" PRIu64 "\n", - this, _reserve_at_beginning, _alignment); - toku_mutex_unlock(&_trace_lock); - - fflush(ba_trace_file); - } -} - -void block_allocator::_trace_create_from_blockpairs(void) { - if (ba_trace_file != nullptr) { - toku_mutex_lock(&_trace_lock); - fprintf(ba_trace_file, "ba_trace_create_from_blockpairs %p %" PRIu64 " %" PRIu64 " ", - this, _reserve_at_beginning, _alignment); - for (uint64_t i = 0; i < _n_blocks; i++) { - fprintf(ba_trace_file, "[%" PRIu64 " %" PRIu64 "] ", - _blocks_array[i].offset, _blocks_array[i].size); - } - fprintf(ba_trace_file, "\n"); - toku_mutex_unlock(&_trace_lock); - - fflush(ba_trace_file); - } -} - -void block_allocator::_trace_destroy(void) { - if (ba_trace_file != nullptr) { - toku_mutex_lock(&_trace_lock); - fprintf(ba_trace_file, "ba_trace_destroy %p\n", this); - toku_mutex_unlock(&_trace_lock); - - fflush(ba_trace_file); - } -} - -void block_allocator::_trace_alloc(uint64_t size, uint64_t heat, uint64_t offset) { - if (ba_trace_file != nullptr) { - toku_mutex_lock(&_trace_lock); - fprintf(ba_trace_file, "ba_trace_alloc %p %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", - this, size, heat, offset); - toku_mutex_unlock(&_trace_lock); - - fflush(ba_trace_file); +struct ValidateExtra { + uint64_t _bytes; + MhsRbTree::Node *_pre_node; +}; +static void VisUsedBlocksInOrder(void *extra, + MhsRbTree::Node *cur_node, + uint64_t UU(depth)) { + struct ValidateExtra *v_e = (struct ValidateExtra *)extra; + MhsRbTree::Node *pre_node = v_e->_pre_node; + // verify no overlaps + if (pre_node) { + invariant(rbn_size(pre_node) > 0); + invariant(rbn_offset(cur_node) > + rbn_offset(pre_node) + rbn_size(pre_node)); + MhsRbTree::OUUInt64 used_space = + rbn_offset(cur_node) - (rbn_offset(pre_node) + rbn_size(pre_node)); + v_e->_bytes += used_space.ToInt(); + } else { + v_e->_bytes += rbn_offset(cur_node).ToInt(); } + v_e->_pre_node = cur_node; } -void block_allocator::_trace_free(uint64_t offset) { - if (ba_trace_file != nullptr) { - toku_mutex_lock(&_trace_lock); - fprintf(ba_trace_file, "ba_trace_free %p %" PRIu64 "\n", this, offset); - toku_mutex_unlock(&_trace_lock); - - fflush(ba_trace_file); - } +void BlockAllocator::Validate() const { + _tree->ValidateBalance(); + _tree->ValidateMhs(); + struct ValidateExtra extra = {0, nullptr}; + _tree->InOrderVisitor(VisUsedBlocksInOrder, &extra); + invariant(extra._bytes == _n_bytes_in_use); } |