5.6.26-74.0

author: Sergei Golubchik <serg@mariadb.org> 2015-10-26 12:48:26 +0100
committer: Sergei Golubchik <serg@mariadb.org> 2015-10-26 12:57:57 +0100
commit: 2c8c65297865d9f8da501761f46e2a34e29af603 (patch)
tree: 3fdf4a00f8537bb3564827884f923ac56966e778 /storage/tokudb/PerconaFT/ft/serialize
download: mariadb-git-2c8c65297865d9f8da501761f46e2a34e29af603.tar.gz
21 files changed, 8792 insertions, 0 deletions
diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc
new file mode 100644
index 00000000000..1355f3739ee
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc
@@ -0,0 +1,460 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <algorithm>
+
+#include <string.h>
+
+#include "portability/memory.h"
+#include "portability/toku_assert.h"
+#include "portability/toku_stdint.h"
+#include "portability/toku_stdlib.h"
+
+#include "ft/serialize/block_allocator.h"
+#include "ft/serialize/block_allocator_strategy.h"
+
+#if TOKU_DEBUG_PARANOID
+#define VALIDATE() validate()
+#else
+#define VALIDATE()
+#endif
+
+static FILE *ba_trace_file = nullptr;
+
+void block_allocator::maybe_initialize_trace(void) {
+    const char *ba_trace_path = getenv("TOKU_BA_TRACE_PATH");        
+    if (ba_trace_path != nullptr) {
+        ba_trace_file = toku_os_fopen(ba_trace_path, "w");
+        if (ba_trace_file == nullptr) {
+            fprintf(stderr, "tokuft: error: block allocator trace path found in environment (%s), "
+                            "but it could not be opened for writing (errno %d)\n",
+                            ba_trace_path, get_maybe_error_errno());
+        } else {
+            fprintf(stderr, "tokuft: block allocator tracing enabled, path: %s\n", ba_trace_path);
+        }
+    }
+}
+
+void block_allocator::maybe_close_trace() {
+    if (ba_trace_file != nullptr) {
+        int r = toku_os_fclose(ba_trace_file);
+        if (r != 0) {
+            fprintf(stderr, "tokuft: error: block allocator trace file did not close properly (r %d, errno %d)\n",
+                            r, get_maybe_error_errno());
+        } else {
+            fprintf(stderr, "tokuft: block allocator tracing finished, file closed successfully\n");
+        }
+    }
+}
+
+void block_allocator::_create_internal(uint64_t reserve_at_beginning, uint64_t alignment) {
+    // the alignment must be at least 512 and aligned with 512 to work with direct I/O
+    assert(alignment >= 512 && (alignment % 512) == 0);
+
+    _reserve_at_beginning = reserve_at_beginning;
+    _alignment = alignment;
+    _n_blocks = 0;
+    _blocks_array_size = 1;
+    XMALLOC_N(_blocks_array_size, _blocks_array);
+    _n_bytes_in_use = reserve_at_beginning;
+    _strategy = BA_STRATEGY_FIRST_FIT;
+
+    memset(&_trace_lock, 0, sizeof(toku_mutex_t));
+    toku_mutex_init(&_trace_lock, nullptr);
+
+    VALIDATE();
+}
+
+void block_allocator::create(uint64_t reserve_at_beginning, uint64_t alignment) {
+    _create_internal(reserve_at_beginning, alignment);
+    _trace_create();
+}
+
+void block_allocator::destroy() {
+    toku_free(_blocks_array);
+    _trace_destroy();
+    toku_mutex_destroy(&_trace_lock);
+}
+
+void block_allocator::set_strategy(enum allocation_strategy strategy) {
+    _strategy = strategy;
+}
+
+void block_allocator::grow_blocks_array_by(uint64_t n_to_add) {
+    if (_n_blocks + n_to_add > _blocks_array_size) {
+        uint64_t new_size = _n_blocks + n_to_add;
+        uint64_t at_least = _blocks_array_size * 2;
+        if (at_least > new_size) {
+            new_size = at_least;
+        }
+        _blocks_array_size = new_size;
+        XREALLOC_N(_blocks_array_size, _blocks_array);
+    }
+}
+
+void block_allocator::grow_blocks_array() {
+    grow_blocks_array_by(1);
+}
+
+void block_allocator::create_from_blockpairs(uint64_t reserve_at_beginning, uint64_t alignment,
+                                             struct blockpair *pairs, uint64_t n_blocks) {
+    _create_internal(reserve_at_beginning, alignment);
+
+    _n_blocks = n_blocks;
+    grow_blocks_array_by(_n_blocks);
+    memcpy(_blocks_array, pairs, _n_blocks * sizeof(struct blockpair));
+    std::sort(_blocks_array, _blocks_array + _n_blocks);
+    for (uint64_t i = 0; i < _n_blocks; i++) {
+        // Allocator does not support size 0 blocks. See block_allocator_free_block.
+        invariant(_blocks_array[i].size > 0);
+        invariant(_blocks_array[i].offset >= _reserve_at_beginning);
+        invariant(_blocks_array[i].offset % _alignment == 0);
+
+        _n_bytes_in_use += _blocks_array[i].size;
+    }
+
+    VALIDATE();
+
+    _trace_create_from_blockpairs();
+}
+
+// Effect: align a value by rounding up.
+static inline uint64_t align(uint64_t value, uint64_t ba_alignment) {
+    return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment;
+}
+
+struct block_allocator::blockpair *
+block_allocator::choose_block_to_alloc_after(size_t size, uint64_t heat) {
+    switch (_strategy) {
+    case BA_STRATEGY_FIRST_FIT:
+        return block_allocator_strategy::first_fit(_blocks_array, _n_blocks, size, _alignment);
+    case BA_STRATEGY_BEST_FIT:
+        return block_allocator_strategy::best_fit(_blocks_array, _n_blocks, size, _alignment);
+    case BA_STRATEGY_HEAT_ZONE:
+        return block_allocator_strategy::heat_zone(_blocks_array, _n_blocks, size, _alignment, heat);
+    case BA_STRATEGY_PADDED_FIT:
+        return block_allocator_strategy::padded_fit(_blocks_array, _n_blocks, size, _alignment);
+    default:
+        abort();
+    }
+}
+
+// Effect: Allocate a block. The resulting block must be aligned on the ba->alignment (which to make direct_io happy must be a positive multiple of 512).
+void block_allocator::alloc_block(uint64_t size, uint64_t heat, uint64_t *offset) {
+    struct blockpair *bp;
+
+    // Allocator does not support size 0 blocks. See block_allocator_free_block.
+    invariant(size > 0);
+
+    grow_blocks_array();
+    _n_bytes_in_use += size;
+
+    uint64_t end_of_reserve = align(_reserve_at_beginning, _alignment);
+
+    if (_n_blocks == 0) {
+        // First and only block
+        assert(_n_bytes_in_use == _reserve_at_beginning + size); // we know exactly how many are in use
+        _blocks_array[0].offset = align(_reserve_at_beginning, _alignment);
+        _blocks_array[0].size = size;
+        *offset = _blocks_array[0].offset;
+        goto done;
+    } else if (end_of_reserve + size <= _blocks_array[0].offset ) {
+        // Check to see if the space immediately after the reserve is big enough to hold the new block.
+        bp = &_blocks_array[0];
+        memmove(bp + 1, bp, _n_blocks * sizeof(*bp));
+        bp[0].offset = end_of_reserve;
+        bp[0].size = size;
+        *offset = end_of_reserve;
+        goto done;
+    }
+
+    bp = choose_block_to_alloc_after(size, heat);
+    if (bp != nullptr) {
+        // our allocation strategy chose the space after `bp' to fit the new block
+        uint64_t answer_offset = align(bp->offset + bp->size, _alignment);
+        uint64_t blocknum = bp - _blocks_array;
+        invariant(&_blocks_array[blocknum] == bp);
+        invariant(blocknum < _n_blocks);
+        memmove(bp + 2, bp + 1, (_n_blocks - blocknum - 1) * sizeof(*bp));
+        bp[1].offset = answer_offset;
+        bp[1].size = size;
+        *offset = answer_offset;
+    } else {
+        // It didn't fit anywhere, so fit it on the end.
+        assert(_n_blocks < _blocks_array_size);
+        bp = &_blocks_array[_n_blocks];
+        uint64_t answer_offset = align(bp[-1].offset + bp[-1].size, _alignment);
+        bp->offset = answer_offset;
+        bp->size = size;
+        *offset = answer_offset;
+    }
+
+done:
+    _n_blocks++;
+    VALIDATE();
+
+    _trace_alloc(size, heat, *offset);
+}
+
+// Find the index in the blocks array that has a particular offset.  Requires that the block exist.
+// Use binary search so it runs fast.
+int64_t block_allocator::find_block(uint64_t offset) {
+    VALIDATE();
+    if (_n_blocks == 1) {
+        assert(_blocks_array[0].offset == offset);
+        return 0;
+    }
+
+    uint64_t lo = 0;
+    uint64_t hi = _n_blocks;
+    while (1) {
+        assert(lo < hi); // otherwise no such block exists.
+        uint64_t mid = (lo + hi) / 2;
+        uint64_t thisoff = _blocks_array[mid].offset;
+        if (thisoff < offset) {
+            lo = mid + 1;
+        } else if (thisoff > offset) {
+            hi = mid;
+        } else {
+            return mid;
+        }
+    }
+}
+
+// To support 0-sized blocks, we need to include size as an input to this function.
+// All 0-sized blocks at the same offset can be considered identical, but
+// a 0-sized block can share offset with a non-zero sized block.
+// The non-zero sized block is not exchangable with a zero sized block (or vice versa),
+// so inserting 0-sized blocks can cause corruption here.
+void block_allocator::free_block(uint64_t offset) {
+    VALIDATE();
+    int64_t bn = find_block(offset);
+    assert(bn >= 0); // we require that there is a block with that offset.
+    _n_bytes_in_use -= _blocks_array[bn].size;
+    memmove(&_blocks_array[bn], &_blocks_array[bn + 1],
+            (_n_blocks - bn - 1) * sizeof(struct blockpair));
+    _n_blocks--;
+    VALIDATE();
+    
+    _trace_free(offset);
+}
+
+uint64_t block_allocator::block_size(uint64_t offset) {
+    int64_t bn = find_block(offset);
+    assert(bn >=0); // we require that there is a block with that offset.
+    return _blocks_array[bn].size;
+}
+
+uint64_t block_allocator::allocated_limit() const {
+    if (_n_blocks == 0) {
+        return _reserve_at_beginning;
+    } else {
+        struct blockpair *last = &_blocks_array[_n_blocks - 1];
+        return last->offset + last->size;
+    }
+}
+
+// Effect: Consider the blocks in sorted order.  The reserved block at the beginning is number 0.  The next one is number 1 and so forth.
+// Return the offset and size of the block with that number.
+// Return 0 if there is a block that big, return nonzero if b is too big.
+int block_allocator::get_nth_block_in_layout_order(uint64_t b, uint64_t *offset, uint64_t *size) {
+    if (b ==0 ) {
+        *offset = 0;
+        *size = _reserve_at_beginning;
+        return  0;
+    } else if (b > _n_blocks) {
+        return -1;
+    } else {
+        *offset =_blocks_array[b - 1].offset;
+        *size =_blocks_array[b - 1].size;
+        return 0;
+    }
+}
+
+// Requires: report->file_size_bytes is filled in
+// Requires: report->data_bytes is filled in
+// Requires: report->checkpoint_bytes_additional is filled in
+void block_allocator::get_unused_statistics(TOKU_DB_FRAGMENTATION report) {
+    assert(_n_bytes_in_use == report->data_bytes + report->checkpoint_bytes_additional);
+
+    report->unused_bytes = 0;
+    report->unused_blocks = 0;
+    report->largest_unused_block = 0;
+    if (_n_blocks > 0) {
+        //Deal with space before block 0 and after reserve:
+        {
+            struct blockpair *bp = &_blocks_array[0];
+            assert(bp->offset >= align(_reserve_at_beginning, _alignment));
+            uint64_t free_space = bp->offset - align(_reserve_at_beginning, _alignment);
+            if (free_space > 0) {
+                report->unused_bytes += free_space;
+                report->unused_blocks++;
+                if (free_space > report->largest_unused_block) {
+                    report->largest_unused_block = free_space;
+                }
+            }
+        }
+
+        //Deal with space between blocks:
+        for (uint64_t blocknum = 0; blocknum +1 < _n_blocks; blocknum ++) {
+            // Consider the space after blocknum
+            struct blockpair *bp = &_blocks_array[blocknum];
+            uint64_t this_offset = bp[0].offset;
+            uint64_t this_size   = bp[0].size;
+            uint64_t end_of_this_block = align(this_offset+this_size, _alignment);
+            uint64_t next_offset = bp[1].offset;
+            uint64_t free_space  = next_offset - end_of_this_block;
+            if (free_space > 0) {
+                report->unused_bytes += free_space;
+                report->unused_blocks++;
+                if (free_space > report->largest_unused_block) {
+                    report->largest_unused_block = free_space;
+                }
+            }
+        }
+
+        //Deal with space after last block
+        {
+            struct blockpair *bp = &_blocks_array[_n_blocks-1];
+            uint64_t this_offset = bp[0].offset;
+            uint64_t this_size   = bp[0].size;
+            uint64_t end_of_this_block = align(this_offset+this_size, _alignment);
+            if (end_of_this_block < report->file_size_bytes) {
+                uint64_t free_space  = report->file_size_bytes - end_of_this_block;
+                assert(free_space > 0);
+                report->unused_bytes += free_space;
+                report->unused_blocks++;
+                if (free_space > report->largest_unused_block) {
+                    report->largest_unused_block = free_space;
+                }
+            }
+        }
+    } else {
+        // No blocks.  Just the reserve.
+        uint64_t end_of_this_block = align(_reserve_at_beginning, _alignment);
+        if (end_of_this_block < report->file_size_bytes) {
+            uint64_t free_space  = report->file_size_bytes - end_of_this_block;
+            assert(free_space > 0);
+            report->unused_bytes += free_space;
+            report->unused_blocks++;
+            if (free_space > report->largest_unused_block) {
+                report->largest_unused_block = free_space;
+            }
+        }
+    }
+}
+
+void block_allocator::get_statistics(TOKU_DB_FRAGMENTATION report) {
+    report->data_bytes = _n_bytes_in_use; 
+    report->data_blocks = _n_blocks; 
+    report->file_size_bytes = 0;
+    report->checkpoint_bytes_additional = 0;
+    get_unused_statistics(report);
+}
+
+void block_allocator::validate() const {
+    uint64_t n_bytes_in_use = _reserve_at_beginning;
+    for (uint64_t i = 0; i < _n_blocks; i++) {
+        n_bytes_in_use += _blocks_array[i].size;
+        if (i > 0) {
+            assert(_blocks_array[i].offset >  _blocks_array[i - 1].offset);
+            assert(_blocks_array[i].offset >= _blocks_array[i - 1].offset + _blocks_array[i - 1].size );
+        }
+    }
+    assert(n_bytes_in_use == _n_bytes_in_use);
+}
+
+// Tracing
+
+void block_allocator::_trace_create(void) {
+    if (ba_trace_file != nullptr) {
+        toku_mutex_lock(&_trace_lock);
+        fprintf(ba_trace_file, "ba_trace_create %p %" PRIu64 " %" PRIu64 "\n",
+                this, _reserve_at_beginning, _alignment);
+        toku_mutex_unlock(&_trace_lock);
+
+        fflush(ba_trace_file);
+    }
+}
+
+void block_allocator::_trace_create_from_blockpairs(void) {
+    if (ba_trace_file != nullptr) {
+        toku_mutex_lock(&_trace_lock);
+        fprintf(ba_trace_file, "ba_trace_create_from_blockpairs %p %" PRIu64 " %" PRIu64 " ",
+                this, _reserve_at_beginning, _alignment);
+        for (uint64_t i = 0; i < _n_blocks; i++) {
+            fprintf(ba_trace_file, "[%" PRIu64 " %" PRIu64 "] ",
+                    _blocks_array[i].offset, _blocks_array[i].size);
+        }
+        fprintf(ba_trace_file, "\n");
+        toku_mutex_unlock(&_trace_lock);
+
+        fflush(ba_trace_file);
+    }
+}
+
+void block_allocator::_trace_destroy(void) {
+    if (ba_trace_file != nullptr) {
+        toku_mutex_lock(&_trace_lock);
+        fprintf(ba_trace_file, "ba_trace_destroy %p\n", this);
+        toku_mutex_unlock(&_trace_lock);
+
+        fflush(ba_trace_file);
+    }
+}
+
+void block_allocator::_trace_alloc(uint64_t size, uint64_t heat, uint64_t offset) {
+    if (ba_trace_file != nullptr) {
+        toku_mutex_lock(&_trace_lock);
+        fprintf(ba_trace_file, "ba_trace_alloc %p %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
+                this, size, heat, offset);
+        toku_mutex_unlock(&_trace_lock);
+
+        fflush(ba_trace_file);
+    }
+}
+
+void block_allocator::_trace_free(uint64_t offset) {
+    if (ba_trace_file != nullptr) {
+        toku_mutex_lock(&_trace_lock);
+        fprintf(ba_trace_file, "ba_trace_free %p %" PRIu64 "\n", this, offset);
+        toku_mutex_unlock(&_trace_lock);
+
+        fflush(ba_trace_file);
+    }
+}
diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator.h b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.h
new file mode 100644
index 00000000000..9b2c1553e7f
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.h
@@ -0,0 +1,214 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <db.h>
+
+#include "portability/toku_pthread.h"
+#include "portability/toku_stdint.h"
+#include "portability/toku_stdlib.h"
+
+// Block allocator.
+//
+// A block allocator manages the allocation of variable-sized blocks.
+// The translation of block numbers to addresses is handled elsewhere.
+// The allocation of block numbers is handled elsewhere.
+//
+// When creating a block allocator we also specify a certain-sized
+// block at the beginning that is preallocated (and cannot be allocated or freed)
+//
+// We can allocate blocks of a particular size at a particular location.
+// We can allocate blocks of a particular size at a location chosen by the allocator.
+// We can free blocks.
+// We can determine the size of a block.
+
+class block_allocator {
+public:
+    static const size_t BLOCK_ALLOCATOR_ALIGNMENT = 4096;
+
+    // How much must be reserved at the beginning for the block?
+    //  The actual header is 8+4+4+8+8_4+8+ the length of the db names + 1 pointer for each root.
+    //  So 4096 should be enough.
+    static const size_t BLOCK_ALLOCATOR_HEADER_RESERVE = 4096;
+    
+    static_assert(BLOCK_ALLOCATOR_HEADER_RESERVE % BLOCK_ALLOCATOR_ALIGNMENT == 0,
+                  "block allocator header must have proper alignment");
+
+    static const size_t BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE = BLOCK_ALLOCATOR_HEADER_RESERVE * 2;
+
+    enum allocation_strategy {
+        BA_STRATEGY_FIRST_FIT = 1,
+        BA_STRATEGY_BEST_FIT,
+        BA_STRATEGY_PADDED_FIT,
+        BA_STRATEGY_HEAT_ZONE
+    };
+
+    struct blockpair {
+        uint64_t offset;
+        uint64_t size;
+        blockpair(uint64_t o, uint64_t s) :
+            offset(o), size(s) {
+        }
+        int operator<(const struct blockpair &rhs) const {
+            return offset < rhs.offset;
+        }
+        int operator<(const uint64_t &o) const {
+            return offset < o;
+        }
+    };
+
+    // Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING bytes are not put into a block.
+    //         The default allocation strategy is first fit (BA_STRATEGY_FIRST_FIT)
+    //  All blocks be start on a multiple of ALIGNMENT.
+    //  Aborts if we run out of memory.
+    // Parameters
+    //  reserve_at_beginning (IN)        Size of reserved block at beginning.  This size does not have to be aligned.
+    //  alignment (IN)                   Block alignment.
+    void create(uint64_t reserve_at_beginning, uint64_t alignment);
+
+    // Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING bytes are not put into a block.
+    //         The default allocation strategy is first fit (BA_STRATEGY_FIRST_FIT)
+    //         The allocator is initialized to contain `n_blocks' of blockpairs, taken from `pairs'
+    //  All blocks be start on a multiple of ALIGNMENT.
+    //  Aborts if we run out of memory.
+    // Parameters
+    //  pairs,                           unowned array of pairs to copy
+    //  n_blocks,                        Size of pairs array
+    //  reserve_at_beginning (IN)        Size of reserved block at beginning.  This size does not have to be aligned.
+    //  alignment (IN)                   Block alignment.
+    void create_from_blockpairs(uint64_t reserve_at_beginning, uint64_t alignment,
+                                struct blockpair *pairs, uint64_t n_blocks);
+
+    // Effect: Destroy this block allocator
+    void destroy();
+
+    // Effect: Set the allocation strategy that the allocator should use
+    // Requires: No other threads are operating on this block allocator
+    void set_strategy(enum allocation_strategy strategy);
+
+    // Effect: Allocate a block of the specified size at an address chosen by the allocator.
+    //  Aborts if anything goes wrong.
+    //  The block address will be a multiple of the alignment.
+    // Parameters:
+    //  size (IN):    The size of the block.  (The size does not have to be aligned.)
+    //  offset (OUT): The location of the block.
+    //  heat (IN):    A higher heat means we should be prepared to free this block soon (perhaps in the next checkpoint)
+    //                Heat values are lexiographically ordered (like integers), but their specific values are arbitrary
+    void alloc_block(uint64_t size, uint64_t heat, uint64_t *offset);
+
+    // Effect: Free the block at offset.
+    // Requires: There must be a block currently allocated at that offset.
+    // Parameters:
+    //  offset (IN): The offset of the block.
+    void free_block(uint64_t offset);
+
+    // Effect: Return the size of the block that starts at offset.
+    // Requires: There must be a block currently allocated at that offset.
+    // Parameters:
+    //  offset (IN): The offset of the block.
+    uint64_t block_size(uint64_t offset);
+
+    // Effect: Check to see if the block allocator is OK.  This may take a long time.
+    // Usage Hints: Probably only use this for unit tests.
+    // TODO: Private?
+    void validate() const;
+
+    // Effect: Return the unallocated block address of "infinite" size.
+    //  That is, return the smallest address that is above all the allocated blocks.
+    uint64_t allocated_limit() const;
+
+    // Effect: Consider the blocks in sorted order.  The reserved block at the beginning is number 0.  The next one is number 1 and so forth.
+    //  Return the offset and size of the block with that number.
+    //  Return 0 if there is a block that big, return nonzero if b is too big.
+    // Rationale: This is probably useful only for tests.
+    int get_nth_block_in_layout_order(uint64_t b, uint64_t *offset, uint64_t *size);
+
+    // Effect:  Fill in report to indicate how the file is used.
+    // Requires: 
+    //  report->file_size_bytes is filled in
+    //  report->data_bytes is filled in
+    //  report->checkpoint_bytes_additional is filled in
+    void get_unused_statistics(TOKU_DB_FRAGMENTATION report);
+
+    // Effect: Fill in report->data_bytes with the number of bytes in use
+    //         Fill in report->data_blocks with the number of blockpairs in use
+    //         Fill in unused statistics using this->get_unused_statistics()
+    // Requires:
+    //  report->file_size is ignored on return
+    //  report->checkpoint_bytes_additional is ignored on return
+    void get_statistics(TOKU_DB_FRAGMENTATION report);
+
+    // Block allocator tracing.
+    // - Enabled by setting TOKU_BA_TRACE_PATH to the file that the trace file
+    //   should be written to.
+    // - Trace may be replayed by ba_trace_replay tool in tools/ directory
+    //   eg: "cat mytracefile | ba_trace_replay"
+    static void maybe_initialize_trace();
+    static void maybe_close_trace();
+
+private:
+    void _create_internal(uint64_t reserve_at_beginning, uint64_t alignment);
+    void grow_blocks_array_by(uint64_t n_to_add);
+    void grow_blocks_array();
+    int64_t find_block(uint64_t offset);
+    struct blockpair *choose_block_to_alloc_after(size_t size, uint64_t heat);
+
+    // Tracing
+    toku_mutex_t _trace_lock;
+    void _trace_create(void);
+    void _trace_create_from_blockpairs(void);
+    void _trace_destroy(void);
+    void _trace_alloc(uint64_t size, uint64_t heat, uint64_t offset);
+    void _trace_free(uint64_t offset);
+
+    // How much to reserve at the beginning
+    uint64_t _reserve_at_beginning;
+    // Block alignment
+    uint64_t _alignment;
+    // How many blocks
+    uint64_t _n_blocks;
+    // How big is the blocks_array.  Must be >= n_blocks.
+    uint64_t _blocks_array_size;
+    // These blocks are sorted by address.
+    struct blockpair *_blocks_array;
+    // Including the reserve_at_beginning
+    uint64_t _n_bytes_in_use;
+    // The allocation strategy are we using
+    enum allocation_strategy _strategy;
+};
diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.cc b/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.cc
new file mode 100644
index 00000000000..62bb8fc4a87
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.cc
@@ -0,0 +1,224 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <algorithm>
+
+#include <string.h>
+
+#include "portability/toku_assert.h"
+
+#include "ft/serialize/block_allocator_strategy.h"
+
+static uint64_t _align(uint64_t value, uint64_t ba_alignment) {
+    return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment;
+}
+
+static uint64_t _roundup_to_power_of_two(uint64_t value) {
+    uint64_t r = 4096;
+    while (r < value) {
+        r *= 2;
+        invariant(r > 0);
+    }
+    return r;
+}
+
+// First fit block allocation
+static struct block_allocator::blockpair *
+_first_fit(struct block_allocator::blockpair *blocks_array,
+           uint64_t n_blocks, uint64_t size, uint64_t alignment,
+           uint64_t max_padding) {
+    if (n_blocks == 1) {
+        // won't enter loop, can't underflow the direction < 0 case
+        return nullptr;
+    }
+
+    struct block_allocator::blockpair *bp = &blocks_array[0];
+    for (uint64_t n_spaces_to_check = n_blocks - 1; n_spaces_to_check > 0;
+         n_spaces_to_check--, bp++) {
+        // Consider the space after bp
+        uint64_t padded_alignment = max_padding != 0 ? _align(max_padding, alignment) : alignment;
+        uint64_t possible_offset = _align(bp->offset + bp->size, padded_alignment);
+        if (possible_offset + size <= bp[1].offset) { // bp[1] is always valid since bp < &blocks_array[n_blocks-1]
+            invariant(bp - blocks_array < (int64_t) n_blocks);
+            return bp;
+        }
+    }
+    return nullptr;
+}
+
+static struct block_allocator::blockpair *
+_first_fit_bw(struct block_allocator::blockpair *blocks_array,
+           uint64_t n_blocks, uint64_t size, uint64_t alignment,
+           uint64_t max_padding, struct block_allocator::blockpair *blocks_array_limit) {
+    if (n_blocks == 1) {
+        // won't enter loop, can't underflow the direction < 0 case
+        return nullptr;
+    }
+
+    struct block_allocator::blockpair *bp = &blocks_array[-1];
+    for (uint64_t n_spaces_to_check = n_blocks - 1; n_spaces_to_check > 0;
+         n_spaces_to_check--, bp--) {
+        // Consider the space after bp
+        uint64_t padded_alignment = max_padding != 0 ? _align(max_padding, alignment) : alignment;
+        uint64_t possible_offset = _align(bp->offset + bp->size, padded_alignment);
+        if (&bp[1] < blocks_array_limit && possible_offset + size <= bp[1].offset) {
+            invariant(blocks_array - bp < (int64_t) n_blocks);
+            return bp;
+        }
+    }
+    return nullptr;
+}
+
+struct block_allocator::blockpair *
+block_allocator_strategy::first_fit(struct block_allocator::blockpair *blocks_array,
+                                    uint64_t n_blocks, uint64_t size, uint64_t alignment) {
+    return _first_fit(blocks_array, n_blocks, size, alignment, 0);
+}
+
+// Best fit block allocation
+struct block_allocator::blockpair *
+block_allocator_strategy::best_fit(struct block_allocator::blockpair *blocks_array,
+                                   uint64_t n_blocks, uint64_t size, uint64_t alignment) {
+    struct block_allocator::blockpair *best_bp = nullptr;
+    uint64_t best_hole_size = 0;
+    for (uint64_t blocknum = 0; blocknum + 1 < n_blocks; blocknum++) {
+        // Consider the space after blocknum
+        struct block_allocator::blockpair *bp = &blocks_array[blocknum];
+        uint64_t possible_offset = _align(bp->offset + bp->size, alignment);
+        uint64_t possible_end_offset = possible_offset + size;
+        if (possible_end_offset <= bp[1].offset) {
+            // It fits here. Is it the best fit?
+            uint64_t hole_size = bp[1].offset - possible_end_offset;
+            if (best_bp == nullptr || hole_size < best_hole_size) {
+                best_hole_size = hole_size;
+                best_bp = bp;
+            }
+        }
+    }
+    return best_bp;
+}
+
+static uint64_t padded_fit_alignment = 4096;
+
+// TODO: These compiler specific directives should be abstracted in a portability header
+//       portability/toku_compiler.h?
+__attribute__((__constructor__))
+static void determine_padded_fit_alignment_from_env(void) {
+    // TODO: Should be in portability as 'toku_os_getenv()?'
+    const char *s = getenv("TOKU_BA_PADDED_FIT_ALIGNMENT");
+    if (s != nullptr && strlen(s) > 0) {
+        const int64_t alignment = strtoll(s, nullptr, 10);
+        if (alignment <= 0) {
+            fprintf(stderr, "tokuft: error: block allocator padded fit alignment found in environment (%s), "
+                            "but it's out of range (should be an integer > 0). defaulting to %" PRIu64 "\n",
+                            s, padded_fit_alignment);
+        } else {
+            padded_fit_alignment = _roundup_to_power_of_two(alignment);
+            fprintf(stderr, "tokuft: setting block allocator padded fit alignment to %" PRIu64 "\n",
+                    padded_fit_alignment);
+        }
+    }
+}
+
+// First fit into a block that is oversized by up to max_padding.
+// The hope is that if we purposefully waste a bit of space at allocation
+// time we'll be more likely to reuse this block later.
+struct block_allocator::blockpair *
+block_allocator_strategy::padded_fit(struct block_allocator::blockpair *blocks_array,
+                                     uint64_t n_blocks, uint64_t size, uint64_t alignment) {
+    return _first_fit(blocks_array, n_blocks, size, alignment, padded_fit_alignment);
+}
+
+static double hot_zone_threshold = 0.85;
+
+// TODO: These compiler specific directives should be abstracted in a portability header
+//       portability/toku_compiler.h?
+__attribute__((__constructor__))
+static void determine_hot_zone_threshold_from_env(void) {
+    // TODO: Should be in portability as 'toku_os_getenv()?'
+    const char *s = getenv("TOKU_BA_HOT_ZONE_THRESHOLD");
+    if (s != nullptr && strlen(s) > 0) {
+        const double hot_zone = strtod(s, nullptr);
+        if (hot_zone < 1 || hot_zone > 99) {
+            fprintf(stderr, "tokuft: error: block allocator hot zone threshold found in environment (%s), "
+                            "but it's out of range (should be an integer 1 through 99). defaulting to 85\n", s);
+            hot_zone_threshold = 85 / 100;
+        } else {
+            fprintf(stderr, "tokuft: setting block allocator hot zone threshold to %s\n", s);
+            hot_zone_threshold = hot_zone / 100;
+        }
+    }
+}
+
+struct block_allocator::blockpair *
+block_allocator_strategy::heat_zone(struct block_allocator::blockpair *blocks_array,
+                                    uint64_t n_blocks, uint64_t size, uint64_t alignment,
+                                    uint64_t heat) {
+    if (heat > 0) {
+        struct block_allocator::blockpair *bp, *boundary_bp;
+
+        // Hot allocation. Find the beginning of the hot zone.
+        boundary_bp = &blocks_array[n_blocks - 1];
+        uint64_t highest_offset = _align(boundary_bp->offset + boundary_bp->size, alignment);
+        uint64_t hot_zone_offset = static_cast<uint64_t>(hot_zone_threshold * highest_offset);
+
+        boundary_bp = std::lower_bound(blocks_array, blocks_array + n_blocks, hot_zone_offset);
+        uint64_t blocks_in_zone = (blocks_array + n_blocks) - boundary_bp;
+        uint64_t blocks_outside_zone = boundary_bp - blocks_array;
+        invariant(blocks_in_zone + blocks_outside_zone == n_blocks);
+
+        if (blocks_in_zone > 0) {
+            // Find the first fit in the hot zone, going forward.
+            bp = _first_fit(boundary_bp, blocks_in_zone, size, alignment, 0);
+            if (bp != nullptr) {
+                return bp;
+            }
+        }
+        if (blocks_outside_zone > 0) {
+            // Find the first fit in the cold zone, going backwards.
+            bp = _first_fit_bw(boundary_bp, blocks_outside_zone, size, alignment, 0, &blocks_array[n_blocks]);
+            if (bp != nullptr) {
+                return bp;
+            }
+        }
+    } else {
+        // Cold allocations are simply first-fit from the beginning.
+        return _first_fit(blocks_array, n_blocks, size, alignment, 0);
+    }
+    return nullptr;
+}
diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.h b/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.h
new file mode 100644
index 00000000000..8aded3898c1
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.h
@@ -0,0 +1,65 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <db.h>
+
+#include "ft/serialize/block_allocator.h"
+
+// Block allocation strategy implementations
+
+class block_allocator_strategy {
+public:
+    static struct block_allocator::blockpair *
+    first_fit(struct block_allocator::blockpair *blocks_array,
+              uint64_t n_blocks, uint64_t size, uint64_t alignment);
+
+    static struct block_allocator::blockpair *
+    best_fit(struct block_allocator::blockpair *blocks_array,
+             uint64_t n_blocks, uint64_t size, uint64_t alignment);
+
+    static struct block_allocator::blockpair *
+    padded_fit(struct block_allocator::blockpair *blocks_array,
+               uint64_t n_blocks, uint64_t size, uint64_t alignment);
+
+    static struct block_allocator::blockpair *
+    heat_zone(struct block_allocator::blockpair *blocks_array,
+              uint64_t n_blocks, uint64_t size, uint64_t alignment,
+              uint64_t heat);
+};
diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_table.cc b/storage/tokudb/PerconaFT/ft/serialize/block_table.cc
new file mode 100644
index 00000000000..7101ba9f58c
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/block_table.cc
@@ -0,0 +1,993 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "portability/memory.h"
+#include "portability/toku_assert.h"
+#include "portability/toku_portability.h"
+#include "portability/toku_pthread.h"
+
+// ugly but pragmatic, need access to dirty bits while holding translation lock
+// TODO: Refactor this (possibly with FT-301)
+#include "ft/ft-internal.h"
+
+// TODO: reorganize this dependency (FT-303)
+#include "ft/ft-ops.h" // for toku_maybe_truncate_file
+#include "ft/serialize/block_table.h"
+#include "ft/serialize/rbuf.h"
+#include "ft/serialize/wbuf.h"
+#include "ft/serialize/block_allocator.h"
+
+#include "util/nb_mutex.h"
+#include "util/scoped_malloc.h"
+
+// indicates the end of a freelist
+static const BLOCKNUM freelist_null = { -1 };
+
+// value of block_translation_pair.size if blocknum is unused
+static const DISKOFF size_is_free = (DISKOFF) -1;
+
+// value of block_translation_pair.u.diskoff if blocknum is used but does not yet have a diskblock
+static const DISKOFF diskoff_unused = (DISKOFF) -2;
+
+void block_table::_mutex_lock() {
+    toku_mutex_lock(&_mutex);
+}
+
+void block_table::_mutex_unlock() {
+    toku_mutex_unlock(&_mutex);
+}
+
+// TODO: Move lock to FT
+void toku_ft_lock(FT ft) {
+    block_table *bt = &ft->blocktable;
+    bt->_mutex_lock();
+}
+
+// TODO: Move lock to FT
+void toku_ft_unlock(FT ft) {
+    block_table *bt = &ft->blocktable;
+    toku_mutex_assert_locked(&bt->_mutex);
+    bt->_mutex_unlock();
+}
+
+// There are two headers: the reserve must fit them both and be suitably aligned.
+static_assert(block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE %
+              block_allocator::BLOCK_ALLOCATOR_ALIGNMENT == 0,
+              "Block allocator's header reserve must be suitibly aligned");
+static_assert(block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE * 2 ==
+              block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE,
+              "Block allocator's total header reserve must exactly fit two headers");
+
+// does NOT initialize the block allocator: the caller is responsible
+void block_table::_create_internal() {
+    memset(&_current, 0, sizeof(struct translation));
+    memset(&_inprogress, 0, sizeof(struct translation));
+    memset(&_checkpointed, 0, sizeof(struct translation));
+    memset(&_mutex, 0, sizeof(_mutex));
+    toku_mutex_init(&_mutex, nullptr);
+    nb_mutex_init(&_safe_file_size_lock);
+}
+
+// Fill in the checkpointed translation from buffer, and copy checkpointed to current.
+// The one read from disk is the last known checkpointed one, so we are keeping it in 
+// place and then setting current (which is never stored on disk) for current use.
+// The translation_buffer has translation only, we create the rest of the block_table.
+int block_table::create_from_buffer(int fd,
+                                    DISKOFF location_on_disk, //Location of translation_buffer
+                                    DISKOFF size_on_disk,
+                                    unsigned char *translation_buffer) {
+    // Does not initialize the block allocator
+    _create_internal();
+
+    // Deserialize the translation and copy it to current
+    int r = _translation_deserialize_from_buffer(&_checkpointed,
+                                                 location_on_disk, size_on_disk,
+                                                 translation_buffer);
+    if (r != 0) {
+        return r;
+    }
+    _copy_translation(&_current, &_checkpointed, TRANSLATION_CURRENT);
+
+    // Determine the file size
+    int64_t file_size;
+    r = toku_os_get_file_size(fd, &file_size);
+    lazy_assert_zero(r);
+    invariant(file_size >= 0);
+    _safe_file_size = file_size;
+
+    // Gather the non-empty translations and use them to create the block allocator
+    toku::scoped_malloc pairs_buf(_checkpointed.smallest_never_used_blocknum.b *
+                                  sizeof(struct block_allocator::blockpair));
+    struct block_allocator::blockpair *CAST_FROM_VOIDP(pairs, pairs_buf.get());
+    uint64_t n_pairs = 0;
+    for (int64_t i = 0; i < _checkpointed.smallest_never_used_blocknum.b; i++) {
+        struct block_translation_pair pair = _checkpointed.block_translation[i];
+        if (pair.size > 0) {
+            invariant(pair.u.diskoff != diskoff_unused);
+            pairs[n_pairs++] = block_allocator::blockpair(pair.u.diskoff, pair.size);
+        }
+    }
+
+    _bt_block_allocator.create_from_blockpairs(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE,
+                                               block_allocator::BLOCK_ALLOCATOR_ALIGNMENT,
+                                               pairs, n_pairs);
+
+    return 0;
+}
+
+void block_table::create() {
+    // Does not initialize the block allocator
+    _create_internal();
+
+    _checkpointed.type = TRANSLATION_CHECKPOINTED;
+    _checkpointed.smallest_never_used_blocknum = make_blocknum(RESERVED_BLOCKNUMS);
+    _checkpointed.length_of_array = _checkpointed.smallest_never_used_blocknum.b;
+    _checkpointed.blocknum_freelist_head = freelist_null;
+    XMALLOC_N(_checkpointed.length_of_array, _checkpointed.block_translation);
+    for (int64_t i = 0; i < _checkpointed.length_of_array; i++) {
+        _checkpointed.block_translation[i].size = 0;
+        _checkpointed.block_translation[i].u.diskoff = diskoff_unused;
+    }
+
+    // we just created a default checkpointed, now copy it to current.  
+    _copy_translation(&_current, &_checkpointed, TRANSLATION_CURRENT);
+
+    // Create an empty block allocator.
+    _bt_block_allocator.create(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE,
+                               block_allocator::BLOCK_ALLOCATOR_ALIGNMENT);
+}
+
+// TODO: Refactor with FT-303
+static void ft_set_dirty(FT ft, bool for_checkpoint) {
+    invariant(ft->h->type == FT_CURRENT);
+    if (for_checkpoint) {
+        invariant(ft->checkpoint_header->type == FT_CHECKPOINT_INPROGRESS);
+        ft->checkpoint_header->dirty = 1;
+    } else {
+        ft->h->dirty = 1;
+    }
+}
+
+void block_table::_maybe_truncate_file(int fd, uint64_t size_needed_before) {
+    toku_mutex_assert_locked(&_mutex);
+    uint64_t new_size_needed = _bt_block_allocator.allocated_limit();
+    //Save a call to toku_os_get_file_size (kernel call) if unlikely to be useful.
+    if (new_size_needed < size_needed_before && new_size_needed < _safe_file_size) {
+        nb_mutex_lock(&_safe_file_size_lock, &_mutex);
+
+        // Must hold _safe_file_size_lock to change _safe_file_size.
+        if (new_size_needed < _safe_file_size) {
+            int64_t safe_file_size_before = _safe_file_size;
+            // Not safe to use the 'to-be-truncated' portion until truncate is done.
+            _safe_file_size = new_size_needed;
+            _mutex_unlock();
+
+            uint64_t size_after;
+            toku_maybe_truncate_file(fd, new_size_needed, safe_file_size_before, &size_after);
+            _mutex_lock();
+
+            _safe_file_size = size_after;
+        }
+        nb_mutex_unlock(&_safe_file_size_lock);
+    }
+}
+
+void block_table::maybe_truncate_file_on_open(int fd) {
+    _mutex_lock();
+    _maybe_truncate_file(fd, _safe_file_size);
+    _mutex_unlock();
+}
+
+void block_table::_copy_translation(struct translation *dst, struct translation *src, enum translation_type newtype) {
+    // We intend to malloc a fresh block, so the incoming translation should be empty
+    invariant_null(dst->block_translation);
+
+    invariant(src->length_of_array >= src->smallest_never_used_blocknum.b);
+    invariant(newtype == TRANSLATION_DEBUG ||
+              (src->type == TRANSLATION_CURRENT && newtype == TRANSLATION_INPROGRESS) ||
+              (src->type == TRANSLATION_CHECKPOINTED && newtype == TRANSLATION_CURRENT));
+    dst->type = newtype;
+    dst->smallest_never_used_blocknum = src->smallest_never_used_blocknum;
+    dst->blocknum_freelist_head = src->blocknum_freelist_head; 
+
+    // destination btt is of fixed size. Allocate + memcpy the exact length necessary.
+    dst->length_of_array = dst->smallest_never_used_blocknum.b;
+    XMALLOC_N(dst->length_of_array, dst->block_translation);
+    memcpy(dst->block_translation, src->block_translation, dst->length_of_array * sizeof(*dst->block_translation));
+
+    // New version of btt is not yet stored on disk.
+    dst->block_translation[RESERVED_BLOCKNUM_TRANSLATION].size = 0;
+    dst->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff = diskoff_unused;
+}
+
+int64_t block_table::get_blocks_in_use_unlocked() {
+    BLOCKNUM b;
+    struct translation *t = &_current;
+    int64_t num_blocks = 0;
+    {
+        //Reserved blocknums do not get upgraded; They are part of the header.
+        for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b; b.b++) {
+            if (t->block_translation[b.b].size != size_is_free) {
+                num_blocks++;
+            }
+        }
+    }
+    return num_blocks;
+}
+
+void block_table::_maybe_optimize_translation(struct translation *t) {
+    //Reduce 'smallest_never_used_blocknum.b' (completely free blocknums instead of just
+    //on a free list.  Doing so requires us to regenerate the free list.
+    //This is O(n) work, so do it only if you're already doing that.
+
+    BLOCKNUM b;
+    paranoid_invariant(t->smallest_never_used_blocknum.b >= RESERVED_BLOCKNUMS);
+    //Calculate how large the free suffix is.
+    int64_t freed;
+    {
+        for (b.b = t->smallest_never_used_blocknum.b; b.b > RESERVED_BLOCKNUMS; b.b--) {
+            if (t->block_translation[b.b-1].size != size_is_free) {
+                break;
+            }
+        }
+        freed = t->smallest_never_used_blocknum.b - b.b;
+    }
+    if (freed>0) {
+        t->smallest_never_used_blocknum.b = b.b;
+        if (t->length_of_array/4 > t->smallest_never_used_blocknum.b) {
+            //We're using more memory than necessary to represent this now.  Reduce.
+            uint64_t new_length = t->smallest_never_used_blocknum.b * 2;
+            XREALLOC_N(new_length, t->block_translation);
+            t->length_of_array = new_length;
+            //No need to zero anything out. 
+        }
+
+        //Regenerate free list.
+        t->blocknum_freelist_head.b = freelist_null.b;
+        for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b; b.b++) {
+            if (t->block_translation[b.b].size == size_is_free) {
+                t->block_translation[b.b].u.next_free_blocknum = t->blocknum_freelist_head;
+                t->blocknum_freelist_head                      = b;
+            }
+        }
+    }
+}
+
+// block table must be locked by caller of this function
+void block_table::note_start_checkpoint_unlocked() {
+    toku_mutex_assert_locked(&_mutex);
+
+    // We're going to do O(n) work to copy the translation, so we
+    // can afford to do O(n) work by optimizing the translation
+    _maybe_optimize_translation(&_current);
+
+    // Copy current translation to inprogress translation.
+    _copy_translation(&_inprogress, &_current, TRANSLATION_INPROGRESS);
+
+    _checkpoint_skipped = false;
+}
+
+void block_table::note_skipped_checkpoint() {
+    //Purpose, alert block translation that the checkpoint was skipped, e.x. for a non-dirty header
+    _mutex_lock();
+    paranoid_invariant_notnull(_inprogress.block_translation);
+    _checkpoint_skipped = true;
+    _mutex_unlock();
+}
+
+// Purpose: free any disk space used by previous checkpoint that isn't in use by either
+//           - current state
+//           - in-progress checkpoint
+//          capture inprogress as new checkpointed.
+// For each entry in checkpointBTT
+//   if offset does not match offset in inprogress
+//      assert offset does not match offset in current
+//      free (offset,len) from checkpoint
+// move inprogress to checkpoint (resetting type)
+// inprogress = NULL
+void block_table::note_end_checkpoint(int fd) {
+    // Free unused blocks
+    _mutex_lock();
+    uint64_t allocated_limit_at_start = _bt_block_allocator.allocated_limit();
+    paranoid_invariant_notnull(_inprogress.block_translation);
+    if (_checkpoint_skipped) {
+        toku_free(_inprogress.block_translation);
+        memset(&_inprogress, 0, sizeof(_inprogress));
+        goto end;
+    }
+
+    //Make certain inprogress was allocated space on disk
+    assert(_inprogress.block_translation[RESERVED_BLOCKNUM_TRANSLATION].size > 0);
+    assert(_inprogress.block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff > 0);
+
+    {
+        struct translation *t = &_checkpointed;
+        for (int64_t i = 0; i < t->length_of_array; i++) {
+            struct block_translation_pair *pair = &t->block_translation[i];
+            if (pair->size > 0 && !_translation_prevents_freeing(&_inprogress, make_blocknum(i), pair)) {
+                assert(!_translation_prevents_freeing(&_current, make_blocknum(i), pair));
+                _bt_block_allocator.free_block(pair->u.diskoff);
+            }
+        }
+        toku_free(_checkpointed.block_translation);
+        _checkpointed = _inprogress;
+        _checkpointed.type = TRANSLATION_CHECKPOINTED;
+        memset(&_inprogress, 0, sizeof(_inprogress));
+        _maybe_truncate_file(fd, allocated_limit_at_start);
+    }
+end:
+    _mutex_unlock();
+}
+
+bool block_table::_is_valid_blocknum(struct translation *t, BLOCKNUM b) {
+    invariant(t->length_of_array >= t->smallest_never_used_blocknum.b);
+    return b.b >= 0 && b.b < t->smallest_never_used_blocknum.b;
+}
+
+void block_table::_verify_valid_blocknum(struct translation *UU(t), BLOCKNUM UU(b)) {
+    invariant(_is_valid_blocknum(t, b));
+}
+
+bool block_table::_is_valid_freeable_blocknum(struct translation *t, BLOCKNUM b) {
+    invariant(t->length_of_array >= t->smallest_never_used_blocknum.b);
+    return b.b >= RESERVED_BLOCKNUMS && b.b < t->smallest_never_used_blocknum.b;
+}
+
+// should be freeable
+void block_table::_verify_valid_freeable_blocknum(struct translation *UU(t), BLOCKNUM UU(b)) {
+    invariant(_is_valid_freeable_blocknum(t, b));
+}
+
+// Also used only in ft-serialize-test.
+void block_table::block_free(uint64_t offset) {
+    _mutex_lock();
+    _bt_block_allocator.free_block(offset);
+    _mutex_unlock();
+}
+
+int64_t block_table::_calculate_size_on_disk(struct translation *t) {
+    return 8 + // smallest_never_used_blocknum
+           8 + // blocknum_freelist_head
+           t->smallest_never_used_blocknum.b * 16 + // Array
+           4; // 4 for checksum
+}
+
+// We cannot free the disk space allocated to this blocknum if it is still in use by the given translation table.
+bool block_table::_translation_prevents_freeing(struct translation *t, BLOCKNUM b, struct block_translation_pair *old_pair) {
+    return t->block_translation &&
+           b.b < t->smallest_never_used_blocknum.b &&
+           old_pair->u.diskoff == t->block_translation[b.b].u.diskoff;
+}
+
+void block_table::_realloc_on_disk_internal(BLOCKNUM b, DISKOFF size, DISKOFF *offset, FT ft, bool for_checkpoint, uint64_t heat) {
+    toku_mutex_assert_locked(&_mutex);
+    ft_set_dirty(ft, for_checkpoint);
+
+    struct translation *t = &_current;
+    struct block_translation_pair old_pair = t->block_translation[b.b];
+    //Free the old block if it is not still in use by the checkpoint in progress or the previous checkpoint
+    bool cannot_free = (bool)
+        ((!for_checkpoint && _translation_prevents_freeing(&_inprogress,   b, &old_pair)) ||
+         _translation_prevents_freeing(&_checkpointed, b, &old_pair));
+    if (!cannot_free && old_pair.u.diskoff!=diskoff_unused) {
+        _bt_block_allocator.free_block(old_pair.u.diskoff);
+    }
+
+    uint64_t allocator_offset = diskoff_unused;
+    t->block_translation[b.b].size = size;
+    if (size > 0) {
+        // Allocate a new block if the size is greater than 0,
+        // if the size is just 0, offset will be set to diskoff_unused
+        _bt_block_allocator.alloc_block(size, heat, &allocator_offset);
+    }
+    t->block_translation[b.b].u.diskoff = allocator_offset;
+    *offset = allocator_offset;
+
+    //Update inprogress btt if appropriate (if called because Pending bit is set).
+    if (for_checkpoint) {
+        paranoid_invariant(b.b < _inprogress.length_of_array);
+        _inprogress.block_translation[b.b] = t->block_translation[b.b];
+    }
+}
+
+void block_table::_ensure_safe_write_unlocked(int fd, DISKOFF block_size, DISKOFF block_offset) {
+    // Requires: holding _mutex
+    uint64_t size_needed = block_size + block_offset;
+    if (size_needed > _safe_file_size) {
+        // Must hold _safe_file_size_lock to change _safe_file_size.
+        nb_mutex_lock(&_safe_file_size_lock, &_mutex);
+        if (size_needed > _safe_file_size) {
+            _mutex_unlock();
+
+            int64_t size_after;
+            toku_maybe_preallocate_in_file(fd, size_needed, _safe_file_size, &size_after);
+
+            _mutex_lock();
+            _safe_file_size = size_after;
+        }
+        nb_mutex_unlock(&_safe_file_size_lock);
+    }
+}
+
+void block_table::realloc_on_disk(BLOCKNUM b, DISKOFF size, DISKOFF *offset, FT ft, int fd, bool for_checkpoint, uint64_t heat) {
+    _mutex_lock();
+    struct translation *t = &_current;
+    _verify_valid_freeable_blocknum(t, b);
+    _realloc_on_disk_internal(b, size, offset, ft, for_checkpoint, heat);
+
+    _ensure_safe_write_unlocked(fd, size, *offset);
+    _mutex_unlock();
+}
+
+bool block_table::_pair_is_unallocated(struct block_translation_pair *pair) {
+    return pair->size == 0 && pair->u.diskoff == diskoff_unused;
+}
+
+// Effect: figure out where to put the inprogress btt on disk, allocate space for it there.
+//   The space must be 512-byte aligned (both the starting address and the size).
+//   As a result, the allcoated space may be a little bit bigger (up to the next 512-byte boundary) than the actual btt.
+void block_table::_alloc_inprogress_translation_on_disk_unlocked() {
+    toku_mutex_assert_locked(&_mutex);
+
+    struct translation *t = &_inprogress;
+    paranoid_invariant_notnull(t->block_translation);
+    BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION);
+    //Each inprogress is allocated only once
+    paranoid_invariant(_pair_is_unallocated(&t->block_translation[b.b]));
+
+    //Allocate a new block
+    int64_t size = _calculate_size_on_disk(t);
+    uint64_t offset;
+    _bt_block_allocator.alloc_block(size, 0, &offset);
+    t->block_translation[b.b].u.diskoff = offset;
+    t->block_translation[b.b].size      = size;
+}
+
+// Effect: Serializes the blocktable to a wbuf (which starts uninitialized)
+//   A clean shutdown runs checkpoint start so that current and inprogress are copies.
+//   The resulting wbuf buffer is guaranteed to be be 512-byte aligned and the total length is a multiple of 512 (so we pad with zeros at the end if needd)
+//   The address is guaranteed to be 512-byte aligned, but the size is not guaranteed.
+//   It *is* guaranteed that we can read up to the next 512-byte boundary, however
+void block_table::serialize_translation_to_wbuf(int fd, struct wbuf *w,
+                                                int64_t *address, int64_t *size) {
+    _mutex_lock();
+    struct translation *t = &_inprogress;
+
+    BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION);
+    _alloc_inprogress_translation_on_disk_unlocked(); // The allocated block must be 512-byte aligned to make O_DIRECT happy.
+    uint64_t size_translation = _calculate_size_on_disk(t);
+    uint64_t size_aligned     = roundup_to_multiple(512, size_translation);
+    assert((int64_t)size_translation==t->block_translation[b.b].size);
+    {
+        //Init wbuf
+        if (0)
+            printf("%s:%d writing translation table of size_translation %" PRIu64 " at %" PRId64 "\n", __FILE__, __LINE__, size_translation, t->block_translation[b.b].u.diskoff);
+        char *XMALLOC_N_ALIGNED(512, size_aligned, buf);
+        for (uint64_t i=size_translation; i<size_aligned; i++) buf[i]=0; // fill in the end of the buffer with zeros.
+        wbuf_init(w, buf, size_aligned);
+    }
+    wbuf_BLOCKNUM(w, t->smallest_never_used_blocknum); 
+    wbuf_BLOCKNUM(w, t->blocknum_freelist_head); 
+    int64_t i;
+    for (i=0; i<t->smallest_never_used_blocknum.b; i++) {
+        if (0)
+            printf("%s:%d %" PRId64 ",%" PRId64 "\n", __FILE__, __LINE__, t->block_translation[i].u.diskoff, t->block_translation[i].size);
+        wbuf_DISKOFF(w, t->block_translation[i].u.diskoff);
+        wbuf_DISKOFF(w, t->block_translation[i].size);
+    }
+    uint32_t checksum = toku_x1764_finish(&w->checksum);
+    wbuf_int(w, checksum);
+    *address = t->block_translation[b.b].u.diskoff;
+    *size    = size_translation;
+    assert((*address)%512 == 0);
+
+    _ensure_safe_write_unlocked(fd, size_aligned, *address);
+    _mutex_unlock();
+}
+
+// Perhaps rename: purpose is get disk address of a block, given its blocknum (blockid?)
+void block_table::_translate_blocknum_to_offset_size_unlocked(BLOCKNUM b, DISKOFF *offset, DISKOFF *size) {
+    struct translation *t = &_current;
+    _verify_valid_blocknum(t, b);
+    if (offset) {
+        *offset = t->block_translation[b.b].u.diskoff;
+    }
+    if (size) {
+        *size = t->block_translation[b.b].size;
+    }
+}
+
+// Perhaps rename: purpose is get disk address of a block, given its blocknum (blockid?)
+void block_table::translate_blocknum_to_offset_size(BLOCKNUM b, DISKOFF *offset, DISKOFF *size) {
+    _mutex_lock();
+    _translate_blocknum_to_offset_size_unlocked(b, offset, size);
+    _mutex_unlock();
+}
+
+// Only called by toku_allocate_blocknum
+// Effect: expand the array to maintain size invariant
+// given that one more never-used blocknum will soon be used.
+void block_table::_maybe_expand_translation(struct translation *t) {
+    if (t->length_of_array <= t->smallest_never_used_blocknum.b) {
+        //expansion is necessary
+        uint64_t new_length = t->smallest_never_used_blocknum.b * 2;
+        XREALLOC_N(new_length, t->block_translation);
+        uint64_t i;
+        for (i = t->length_of_array; i < new_length; i++) {
+            t->block_translation[i].u.next_free_blocknum = freelist_null;
+            t->block_translation[i].size                 = size_is_free;
+        }
+        t->length_of_array = new_length;
+    }
+}
+
+void block_table::_allocate_blocknum_unlocked(BLOCKNUM *res, FT ft) {
+    toku_mutex_assert_locked(&_mutex);
+    BLOCKNUM result;
+    struct translation *t = &_current;
+    if (t->blocknum_freelist_head.b == freelist_null.b) {
+        // no previously used blocknums are available
+        // use a never used blocknum
+        _maybe_expand_translation(t); //Ensure a never used blocknums is available
+        result = t->smallest_never_used_blocknum;
+        t->smallest_never_used_blocknum.b++;
+    } else {  // reuse a previously used blocknum
+        result = t->blocknum_freelist_head;
+        BLOCKNUM next = t->block_translation[result.b].u.next_free_blocknum;
+        t->blocknum_freelist_head = next;
+    }
+    //Verify the blocknum is free
+    paranoid_invariant(t->block_translation[result.b].size == size_is_free);
+    //blocknum is not free anymore
+    t->block_translation[result.b].u.diskoff = diskoff_unused;
+    t->block_translation[result.b].size    = 0;
+    _verify_valid_freeable_blocknum(t, result);
+    *res = result;
+    ft_set_dirty(ft, false);
+}
+
+void block_table::allocate_blocknum(BLOCKNUM *res, FT ft) {
+    _mutex_lock();
+    _allocate_blocknum_unlocked(res, ft);
+    _mutex_unlock();
+}
+
+void block_table::_free_blocknum_in_translation(struct translation *t, BLOCKNUM b) {
+    _verify_valid_freeable_blocknum(t, b);
+    paranoid_invariant(t->block_translation[b.b].size != size_is_free);
+
+    t->block_translation[b.b].size                 = size_is_free;
+    t->block_translation[b.b].u.next_free_blocknum = t->blocknum_freelist_head;
+    t->blocknum_freelist_head                      = b;
+}
+
+// Effect: Free a blocknum.
+// If the blocknum holds the only reference to a block on disk, free that block
+void block_table::_free_blocknum_unlocked(BLOCKNUM *bp, FT ft, bool for_checkpoint) {
+    toku_mutex_assert_locked(&_mutex);
+    BLOCKNUM b = *bp;
+    bp->b = 0; //Remove caller's reference.
+
+    struct block_translation_pair old_pair = _current.block_translation[b.b];
+
+    _free_blocknum_in_translation(&_current, b);
+    if (for_checkpoint) {
+        paranoid_invariant(ft->checkpoint_header->type == FT_CHECKPOINT_INPROGRESS);
+        _free_blocknum_in_translation(&_inprogress, b);
+    }
+
+    //If the size is 0, no disk block has ever been assigned to this blocknum.
+    if (old_pair.size > 0) {
+        //Free the old block if it is not still in use by the checkpoint in progress or the previous checkpoint
+        bool cannot_free = (bool)
+            (_translation_prevents_freeing(&_inprogress,   b, &old_pair) ||
+             _translation_prevents_freeing(&_checkpointed, b, &old_pair));
+        if (!cannot_free) {
+            _bt_block_allocator.free_block(old_pair.u.diskoff);
+        }
+    }
+    else {
+        paranoid_invariant(old_pair.size==0);
+        paranoid_invariant(old_pair.u.diskoff == diskoff_unused);
+    }
+    ft_set_dirty(ft, for_checkpoint);
+}
+
+void block_table::free_blocknum(BLOCKNUM *bp, FT ft, bool for_checkpoint) {
+    _mutex_lock();
+    _free_blocknum_unlocked(bp, ft, for_checkpoint);
+    _mutex_unlock();
+}
+
+// Verify there are no free blocks.
+void block_table::verify_no_free_blocknums() {
+    invariant(_current.blocknum_freelist_head.b == freelist_null.b);
+}
+
+// Frees blocknums that have a size of 0 and unused diskoff
+// Currently used for eliminating unused cached rollback log nodes
+void block_table::free_unused_blocknums(BLOCKNUM root) {
+    _mutex_lock();
+    int64_t smallest = _current.smallest_never_used_blocknum.b;
+    for (int64_t i=RESERVED_BLOCKNUMS; i < smallest; i++) {
+        if (i == root.b) {
+            continue;
+        }
+        BLOCKNUM b = make_blocknum(i);
+        if (_current.block_translation[b.b].size == 0) {
+            invariant(_current.block_translation[b.b].u.diskoff == diskoff_unused);
+            _free_blocknum_in_translation(&_current, b);
+        }
+    }
+    _mutex_unlock();
+}
+
+bool block_table::_no_data_blocks_except_root(BLOCKNUM root) {
+    bool ok = true;
+    _mutex_lock();
+    int64_t smallest = _current.smallest_never_used_blocknum.b;
+    if (root.b < RESERVED_BLOCKNUMS) {
+        ok = false;
+        goto cleanup;
+    }
+    for (int64_t i = RESERVED_BLOCKNUMS; i < smallest; i++) {
+        if (i == root.b) {
+            continue;
+        }
+        BLOCKNUM b = make_blocknum(i);
+        if (_current.block_translation[b.b].size != size_is_free) {
+            ok = false;
+            goto cleanup;
+        }
+    }
+ cleanup:
+    _mutex_unlock();
+    return ok;
+}
+
+// Verify there are no data blocks except root.
+// TODO(leif): This actually takes a lock, but I don't want to fix all the callers right now.
+void block_table::verify_no_data_blocks_except_root(BLOCKNUM UU(root)) {
+    paranoid_invariant(_no_data_blocks_except_root(root));
+}
+
+bool block_table::_blocknum_allocated(BLOCKNUM b) {
+    _mutex_lock();
+    struct translation *t = &_current;
+    _verify_valid_blocknum(t, b);
+    bool ok = t->block_translation[b.b].size != size_is_free;
+    _mutex_unlock();
+    return ok;
+}
+
+// Verify a blocknum is currently allocated.
+void block_table::verify_blocknum_allocated(BLOCKNUM UU(b)) {
+    paranoid_invariant(_blocknum_allocated(b));
+}
+
+// Only used by toku_dump_translation table (debug info)
+void block_table::_dump_translation_internal(FILE *f, struct translation *t) {
+    if (t->block_translation) {
+        BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION);
+        fprintf(f, " length_of_array[%" PRId64 "]", t->length_of_array);
+        fprintf(f, " smallest_never_used_blocknum[%" PRId64 "]", t->smallest_never_used_blocknum.b);
+        fprintf(f, " blocknum_free_list_head[%" PRId64 "]", t->blocknum_freelist_head.b);
+        fprintf(f, " size_on_disk[%" PRId64 "]", t->block_translation[b.b].size);
+        fprintf(f, " location_on_disk[%" PRId64 "]\n", t->block_translation[b.b].u.diskoff);
+        int64_t i;
+        for (i=0; i<t->length_of_array; i++) {
+            fprintf(f, " %" PRId64 ": %" PRId64 " %" PRId64 "\n", i, t->block_translation[i].u.diskoff, t->block_translation[i].size);
+        }
+        fprintf(f, "\n");
+    } else {
+        fprintf(f, " does not exist\n");
+    }
+}
+
+// Only used by toku_ft_dump which is only for debugging purposes
+// "pretty" just means we use tabs so we can parse output easier later
+void block_table::dump_translation_table_pretty(FILE *f) {
+    _mutex_lock();
+    struct translation *t = &_checkpointed;
+    assert(t->block_translation != nullptr);
+    for (int64_t i = 0; i < t->length_of_array; ++i) {
+        fprintf(f, "%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n", i, t->block_translation[i].u.diskoff, t->block_translation[i].size);
+    }
+    _mutex_unlock();
+}
+
+// Only used by toku_ft_dump which is only for debugging purposes
+void block_table::dump_translation_table(FILE *f) {
+    _mutex_lock();
+    fprintf(f, "Current block translation:");
+    _dump_translation_internal(f, &_current);
+    fprintf(f, "Checkpoint in progress block translation:");
+    _dump_translation_internal(f, &_inprogress);
+    fprintf(f, "Checkpointed block translation:");
+    _dump_translation_internal(f, &_checkpointed);
+    _mutex_unlock();
+}
+
+// Only used by ftdump
+void block_table::blocknum_dump_translation(BLOCKNUM b) {
+    _mutex_lock();
+
+    struct translation *t = &_current;
+    if (b.b < t->length_of_array) {
+        struct block_translation_pair *bx = &t->block_translation[b.b];
+        printf("%" PRId64 ": %" PRId64 " %" PRId64 "\n", b.b, bx->u.diskoff, bx->size);
+    }
+    _mutex_unlock();
+}
+
+// Must not call this function when anything else is using the blocktable.
+// No one may use the blocktable afterwards.
+void block_table::destroy(void) {
+    // TODO: translation.destroy();
+    toku_free(_current.block_translation);
+    toku_free(_inprogress.block_translation);
+    toku_free(_checkpointed.block_translation);
+
+    _bt_block_allocator.destroy();
+    toku_mutex_destroy(&_mutex);
+    nb_mutex_destroy(&_safe_file_size_lock);
+}
+
+int block_table::_translation_deserialize_from_buffer(struct translation *t,
+                                                      DISKOFF location_on_disk,
+                                                      uint64_t size_on_disk,
+                                                      // out: buffer with serialized translation
+                                                      unsigned char *translation_buffer) {
+    int r = 0;
+    assert(location_on_disk != 0);
+    t->type = TRANSLATION_CHECKPOINTED;
+
+    // check the checksum
+    uint32_t x1764 = toku_x1764_memory(translation_buffer, size_on_disk - 4);
+    uint64_t offset = size_on_disk - 4;
+    uint32_t stored_x1764 = toku_dtoh32(*(int*)(translation_buffer + offset));
+    if (x1764 != stored_x1764) {
+        fprintf(stderr, "Translation table checksum failure: calc=0x%08x read=0x%08x\n", x1764, stored_x1764);
+        r = TOKUDB_BAD_CHECKSUM;
+        goto exit;
+    }
+
+    struct rbuf rb;
+    rb.buf = translation_buffer;
+    rb.ndone = 0;
+    rb.size = size_on_disk-4;//4==checksum
+
+    t->smallest_never_used_blocknum = rbuf_blocknum(&rb); 
+    t->length_of_array = t->smallest_never_used_blocknum.b;
+    invariant(t->smallest_never_used_blocknum.b >= RESERVED_BLOCKNUMS);
+    t->blocknum_freelist_head = rbuf_blocknum(&rb); 
+    XMALLOC_N(t->length_of_array, t->block_translation);
+    for (int64_t i = 0; i < t->length_of_array; i++) {
+        t->block_translation[i].u.diskoff = rbuf_DISKOFF(&rb);
+        t->block_translation[i].size = rbuf_DISKOFF(&rb);
+    }
+    invariant(_calculate_size_on_disk(t) == (int64_t) size_on_disk);
+    invariant(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].size == (int64_t) size_on_disk);
+    invariant(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff == location_on_disk);
+
+exit:
+    return r;
+}
+
+int block_table::iterate(enum translation_type type,
+                         BLOCKTABLE_CALLBACK f, void *extra, bool data_only, bool used_only) {
+    struct translation *src;
+    
+    int r = 0;
+    switch (type) {
+    case TRANSLATION_CURRENT:
+        src = &_current;
+        break;
+    case TRANSLATION_INPROGRESS:
+        src = &_inprogress;
+        break;
+    case TRANSLATION_CHECKPOINTED:
+        src = &_checkpointed;
+        break;
+    default:
+        r = EINVAL;
+    }
+
+    struct translation fakecurrent;
+    memset(&fakecurrent, 0, sizeof(struct translation));
+
+    struct translation *t = &fakecurrent;
+    if (r == 0) {
+        _mutex_lock();
+        _copy_translation(t, src, TRANSLATION_DEBUG);
+        t->block_translation[RESERVED_BLOCKNUM_TRANSLATION] =
+            src->block_translation[RESERVED_BLOCKNUM_TRANSLATION];
+        _mutex_unlock();
+        int64_t i;
+        for (i=0; i<t->smallest_never_used_blocknum.b; i++) {
+            struct block_translation_pair pair = t->block_translation[i];
+            if (data_only && i< RESERVED_BLOCKNUMS) continue;
+            if (used_only && pair.size <= 0) continue;
+            r = f(make_blocknum(i), pair.size, pair.u.diskoff, extra);
+            if (r!=0) break;
+        }
+        toku_free(t->block_translation);
+    }
+    return r;
+}
+
+typedef struct {
+    int64_t used_space;
+    int64_t total_space;
+} frag_extra;
+
+static int frag_helper(BLOCKNUM UU(b), int64_t size, int64_t address, void *extra) {
+    frag_extra *info = (frag_extra *) extra;
+
+    if (size + address > info->total_space)
+        info->total_space = size + address;
+    info->used_space += size;
+    return 0;
+}
+
+void block_table::internal_fragmentation(int64_t *total_sizep, int64_t *used_sizep) {
+    frag_extra info = { 0, 0 };
+    int r = iterate(TRANSLATION_CHECKPOINTED, frag_helper, &info, false, true);
+    assert_zero(r);
+
+    if (total_sizep) *total_sizep = info.total_space;
+    if (used_sizep)  *used_sizep  = info.used_space;
+}
+
+void block_table::_realloc_descriptor_on_disk_unlocked(DISKOFF size, DISKOFF *offset, FT ft) {
+    toku_mutex_assert_locked(&_mutex);
+    BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_DESCRIPTOR);
+    _realloc_on_disk_internal(b, size, offset, ft, false, 0);
+}
+
+void block_table::realloc_descriptor_on_disk(DISKOFF size, DISKOFF *offset, FT ft, int fd) {
+    _mutex_lock();
+    _realloc_descriptor_on_disk_unlocked(size, offset, ft);
+    _ensure_safe_write_unlocked(fd, size, *offset);
+    _mutex_unlock();
+}
+
+void block_table::get_descriptor_offset_size(DISKOFF *offset, DISKOFF *size) {
+    _mutex_lock();
+    BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_DESCRIPTOR);
+    _translate_blocknum_to_offset_size_unlocked(b, offset, size);
+    _mutex_unlock();
+}
+
+void block_table::get_fragmentation_unlocked(TOKU_DB_FRAGMENTATION report) {
+    // Requires:  blocktable lock is held.
+    // Requires:  report->file_size_bytes is already filled in.
+    
+    // Count the headers.
+    report->data_bytes = block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
+    report->data_blocks = 1;
+    report->checkpoint_bytes_additional = block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
+    report->checkpoint_blocks_additional = 1;
+
+    struct translation *current = &_current;
+    for (int64_t i = 0; i < current->length_of_array; i++) {
+        struct block_translation_pair *pair = &current->block_translation[i];
+        if (pair->size > 0) {
+            report->data_bytes += pair->size;
+            report->data_blocks++;
+        }
+    }
+
+    struct translation *checkpointed = &_checkpointed;
+    for (int64_t i = 0; i < checkpointed->length_of_array; i++) {
+        struct block_translation_pair *pair = &checkpointed->block_translation[i];
+        if (pair->size > 0 && !(i < current->length_of_array &&
+                                current->block_translation[i].size > 0 &&
+                                current->block_translation[i].u.diskoff == pair->u.diskoff)) {
+                report->checkpoint_bytes_additional += pair->size;
+                report->checkpoint_blocks_additional++;
+        }
+    }
+
+    struct translation *inprogress = &_inprogress;
+    for (int64_t i = 0; i < inprogress->length_of_array; i++) {
+        struct block_translation_pair *pair = &inprogress->block_translation[i];
+        if (pair->size > 0 && !(i < current->length_of_array &&
+                                current->block_translation[i].size > 0 &&
+                                current->block_translation[i].u.diskoff == pair->u.diskoff) &&
+                              !(i < checkpointed->length_of_array &&
+                                checkpointed->block_translation[i].size > 0 &&
+                                checkpointed->block_translation[i].u.diskoff == pair->u.diskoff)) {
+            report->checkpoint_bytes_additional += pair->size;
+            report->checkpoint_blocks_additional++;
+        }
+    }
+
+    _bt_block_allocator.get_unused_statistics(report);
+}
+
+void block_table::get_info64(struct ftinfo64 *s) {
+    _mutex_lock();
+
+    struct translation *current = &_current;
+    s->num_blocks_allocated = current->length_of_array;
+    s->num_blocks_in_use = 0;
+    s->size_allocated = 0;
+    s->size_in_use = 0;
+
+    for (int64_t i = 0; i < current->length_of_array; ++i) {
+        struct block_translation_pair *block = &current->block_translation[i];
+        if (block->size != size_is_free) {
+            ++s->num_blocks_in_use;
+            s->size_in_use += block->size;
+            if (block->u.diskoff != diskoff_unused) {
+                uint64_t limit = block->u.diskoff + block->size;
+                if (limit > s->size_allocated) {
+                    s->size_allocated = limit;
+                }
+            }
+        }
+    }
+
+    _mutex_unlock();
+}
+
+int block_table::iterate_translation_tables(uint64_t checkpoint_count,
+                                            int (*iter)(uint64_t checkpoint_count,
+                                                        int64_t total_num_rows,
+                                                        int64_t blocknum,
+                                                        int64_t diskoff,
+                                                        int64_t size,
+                                                        void *extra),
+                                            void *iter_extra) {
+    int error = 0;
+    _mutex_lock();
+
+    int64_t total_num_rows = _current.length_of_array + _checkpointed.length_of_array;
+    for (int64_t i = 0; error == 0 && i < _current.length_of_array; ++i) {
+        struct block_translation_pair *block = &_current.block_translation[i];
+        error = iter(checkpoint_count, total_num_rows, i, block->u.diskoff, block->size, iter_extra);
+    }
+    for (int64_t i = 0; error == 0 && i < _checkpointed.length_of_array; ++i) {
+        struct block_translation_pair *block = &_checkpointed.block_translation[i];
+        error = iter(checkpoint_count - 1, total_num_rows, i, block->u.diskoff, block->size, iter_extra);
+    }
+
+    _mutex_unlock();
+    return error;
+}
diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_table.h b/storage/tokudb/PerconaFT/ft/serialize/block_table.h
new file mode 100644
index 00000000000..8d391674540
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/block_table.h
@@ -0,0 +1,285 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <db.h>
+
+#include "portability/toku_stdint.h"
+#include "portability/toku_pthread.h"
+
+#include "ft/serialize/block_allocator.h"
+#include "util/nb_mutex.h"
+
+struct ft;
+
+typedef struct blocknum_s { int64_t b; } BLOCKNUM;
+
+// Offset in a disk. -1 is the 'null' pointer.
+typedef int64_t DISKOFF;
+
+// Unmovable reserved first, then reallocable.
+// We reserve one blocknum for the translation table itself.
+enum {
+    RESERVED_BLOCKNUM_NULL = 0,
+    RESERVED_BLOCKNUM_TRANSLATION = 1,
+    RESERVED_BLOCKNUM_DESCRIPTOR = 2,
+    RESERVED_BLOCKNUMS
+};
+
+typedef int (*BLOCKTABLE_CALLBACK)(BLOCKNUM b, int64_t size, int64_t address, void *extra);
+
+static inline BLOCKNUM make_blocknum(int64_t b) {
+    BLOCKNUM result = { .b = b };
+    return result;
+}
+static const BLOCKNUM ROLLBACK_NONE = { .b = 0 };
+
+/**
+ *  There are three copies of the translation table (btt) in the block table:
+ *
+ *    checkpointed   Is initialized by deserializing from disk,
+ *                   and is the only version ever read from disk.
+ *                   When read from disk it is copied to current.
+ *                   It is immutable. It can be replaced by an inprogress btt.
+ *
+ *    inprogress     Is only filled by copying from current,
+ *                   and is the only version ever serialized to disk.
+ *                   (It is serialized to disk on checkpoint and clean shutdown.)
+ *                   At end of checkpoint it replaces 'checkpointed'.
+ *                   During a checkpoint, any 'pending' dirty writes will update
+ *                   inprogress.
+ *
+ *    current        Is initialized by copying from checkpointed,
+ *                   is the only version ever modified while the database is in use, 
+ *                   and is the only version ever copied to inprogress.
+ *                   It is never stored on disk.
+ */
+class block_table {
+public:
+    enum translation_type {
+        TRANSLATION_NONE = 0,
+        TRANSLATION_CURRENT,
+        TRANSLATION_INPROGRESS,
+        TRANSLATION_CHECKPOINTED,
+        TRANSLATION_DEBUG
+    };
+
+    void create();
+
+    int create_from_buffer(int fd, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer);
+
+    void destroy();
+
+    // Checkpointing
+    void note_start_checkpoint_unlocked();
+    void note_end_checkpoint(int fd);
+    void note_skipped_checkpoint();
+    void maybe_truncate_file_on_open(int fd);
+
+    // Blocknums
+    void allocate_blocknum(BLOCKNUM *res, struct ft *ft);
+    void realloc_on_disk(BLOCKNUM b, DISKOFF size, DISKOFF *offset, struct ft *ft, int fd, bool for_checkpoint, uint64_t heat);
+    void free_blocknum(BLOCKNUM *b, struct ft *ft, bool for_checkpoint);
+    void translate_blocknum_to_offset_size(BLOCKNUM b, DISKOFF *offset, DISKOFF *size);
+    void free_unused_blocknums(BLOCKNUM root);
+    void realloc_descriptor_on_disk(DISKOFF size, DISKOFF *offset, struct ft *ft, int fd);
+    void get_descriptor_offset_size(DISKOFF *offset, DISKOFF *size);
+
+    // External verfication
+    void verify_blocknum_allocated(BLOCKNUM b);
+    void verify_no_data_blocks_except_root(BLOCKNUM root);
+    void verify_no_free_blocknums();
+
+    // Serialization
+    void serialize_translation_to_wbuf(int fd, struct wbuf *w, int64_t *address, int64_t *size);
+
+    // DEBUG ONLY (ftdump included), tests included
+    void blocknum_dump_translation(BLOCKNUM b);
+    void dump_translation_table_pretty(FILE *f);
+    void dump_translation_table(FILE *f);
+    void block_free(uint64_t offset);
+
+    int iterate(enum translation_type type, BLOCKTABLE_CALLBACK f, void *extra, bool data_only, bool used_only); 
+    void internal_fragmentation(int64_t *total_sizep, int64_t *used_sizep);
+
+    // Requires: blocktable lock is held.
+    // Requires: report->file_size_bytes is already filled in.
+    void get_fragmentation_unlocked(TOKU_DB_FRAGMENTATION report);
+
+    int64_t get_blocks_in_use_unlocked();
+
+    void get_info64(struct ftinfo64 *);
+
+    int iterate_translation_tables(uint64_t, int (*)(uint64_t, int64_t, int64_t, int64_t, int64_t, void *), void *);
+
+private:
+    struct block_translation_pair {
+        // If in the freelist, use next_free_blocknum, otherwise diskoff.
+        union {
+            DISKOFF  diskoff; 
+            BLOCKNUM next_free_blocknum;
+        } u;
+
+        // Set to 0xFFFFFFFFFFFFFFFF for free
+        DISKOFF size;
+    };
+
+    // This is the BTT (block translation table)
+    // When the translation (btt) is stored on disk:
+    //   In Header:
+    //       size_on_disk
+    //       location_on_disk
+    //   In block translation table (in order):
+    //       smallest_never_used_blocknum
+    //       blocknum_freelist_head
+    //       array
+    //       a checksum
+    struct translation {
+        enum translation_type type;
+
+        // Number of elements in array (block_translation).  always >= smallest_never_used_blocknum
+        int64_t length_of_array;
+        BLOCKNUM smallest_never_used_blocknum;
+
+        // Next (previously used) unused blocknum (free list)
+        BLOCKNUM blocknum_freelist_head;
+        struct block_translation_pair *block_translation;
+
+        // size_on_disk is stored in block_translation[RESERVED_BLOCKNUM_TRANSLATION].size
+        // location_on is stored in block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff
+    };
+
+    void _create_internal();
+    int _translation_deserialize_from_buffer(struct translation *t,    // destination into which to deserialize
+                                             DISKOFF location_on_disk, // location of translation_buffer
+                                             uint64_t size_on_disk,
+                                             unsigned char * translation_buffer);   // buffer with serialized translation
+
+    void _copy_translation(struct translation *dst, struct translation *src, enum translation_type newtype);
+    void _maybe_optimize_translation(struct translation *t);
+    void _maybe_expand_translation(struct translation *t);
+    bool _translation_prevents_freeing(struct translation *t, BLOCKNUM b, struct block_translation_pair *old_pair);
+    void _free_blocknum_in_translation(struct translation *t, BLOCKNUM b);
+    int64_t _calculate_size_on_disk(struct translation *t);
+    bool _pair_is_unallocated(struct block_translation_pair *pair);
+    void _alloc_inprogress_translation_on_disk_unlocked();
+    void _dump_translation_internal(FILE *f, struct translation *t);
+
+    // Blocknum management
+    void _allocate_blocknum_unlocked(BLOCKNUM *res, struct ft *ft);
+    void _free_blocknum_unlocked(BLOCKNUM *bp, struct ft *ft, bool for_checkpoint);
+    void _realloc_descriptor_on_disk_unlocked(DISKOFF size, DISKOFF *offset, struct ft *ft);
+    void _realloc_on_disk_internal(BLOCKNUM b, DISKOFF size, DISKOFF *offset, struct ft *ft, bool for_checkpoint, uint64_t heat);
+    void _translate_blocknum_to_offset_size_unlocked(BLOCKNUM b, DISKOFF *offset, DISKOFF *size);
+
+    // File management
+    void _maybe_truncate_file(int fd, uint64_t size_needed_before);
+    void _ensure_safe_write_unlocked(int fd, DISKOFF block_size, DISKOFF block_offset);
+
+    // Verification
+    bool _is_valid_blocknum(struct translation *t, BLOCKNUM b);
+    void _verify_valid_blocknum(struct translation *t, BLOCKNUM b);
+    bool _is_valid_freeable_blocknum(struct translation *t, BLOCKNUM b);
+    void _verify_valid_freeable_blocknum(struct translation *t, BLOCKNUM b);
+    bool _no_data_blocks_except_root(BLOCKNUM root);
+    bool _blocknum_allocated(BLOCKNUM b);
+
+    // Locking 
+    //
+    // TODO: Move the lock to the FT
+    void _mutex_lock();
+    void _mutex_unlock();
+
+    // The current translation is the one used by client threads. 
+    // It is not represented on disk.
+    struct translation _current;
+
+    // The translation used by the checkpoint currently in progress. 
+    // If the checkpoint thread allocates a block, it must also update the current translation.
+    struct translation _inprogress;
+
+    // The translation for the data that shall remain inviolate on disk until the next checkpoint finishes,
+    // after which any blocks used only in this translation can be freed.
+    struct translation _checkpointed;
+
+    // The in-memory data structure for block allocation. 
+    // There is no on-disk data structure for block allocation.
+    // Note: This is *allocation* not *translation* - the block allocator is unaware of which
+    //       blocks are used for which translation, but simply allocates and deallocates blocks.
+    block_allocator _bt_block_allocator;
+    toku_mutex_t _mutex;
+    struct nb_mutex _safe_file_size_lock;
+    bool _checkpoint_skipped;
+    uint64_t _safe_file_size;
+
+    // Because the lock is in a weird place right now
+    friend void toku_ft_lock(struct ft *ft);
+    friend void toku_ft_unlock(struct ft *ft);
+};
+
+// For serialize / deserialize
+
+#include "ft/serialize/wbuf.h"
+
+static inline void wbuf_BLOCKNUM (struct wbuf *w, BLOCKNUM b) {
+    wbuf_ulonglong(w, b.b);
+}
+
+static inline void wbuf_nocrc_BLOCKNUM (struct wbuf *w, BLOCKNUM b) {
+    wbuf_nocrc_ulonglong(w, b.b);
+}
+
+static inline void wbuf_DISKOFF(struct wbuf *wb, DISKOFF off) {
+    wbuf_ulonglong(wb, (uint64_t) off);
+}
+
+#include "ft/serialize/rbuf.h"
+
+static inline DISKOFF rbuf_DISKOFF(struct rbuf *rb) {
+    return rbuf_ulonglong(rb);
+}
+
+static inline BLOCKNUM rbuf_blocknum(struct rbuf *rb) {
+    BLOCKNUM result = make_blocknum(rbuf_longlong(rb));
+    return result;
+}
+
+static inline void rbuf_ma_BLOCKNUM(struct rbuf *rb, memarena *UU(ma), BLOCKNUM *blocknum) {
+    *blocknum = rbuf_blocknum(rb);
+}
diff --git a/storage/tokudb/PerconaFT/ft/serialize/compress.cc b/storage/tokudb/PerconaFT/ft/serialize/compress.cc
new file mode 100644
index 00000000000..1719b6b7cb5
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/compress.cc
@@ -0,0 +1,257 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <toku_portability.h>
+#include <util/scoped_malloc.h>
+
+#include <zlib.h>
+#include <lzma.h>
+#include <snappy.h>
+
+#include "compress.h"
+#include "memory.h"
+#include "quicklz.h"
+#include "toku_assert.h"
+
+static inline enum toku_compression_method
+normalize_compression_method(enum toku_compression_method method)
+// Effect: resolve "friendly" names like "fast" and "small" into their real values.
+{
+    switch (method) {
+    case TOKU_DEFAULT_COMPRESSION_METHOD:
+    case TOKU_FAST_COMPRESSION_METHOD:
+        return TOKU_QUICKLZ_METHOD;
+    case TOKU_SMALL_COMPRESSION_METHOD:
+        return TOKU_LZMA_METHOD;
+    default:
+        return method; // everything else is fine
+    }
+}
+
+size_t toku_compress_bound (enum toku_compression_method a, size_t size)
+// See compress.h for the specification of this function.
+{
+    a = normalize_compression_method(a);
+    switch (a) {
+    case TOKU_NO_COMPRESSION:
+        return size + 1;
+    case TOKU_LZMA_METHOD:
+	return 1+lzma_stream_buffer_bound(size); // We need one extra for the rfc1950-style header byte (bits -03 are TOKU_LZMA_METHOD (1), bits 4-7 are the compression level)
+    case TOKU_QUICKLZ_METHOD:
+        return size+400 + 1;  // quicklz manual says 400 bytes is enough.  We need one more byte for the rfc1950-style header byte.  bits 0-3 are 9, bits 4-7 are the QLZ_COMPRESSION_LEVEL.
+    case TOKU_ZLIB_METHOD:
+        return compressBound (size);
+    case TOKU_ZLIB_WITHOUT_CHECKSUM_METHOD:
+        return 2+deflateBound(nullptr, size); // We need one extra for the rfc1950-style header byte, and one extra to store windowBits (a bit over cautious about future upgrades maybe).
+    case TOKU_SNAPPY_METHOD:
+        return (1 + snappy::MaxCompressedLength(size));
+    default:
+        break;
+    }
+    // fall through for bad enum (thus compiler can warn us if we didn't use all the enums
+    assert(0); return 0;
+}
+
+void toku_compress (enum toku_compression_method a,
+                    // the following types and naming conventions come from zlib.h
+                    Bytef       *dest,   uLongf *destLen,
+                    const Bytef *source, uLong   sourceLen)
+// See compress.h for the specification of this function.
+{
+    static const int zlib_compression_level = 5;
+    static const int zlib_without_checksum_windowbits = -15;
+
+    a = normalize_compression_method(a);
+    assert(sourceLen < (1LL << 32));
+    switch (a) {
+    case TOKU_NO_COMPRESSION:
+        dest[0] = TOKU_NO_COMPRESSION;
+        memcpy(dest + 1, source, sourceLen);
+        *destLen = sourceLen + 1;
+        return;
+    case TOKU_ZLIB_METHOD: {
+        int r = compress2(dest, destLen, source, sourceLen, zlib_compression_level);
+        assert(r == Z_OK);
+        assert((dest[0]&0xF) == TOKU_ZLIB_METHOD);
+        return;
+    }
+    case  TOKU_QUICKLZ_METHOD: {
+        if (sourceLen==0) {
+            // quicklz requires at least one byte, so we handle this ourselves
+            assert(1 <= *destLen);
+            *destLen = 1;
+        } else {
+            toku::scoped_calloc qsc_buf(sizeof(qlz_state_compress));
+            qlz_state_compress *qsc = reinterpret_cast<qlz_state_compress *>(qsc_buf.get());
+            size_t actual_destlen = qlz_compress(source, (char*)(dest+1), sourceLen, qsc);
+            assert(actual_destlen + 1 <= *destLen);
+            // add one for the rfc1950-style header byte.
+            *destLen = actual_destlen + 1;
+        }
+        // Fill in that first byte
+        dest[0] = TOKU_QUICKLZ_METHOD + (QLZ_COMPRESSION_LEVEL << 4);
+        return;
+    }
+    case TOKU_LZMA_METHOD: {
+        const int lzma_compression_level = 2;
+        if (sourceLen==0) {
+            // lzma version 4.999 requires at least one byte, so we'll do it ourselves.
+            assert(1<=*destLen);
+            *destLen = 1;
+        } else {
+            size_t out_pos = 1;
+            lzma_ret r = lzma_easy_buffer_encode(lzma_compression_level,
+                                                 LZMA_CHECK_NONE, NULL,
+                                                 source, sourceLen,
+                                                 dest, &out_pos, *destLen);
+            assert(out_pos < *destLen);
+            if (r != LZMA_OK) {
+                fprintf(stderr, "lzma_easy_buffer_encode() returned %d\n", (int) r);
+            }
+            assert(r==LZMA_OK);
+            *destLen = out_pos;
+        }
+        dest[0] = TOKU_LZMA_METHOD + (lzma_compression_level << 4);
+        return;
+    }
+    case TOKU_ZLIB_WITHOUT_CHECKSUM_METHOD: {
+        z_stream strm;
+        strm.zalloc = Z_NULL;
+        strm.zfree = Z_NULL;
+        strm.opaque = Z_NULL;
+        strm.next_in = const_cast<Bytef *>(source);
+        strm.avail_in = sourceLen;
+        int r = deflateInit2(&strm, zlib_compression_level, Z_DEFLATED,
+                             zlib_without_checksum_windowbits, 8, Z_DEFAULT_STRATEGY);
+        lazy_assert(r == Z_OK);
+        strm.next_out = dest + 2;
+        strm.avail_out = *destLen - 2;
+        r = deflate(&strm, Z_FINISH);
+        lazy_assert(r == Z_STREAM_END);
+        r = deflateEnd(&strm);
+        lazy_assert(r == Z_OK);
+        *destLen = strm.total_out + 2;
+        dest[0] = TOKU_ZLIB_WITHOUT_CHECKSUM_METHOD + (zlib_compression_level << 4);
+        dest[1] = zlib_without_checksum_windowbits;
+        return;
+    }
+    case TOKU_SNAPPY_METHOD: {
+        snappy::RawCompress((char*)source, sourceLen, (char*)dest + 1, destLen);
+        *destLen += 1;
+        dest[0] = TOKU_SNAPPY_METHOD;
+        return;
+    }
+    default:
+        break;
+    }
+    // default fall through to error.
+    assert(0);
+}
+
+void toku_decompress (Bytef       *dest,   uLongf destLen,
+                      const Bytef *source, uLongf sourceLen)
+// See compress.h for the specification of this function.
+{
+    assert(sourceLen>=1); // need at least one byte for the RFC header.
+    switch (source[0] & 0xF) {
+    case TOKU_NO_COMPRESSION:
+        memcpy(dest, source + 1, sourceLen - 1);
+        return;
+    case TOKU_ZLIB_METHOD: {
+        uLongf actual_destlen = destLen;
+        int r = uncompress(dest, &actual_destlen, source, sourceLen);
+        assert(r == Z_OK);
+        assert(actual_destlen == destLen);
+        return;
+    }
+    case TOKU_QUICKLZ_METHOD:
+        if (sourceLen>1) {
+            toku::scoped_calloc state_buf(sizeof(qlz_state_decompress));
+            qlz_state_decompress *qsd = reinterpret_cast<qlz_state_decompress *>(state_buf.get());
+            uLongf actual_destlen = qlz_decompress((char*)source+1, dest, qsd);
+            assert(actual_destlen == destLen);
+        } else {
+            // length 1 means there is no data, so do nothing.
+            assert(destLen==0);
+        }
+        return;
+    case TOKU_LZMA_METHOD: {
+        if (sourceLen>1) {
+            uint64_t memlimit = UINT64_MAX;
+            size_t out_pos = 0;
+            size_t in_pos  = 1;
+            lzma_ret r = lzma_stream_buffer_decode(&memlimit,  // memlimit, use UINT64_MAX to disable this check
+                                                   0,          // flags
+                                                   NULL,       // allocator
+                                                   source, &in_pos, sourceLen,
+                                                   dest, &out_pos, destLen);
+            assert(r==LZMA_OK);
+            assert(out_pos == destLen);
+        } else {
+            // length 1 means there is no data, so do nothing.
+            assert(destLen==0);
+        }
+        return;
+    }
+    case TOKU_ZLIB_WITHOUT_CHECKSUM_METHOD: {
+        z_stream strm;
+        strm.next_in = const_cast<Bytef *>(source + 2);
+        strm.avail_in = sourceLen - 2;
+        strm.zalloc = Z_NULL;
+        strm.zfree = Z_NULL;
+        strm.opaque = Z_NULL;
+        char windowBits = source[1];
+        int r = inflateInit2(&strm, windowBits);
+        lazy_assert(r == Z_OK);
+        strm.next_out = dest;
+        strm.avail_out = destLen;
+        r = inflate(&strm, Z_FINISH);
+        lazy_assert(r == Z_STREAM_END);
+        r = inflateEnd(&strm);
+        lazy_assert(r == Z_OK);
+        return;
+    }
+    case TOKU_SNAPPY_METHOD: {
+        bool r = snappy::RawUncompress((char*)source + 1, sourceLen - 1, (char*)dest);
+        assert(r);
+        return;
+    }
+    }
+    // default fall through to error.
+    assert(0);
+}
diff --git a/storage/tokudb/PerconaFT/ft/serialize/compress.h b/storage/tokudb/PerconaFT/ft/serialize/compress.h
new file mode 100644
index 00000000000..74307985e75
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/compress.h
@@ -0,0 +1,78 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <zlib.h>
+#include <db.h>
+
+// The following provides an abstraction of quicklz and zlib.
+// We offer three compression methods: ZLIB, QUICKLZ, and LZMA, as well as a "no compression" option.  These options are declared in make_tdb.c.
+// The resulting byte string includes enough information for us to decompress it.  That is, we can tell whether it's z-compressed or qz-compressed or xz-compressed.
+
+size_t toku_compress_bound (enum toku_compression_method a, size_t size);
+// Effect:  Return the number of bytes needed to compress a buffer of size SIZE using compression method A.
+//  Typically, the result is a little bit larger than SIZE, since some data cannot be compressed.
+// Usage note: It may help to know roughly how much space is involved.
+//    zlib's bound is something like (size + (size>>12) + (size>>14) + (size>>25) + 13.
+//    quicklz's bound is something like size+400.
+
+void toku_compress (enum toku_compression_method a,
+		    // the following types and naming conventions come from zlib.h
+		    Bytef       *dest,   uLongf *destLen,
+		    const Bytef *source, uLong   sourceLen);
+// Effect: Using compression method A, compress SOURCE into DEST.   The number of bytes to compress is passed in SOURCELEN.
+//  On input: *destLen is the size of the buffer.
+//  On output: *destLen is the size of the actual compressed data.
+// Usage note: sourceLen may be be zero (unlike for quicklz, which requires sourceLen>0).
+// Requires: The buffer must be big enough to hold the compressed data.  (That is *destLen >= compressBound(a, sourceLen))
+// Requires: sourceLen < 2^32.
+// Usage note: Although we *try* to assert if the DESTLEN isn't big enough, it's possible that it's too late by then (in the case of quicklz which offers
+//   no way to avoid a buffer overrun.)  So we require that that DESTLEN is big enough.
+// Rationale:  zlib's argument order is DEST then SOURCE with the size of the buffer passed in *destLen, and the size of the result returned in *destLen.
+//             quicklz's argument order is SOURCE then DEST with the size returned (and it has no way to verify that an overright didn't happen).
+//     We use zlib's calling conventions partly because it is safer, and partly because it is more established.
+//     We also use zlib's ugly camel case convention for destLen and sourceLen.
+//     Unlike zlib, we return no error codes.  Instead, we require that the data be OK and the size of the buffers is OK, and assert if there's a problem.
+
+void toku_decompress (Bytef       *dest,   uLongf destLen,
+		      const Bytef *source, uLongf sourceLen);
+// Effect: Decompress source (length sourceLen) into dest (length destLen)
+//  This function can decompress data compressed with either zlib or quicklz compression methods (calling toku_compress(), which puts an appropriate header on so we know which it is.)
+// Requires: destLen is equal to the actual decompressed size of the data.
+// Requires: The source must have been properly compressed.
diff --git a/storage/tokudb/PerconaFT/ft/serialize/ft-node-deserialize.cc b/storage/tokudb/PerconaFT/ft/serialize/ft-node-deserialize.cc
new file mode 100644
index 00000000000..02a9dfd085c
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/ft-node-deserialize.cc
@@ -0,0 +1,186 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "ft/node.h"
+#include "ft/ft-internal.h"
+#include "ft/serialize/ft_node-serialize.h"
+
+/*
+ * ft-node-deserialize.c -
+ *      This file contains functions used by deserializtion
+ *  code paths in and out of the engine.  The functions can,
+ *  essentially, be broken up into two types.  Some of these
+ *  functions return error codes based expected values inside
+ *  the fractal tree node, others merely read the specific 
+ *  quantities of bytes out of the buffer.  It is expeceted
+ *  that these will be called in the correct order by users
+ *  of these functions/this API.
+ *
+ */
+
+// Sets initial values for the given fractal tree node to be
+// deserialized
+void
+initialize_ftnode(FTNODE node, BLOCKNUM blocknum)
+{
+    node->fullhash = 0xDEADBEEF; // <CER> Is this 'spoof' ok?
+    node->blocknum = blocknum;
+    node->dirty = 0;
+    node->bp = NULL;
+    // <CER> Can we use this initialization as a correctness assert in
+    // a later function?
+    node->layout_version_read_from_disk = 0;
+}
+
+/************************
+ * TODO: In other deserialization code, we check the rb size member.  We
+ * verify that it is greater than or equal to 24.  Ignoring this magic
+ * number for a moment, should we put this check in its own function? *
+*************************/
+
+
+// Read and check the 'magic' bytes on disk.  Returns an error if
+// the magic does not match.
+int
+read_and_check_magic(struct rbuf *rb)
+{
+    int r = 0;
+    const void *magic;
+    rbuf_literal_bytes(rb, &magic, 8);
+    if (memcmp(magic, "tokuleaf", 8)!=0 &&
+        memcmp(magic, "tokunode", 8)!=0) {
+        r = DB_BADFORMAT; // TODO: Return more meaningful error.
+    }
+
+    return r;
+}
+
+// Read the version number from the given buffer
+// and returns an error if the version is too old.
+int
+read_and_check_version(FTNODE node, struct rbuf *rb)
+{
+    int r = 0;
+    int version = rbuf_int(rb);
+    node->layout_version_read_from_disk = version;
+    if (version < FT_LAYOUT_MIN_SUPPORTED_VERSION) {
+        r = 1; // TODO: Better error reporting.
+    }
+
+    return r;
+}
+
+// Reads the basic version, build, and child info from
+// the given buffer.
+void
+read_node_info(FTNODE node, struct rbuf *rb, int version)
+{
+    node->layout_version = version;
+    node->layout_version_original = rbuf_int(rb);
+    node->build_id = rbuf_int(rb);
+    node->n_children = rbuf_int(rb);
+}
+
+// Allocates the partitions based on the given node's nubmer
+// of children.  It then reads, out of the given buffer,
+// the start and size of each child partition.
+// TODO: Should these be two seperate functions?
+void
+allocate_and_read_partition_offsets(FTNODE node, struct rbuf *rb, FTNODE_DISK_DATA *ndd)
+{
+    XMALLOC_N(node->n_children, node->bp);
+    // TODO: Fix this to use xmalloc_n
+    XMALLOC_N(node->n_children, *ndd);
+    // Read the partition locations.
+    for (int i = 0; i < node->n_children; i++) {
+        BP_START(*ndd, i) = rbuf_int(rb);
+        BP_SIZE (*ndd, i) = rbuf_int(rb);
+    }
+}
+
+// Compares checksum of stored (in the given buffer) checksum
+// and the checksum of the buffer itself.  If these are NOT
+// equal, this function returns an appropriate error code.
+int
+check_node_info_checksum(struct rbuf *rb)
+{
+    int r = 0;
+    // Verify checksum of header stored.
+    uint32_t checksum = toku_x1764_memory(rb->buf, rb->ndone);
+    uint32_t stored_checksum = rbuf_int(rb);
+
+    if (stored_checksum != checksum) {
+        // TODO: dump_bad_block(rb->buf, rb->size);
+        r = TOKUDB_BAD_CHECKSUM;
+    }
+
+    return r;
+}
+
+// Reads node info from older (13 and 14) fractal tree nodes
+// out of the given buffer.
+void
+read_legacy_node_info(FTNODE node, struct rbuf *rb, int version)
+{
+    (void)rbuf_int(rb); // 1. nodesize
+    node->flags = rbuf_int(rb);    // 2. flags
+    node->height = rbuf_int(rb);   // 3. height
+    
+    // If the version is less than 14, there are two extra ints here.
+    // we would need to ignore them if they are there.
+    if (version == FT_LAYOUT_VERSION_13) {
+        (void) rbuf_int(rb);       // 4. rand4
+        (void) rbuf_int(rb);       // 5. local
+    }
+}
+
+// Assuming the given buffer is in the correct position,
+// this checks to see if the stored checksum matches the
+// checksum of the entire buffer.
+int
+check_legacy_end_checksum(struct rbuf *rb)
+{
+    int r = 0;
+    uint32_t expected_xsum = rbuf_int(rb);
+    uint32_t actual_xsum = toku_x1764_memory(rb->buf, rb->size - 4);
+    if (expected_xsum != actual_xsum) {
+        r = TOKUDB_BAD_CHECKSUM;
+    }
+    
+    return r;
+}
diff --git a/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.cc b/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.cc
new file mode 100644
index 00000000000..a7bc2949276
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.cc
@@ -0,0 +1,812 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "ft/ft.h"
+#include "ft/ft-internal.h"
+#include "ft/msg.h"
+#include "ft/serialize/block_allocator.h"
+#include "ft/serialize/block_table.h"
+#include "ft/serialize/compress.h"
+#include "ft/serialize/ft-serialize.h"
+
+// not version-sensitive because we only serialize a descriptor using the current layout_version
+uint32_t
+toku_serialize_descriptor_size(DESCRIPTOR desc) {
+    //Checksum NOT included in this.  Checksum only exists in header's version.
+    uint32_t size = 4; // four bytes for size of descriptor
+    size += desc->dbt.size;
+    return size;
+}
+
+static uint32_t
+deserialize_descriptor_size(DESCRIPTOR desc, int layout_version) {
+    //Checksum NOT included in this.  Checksum only exists in header's version.
+    uint32_t size = 4; // four bytes for size of descriptor
+    if (layout_version == FT_LAYOUT_VERSION_13)
+        size += 4;   // for version 13, include four bytes of "version"
+    size += desc->dbt.size;
+    return size;
+}
+
+void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, DESCRIPTOR desc) {
+    wbuf_bytes(wb, desc->dbt.data, desc->dbt.size);
+}
+
+//Descriptor is written to disk during toku_ft_handle_open iff we have a new (or changed)
+//descriptor.
+//Descriptors are NOT written during the header checkpoint process.
+void
+toku_serialize_descriptor_contents_to_fd(int fd, DESCRIPTOR desc, DISKOFF offset) {
+    // make the checksum
+    int64_t size = toku_serialize_descriptor_size(desc)+4; //4 for checksum
+    int64_t size_aligned = roundup_to_multiple(512, size);
+    struct wbuf w;
+    char *XMALLOC_N_ALIGNED(512, size_aligned, aligned_buf);
+    for (int64_t i=size; i<size_aligned; i++) aligned_buf[i] = 0;
+    wbuf_init(&w, aligned_buf, size);
+    toku_serialize_descriptor_contents_to_wbuf(&w, desc);
+    {
+        //Add checksum
+        uint32_t checksum = toku_x1764_finish(&w.checksum);
+        wbuf_int(&w, checksum);
+    }
+    lazy_assert(w.ndone==w.size);
+    {
+        //Actual Write translation table
+        toku_os_full_pwrite(fd, w.buf, size_aligned, offset);
+    }
+    toku_free(w.buf);
+}
+
+static void
+deserialize_descriptor_from_rbuf(struct rbuf *rb, DESCRIPTOR desc, int layout_version) {
+    if (layout_version <= FT_LAYOUT_VERSION_13) {
+        // in older versions of tokuft, the descriptor had a 4 byte
+        // version, which we skip over
+        (void) rbuf_int(rb);
+    }
+
+    uint32_t size;
+    const void *data;
+    rbuf_bytes(rb, &data, &size);
+    toku_memdup_dbt(&desc->dbt, data, size);
+}
+
+static int
+deserialize_descriptor_from(int fd, block_table *bt, DESCRIPTOR desc, int layout_version) {
+    int r = 0;
+    DISKOFF offset;
+    DISKOFF size;
+    unsigned char *dbuf = nullptr;
+    bt->get_descriptor_offset_size(&offset, &size);
+    memset(desc, 0, sizeof(*desc));
+    if (size > 0) {
+        lazy_assert(size>=4); //4 for checksum
+        {
+            ssize_t size_to_malloc = roundup_to_multiple(512, size);
+            XMALLOC_N_ALIGNED(512, size_to_malloc, dbuf);
+            {
+
+                ssize_t sz_read = toku_os_pread(fd, dbuf, size_to_malloc, offset);
+                lazy_assert(sz_read==size_to_malloc);
+            }
+            {
+                // check the checksum
+                uint32_t x1764 = toku_x1764_memory(dbuf, size-4);
+                //printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk);
+                uint32_t stored_x1764 = toku_dtoh32(*(int*)(dbuf + size-4));
+                if (x1764 != stored_x1764) {
+                    fprintf(stderr, "Descriptor checksum failure: calc=0x%08x read=0x%08x\n", x1764, stored_x1764);
+                    r = TOKUDB_BAD_CHECKSUM;
+                    toku_free(dbuf);
+                    goto exit;
+                }
+            }
+
+            struct rbuf rb = { .buf = dbuf, .size = (unsigned int) size, .ndone = 0 };
+            deserialize_descriptor_from_rbuf(&rb, desc, layout_version);
+            lazy_assert(deserialize_descriptor_size(desc, layout_version) + 4 == size);
+            toku_free(dbuf);
+        }
+    }
+exit:
+    return r;
+}
+
+int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
+// Effect: Deserialize the ft header.
+//   We deserialize ft_header only once and then share everything with all the FTs.
+{
+    int r;
+    FT ft = NULL;
+    paranoid_invariant(version >= FT_LAYOUT_MIN_SUPPORTED_VERSION);
+    paranoid_invariant(version <= FT_LAYOUT_VERSION);
+    // We already know:
+    //  we have an rbuf representing the header.
+    //  The checksum has been validated
+
+    //Verification of initial elements.
+    //Check magic number
+    const void *magic;
+    rbuf_literal_bytes(rb, &magic, 8);
+    lazy_assert(memcmp(magic,"tokudata",8)==0);
+
+    XCALLOC(ft);
+    ft->checkpoint_header = NULL;
+    toku_list_init(&ft->live_ft_handles);
+
+    //version MUST be in network order on disk regardless of disk order
+    ft->layout_version_read_from_disk = rbuf_network_int(rb);
+    invariant(ft->layout_version_read_from_disk >= FT_LAYOUT_MIN_SUPPORTED_VERSION);
+    invariant(ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION);
+
+    //build_id MUST be in network order on disk regardless of disk order
+    uint32_t build_id;
+    build_id = rbuf_network_int(rb);
+
+    //Size MUST be in network order regardless of disk order.
+    uint32_t size;
+    size = rbuf_network_int(rb);
+    lazy_assert(size == rb->size);
+
+    const void *tmp_byte_order_check;
+    lazy_assert((sizeof tmp_byte_order_check) >= 8);
+    rbuf_literal_bytes(rb, &tmp_byte_order_check, 8); //Must not translate byte order
+    int64_t byte_order_stored;
+    byte_order_stored = *(int64_t*)tmp_byte_order_check;
+    lazy_assert(byte_order_stored == toku_byte_order_host);
+
+    uint64_t checkpoint_count;
+    checkpoint_count = rbuf_ulonglong(rb);
+    LSN checkpoint_lsn;
+    checkpoint_lsn = rbuf_LSN(rb);
+    unsigned nodesize;
+    nodesize = rbuf_int(rb);
+    DISKOFF translation_address_on_disk;
+    translation_address_on_disk = rbuf_DISKOFF(rb);
+    DISKOFF translation_size_on_disk;
+    translation_size_on_disk = rbuf_DISKOFF(rb);
+    lazy_assert(translation_address_on_disk > 0);
+    lazy_assert(translation_size_on_disk > 0);
+
+    // initialize the tree lock
+    toku_ft_init_reflock(ft);
+
+    //Load translation table
+    {
+        size_t size_to_read = roundup_to_multiple(512, translation_size_on_disk);
+        unsigned char *XMALLOC_N_ALIGNED(512, size_to_read, tbuf);
+        {
+            // This cast is messed up in 32-bits if the block translation
+            // table is ever more than 4GB.  But in that case, the
+            // translation table itself won't fit in main memory.
+            ssize_t readsz = toku_os_pread(fd, tbuf, size_to_read,
+                                           translation_address_on_disk);
+            assert(readsz >= translation_size_on_disk);
+            assert(readsz <= (ssize_t)size_to_read);
+        }
+        // Create table and read in data.
+        r = ft->blocktable.create_from_buffer(fd,
+                                              translation_address_on_disk,
+                                              translation_size_on_disk,
+                                              tbuf);
+        toku_free(tbuf);
+        if (r != 0) {
+            goto exit;
+        }
+    }
+
+    BLOCKNUM root_blocknum;
+    root_blocknum = rbuf_blocknum(rb);
+    unsigned flags;
+    flags = rbuf_int(rb);
+    if (ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_13) {
+        // deprecate 'TOKU_DB_VALCMP_BUILTIN'. just remove the flag
+        flags &= ~TOKU_DB_VALCMP_BUILTIN_13;
+    }
+    int layout_version_original;
+    layout_version_original = rbuf_int(rb);
+    uint32_t build_id_original;
+    build_id_original = rbuf_int(rb);
+    uint64_t time_of_creation;
+    time_of_creation = rbuf_ulonglong(rb);
+    uint64_t time_of_last_modification;
+    time_of_last_modification = rbuf_ulonglong(rb);
+
+    if (ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_18) {
+        // 17 was the last version with these fields, we no longer store
+        // them, so read and discard them
+        (void) rbuf_ulonglong(rb);  // num_blocks_to_upgrade_13
+        if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_15) {
+            (void) rbuf_ulonglong(rb);  // num_blocks_to_upgrade_14
+        }
+    }
+
+    // fake creation during the last checkpoint
+    TXNID root_xid_that_created;
+    root_xid_that_created = checkpoint_lsn.lsn;
+    if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_14) {
+        rbuf_TXNID(rb, &root_xid_that_created);
+    }
+
+    // TODO(leif): get this to default to what's specified, not the
+    // hard-coded default
+    unsigned basementnodesize;
+    basementnodesize = FT_DEFAULT_BASEMENT_NODE_SIZE;
+    uint64_t time_of_last_verification;
+    time_of_last_verification = 0;
+    if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_15) {
+        basementnodesize = rbuf_int(rb);
+        time_of_last_verification = rbuf_ulonglong(rb);
+    }
+
+    STAT64INFO_S on_disk_stats;
+    on_disk_stats = ZEROSTATS;
+    uint64_t time_of_last_optimize_begin;
+    time_of_last_optimize_begin = 0;
+    uint64_t time_of_last_optimize_end;
+    time_of_last_optimize_end = 0;
+    uint32_t count_of_optimize_in_progress;
+    count_of_optimize_in_progress = 0;
+    MSN msn_at_start_of_last_completed_optimize;
+    msn_at_start_of_last_completed_optimize = ZERO_MSN;
+    if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_18) {
+        on_disk_stats.numrows = rbuf_ulonglong(rb);
+        on_disk_stats.numbytes = rbuf_ulonglong(rb);
+        ft->in_memory_stats = on_disk_stats;
+        time_of_last_optimize_begin = rbuf_ulonglong(rb);
+        time_of_last_optimize_end = rbuf_ulonglong(rb);
+        count_of_optimize_in_progress = rbuf_int(rb);
+        msn_at_start_of_last_completed_optimize = rbuf_MSN(rb);
+    }
+
+    enum toku_compression_method compression_method;
+    MSN highest_unused_msn_for_upgrade;
+    highest_unused_msn_for_upgrade.msn = (MIN_MSN.msn - 1);
+    if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_19) {
+        unsigned char method = rbuf_char(rb);
+        compression_method = (enum toku_compression_method) method;
+        highest_unused_msn_for_upgrade = rbuf_MSN(rb);
+    } else {
+        // we hard coded zlib until 5.2, then quicklz in 5.2
+        if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_18) {
+            compression_method = TOKU_ZLIB_METHOD;
+        } else {
+            compression_method = TOKU_QUICKLZ_METHOD;
+        }
+    }
+
+    MSN max_msn_in_ft;
+    max_msn_in_ft = ZERO_MSN;  // We'll upgrade it from the root node later if necessary
+    if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_21) {
+        max_msn_in_ft = rbuf_MSN(rb);
+    }
+
+    unsigned fanout;
+    fanout = FT_DEFAULT_FANOUT;
+    if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_28) {
+        fanout = rbuf_int(rb);
+    }
+
+    (void) rbuf_int(rb); //Read in checksum and ignore (already verified).
+    if (rb->ndone != rb->size) {
+        fprintf(stderr, "Header size did not match contents.\n");
+        r = EINVAL;
+        goto exit;
+    }
+
+    {
+        struct ft_header h = {
+            .type = FT_CURRENT,
+            .dirty = 0,
+            .checkpoint_count = checkpoint_count,
+            .checkpoint_lsn = checkpoint_lsn,
+            .layout_version = FT_LAYOUT_VERSION,
+            .layout_version_original = layout_version_original,
+            .build_id = build_id,
+            .build_id_original = build_id_original,
+            .time_of_creation = time_of_creation,
+            .root_xid_that_created = root_xid_that_created,
+            .time_of_last_modification = time_of_last_modification,
+            .time_of_last_verification = time_of_last_verification,
+            .root_blocknum = root_blocknum,
+            .flags = flags,
+            .nodesize = nodesize,
+            .basementnodesize = basementnodesize,
+            .compression_method = compression_method,
+            .fanout = fanout,
+            .highest_unused_msn_for_upgrade = highest_unused_msn_for_upgrade,
+            .max_msn_in_ft = max_msn_in_ft,
+            .time_of_last_optimize_begin = time_of_last_optimize_begin,
+            .time_of_last_optimize_end = time_of_last_optimize_end,
+            .count_of_optimize_in_progress = count_of_optimize_in_progress,
+            .count_of_optimize_in_progress_read_from_disk = count_of_optimize_in_progress,
+            .msn_at_start_of_last_completed_optimize = msn_at_start_of_last_completed_optimize,
+            .on_disk_stats = on_disk_stats
+        };
+        XMEMDUP(ft->h, &h);
+    }
+
+    if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_18) {
+        // This needs ft->h to be non-null, so we have to do it after we
+        // read everything else.
+        r = toku_upgrade_subtree_estimates_to_stat64info(fd, ft);
+        if (r != 0) {
+            goto exit;
+        }
+    }
+    if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_21) {
+        r = toku_upgrade_msn_from_root_to_header(fd, ft);
+        if (r != 0) {
+            goto exit;
+        }
+    }
+
+    invariant((uint32_t) ft->layout_version_read_from_disk == version);
+    r = deserialize_descriptor_from(fd, &ft->blocktable, &ft->descriptor, version);
+    if (r != 0) {
+        goto exit;
+    }
+
+    // initialize for svn #4541
+    toku_clone_dbt(&ft->cmp_descriptor.dbt, ft->descriptor.dbt);
+
+    // Version 13 descriptors had an extra 4 bytes that we don't read
+    // anymore.  Since the header is going to think it's the current
+    // version if it gets written out, we need to write the descriptor in
+    // the new format (without those bytes) before that happens.
+    if (version <= FT_LAYOUT_VERSION_13) {
+        toku_ft_update_descriptor_with_fd(ft, &ft->cmp_descriptor, fd);
+    }
+    r = 0;
+exit:
+    if (r != 0 && ft != NULL) {
+        toku_free(ft);
+        ft = NULL;
+    }
+    *ftp = ft;
+    return r;
+}
+
+static size_t
+serialize_ft_min_size (uint32_t version) {
+    size_t size = 0;
+
+    switch(version) {
+    case FT_LAYOUT_VERSION_28:
+        size += sizeof(uint32_t); // fanout in ft
+    case FT_LAYOUT_VERSION_27:
+    case FT_LAYOUT_VERSION_26:
+    case FT_LAYOUT_VERSION_25:
+    case FT_LAYOUT_VERSION_24:
+    case FT_LAYOUT_VERSION_23:
+    case FT_LAYOUT_VERSION_22:
+    case FT_LAYOUT_VERSION_21:
+        size += sizeof(MSN);       // max_msn_in_ft
+    case FT_LAYOUT_VERSION_20:
+    case FT_LAYOUT_VERSION_19:
+        size += 1; // compression method
+        size += sizeof(MSN);       // highest_unused_msn_for_upgrade
+    case FT_LAYOUT_VERSION_18:
+        size += sizeof(uint64_t);  // time_of_last_optimize_begin
+        size += sizeof(uint64_t);  // time_of_last_optimize_end
+        size += sizeof(uint32_t);  // count_of_optimize_in_progress
+        size += sizeof(MSN);       // msn_at_start_of_last_completed_optimize
+        size -= 8;                 // removed num_blocks_to_upgrade_14
+        size -= 8;                 // removed num_blocks_to_upgrade_13
+    case FT_LAYOUT_VERSION_17:
+        size += 16;
+        invariant(sizeof(STAT64INFO_S) == 16);
+    case FT_LAYOUT_VERSION_16:
+    case FT_LAYOUT_VERSION_15:
+        size += 4;  // basement node size
+        size += 8;  // num_blocks_to_upgrade_14 (previously num_blocks_to_upgrade, now one int each for upgrade from 13, 14
+        size += 8;  // time of last verification
+    case FT_LAYOUT_VERSION_14:
+        size += 8;  //TXNID that created
+    case FT_LAYOUT_VERSION_13:
+        size += ( 4 // build_id
+                  +4 // build_id_original
+                  +8 // time_of_creation
+                  +8 // time_of_last_modification
+            );
+        // fall through
+    case FT_LAYOUT_VERSION_12:
+        size += (+8 // "tokudata"
+                 +4 // version
+                 +4 // original_version
+                 +4 // size
+                 +8 // byte order verification
+                 +8 // checkpoint_count
+                 +8 // checkpoint_lsn
+                 +4 // tree's nodesize
+                 +8 // translation_size_on_disk
+                 +8 // translation_address_on_disk
+                 +4 // checksum
+                 +8 // Number of blocks in old version.
+                 +8 // diskoff
+                 +4 // flags
+            );
+        break;
+    default:
+        abort();
+    }
+
+    lazy_assert(size <= block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
+    return size;
+}
+
+int deserialize_ft_from_fd_into_rbuf(int fd,
+                                     toku_off_t offset_of_header,
+                                     struct rbuf *rb,
+                                     uint64_t *checkpoint_count,
+                                     LSN *checkpoint_lsn,
+                                     uint32_t * version_p)
+// Effect: Read and parse the header of a fractalal tree
+//
+//  Simply reading the raw bytes of the header into an rbuf is insensitive
+//  to disk format version.  If that ever changes, then modify this.
+//
+//  TOKUDB_DICTIONARY_NO_HEADER means we can overwrite everything in the
+//  file AND the header is useless
+{
+    int r = 0;
+    const int64_t prefix_size = 8 + // magic ("tokudata")
+                                4 + // version
+                                4 + // build_id
+                                4;  // size
+    const int64_t read_size = roundup_to_multiple(512, prefix_size);
+    unsigned char *XMALLOC_N_ALIGNED(512, read_size, prefix);
+    rb->buf = NULL;
+    int64_t n = toku_os_pread(fd, prefix, read_size, offset_of_header);
+    if (n != read_size) {
+        if (n==0) {
+            r = TOKUDB_DICTIONARY_NO_HEADER;
+        } else if (n<0) {
+            r = get_error_errno();
+        } else {
+            r = EINVAL;
+        }
+        toku_free(prefix);
+        goto exit;
+    }
+
+    rbuf_init(rb, prefix, prefix_size);
+
+    //Check magic number
+    const void *magic;
+    rbuf_literal_bytes(rb, &magic, 8);
+    if (memcmp(magic,"tokudata",8)!=0) {
+        if ((*(uint64_t*)magic) == 0) {
+            r = TOKUDB_DICTIONARY_NO_HEADER;
+        } else {
+            r = EINVAL; //Not a tokudb file! Do not use.
+        }
+        goto exit;
+    }
+
+    //Version MUST be in network order regardless of disk order.
+    uint32_t version;
+    version = rbuf_network_int(rb);
+    *version_p = version;
+    if (version < FT_LAYOUT_MIN_SUPPORTED_VERSION) {
+        r = TOKUDB_DICTIONARY_TOO_OLD; //Cannot use
+        goto exit;
+    } else if (version > FT_LAYOUT_VERSION) {
+        r = TOKUDB_DICTIONARY_TOO_NEW; //Cannot use
+        goto exit;
+    }
+
+    //build_id MUST be in network order regardless of disk order.
+    uint32_t build_id __attribute__((__unused__));
+    build_id = rbuf_network_int(rb);
+    int64_t min_header_size;
+    min_header_size = serialize_ft_min_size(version);
+
+    //Size MUST be in network order regardless of disk order.
+    uint32_t size;
+    size = rbuf_network_int(rb);
+    //If too big, it is corrupt.  We would probably notice during checksum
+    //but may have to do a multi-gigabyte malloc+read to find out.
+    //If its too small reading rbuf would crash, so verify.
+    if (size > block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE || size < min_header_size) {
+        r = TOKUDB_DICTIONARY_NO_HEADER;
+        goto exit;
+    }
+
+    lazy_assert(rb->ndone==prefix_size);
+    rb->size = size;
+    {
+        toku_free(rb->buf);
+        uint32_t size_to_read = roundup_to_multiple(512, size);
+        XMALLOC_N_ALIGNED(512, size_to_read, rb->buf);
+
+        assert(offset_of_header%512==0);
+        n = toku_os_pread(fd, rb->buf, size_to_read, offset_of_header);
+        if (n != size_to_read) {
+            if (n < 0) {
+                r = get_error_errno();
+            } else {
+                r = EINVAL; //Header might be useless (wrong size) or could be a disk read error.
+            }
+            goto exit;
+        }
+    }
+    //It's version 14 or later.  Magic looks OK.
+    //We have an rbuf that represents the header.
+    //Size is within acceptable bounds.
+
+    //Verify checksum (FT_LAYOUT_VERSION_13 or later, when checksum function changed)
+    uint32_t calculated_x1764;
+    calculated_x1764 = toku_x1764_memory(rb->buf, rb->size-4);
+    uint32_t stored_x1764;
+    stored_x1764 = toku_dtoh32(*(int*)(rb->buf+rb->size-4));
+    if (calculated_x1764 != stored_x1764) {
+        r = TOKUDB_BAD_CHECKSUM; //Header useless
+        fprintf(stderr, "Header checksum failure: calc=0x%08x read=0x%08x\n", calculated_x1764, stored_x1764);
+        goto exit;
+    }
+
+    //Verify byte order
+    const void *tmp_byte_order_check;
+    lazy_assert((sizeof toku_byte_order_host) == 8);
+    rbuf_literal_bytes(rb, &tmp_byte_order_check, 8); //Must not translate byte order
+    int64_t byte_order_stored;
+    byte_order_stored = *(int64_t*)tmp_byte_order_check;
+    if (byte_order_stored != toku_byte_order_host) {
+        r = TOKUDB_DICTIONARY_NO_HEADER; //Cannot use dictionary
+        goto exit;
+    }
+
+    //Load checkpoint count
+    *checkpoint_count = rbuf_ulonglong(rb);
+    *checkpoint_lsn = rbuf_LSN(rb);
+    //Restart at beginning during regular deserialization
+    rb->ndone = 0;
+
+exit:
+    if (r != 0 && rb->buf != NULL) {
+        toku_free(rb->buf);
+        rb->buf = NULL;
+    }
+    return r;
+}
+
+// Read ft from file into struct.  Read both headers and use one.
+// We want the latest acceptable header whose checkpoint_lsn is no later
+// than max_acceptable_lsn.
+int
+toku_deserialize_ft_from(int fd,
+                         LSN max_acceptable_lsn,
+                         FT *ft)
+{
+    struct rbuf rb_0;
+    struct rbuf rb_1;
+    uint64_t checkpoint_count_0 = 0;
+    uint64_t checkpoint_count_1 = 0;
+    LSN checkpoint_lsn_0;
+    LSN checkpoint_lsn_1;
+    uint32_t version_0 = 0, version_1 = 0, version = 0;
+    bool h0_acceptable = false;
+    bool h1_acceptable = false;
+    struct rbuf *rb = NULL;
+    int r0, r1, r;
+
+    toku_off_t header_0_off = 0;
+    r0 = deserialize_ft_from_fd_into_rbuf(fd, header_0_off, &rb_0, &checkpoint_count_0, &checkpoint_lsn_0, &version_0);
+    if (r0 == 0 && checkpoint_lsn_0.lsn <= max_acceptable_lsn.lsn) {
+        h0_acceptable = true;
+    }
+
+    toku_off_t header_1_off = block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
+    r1 = deserialize_ft_from_fd_into_rbuf(fd, header_1_off, &rb_1, &checkpoint_count_1, &checkpoint_lsn_1, &version_1);
+    if (r1 == 0 && checkpoint_lsn_1.lsn <= max_acceptable_lsn.lsn) {
+        h1_acceptable = true;
+    }
+
+    // if either header is too new, the dictionary is unreadable
+    if (r0 == TOKUDB_DICTIONARY_TOO_NEW || r1 == TOKUDB_DICTIONARY_TOO_NEW ||
+        !(h0_acceptable || h1_acceptable)) {
+        // We were unable to read either header or at least one is too
+        // new.  Certain errors are higher priority than others. Order of
+        // these if/else if is important.
+        if (r0 == TOKUDB_DICTIONARY_TOO_NEW || r1 == TOKUDB_DICTIONARY_TOO_NEW) {
+            r = TOKUDB_DICTIONARY_TOO_NEW;
+        } else if (r0 == TOKUDB_DICTIONARY_TOO_OLD || r1 == TOKUDB_DICTIONARY_TOO_OLD) {
+            r = TOKUDB_DICTIONARY_TOO_OLD;
+        } else if (r0 == TOKUDB_BAD_CHECKSUM && r1 == TOKUDB_BAD_CHECKSUM) {
+            fprintf(stderr, "Both header checksums failed.\n");
+            r = TOKUDB_BAD_CHECKSUM;
+        } else if (r0 == TOKUDB_DICTIONARY_NO_HEADER || r1 == TOKUDB_DICTIONARY_NO_HEADER) {
+            r = TOKUDB_DICTIONARY_NO_HEADER;
+        } else {
+            r = r0 ? r0 : r1; //Arbitrarily report the error from the
+                              //first header, unless it's readable
+        }
+
+        // it should not be possible for both headers to be later than the max_acceptable_lsn
+        invariant(!((r0==0 && checkpoint_lsn_0.lsn > max_acceptable_lsn.lsn) &&
+                    (r1==0 && checkpoint_lsn_1.lsn > max_acceptable_lsn.lsn)));
+        invariant(r!=0);
+        goto exit;
+    }
+
+    if (h0_acceptable && h1_acceptable) {
+        if (checkpoint_count_0 > checkpoint_count_1) {
+            invariant(checkpoint_count_0 == checkpoint_count_1 + 1);
+            invariant(version_0 >= version_1);
+            rb = &rb_0;
+            version = version_0;
+        }
+        else {
+            invariant(checkpoint_count_1 == checkpoint_count_0 + 1);
+            invariant(version_1 >= version_0);
+            rb = &rb_1;
+            version = version_1;
+        }
+    } else if (h0_acceptable) {
+        if (r1 == TOKUDB_BAD_CHECKSUM) {
+            // print something reassuring
+            fprintf(stderr, "Header 2 checksum failed, but header 1 ok.  Proceeding.\n");
+        }
+        rb = &rb_0;
+        version = version_0;
+    } else if (h1_acceptable) {
+        if (r0 == TOKUDB_BAD_CHECKSUM) {
+            // print something reassuring
+            fprintf(stderr, "Header 1 checksum failed, but header 2 ok.  Proceeding.\n");
+        }
+        rb = &rb_1;
+        version = version_1;
+    }
+
+    paranoid_invariant(rb);
+    r = deserialize_ft_versioned(fd, rb, ft, version);
+
+exit:
+    if (rb_0.buf) {
+        toku_free(rb_0.buf);
+    }
+    if (rb_1.buf) {
+        toku_free(rb_1.buf);
+    }
+    return r;
+}
+
+
+size_t toku_serialize_ft_size (FT_HEADER h) {
+    size_t size = serialize_ft_min_size(h->layout_version);
+    //There is no dynamic data.
+    lazy_assert(size <= block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
+    return size;
+}
+
+
+void toku_serialize_ft_to_wbuf (
+    struct wbuf *wbuf, 
+    FT_HEADER h, 
+    DISKOFF translation_location_on_disk, 
+    DISKOFF translation_size_on_disk
+    ) 
+{
+    wbuf_literal_bytes(wbuf, "tokudata", 8);
+    wbuf_network_int  (wbuf, h->layout_version); //MUST be in network order regardless of disk order
+    wbuf_network_int  (wbuf, BUILD_ID); //MUST be in network order regardless of disk order
+    wbuf_network_int  (wbuf, wbuf->size); //MUST be in network order regardless of disk order
+    wbuf_literal_bytes(wbuf, &toku_byte_order_host, 8); //Must not translate byte order
+    wbuf_ulonglong(wbuf, h->checkpoint_count);
+    wbuf_LSN    (wbuf, h->checkpoint_lsn);
+    wbuf_int    (wbuf, h->nodesize);
+
+    wbuf_DISKOFF(wbuf, translation_location_on_disk);
+    wbuf_DISKOFF(wbuf, translation_size_on_disk);
+    wbuf_BLOCKNUM(wbuf, h->root_blocknum);
+    wbuf_int(wbuf, h->flags);
+    wbuf_int(wbuf, h->layout_version_original);
+    wbuf_int(wbuf, h->build_id_original);
+    wbuf_ulonglong(wbuf, h->time_of_creation);
+    wbuf_ulonglong(wbuf, h->time_of_last_modification);
+    wbuf_TXNID(wbuf, h->root_xid_that_created);
+    wbuf_int(wbuf, h->basementnodesize);
+    wbuf_ulonglong(wbuf, h->time_of_last_verification);
+    wbuf_ulonglong(wbuf, h->on_disk_stats.numrows);
+    wbuf_ulonglong(wbuf, h->on_disk_stats.numbytes);
+    wbuf_ulonglong(wbuf, h->time_of_last_optimize_begin);
+    wbuf_ulonglong(wbuf, h->time_of_last_optimize_end);
+    wbuf_int(wbuf, h->count_of_optimize_in_progress);
+    wbuf_MSN(wbuf, h->msn_at_start_of_last_completed_optimize);
+    wbuf_char(wbuf, (unsigned char) h->compression_method);
+    wbuf_MSN(wbuf, h->highest_unused_msn_for_upgrade);
+    wbuf_MSN(wbuf, h->max_msn_in_ft);
+    wbuf_int(wbuf, h->fanout);
+    uint32_t checksum = toku_x1764_finish(&wbuf->checksum);
+    wbuf_int(wbuf, checksum);
+    lazy_assert(wbuf->ndone == wbuf->size);
+}
+
+void toku_serialize_ft_to(int fd, FT_HEADER h, block_table *bt, CACHEFILE cf) {
+    lazy_assert(h->type==FT_CHECKPOINT_INPROGRESS);
+    struct wbuf w_translation;
+    int64_t size_translation;
+    int64_t address_translation;
+
+    // Must serialize translation first, to get address,size for header.
+    bt->serialize_translation_to_wbuf(fd, &w_translation,
+                                      &address_translation,
+                                      &size_translation);
+    assert(size_translation == w_translation.ndone);
+
+    // the number of bytes available in the buffer is 0 mod 512, and those last bytes are all initialized.
+    assert(w_translation.size % 512 == 0);
+
+    struct wbuf w_main;
+    size_t size_main       = toku_serialize_ft_size(h);
+    size_t size_main_aligned = roundup_to_multiple(512, size_main);
+    assert(size_main_aligned<block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
+    char *XMALLOC_N_ALIGNED(512, size_main_aligned, mainbuf);
+    for (size_t i=size_main; i<size_main_aligned; i++) mainbuf[i]=0; // initialize the end of the buffer with zeros
+    wbuf_init(&w_main, mainbuf, size_main);
+    toku_serialize_ft_to_wbuf(&w_main, h, address_translation, size_translation);
+    lazy_assert(w_main.ndone == size_main);
+
+    // Actually write translation table
+    // This write is guaranteed to read good data at the end of the buffer, since the
+    // w_translation.buf is padded with zeros to a 512-byte boundary.
+    toku_os_full_pwrite(fd, w_translation.buf, roundup_to_multiple(512, size_translation), address_translation);
+
+    //Everything but the header MUST be on disk before header starts.
+    //Otherwise we will think the header is good and some blocks might not
+    //yet be on disk.
+    //If the header has a cachefile we need to do cachefile fsync (to
+    //prevent crash if we redirected to dev null)
+    //If there is no cachefile we still need to do an fsync.
+    if (cf) {
+        toku_cachefile_fsync(cf);
+    }
+    else {
+        toku_file_fsync(fd);
+    }
+
+    //Alternate writing header to two locations:
+    //   Beginning (0) or BLOCK_ALLOCATOR_HEADER_RESERVE
+    toku_off_t main_offset;
+    main_offset = (h->checkpoint_count & 0x1) ? 0 : block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
+    toku_os_full_pwrite(fd, w_main.buf, size_main_aligned, main_offset);
+    toku_free(w_main.buf);
+    toku_free(w_translation.buf);
+}
diff --git a/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.h b/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.h
new file mode 100644
index 00000000000..fe31ff7c5fd
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.h
@@ -0,0 +1,62 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "ft/ft.h"
+#include "ft/serialize/block_table.h"
+
+size_t toku_serialize_ft_size(struct ft_header *h);
+void toku_serialize_ft_to(int fd, struct ft_header *h, block_table *bt, CACHEFILE cf);
+void toku_serialize_ft_to_wbuf(struct wbuf *wbuf, struct ft_header *h, DISKOFF translation_location_on_disk, DISKOFF translation_size_on_disk);
+void toku_serialize_descriptor_contents_to_fd(int fd, DESCRIPTOR desc, DISKOFF offset);
+void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, DESCRIPTOR desc);
+
+int toku_deserialize_ft_from(int fd, LSN max_acceptable_lsn, FT *ft);
+
+// TODO rename
+int deserialize_ft_from_fd_into_rbuf(int fd,
+                                     toku_off_t offset_of_header,
+                                     struct rbuf *rb,
+                                     uint64_t *checkpoint_count,
+                                     LSN *checkpoint_lsn,
+                                     uint32_t *version_p);
+
+// used by verify
+// TODO rename
+int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ft, uint32_t version);
diff --git a/storage/tokudb/PerconaFT/ft/serialize/ft_layout_version.h b/storage/tokudb/PerconaFT/ft/serialize/ft_layout_version.h
new file mode 100644
index 00000000000..72b6882bc06
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/ft_layout_version.h
@@ -0,0 +1,79 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+//Must be defined before other recursive headers could include logger/recover.h
+enum ft_layout_version_e {
+    FT_LAYOUT_VERSION_5 = 5,
+    FT_LAYOUT_VERSION_6 = 6,   // Diff from 5 to 6:  Add leafentry_estimate
+    FT_LAYOUT_VERSION_7 = 7,   // Diff from 6 to 7:  Add exact-bit to leafentry_estimate #818, add magic to header #22, add per-subdatase flags #333
+    FT_LAYOUT_VERSION_8 = 8,   // Diff from 7 to 8:  Use murmur instead of crc32.  We are going to make a simplification and stop supporting version 7 and before.  Current As of Beta 1.0.6
+    FT_LAYOUT_VERSION_9 = 9,   // Diff from 8 to 9:  Variable-sized blocks and compression.
+    FT_LAYOUT_VERSION_10 = 10, // Diff from 9 to 10: Variable number of compressed sub-blocks per block, disk byte order == intel byte order, Subtree estimates instead of just leafentry estimates, translation table, dictionary descriptors, checksum in header, subdb support removed from ft layer
+    FT_LAYOUT_VERSION_11 = 11, // Diff from 10 to 11: Nested transaction leafentries (completely redesigned).  FT_CMDs on disk now support XIDS (multiple txnids) instead of exactly one.
+    FT_LAYOUT_VERSION_12 = 12, // Diff from 11 to 12: Added FT_CMD 'FT_INSERT_NO_OVERWRITE', compressed block format, num old blocks
+    FT_LAYOUT_VERSION_13 = 13, // Diff from 12 to 13: Fixed loader pivot bug, added build_id to every node, timestamps to ft 
+    FT_LAYOUT_VERSION_14 = 14, // Diff from 13 to 14: Added MVCC; deprecated TOKU_DB_VALCMP_BUILTIN(_13); Remove fingerprints; Support QUICKLZ; add end-to-end checksum on uncompressed data.
+    FT_LAYOUT_VERSION_15 = 15, // Diff from 14 to 15: basement nodes, last verification time
+    FT_LAYOUT_VERSION_16 = 16, // Dr. No:  No subtree estimates, partition layout information represented more transparently. 
+                                // ALERT ALERT ALERT: version 16 never released to customers, internal and beta use only
+    FT_LAYOUT_VERSION_17 = 17, // Dr. No:  Add STAT64INFO_S to ft header
+    FT_LAYOUT_VERSION_18 = 18, // Dr. No:  Add HOT info to ft header
+    FT_LAYOUT_VERSION_19 = 19, // Doofenshmirtz: Add compression method, highest_unused_msn_for_upgrade
+    FT_LAYOUT_VERSION_20 = 20, // Deadshot: Add compression method to log_fcreate,
+                               // mgr_last_xid after begin checkpoint,
+                               // last_xid to shutdown
+    FT_LAYOUT_VERSION_21 = 21, // Ming: Add max_msn_in_ft to header,
+                               //       Removed log suppression logentry
+    FT_LAYOUT_VERSION_22 = 22, // Ming: Add oldest known referenced xid to each ftnode, for better garbage collection
+    FT_LAYOUT_VERSION_23 = 23, // Ming: Fix upgrade path #5902
+    FT_LAYOUT_VERSION_24 = 24, // Riddler: change logentries that log transactions to store TXNID_PAIRs instead of TXNIDs
+    FT_LAYOUT_VERSION_25 = 25, // SecretSquirrel: ROLLBACK_LOG_NODES (on disk and in memory) now just use blocknum (instead of blocknum + hash) to point to other log nodes.  same for xstillopen log entry
+    FT_LAYOUT_VERSION_26 = 26, // Hojo: basements store key/vals separately on disk for fixed klpair length BNs
+    FT_LAYOUT_VERSION_27 = 27, // serialize message trees with nonleaf buffers to avoid key, msn sort on deserialize
+    FT_LAYOUT_VERSION_28 = 28, // Add fanout to ft_header
+    FT_NEXT_VERSION,           // the version after the current version
+    FT_LAYOUT_VERSION   = FT_NEXT_VERSION-1, // A hack so I don't have to change this line.
+    FT_LAYOUT_MIN_SUPPORTED_VERSION = FT_LAYOUT_VERSION_13, // Minimum version supported
+
+    // Define these symbolically so the knowledge of exactly which layout version got rid of fingerprints isn't spread all over the code.
+    FT_LAST_LAYOUT_VERSION_WITH_FINGERPRINT = FT_LAYOUT_VERSION_13,
+    FT_FIRST_LAYOUT_VERSION_WITH_END_TO_END_CHECKSUM = FT_LAYOUT_VERSION_14,
+    FT_FIRST_LAYOUT_VERSION_WITH_BASEMENT_NODES = FT_LAYOUT_VERSION_15,
+};
diff --git a/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.cc b/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.cc
new file mode 100644
index 00000000000..c4f4886b6a0
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.cc
@@ -0,0 +1,2872 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "portability/toku_atomic.h"
+
+#include "ft/cachetable/cachetable.h"
+#include "ft/ft.h"
+#include "ft/ft-internal.h"
+#include "ft/node.h"
+#include "ft/logger/log-internal.h"
+#include "ft/txn/rollback.h"
+#include "ft/serialize/block_allocator.h"
+#include "ft/serialize/block_table.h"
+#include "ft/serialize/compress.h"
+#include "ft/serialize/ft_node-serialize.h"
+#include "ft/serialize/sub_block.h"
+#include "util/sort.h"
+#include "util/threadpool.h"
+#include "util/status.h"
+#include "util/scoped_malloc.h"
+
+static FT_UPGRADE_STATUS_S ft_upgrade_status;
+
+#define STATUS_INIT(k,c,t,l,inc) TOKUFT_STATUS_INIT(ft_upgrade_status, k, c, t, "ft upgrade: " l, inc)
+
+static void
+status_init(void)
+{
+    // Note, this function initializes the keyname, type, and legend fields.
+    // Value fields are initialized to zero by compiler.
+    STATUS_INIT(FT_UPGRADE_FOOTPRINT,             nullptr, UINT64, "footprint", TOKU_ENGINE_STATUS);
+    ft_upgrade_status.initialized = true;
+}
+#undef STATUS_INIT
+
+#define UPGRADE_STATUS_VALUE(x) ft_upgrade_status.status[x].value.num
+
+void
+toku_ft_upgrade_get_status(FT_UPGRADE_STATUS s) {
+    if (!ft_upgrade_status.initialized) {
+        status_init();
+    }
+    UPGRADE_STATUS_VALUE(FT_UPGRADE_FOOTPRINT) = toku_log_upgrade_get_footprint();
+    *s = ft_upgrade_status;
+}
+
+static int num_cores = 0; // cache the number of cores for the parallelization
+static struct toku_thread_pool *ft_pool = NULL;
+bool toku_serialize_in_parallel;
+
+int get_num_cores(void) {
+    return num_cores;
+}
+
+struct toku_thread_pool *get_ft_pool(void) {
+    return ft_pool;
+}
+
+void toku_serialize_set_parallel(bool in_parallel) {
+    toku_unsafe_set(&toku_serialize_in_parallel, in_parallel);
+}
+
+void toku_ft_serialize_layer_init(void) {
+    num_cores = toku_os_get_number_active_processors();
+    int r = toku_thread_pool_create(&ft_pool, num_cores);
+    lazy_assert_zero(r);
+    block_allocator::maybe_initialize_trace();
+    toku_serialize_in_parallel = false;
+}
+
+void toku_ft_serialize_layer_destroy(void) {
+    toku_thread_pool_destroy(&ft_pool);
+    block_allocator::maybe_close_trace();
+}
+
+enum { FILE_CHANGE_INCREMENT = (16 << 20) };
+
+static inline uint64_t 
+alignup64(uint64_t a, uint64_t b) {
+    return ((a+b-1)/b)*b;
+}
+
+// safe_file_size_lock must be held.
+void
+toku_maybe_truncate_file (int fd, uint64_t size_used, uint64_t expected_size, uint64_t *new_sizep)
+// Effect: If file size >= SIZE+32MiB, reduce file size.
+// (32 instead of 16.. hysteresis).
+// Return 0 on success, otherwise an error number.
+{
+    int64_t file_size;
+    {
+        int r = toku_os_get_file_size(fd, &file_size);
+        lazy_assert_zero(r);
+        invariant(file_size >= 0);
+    }
+    invariant(expected_size == (uint64_t)file_size);
+    // If file space is overallocated by at least 32M
+    if ((uint64_t)file_size >= size_used + (2*FILE_CHANGE_INCREMENT)) {
+        toku_off_t new_size = alignup64(size_used, (2*FILE_CHANGE_INCREMENT)); //Truncate to new size_used.
+        invariant(new_size < file_size);
+        invariant(new_size >= 0);
+        int r = ftruncate(fd, new_size);
+        lazy_assert_zero(r);
+        *new_sizep = new_size;
+    }
+    else {
+        *new_sizep = file_size;
+    }
+    return;
+}
+
+static int64_t 
+min64(int64_t a, int64_t b) {
+    if (a<b) return a;
+    return b;
+}
+
+void
+toku_maybe_preallocate_in_file (int fd, int64_t size, int64_t expected_size, int64_t *new_size)
+// Effect: make the file bigger by either doubling it or growing by 16MiB whichever is less, until it is at least size
+// Return 0 on success, otherwise an error number.
+{
+    int64_t file_size = 0;
+    //TODO(yoni): Allow variable stripe_width (perhaps from ft) for larger raids
+    const uint64_t stripe_width = 4096;
+    {
+        int r = toku_os_get_file_size(fd, &file_size);
+        if (r != 0) { // debug #2463
+            int the_errno = get_maybe_error_errno();
+            fprintf(stderr, "%s:%d fd=%d size=%" PRIu64 " r=%d errno=%d\n", __FUNCTION__, __LINE__, fd, size, r, the_errno); fflush(stderr);
+        }
+        lazy_assert_zero(r);
+    }
+    invariant(file_size >= 0);
+    invariant(expected_size == file_size);
+    // We want to double the size of the file, or add 16MiB, whichever is less.
+    // We emulate calling this function repeatedly until it satisfies the request.
+    int64_t to_write = 0;
+    if (file_size == 0) {
+        // Prevent infinite loop by starting with stripe_width as a base case.
+        to_write = stripe_width;
+    }
+    while (file_size + to_write < size) {
+        to_write += alignup64(min64(file_size + to_write, FILE_CHANGE_INCREMENT), stripe_width);
+    }
+    if (to_write > 0) {
+        assert(to_write%512==0);
+        toku::scoped_malloc_aligned wbuf_aligned(to_write, 512);
+        char *wbuf = reinterpret_cast<char *>(wbuf_aligned.get());
+        memset(wbuf, 0, to_write);
+        toku_off_t start_write = alignup64(file_size, stripe_width);
+        invariant(start_write >= file_size);
+        toku_os_full_pwrite(fd, wbuf, to_write, start_write);
+        *new_size = start_write + to_write;
+    }
+    else {
+        *new_size = file_size;
+    }
+}
+
+// Don't include the sub_block header
+// Overhead calculated in same order fields are written to wbuf
+enum {
+    node_header_overhead = (8+   // magic "tokunode" or "tokuleaf" or "tokuroll"
+                            4+   // layout_version
+                            4+   // layout_version_original
+                            4),  // build_id
+};
+
+// uncompressed header offsets
+enum {
+    uncompressed_magic_offset = 0,
+    uncompressed_version_offset = 8,
+};
+
+static uint32_t
+serialize_node_header_size(FTNODE node) {
+    uint32_t retval = 0;
+    retval += 8; // magic
+    retval += sizeof(node->layout_version);
+    retval += sizeof(node->layout_version_original);
+    retval += 4; // BUILD_ID
+    retval += 4; // n_children
+    retval += node->n_children*8; // encode start offset and length of each partition
+    retval += 4; // checksum
+    return retval;
+}
+
+static void
+serialize_node_header(FTNODE node, FTNODE_DISK_DATA ndd, struct wbuf *wbuf) {
+    if (node->height == 0) 
+        wbuf_nocrc_literal_bytes(wbuf, "tokuleaf", 8);
+    else 
+        wbuf_nocrc_literal_bytes(wbuf, "tokunode", 8);
+    paranoid_invariant(node->layout_version == FT_LAYOUT_VERSION);
+    wbuf_nocrc_int(wbuf, node->layout_version);
+    wbuf_nocrc_int(wbuf, node->layout_version_original);
+    wbuf_nocrc_uint(wbuf, BUILD_ID);
+    wbuf_nocrc_int (wbuf, node->n_children);
+    for (int i=0; i<node->n_children; i++) {
+        assert(BP_SIZE(ndd,i)>0);
+        wbuf_nocrc_int(wbuf, BP_START(ndd, i)); // save the beginning of the partition
+        wbuf_nocrc_int(wbuf, BP_SIZE (ndd, i));         // and the size
+    }
+    // checksum the header
+    uint32_t end_to_end_checksum = toku_x1764_memory(wbuf->buf, wbuf_get_woffset(wbuf));
+    wbuf_nocrc_int(wbuf, end_to_end_checksum);
+    invariant(wbuf->ndone == wbuf->size);
+}
+
+static uint32_t
+serialize_ftnode_partition_size (FTNODE node, int i)
+{
+    uint32_t result = 0;
+    paranoid_invariant(node->bp[i].state == PT_AVAIL);
+    result++; // Byte that states what the partition is
+    if (node->height > 0) {
+        NONLEAF_CHILDINFO bnc = BNC(node, i);
+        // number of messages (4 bytes) plus size of the buffer
+        result += (4 + toku_bnc_nbytesinbuf(bnc));
+        // number of offsets (4 bytes) plus an array of 4 byte offsets, for each message tree
+        result += (4 + (4 * bnc->fresh_message_tree.size()));
+        result += (4 + (4 * bnc->stale_message_tree.size()));
+        result += (4 + (4 * bnc->broadcast_list.size()));
+    }
+    else {
+        result += 4 + bn_data::HEADER_LENGTH; // n_entries in buffer table + basement header
+        result += BLB_NBYTESINDATA(node, i);
+    }
+    result += 4; // checksum
+    return result;
+}
+
+#define FTNODE_PARTITION_DMT_LEAVES 0xaa
+#define FTNODE_PARTITION_MSG_BUFFER 0xbb
+
+UU() static int
+assert_fresh(const int32_t &offset, const uint32_t UU(idx), message_buffer *const msg_buffer) {
+    bool is_fresh = msg_buffer->get_freshness(offset);
+    assert(is_fresh);
+    return 0;
+}
+
+UU() static int
+assert_stale(const int32_t &offset, const uint32_t UU(idx), message_buffer *const msg_buffer) {
+    bool is_fresh = msg_buffer->get_freshness(offset);
+    assert(!is_fresh);
+    return 0;
+}
+
+static void bnc_verify_message_trees(NONLEAF_CHILDINFO UU(bnc)) {
+#ifdef TOKU_DEBUG_PARANOID
+    bnc->fresh_message_tree.iterate<message_buffer, assert_fresh>(&bnc->msg_buffer);
+    bnc->stale_message_tree.iterate<message_buffer, assert_stale>(&bnc->msg_buffer);
+#endif
+}
+
+static int
+wbuf_write_offset(const int32_t &offset, const uint32_t UU(idx), struct wbuf *const wb) {
+    wbuf_nocrc_int(wb, offset);
+    return 0;
+}
+
+static void serialize_child_buffer(NONLEAF_CHILDINFO bnc, struct wbuf *wb) {
+    unsigned char ch = FTNODE_PARTITION_MSG_BUFFER;
+    wbuf_nocrc_char(wb, ch);
+
+    // serialize the message buffer
+    bnc->msg_buffer.serialize_to_wbuf(wb);
+
+    // serialize the message trees (num entries, offsets array):
+    // first, verify their contents are consistent with the message buffer
+    bnc_verify_message_trees(bnc);
+
+    // fresh
+    wbuf_nocrc_int(wb, bnc->fresh_message_tree.size());
+    bnc->fresh_message_tree.iterate<struct wbuf, wbuf_write_offset>(wb);
+
+    // stale
+    wbuf_nocrc_int(wb, bnc->stale_message_tree.size());
+    bnc->stale_message_tree.iterate<struct wbuf, wbuf_write_offset>(wb);
+
+    // broadcast
+    wbuf_nocrc_int(wb, bnc->broadcast_list.size());
+    bnc->broadcast_list.iterate<struct wbuf, wbuf_write_offset>(wb);
+}
+
+//
+// Serialize the i'th partition of node into sb
+// For leaf nodes, this would be the i'th basement node
+// For internal nodes, this would be the i'th internal node
+//
+static void
+serialize_ftnode_partition(FTNODE node, int i, struct sub_block *sb) {
+    // Caller should have allocated memory.
+    invariant_notnull(sb->uncompressed_ptr);
+    invariant(sb->uncompressed_size > 0);
+    paranoid_invariant(sb->uncompressed_size == serialize_ftnode_partition_size(node, i));
+
+    //
+    // Now put the data into sb->uncompressed_ptr
+    //
+    struct wbuf wb;
+    wbuf_init(&wb, sb->uncompressed_ptr, sb->uncompressed_size);
+    if (node->height > 0) {
+        // TODO: (Zardosht) possibly exit early if there are no messages
+        serialize_child_buffer(BNC(node, i), &wb);
+    }
+    else {
+        unsigned char ch = FTNODE_PARTITION_DMT_LEAVES;
+        bn_data* bd = BLB_DATA(node, i);
+
+        wbuf_nocrc_char(&wb, ch);
+        wbuf_nocrc_uint(&wb, bd->num_klpairs());
+
+        bd->serialize_to_wbuf(&wb);
+    }
+    uint32_t end_to_end_checksum = toku_x1764_memory(sb->uncompressed_ptr, wbuf_get_woffset(&wb));
+    wbuf_nocrc_int(&wb, end_to_end_checksum);
+    invariant(wb.ndone == wb.size);
+    invariant(sb->uncompressed_size==wb.ndone);
+}
+
+//
+// Takes the data in sb->uncompressed_ptr, and compresses it 
+// into a newly allocated buffer sb->compressed_ptr
+// 
+static void
+compress_ftnode_sub_block(struct sub_block *sb, enum toku_compression_method method) {
+    invariant(sb->compressed_ptr != nullptr);
+    invariant(sb->compressed_size_bound > 0);
+    paranoid_invariant(sb->compressed_size_bound == toku_compress_bound(method, sb->uncompressed_size));
+    
+    //
+    // This probably seems a bit complicated. Here is what is going on.
+    // In PerconaFT 5.0, sub_blocks were compressed and the compressed data
+    // was checksummed. The checksum did NOT include the size of the compressed data
+    // and the size of the uncompressed data. The fields of sub_block only reference the
+    // compressed data, and it is the responsibility of the user of the sub_block
+    // to write the length
+    //
+    // For Dr. No, we want the checksum to also include the size of the compressed data, and the 
+    // size of the decompressed data, because this data
+    // may be read off of disk alone, so it must be verifiable alone.
+    //
+    // So, we pass in a buffer to compress_nocrc_sub_block that starts 8 bytes after the beginning
+    // of sb->compressed_ptr, so we have space to put in the sizes, and then run the checksum.
+    //
+    sb->compressed_size = compress_nocrc_sub_block(
+        sb,
+        (char *)sb->compressed_ptr + 8,
+        sb->compressed_size_bound,
+        method
+        );
+
+    uint32_t* extra = (uint32_t *)(sb->compressed_ptr);
+    // store the compressed and uncompressed size at the beginning
+    extra[0] = toku_htod32(sb->compressed_size);
+    extra[1] = toku_htod32(sb->uncompressed_size);
+    // now checksum the entire thing
+    sb->compressed_size += 8; // now add the eight bytes that we saved for the sizes
+    sb->xsum = toku_x1764_memory(sb->compressed_ptr,sb->compressed_size);
+
+    //
+    // This is the end result for Dr. No and forward. For ftnodes, sb->compressed_ptr contains
+    // two integers at the beginning, the size and uncompressed size, and then the compressed
+    // data. sb->xsum contains the checksum of this entire thing.
+    // 
+    // In PerconaFT 5.0, sb->compressed_ptr only contained the compressed data, sb->xsum
+    // checksummed only the compressed data, and the checksumming of the sizes were not
+    // done here.
+    //
+}
+
+//
+// Returns the size needed to serialize the ftnode info
+// Does not include header information that is common with rollback logs
+// such as the magic, layout_version, and build_id
+// Includes only node specific info such as pivot information, n_children, and so on
+//
+static uint32_t
+serialize_ftnode_info_size(FTNODE node)
+{
+    uint32_t retval = 0;
+    retval += 8; // max_msn_applied_to_node_on_disk
+    retval += 4; // nodesize
+    retval += 4; // flags
+    retval += 4; // height;
+    retval += 8; // oldest_referenced_xid_known
+    retval += node->pivotkeys.serialized_size();
+    retval += (node->n_children-1)*4; // encode length of each pivot
+    if (node->height > 0) {
+        retval += node->n_children*8; // child blocknum's
+    }
+    retval += 4; // checksum
+    return retval;
+}
+
+static void serialize_ftnode_info(FTNODE node, SUB_BLOCK sb) {
+    // Memory must have been allocated by our caller.
+    invariant(sb->uncompressed_size > 0);
+    invariant_notnull(sb->uncompressed_ptr);
+    paranoid_invariant(sb->uncompressed_size == serialize_ftnode_info_size(node));
+
+    struct wbuf wb;
+    wbuf_init(&wb, sb->uncompressed_ptr, sb->uncompressed_size);
+
+    wbuf_MSN(&wb, node->max_msn_applied_to_node_on_disk);
+    wbuf_nocrc_uint(&wb, 0); // write a dummy value for where node->nodesize used to be
+    wbuf_nocrc_uint(&wb, node->flags);
+    wbuf_nocrc_int (&wb, node->height);    
+    wbuf_TXNID(&wb, node->oldest_referenced_xid_known);
+    node->pivotkeys.serialize_to_wbuf(&wb);
+
+    // child blocks, only for internal nodes
+    if (node->height > 0) {
+        for (int i = 0; i < node->n_children; i++) {
+            wbuf_nocrc_BLOCKNUM(&wb, BP_BLOCKNUM(node,i));
+        }
+    }
+
+    uint32_t end_to_end_checksum = toku_x1764_memory(sb->uncompressed_ptr, wbuf_get_woffset(&wb));
+    wbuf_nocrc_int(&wb, end_to_end_checksum);
+    invariant(wb.ndone == wb.size);
+    invariant(sb->uncompressed_size==wb.ndone);
+}
+
+// This is the size of the uncompressed data, not including the compression headers
+unsigned int
+toku_serialize_ftnode_size (FTNODE node) {
+    unsigned int result = 0;
+    //
+    // As of now, this seems to be called if and only if the entire node is supposed
+    // to be in memory, so we will assert it.
+    //
+    toku_ftnode_assert_fully_in_memory(node);
+    result += serialize_node_header_size(node);
+    result += serialize_ftnode_info_size(node);
+    for (int i = 0; i < node->n_children; i++) {
+        result += serialize_ftnode_partition_size(node,i);
+    }
+    return result;
+}
+
+struct serialize_times {
+    tokutime_t serialize_time;
+    tokutime_t compress_time;
+};
+
+static void
+serialize_and_compress_partition(FTNODE node,
+                                 int childnum,
+                                 enum toku_compression_method compression_method,
+                                 SUB_BLOCK sb,
+                                 struct serialize_times *st)
+{
+    // serialize, compress, update status
+    tokutime_t t0 = toku_time_now();
+    serialize_ftnode_partition(node, childnum, sb);
+    tokutime_t t1 = toku_time_now();
+    compress_ftnode_sub_block(sb, compression_method);
+    tokutime_t t2 = toku_time_now();
+
+    st->serialize_time += t1 - t0;
+    st->compress_time += t2 - t1;
+}
+
+void
+toku_create_compressed_partition_from_available(
+    FTNODE node,
+    int childnum,
+    enum toku_compression_method compression_method,
+    SUB_BLOCK sb
+    )
+{
+    tokutime_t t0 = toku_time_now();
+
+    // serialize
+    sb->uncompressed_size = serialize_ftnode_partition_size(node, childnum);
+    toku::scoped_malloc uncompressed_buf(sb->uncompressed_size);
+    sb->uncompressed_ptr = uncompressed_buf.get();
+    serialize_ftnode_partition(node, childnum, sb);
+
+    tokutime_t t1 = toku_time_now();
+
+    // compress. no need to pad with extra bytes for sizes/xsum - we're not storing them
+    set_compressed_size_bound(sb, compression_method);
+    sb->compressed_ptr = toku_xmalloc(sb->compressed_size_bound);
+    sb->compressed_size = compress_nocrc_sub_block(
+        sb,
+        sb->compressed_ptr,
+        sb->compressed_size_bound,
+        compression_method
+        );
+    sb->uncompressed_ptr = NULL;
+
+    tokutime_t t2 = toku_time_now();
+
+    toku_ft_status_update_serialize_times(node, t1 - t0, t2 - t1);
+}
+
+static void
+serialize_and_compress_serially(FTNODE node,
+                                int npartitions,
+                                enum toku_compression_method compression_method,
+                                struct sub_block sb[],
+                                struct serialize_times *st) {
+    for (int i = 0; i < npartitions; i++) {
+        serialize_and_compress_partition(node, i, compression_method, &sb[i], st);
+    }
+}
+
+struct serialize_compress_work {
+    struct work base;
+    FTNODE node;
+    int i;
+    enum toku_compression_method compression_method;
+    struct sub_block *sb;
+    struct serialize_times st;
+};
+
+static void *
+serialize_and_compress_worker(void *arg) {
+    struct workset *ws = (struct workset *) arg;
+    while (1) {
+        struct serialize_compress_work *w = (struct serialize_compress_work *) workset_get(ws);
+        if (w == NULL)
+            break;
+        int i = w->i;
+        serialize_and_compress_partition(w->node, i, w->compression_method, &w->sb[i], &w->st);
+    }
+    workset_release_ref(ws);
+    return arg;
+}
+
+static void
+serialize_and_compress_in_parallel(FTNODE node,
+                                   int npartitions,
+                                   enum toku_compression_method compression_method,
+                                   struct sub_block sb[],
+                                   struct serialize_times *st) {
+    if (npartitions == 1) {
+        serialize_and_compress_partition(node, 0, compression_method, &sb[0], st);
+    } else {
+        int T = num_cores;
+        if (T > npartitions)
+            T = npartitions;
+        if (T > 0)
+            T = T - 1;
+        struct workset ws;
+        ZERO_STRUCT(ws);
+        workset_init(&ws);
+        struct serialize_compress_work work[npartitions];
+        workset_lock(&ws);
+        for (int i = 0; i < npartitions; i++) {
+            work[i] = (struct serialize_compress_work) { .base = {{NULL, NULL}},
+                                                         .node = node,
+                                                         .i = i,
+                                                         .compression_method = compression_method,
+                                                         .sb = sb,
+                                                         .st = { .serialize_time = 0, .compress_time = 0} };
+            workset_put_locked(&ws, &work[i].base);
+        }
+        workset_unlock(&ws);
+        toku_thread_pool_run(ft_pool, 0, &T, serialize_and_compress_worker, &ws);
+        workset_add_ref(&ws, T);
+        serialize_and_compress_worker(&ws);
+        workset_join(&ws);
+        workset_destroy(&ws);
+
+        // gather up the statistics from each thread's work item
+        for (int i = 0; i < npartitions; i++) {
+            st->serialize_time += work[i].st.serialize_time;
+            st->compress_time += work[i].st.compress_time;
+        }
+    }
+}
+
+static void
+serialize_and_compress_sb_node_info(FTNODE node, struct sub_block *sb,
+        enum toku_compression_method compression_method, struct serialize_times *st) {
+    // serialize, compress, update serialize times.
+    tokutime_t t0 = toku_time_now();
+    serialize_ftnode_info(node, sb);
+    tokutime_t t1 = toku_time_now();
+    compress_ftnode_sub_block(sb, compression_method);
+    tokutime_t t2 = toku_time_now();
+
+    st->serialize_time += t1 - t0;
+    st->compress_time += t2 - t1;
+}
+
+int toku_serialize_ftnode_to_memory(FTNODE node,
+                                    FTNODE_DISK_DATA* ndd,
+                                    unsigned int basementnodesize,
+                                    enum toku_compression_method compression_method,
+                                    bool do_rebalancing,
+                                    bool in_parallel, // for loader is true, for toku_ftnode_flush_callback, is false
+                            /*out*/ size_t *n_bytes_to_write,
+                            /*out*/ size_t *n_uncompressed_bytes,
+                            /*out*/ char  **bytes_to_write)
+// Effect: Writes out each child to a separate malloc'd buffer, then compresses
+//   all of them, and writes the uncompressed header, to bytes_to_write,
+//   which is malloc'd.
+//
+//   The resulting buffer is guaranteed to be 512-byte aligned and the total length is a multiple of 512 (so we pad with zeros at the end if needed).
+//   512-byte padding is for O_DIRECT to work.
+{
+    toku_ftnode_assert_fully_in_memory(node);
+
+    if (do_rebalancing && node->height == 0) {
+        toku_ftnode_leaf_rebalance(node, basementnodesize);
+    }
+    const int npartitions = node->n_children;
+
+    // Each partition represents a compressed sub block
+    // For internal nodes, a sub block is a message buffer
+    // For leaf nodes, a sub block is a basement node
+    toku::scoped_calloc sb_buf(sizeof(struct sub_block) * npartitions);
+    struct sub_block *sb = reinterpret_cast<struct sub_block *>(sb_buf.get());
+    XREALLOC_N(npartitions, *ndd);
+
+    //
+    // First, let's serialize and compress the individual sub blocks
+    //
+
+    // determine how large our serialization and compression buffers need to be.
+    size_t serialize_buf_size = 0, compression_buf_size = 0;
+    for (int i = 0; i < node->n_children; i++) {
+        sb[i].uncompressed_size = serialize_ftnode_partition_size(node, i);
+        sb[i].compressed_size_bound = toku_compress_bound(compression_method, sb[i].uncompressed_size);
+        serialize_buf_size += sb[i].uncompressed_size;
+        compression_buf_size += sb[i].compressed_size_bound + 8; // add 8 extra bytes, 4 for compressed size, 4 for decompressed size
+    }
+
+    // give each sub block a base pointer to enough buffer space for serialization and compression
+    toku::scoped_malloc serialize_buf(serialize_buf_size);
+    toku::scoped_malloc compression_buf(compression_buf_size);
+    for (size_t i = 0, uncompressed_offset = 0, compressed_offset = 0; i < (size_t) node->n_children; i++) {
+        sb[i].uncompressed_ptr = reinterpret_cast<char *>(serialize_buf.get()) + uncompressed_offset;
+        sb[i].compressed_ptr = reinterpret_cast<char *>(compression_buf.get()) + compressed_offset;
+        uncompressed_offset += sb[i].uncompressed_size;
+        compressed_offset += sb[i].compressed_size_bound + 8; // add 8 extra bytes, 4 for compressed size, 4 for decompressed size
+        invariant(uncompressed_offset <= serialize_buf_size);
+        invariant(compressed_offset <= compression_buf_size);
+    }
+
+    // do the actual serialization now that we have buffer space
+    struct serialize_times st = { 0, 0 };
+    if (in_parallel) {
+        serialize_and_compress_in_parallel(node, npartitions, compression_method, sb, &st);
+    } else {
+        serialize_and_compress_serially(node, npartitions, compression_method, sb, &st);
+    }
+
+    //
+    // Now lets create a sub-block that has the common node information,
+    // This does NOT include the header
+    //
+
+    // determine how large our serialization and copmression buffers need to be
+    struct sub_block sb_node_info;
+    sub_block_init(&sb_node_info);
+    size_t sb_node_info_uncompressed_size = serialize_ftnode_info_size(node);
+    size_t sb_node_info_compressed_size_bound = toku_compress_bound(compression_method, sb_node_info_uncompressed_size);
+    toku::scoped_malloc sb_node_info_uncompressed_buf(sb_node_info_uncompressed_size);
+    toku::scoped_malloc sb_node_info_compressed_buf(sb_node_info_compressed_size_bound + 8); // add 8 extra bytes, 4 for compressed size, 4 for decompressed size
+    sb_node_info.uncompressed_size = sb_node_info_uncompressed_size;
+    sb_node_info.uncompressed_ptr = sb_node_info_uncompressed_buf.get();
+    sb_node_info.compressed_size_bound = sb_node_info_compressed_size_bound;
+    sb_node_info.compressed_ptr = sb_node_info_compressed_buf.get();
+
+    // do the actual serialization now that we have buffer space
+    serialize_and_compress_sb_node_info(node, &sb_node_info, compression_method, &st);
+
+    //
+    // At this point, we have compressed each of our pieces into individual sub_blocks,
+    // we can put the header and all the subblocks into a single buffer and return it.
+    //
+
+    // update the serialize times, ignore the header for simplicity. we captured all
+    // of the partitions' serialize times so that's probably good enough.
+    toku_ft_status_update_serialize_times(node, st.serialize_time, st.compress_time);
+
+    // The total size of the node is:
+    // size of header + disk size of the n+1 sub_block's created above
+    uint32_t total_node_size = (serialize_node_header_size(node) // uncompressed header
+                                 + sb_node_info.compressed_size   // compressed nodeinfo (without its checksum)
+                                 + 4);                            // nodeinfo's checksum
+    uint32_t total_uncompressed_size = (serialize_node_header_size(node) // uncompressed header
+                                 + sb_node_info.uncompressed_size   // uncompressed nodeinfo (without its checksum)
+                                 + 4);                            // nodeinfo's checksum
+    // store the BP_SIZESs
+    for (int i = 0; i < node->n_children; i++) {
+        uint32_t len         = sb[i].compressed_size + 4; // data and checksum
+        BP_SIZE (*ndd,i) = len;
+        BP_START(*ndd,i) = total_node_size;
+        total_node_size += sb[i].compressed_size + 4;
+        total_uncompressed_size += sb[i].uncompressed_size + 4;
+    }
+
+    // now create the final serialized node
+    uint32_t total_buffer_size = roundup_to_multiple(512, total_node_size); // make the buffer be 512 bytes.
+    char *XMALLOC_N_ALIGNED(512, total_buffer_size, data);
+    char *curr_ptr = data;
+
+    // write the header
+    struct wbuf wb;
+    wbuf_init(&wb, curr_ptr, serialize_node_header_size(node));
+    serialize_node_header(node, *ndd, &wb);
+    assert(wb.ndone == wb.size);
+    curr_ptr += serialize_node_header_size(node);
+
+    // now write sb_node_info
+    memcpy(curr_ptr, sb_node_info.compressed_ptr, sb_node_info.compressed_size);
+    curr_ptr += sb_node_info.compressed_size;
+    // write the checksum
+    *(uint32_t *)curr_ptr = toku_htod32(sb_node_info.xsum);
+    curr_ptr += sizeof(sb_node_info.xsum);
+
+    for (int i = 0; i < npartitions; i++) {
+        memcpy(curr_ptr, sb[i].compressed_ptr, sb[i].compressed_size);
+        curr_ptr += sb[i].compressed_size;
+        // write the checksum
+        *(uint32_t *)curr_ptr = toku_htod32(sb[i].xsum);
+        curr_ptr += sizeof(sb[i].xsum);
+    }
+    // Zero the rest of the buffer
+    memset(data + total_node_size, 0, total_buffer_size - total_node_size);
+            
+    assert(curr_ptr - data == total_node_size);
+    *bytes_to_write = data;
+    *n_bytes_to_write = total_buffer_size;
+    *n_uncompressed_bytes = total_uncompressed_size;
+
+    invariant(*n_bytes_to_write % 512 == 0);
+    invariant(reinterpret_cast<unsigned long long>(*bytes_to_write) % 512 == 0);
+    return 0;
+}
+
+int
+toku_serialize_ftnode_to (int fd, BLOCKNUM blocknum, FTNODE node, FTNODE_DISK_DATA* ndd, bool do_rebalancing, FT ft, bool for_checkpoint) {
+
+    size_t n_to_write;
+    size_t n_uncompressed_bytes;
+    char *compressed_buf = nullptr;
+
+    // because toku_serialize_ftnode_to is only called for 
+    // in toku_ftnode_flush_callback, we pass false
+    // for in_parallel. The reasoning is that when we write
+    // nodes to disk via toku_ftnode_flush_callback, we 
+    // assume that it is being done on a non-critical
+    // background thread (probably for checkpointing), and therefore 
+    // should not hog CPU,
+    //
+    // Should the above facts change, we may want to revisit
+    // passing false for in_parallel here
+    //
+    // alternatively, we could have made in_parallel a parameter
+    // for toku_serialize_ftnode_to, but instead we did this.
+    int r = toku_serialize_ftnode_to_memory(
+        node,
+        ndd,
+        ft->h->basementnodesize,
+        ft->h->compression_method,
+        do_rebalancing,
+        toku_unsafe_fetch(&toku_serialize_in_parallel),
+        &n_to_write,
+        &n_uncompressed_bytes,
+        &compressed_buf
+        );
+    if (r != 0) {
+        return r;
+    }
+
+    // If the node has never been written, then write the whole buffer, including the zeros
+    invariant(blocknum.b>=0);
+    DISKOFF offset;
+
+    // Dirties the ft
+    ft->blocktable.realloc_on_disk(blocknum, n_to_write, &offset,
+                                   ft, fd, for_checkpoint,
+                                   // Allocations for nodes high in the tree are considered 'hot',
+                                   // as they are likely to move again in the next checkpoint.
+                                   node->height);
+
+    tokutime_t t0 = toku_time_now();
+    toku_os_full_pwrite(fd, compressed_buf, n_to_write, offset);
+    tokutime_t t1 = toku_time_now();
+
+    tokutime_t io_time = t1 - t0;
+    toku_ft_status_update_flush_reason(node, n_uncompressed_bytes, n_to_write, io_time, for_checkpoint);
+
+    toku_free(compressed_buf);
+    node->dirty = 0;  // See #1957.   Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction.
+    return 0;
+}
+
+static void
+sort_and_steal_offset_arrays(NONLEAF_CHILDINFO bnc,
+                             const toku::comparator &cmp,
+                             int32_t **fresh_offsets, int32_t nfresh,
+                             int32_t **stale_offsets, int32_t nstale,
+                             int32_t **broadcast_offsets, int32_t nbroadcast) {
+    // We always have fresh / broadcast offsets (even if they are empty)
+    // but we may not have stale offsets, in the case of v13 upgrade.
+    invariant(fresh_offsets != nullptr);
+    invariant(broadcast_offsets != nullptr);
+    invariant(cmp.valid());
+
+    typedef toku::sort<int32_t, const struct toku_msg_buffer_key_msn_cmp_extra, toku_msg_buffer_key_msn_cmp> msn_sort;
+
+    const int32_t n_in_this_buffer = nfresh + nstale + nbroadcast;
+    struct toku_msg_buffer_key_msn_cmp_extra extra(cmp, &bnc->msg_buffer);
+    msn_sort::mergesort_r(*fresh_offsets, nfresh, extra);
+    bnc->fresh_message_tree.destroy();
+    bnc->fresh_message_tree.create_steal_sorted_array(fresh_offsets, nfresh, n_in_this_buffer);
+    if (stale_offsets) {
+        msn_sort::mergesort_r(*stale_offsets, nstale, extra);
+        bnc->stale_message_tree.destroy();
+        bnc->stale_message_tree.create_steal_sorted_array(stale_offsets, nstale, n_in_this_buffer);
+    }
+    bnc->broadcast_list.destroy();
+    bnc->broadcast_list.create_steal_sorted_array(broadcast_offsets, nbroadcast, n_in_this_buffer);
+}
+
+static MSN
+deserialize_child_buffer_v13(FT ft, NONLEAF_CHILDINFO bnc, struct rbuf *rb) {
+    // We skip 'stale' offsets for upgraded nodes.
+    int32_t nfresh = 0, nbroadcast = 0;
+    int32_t *fresh_offsets = nullptr, *broadcast_offsets = nullptr;
+
+    // Only sort buffers if we have a valid comparison function. In certain scenarios,
+    // like deserialie_ft_versioned() or tokuftdump, we'll need to deserialize ftnodes
+    // for simple inspection and don't actually require that the message buffers are
+    // properly sorted. This is very ugly, but correct.
+    const bool sort = ft->cmp.valid();
+
+    MSN highest_msn_in_this_buffer =
+        bnc->msg_buffer.deserialize_from_rbuf_v13(rb, &ft->h->highest_unused_msn_for_upgrade,
+                                                  sort ? &fresh_offsets : nullptr, &nfresh,
+                                                  sort ? &broadcast_offsets : nullptr, &nbroadcast);
+
+    if (sort) {
+        sort_and_steal_offset_arrays(bnc, ft->cmp,
+                                     &fresh_offsets, nfresh,
+                                     nullptr, 0, // no stale offsets
+                                     &broadcast_offsets, nbroadcast);
+    }
+
+    return highest_msn_in_this_buffer;
+}
+
+static void
+deserialize_child_buffer_v26(NONLEAF_CHILDINFO bnc, struct rbuf *rb, const toku::comparator &cmp) {
+    int32_t nfresh = 0, nstale = 0, nbroadcast = 0;
+    int32_t *fresh_offsets, *stale_offsets, *broadcast_offsets;
+
+    // Only sort buffers if we have a valid comparison function. In certain scenarios,
+    // like deserialie_ft_versioned() or tokuftdump, we'll need to deserialize ftnodes
+    // for simple inspection and don't actually require that the message buffers are
+    // properly sorted. This is very ugly, but correct.
+    const bool sort = cmp.valid();
+
+    // read in the message buffer
+    bnc->msg_buffer.deserialize_from_rbuf(rb,
+                                          sort ? &fresh_offsets : nullptr, &nfresh,
+                                          sort ? &stale_offsets : nullptr, &nstale,
+                                          sort ? &broadcast_offsets : nullptr, &nbroadcast);
+
+    if (sort) {
+        sort_and_steal_offset_arrays(bnc, cmp,
+                                     &fresh_offsets, nfresh,
+                                     &stale_offsets, nstale,
+                                     &broadcast_offsets, nbroadcast);
+    }
+}
+
+static void
+deserialize_child_buffer(NONLEAF_CHILDINFO bnc, struct rbuf *rb) {
+    // read in the message buffer
+    bnc->msg_buffer.deserialize_from_rbuf(rb,
+                                          nullptr, nullptr,  // fresh_offsets, nfresh,
+                                          nullptr, nullptr,  // stale_offsets, nstale,
+                                          nullptr, nullptr); // broadcast_offsets, nbroadcast
+
+    // read in each message tree (fresh, stale, broadcast)
+    int32_t nfresh = rbuf_int(rb);
+    int32_t *XMALLOC_N(nfresh, fresh_offsets);
+    for (int i = 0; i < nfresh; i++) {
+        fresh_offsets[i] = rbuf_int(rb);
+    }
+
+    int32_t nstale = rbuf_int(rb);
+    int32_t *XMALLOC_N(nstale, stale_offsets);
+    for (int i = 0; i < nstale; i++) {
+        stale_offsets[i] = rbuf_int(rb);
+    }
+
+    int32_t nbroadcast = rbuf_int(rb);
+    int32_t *XMALLOC_N(nbroadcast, broadcast_offsets);
+    for (int i = 0; i < nbroadcast; i++) {
+        broadcast_offsets[i] = rbuf_int(rb);
+    }
+
+    // build OMTs out of each offset array
+    bnc->fresh_message_tree.destroy();
+    bnc->fresh_message_tree.create_steal_sorted_array(&fresh_offsets, nfresh, nfresh);
+    bnc->stale_message_tree.destroy();
+    bnc->stale_message_tree.create_steal_sorted_array(&stale_offsets, nstale, nstale);
+    bnc->broadcast_list.destroy();
+    bnc->broadcast_list.create_steal_sorted_array(&broadcast_offsets, nbroadcast, nbroadcast);
+}
+
+// dump a buffer to stderr
+// no locking around this for now
+void
+dump_bad_block(unsigned char *vp, uint64_t size) {
+    const uint64_t linesize = 64;
+    uint64_t n = size / linesize;
+    for (uint64_t i = 0; i < n; i++) {
+        fprintf(stderr, "%p: ", vp);
+        for (uint64_t j = 0; j < linesize; j++) {
+            unsigned char c = vp[j];
+            fprintf(stderr, "%2.2X", c);
+        }
+        fprintf(stderr, "\n");
+        vp += linesize;
+    }
+    size = size % linesize;
+    for (uint64_t i=0; i<size; i++) {
+        if ((i % linesize) == 0)
+            fprintf(stderr, "%p: ", vp+i);
+        fprintf(stderr, "%2.2X", vp[i]);
+        if (((i+1) % linesize) == 0)
+            fprintf(stderr, "\n");
+    }
+    fprintf(stderr, "\n");
+}
+
+////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////
+
+BASEMENTNODE toku_create_empty_bn(void) {
+    BASEMENTNODE bn = toku_create_empty_bn_no_buffer();
+    bn->data_buffer.initialize_empty();
+    return bn;
+}
+
+BASEMENTNODE toku_clone_bn(BASEMENTNODE orig_bn) {
+    BASEMENTNODE bn = toku_create_empty_bn_no_buffer();
+    bn->max_msn_applied = orig_bn->max_msn_applied;
+    bn->seqinsert = orig_bn->seqinsert;
+    bn->stale_ancestor_messages_applied = orig_bn->stale_ancestor_messages_applied;
+    bn->stat64_delta = orig_bn->stat64_delta;
+    bn->data_buffer.clone(&orig_bn->data_buffer);
+    return bn;
+}
+
+BASEMENTNODE toku_create_empty_bn_no_buffer(void) {
+    BASEMENTNODE XMALLOC(bn);
+    bn->max_msn_applied.msn = 0;
+    bn->seqinsert = 0;
+    bn->stale_ancestor_messages_applied = false;
+    bn->stat64_delta = ZEROSTATS;
+    bn->data_buffer.init_zero();
+    return bn;
+}
+
+NONLEAF_CHILDINFO toku_create_empty_nl(void) {
+    NONLEAF_CHILDINFO XMALLOC(cn);
+    cn->msg_buffer.create();
+    cn->fresh_message_tree.create_no_array();
+    cn->stale_message_tree.create_no_array();
+    cn->broadcast_list.create_no_array();
+    memset(cn->flow, 0, sizeof cn->flow);
+    return cn;
+}
+
+// must clone the OMTs, since we serialize them along with the message buffer
+NONLEAF_CHILDINFO toku_clone_nl(NONLEAF_CHILDINFO orig_childinfo) {
+    NONLEAF_CHILDINFO XMALLOC(cn);
+    cn->msg_buffer.clone(&orig_childinfo->msg_buffer);
+    cn->fresh_message_tree.create_no_array();
+    cn->fresh_message_tree.clone(orig_childinfo->fresh_message_tree);
+    cn->stale_message_tree.create_no_array();
+    cn->stale_message_tree.clone(orig_childinfo->stale_message_tree);
+    cn->broadcast_list.create_no_array();
+    cn->broadcast_list.clone(orig_childinfo->broadcast_list);
+    memset(cn->flow, 0, sizeof cn->flow);
+    return cn;
+}
+
+void destroy_basement_node (BASEMENTNODE bn)
+{
+    bn->data_buffer.destroy();
+    toku_free(bn);
+}
+
+void destroy_nonleaf_childinfo (NONLEAF_CHILDINFO nl)
+{
+    nl->msg_buffer.destroy();
+    nl->fresh_message_tree.destroy();
+    nl->stale_message_tree.destroy();
+    nl->broadcast_list.destroy();
+    toku_free(nl);
+}
+
+void read_block_from_fd_into_rbuf(
+    int fd, 
+    BLOCKNUM blocknum,
+    FT ft,
+    struct rbuf *rb
+    ) 
+{
+    // get the file offset and block size for the block
+    DISKOFF offset, size;
+    ft->blocktable.translate_blocknum_to_offset_size(blocknum, &offset, &size);
+    DISKOFF size_aligned = roundup_to_multiple(512, size);
+    uint8_t *XMALLOC_N_ALIGNED(512, size_aligned, raw_block);
+    rbuf_init(rb, raw_block, size);
+    // read the block
+    ssize_t rlen = toku_os_pread(fd, raw_block, size_aligned, offset);
+    assert((DISKOFF)rlen >= size);
+    assert((DISKOFF)rlen <= size_aligned);
+}
+
+static const int read_header_heuristic_max = 32*1024;
+
+#ifndef MIN
+#define MIN(a,b) (((a)>(b)) ? (b) : (a))
+#endif
+
+// Effect: If the header part of the node is small enough, then read it into the rbuf.  The rbuf will be allocated to be big enough in any case.
+static void read_ftnode_header_from_fd_into_rbuf_if_small_enough(int fd, BLOCKNUM blocknum,
+                                                                 FT ft, struct rbuf *rb,
+                                                                 ftnode_fetch_extra *bfe) {
+    DISKOFF offset, size;
+    ft->blocktable.translate_blocknum_to_offset_size(blocknum, &offset, &size);
+    DISKOFF read_size = roundup_to_multiple(512, MIN(read_header_heuristic_max, size));
+    uint8_t *XMALLOC_N_ALIGNED(512, roundup_to_multiple(512, size), raw_block);
+    rbuf_init(rb, raw_block, read_size);
+
+    // read the block
+    tokutime_t t0 = toku_time_now();
+    ssize_t rlen = toku_os_pread(fd, raw_block, read_size, offset);
+    tokutime_t t1 = toku_time_now();
+
+    assert(rlen >= 0);
+    rbuf_init(rb, raw_block, rlen);
+
+    bfe->bytes_read = rlen;
+    bfe->io_time = t1 - t0;
+    toku_ft_status_update_pivot_fetch_reason(bfe);
+}
+
+//
+// read the compressed partition into the sub_block,
+// validate the checksum of the compressed data
+//
+int
+read_compressed_sub_block(struct rbuf *rb, struct sub_block *sb)
+{
+    int r = 0;
+    sb->compressed_size = rbuf_int(rb);
+    sb->uncompressed_size = rbuf_int(rb);
+    const void **cp = (const void **) &sb->compressed_ptr;
+    rbuf_literal_bytes(rb, cp, sb->compressed_size);
+    sb->xsum = rbuf_int(rb);
+    // let's check the checksum
+    uint32_t actual_xsum = toku_x1764_memory((char *)sb->compressed_ptr-8, 8+sb->compressed_size);
+    if (sb->xsum != actual_xsum) {
+        r = TOKUDB_BAD_CHECKSUM;
+    }
+    return r;
+}
+
+static int
+read_and_decompress_sub_block(struct rbuf *rb, struct sub_block *sb)
+{
+    int r = 0;
+    r = read_compressed_sub_block(rb, sb);
+    if (r != 0) {
+        goto exit;
+    }
+
+    just_decompress_sub_block(sb);
+exit:
+    return r;
+}
+
+// Allocates space for the sub-block and de-compresses the data from
+// the supplied compressed pointer..
+void
+just_decompress_sub_block(struct sub_block *sb)
+{
+    // <CER> TODO: Add assert that the subblock was read in.
+    sb->uncompressed_ptr = toku_xmalloc(sb->uncompressed_size);
+
+    toku_decompress(
+        (Bytef *) sb->uncompressed_ptr,
+        sb->uncompressed_size,
+        (Bytef *) sb->compressed_ptr,
+        sb->compressed_size
+        );
+}
+
+// verify the checksum
+int
+verify_ftnode_sub_block (struct sub_block *sb)
+{
+    int r = 0;
+    // first verify the checksum
+    uint32_t data_size = sb->uncompressed_size - 4; // checksum is 4 bytes at end
+    uint32_t stored_xsum = toku_dtoh32(*((uint32_t *)((char *)sb->uncompressed_ptr + data_size)));
+    uint32_t actual_xsum = toku_x1764_memory(sb->uncompressed_ptr, data_size);
+    if (stored_xsum != actual_xsum) {
+        dump_bad_block((Bytef *) sb->uncompressed_ptr, sb->uncompressed_size);
+        r = TOKUDB_BAD_CHECKSUM;
+    }
+    return r;
+}
+
+// This function deserializes the data stored by serialize_ftnode_info
+static int
+deserialize_ftnode_info(
+    struct sub_block *sb, 
+    FTNODE node
+    )
+{
+    // sb_node_info->uncompressed_ptr stores the serialized node information
+    // this function puts that information into node
+
+    // first verify the checksum
+    int r = 0;
+    r = verify_ftnode_sub_block(sb);
+    if (r != 0) {
+        goto exit;
+    }
+
+    uint32_t data_size;
+    data_size = sb->uncompressed_size - 4; // checksum is 4 bytes at end
+
+    // now with the data verified, we can read the information into the node
+    struct rbuf rb;
+    rbuf_init(&rb, (unsigned char *) sb->uncompressed_ptr, data_size);
+
+    node->max_msn_applied_to_node_on_disk = rbuf_MSN(&rb);
+    (void)rbuf_int(&rb);
+    node->flags = rbuf_int(&rb);
+    node->height = rbuf_int(&rb);
+    if (node->layout_version_read_from_disk < FT_LAYOUT_VERSION_19) {
+        (void) rbuf_int(&rb); // optimized_for_upgrade
+    }
+    if (node->layout_version_read_from_disk >= FT_LAYOUT_VERSION_22) {
+        rbuf_TXNID(&rb, &node->oldest_referenced_xid_known);
+    }
+
+    // now create the basement nodes or childinfos, depending on whether this is a
+    // leaf node or internal node
+    // now the subtree_estimates
+
+    // n_children is now in the header, nd the allocatio of the node->bp is in deserialize_ftnode_from_rbuf.
+
+    // now the pivots
+    if (node->n_children > 1) {
+        node->pivotkeys.deserialize_from_rbuf(&rb, node->n_children - 1);
+    } else {
+        node->pivotkeys.create_empty();
+    }
+
+    // if this is an internal node, unpack the block nums, and fill in necessary fields
+    // of childinfo
+    if (node->height > 0) {
+        for (int i = 0; i < node->n_children; i++) {
+            BP_BLOCKNUM(node,i) = rbuf_blocknum(&rb);
+            BP_WORKDONE(node, i) = 0;
+        }
+    }
+
+    // make sure that all the data was read
+    if (data_size != rb.ndone) {
+        dump_bad_block(rb.buf, rb.size);
+        abort();
+    }
+exit:
+    return r;
+}
+
+static void
+setup_available_ftnode_partition(FTNODE node, int i) {
+    if (node->height == 0) {
+        set_BLB(node, i, toku_create_empty_bn());
+        BLB_MAX_MSN_APPLIED(node,i) = node->max_msn_applied_to_node_on_disk;
+    }
+    else {
+        set_BNC(node, i, toku_create_empty_nl());
+    }
+}
+
+// Assign the child_to_read member of the bfe from the given ftnode
+// that has been brought into memory.
+static void
+update_bfe_using_ftnode(FTNODE node, ftnode_fetch_extra *bfe)
+{
+    if (bfe->type == ftnode_fetch_subset && bfe->search != NULL) {
+        // we do not take into account prefetching yet
+        // as of now, if we need a subset, the only thing
+        // we can possibly require is a single basement node
+        // we find out what basement node the query cares about
+        // and check if it is available
+        bfe->child_to_read = toku_ft_search_which_child(
+            bfe->ft->cmp,
+            node,
+            bfe->search
+            );
+    } else if (bfe->type == ftnode_fetch_keymatch) {
+        // we do not take into account prefetching yet
+        // as of now, if we need a subset, the only thing
+        // we can possibly require is a single basement node
+        // we find out what basement node the query cares about
+        // and check if it is available
+        if (node->height == 0) {
+            int left_child = bfe->leftmost_child_wanted(node);
+            int right_child = bfe->rightmost_child_wanted(node);
+            if (left_child == right_child) {
+                bfe->child_to_read = left_child;
+            }
+        }
+    }
+}
+
+// Using the search parameters in the bfe, this function will
+// initialize all of the given ftnode's partitions.
+static void
+setup_partitions_using_bfe(FTNODE node,
+                           ftnode_fetch_extra *bfe,
+                           bool data_in_memory)
+{
+    // Leftmost and Rightmost Child bounds.
+    int lc, rc;
+    if (bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_prefetch) {
+        lc = bfe->leftmost_child_wanted(node);
+        rc = bfe->rightmost_child_wanted(node);
+    } else {
+        lc = -1;
+        rc = -1;
+    }
+
+    //
+    // setup memory needed for the node
+    //
+    //printf("node height %d, blocknum %" PRId64 ", type %d lc %d rc %d\n", node->height, node->blocknum.b, bfe->type, lc, rc);
+    for (int i = 0; i < node->n_children; i++) {
+        BP_INIT_UNTOUCHED_CLOCK(node,i);
+        if (data_in_memory) {
+            BP_STATE(node, i) = ((bfe->wants_child_available(i) || (lc <= i && i <= rc))
+                                 ? PT_AVAIL : PT_COMPRESSED);
+        } else {
+            BP_STATE(node, i) = PT_ON_DISK;
+        }
+        BP_WORKDONE(node,i) = 0;
+
+        switch (BP_STATE(node,i)) {
+        case PT_AVAIL:
+            setup_available_ftnode_partition(node, i);
+            BP_TOUCH_CLOCK(node,i);
+            break;
+        case PT_COMPRESSED:
+            set_BSB(node, i, sub_block_creat());
+            break;
+        case PT_ON_DISK:
+            set_BNULL(node, i);
+            break;
+        case PT_INVALID:
+            abort();
+        }
+    }
+}
+
+static void setup_ftnode_partitions(FTNODE node, ftnode_fetch_extra *bfe, bool data_in_memory)
+// Effect: Used when reading a ftnode into main memory, this sets up the partitions.
+//   We set bfe->child_to_read as well as the BP_STATE and the data pointers (e.g., with set_BSB or set_BNULL or other set_ operations).
+// Arguments:  Node: the node to set up.
+//             bfe:  Describes the key range needed.
+//             data_in_memory: true if we have all the data (in which case we set the BP_STATE to be either PT_AVAIL or PT_COMPRESSED depending on the bfe.
+//                             false if we don't have the partitions in main memory (in which case we set the state to PT_ON_DISK.
+{
+    // Set bfe->child_to_read.
+    update_bfe_using_ftnode(node, bfe);
+
+    // Setup the partitions.
+    setup_partitions_using_bfe(node, bfe, data_in_memory);
+}
+
+/* deserialize the partition from the sub-block's uncompressed buffer
+ * and destroy the uncompressed buffer
+ */
+static int
+deserialize_ftnode_partition(
+    struct sub_block *sb,
+    FTNODE node,
+    int childnum,      // which partition to deserialize
+    const toku::comparator &cmp
+    )
+{
+    int r = 0;
+    r = verify_ftnode_sub_block(sb);
+    if (r != 0) {
+        goto exit;
+    }
+    uint32_t data_size;
+    data_size = sb->uncompressed_size - 4; // checksum is 4 bytes at end
+
+    // now with the data verified, we can read the information into the node
+    struct rbuf rb;
+    rbuf_init(&rb, (unsigned char *) sb->uncompressed_ptr, data_size);
+    unsigned char ch;
+    ch = rbuf_char(&rb);
+
+    if (node->height > 0) {
+        assert(ch == FTNODE_PARTITION_MSG_BUFFER);
+        NONLEAF_CHILDINFO bnc = BNC(node, childnum);
+        if (node->layout_version_read_from_disk <= FT_LAYOUT_VERSION_26) {
+            // Layout version <= 26 did not serialize sorted message trees to disk.
+            deserialize_child_buffer_v26(bnc, &rb, cmp);
+        } else {
+            deserialize_child_buffer(bnc, &rb);
+        }
+        BP_WORKDONE(node, childnum) = 0;
+    }
+    else {
+        assert(ch == FTNODE_PARTITION_DMT_LEAVES);
+        BLB_SEQINSERT(node, childnum) = 0;
+        uint32_t num_entries = rbuf_int(&rb);
+        // we are now at the first byte of first leafentry
+        data_size -= rb.ndone; // remaining bytes of leafentry data
+
+        BASEMENTNODE bn = BLB(node, childnum);
+        bn->data_buffer.deserialize_from_rbuf(num_entries, &rb, data_size, node->layout_version_read_from_disk);
+    }
+    assert(rb.ndone == rb.size);
+exit:
+    return r;
+}
+
+static int
+decompress_and_deserialize_worker(struct rbuf curr_rbuf, struct sub_block curr_sb, FTNODE node, int child,
+                                 const toku::comparator &cmp, tokutime_t *decompress_time)
+{
+    int r = 0;
+    tokutime_t t0 = toku_time_now();
+    r = read_and_decompress_sub_block(&curr_rbuf, &curr_sb);
+    tokutime_t t1 = toku_time_now();
+    if (r == 0) {
+        // at this point, sb->uncompressed_ptr stores the serialized node partition
+        r = deserialize_ftnode_partition(&curr_sb, node, child, cmp);
+    }
+    *decompress_time = t1 - t0;
+
+    toku_free(curr_sb.uncompressed_ptr);
+    return r;
+}
+
+static int
+check_and_copy_compressed_sub_block_worker(struct rbuf curr_rbuf, struct sub_block curr_sb, FTNODE node, int child)
+{
+    int r = 0;
+    r = read_compressed_sub_block(&curr_rbuf, &curr_sb);
+    if (r != 0) {
+        goto exit;
+    }
+
+    SUB_BLOCK bp_sb;
+    bp_sb = BSB(node, child);
+    bp_sb->compressed_size = curr_sb.compressed_size;
+    bp_sb->uncompressed_size = curr_sb.uncompressed_size;
+    bp_sb->compressed_ptr = toku_xmalloc(bp_sb->compressed_size);
+    memcpy(bp_sb->compressed_ptr, curr_sb.compressed_ptr, bp_sb->compressed_size);
+exit:
+    return r;
+}
+
+static FTNODE alloc_ftnode_for_deserialize(uint32_t fullhash, BLOCKNUM blocknum) {
+// Effect: Allocate an FTNODE and fill in the values that are not read from
+    FTNODE XMALLOC(node);
+    node->fullhash = fullhash;
+    node->blocknum = blocknum;
+    node->dirty = 0;
+    node->bp = nullptr;
+    node->oldest_referenced_xid_known = TXNID_NONE;
+    return node; 
+}
+
+static int
+deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode,
+                                                      FTNODE_DISK_DATA* ndd, 
+                                                      BLOCKNUM blocknum,
+                                                      uint32_t fullhash,
+                                                      ftnode_fetch_extra *bfe,
+                                                      struct rbuf *rb,
+                                                      int fd)
+// If we have enough information in the rbuf to construct a header, then do so.
+// Also fetch in the basement node if needed.
+// Return 0 if it worked.  If something goes wrong (including that we are looking at some old data format that doesn't have partitions) then return nonzero.
+{
+    int r = 0;
+
+    tokutime_t t0, t1;
+    tokutime_t decompress_time = 0;
+    tokutime_t deserialize_time = 0;
+    
+    t0 = toku_time_now();
+
+    FTNODE node = alloc_ftnode_for_deserialize(fullhash, blocknum);
+
+    if (rb->size < 24) {
+        // TODO: What error do we return here?
+        // Does it even matter?
+        r = toku_db_badformat();
+        goto cleanup;
+    }
+
+    const void *magic;
+    rbuf_literal_bytes(rb, &magic, 8);
+    if (memcmp(magic, "tokuleaf", 8)!=0 &&
+        memcmp(magic, "tokunode", 8)!=0) {
+        r = toku_db_badformat();        
+        goto cleanup;
+    }
+
+    node->layout_version_read_from_disk = rbuf_int(rb);
+    if (node->layout_version_read_from_disk < FT_FIRST_LAYOUT_VERSION_WITH_BASEMENT_NODES) {
+        // This code path doesn't have to worry about upgrade.
+        r = toku_db_badformat();
+        goto cleanup;
+    }
+
+    // If we get here, we know the node is at least
+    // FT_FIRST_LAYOUT_VERSION_WITH_BASEMENT_NODES.  We haven't changed
+    // the serialization format since then (this comment is correct as of
+    // version 20, which is Deadshot) so we can go ahead and say the
+    // layout version is current (it will be as soon as we finish
+    // deserializing).
+    // TODO(leif): remove node->layout_version (#5174)
+    node->layout_version = FT_LAYOUT_VERSION;
+
+    node->layout_version_original = rbuf_int(rb);
+    node->build_id = rbuf_int(rb);
+    node->n_children = rbuf_int(rb);
+    // Guaranteed to be have been able to read up to here.  If n_children
+    // is too big, we may have a problem, so check that we won't overflow
+    // while reading the partition locations.
+    unsigned int nhsize;
+    nhsize =  serialize_node_header_size(node); // we can do this because n_children is filled in.
+    unsigned int needed_size;
+    needed_size = nhsize + 12; // we need 12 more so that we can read the compressed block size information that follows for the nodeinfo.
+    if (needed_size > rb->size) {
+        r = toku_db_badformat();
+        goto cleanup;
+    }
+
+    XMALLOC_N(node->n_children, node->bp);
+    XMALLOC_N(node->n_children, *ndd);
+    // read the partition locations
+    for (int i=0; i<node->n_children; i++) {
+        BP_START(*ndd,i) = rbuf_int(rb);
+        BP_SIZE (*ndd,i) = rbuf_int(rb);
+    }
+
+    uint32_t checksum;
+    checksum = toku_x1764_memory(rb->buf, rb->ndone);
+    uint32_t stored_checksum;
+    stored_checksum = rbuf_int(rb);
+    if (stored_checksum != checksum) {
+        dump_bad_block(rb->buf, rb->size);
+        r = TOKUDB_BAD_CHECKSUM;
+        goto cleanup;
+    }
+
+    // Now we want to read the pivot information.
+    struct sub_block sb_node_info;
+    sub_block_init(&sb_node_info);
+    sb_node_info.compressed_size = rbuf_int(rb); // we'll be able to read these because we checked the size earlier.
+    sb_node_info.uncompressed_size = rbuf_int(rb);
+    if (rb->size-rb->ndone < sb_node_info.compressed_size + 8) {
+        r = toku_db_badformat();
+        goto cleanup;
+    }
+
+    // Finish reading compressed the sub_block
+    const void **cp;
+    cp = (const void **) &sb_node_info.compressed_ptr;
+    rbuf_literal_bytes(rb, cp, sb_node_info.compressed_size);
+    sb_node_info.xsum = rbuf_int(rb);
+    // let's check the checksum
+    uint32_t actual_xsum;
+    actual_xsum = toku_x1764_memory((char *)sb_node_info.compressed_ptr-8, 8+sb_node_info.compressed_size);
+    if (sb_node_info.xsum != actual_xsum) {
+        r = TOKUDB_BAD_CHECKSUM;
+        goto cleanup;
+    }
+
+    // Now decompress the subblock
+    {
+        toku::scoped_malloc sb_node_info_buf(sb_node_info.uncompressed_size);
+        sb_node_info.uncompressed_ptr = sb_node_info_buf.get();
+        tokutime_t decompress_t0 = toku_time_now();
+        toku_decompress(
+            (Bytef *) sb_node_info.uncompressed_ptr,
+            sb_node_info.uncompressed_size,
+            (Bytef *) sb_node_info.compressed_ptr,
+            sb_node_info.compressed_size
+            );
+        tokutime_t decompress_t1 = toku_time_now();
+        decompress_time = decompress_t1 - decompress_t0;
+
+        // at this point sb->uncompressed_ptr stores the serialized node info.
+        r = deserialize_ftnode_info(&sb_node_info, node);
+        if (r != 0) {
+            goto cleanup;
+        }
+    }
+
+    // Now we have the ftnode_info.  We have a bunch more stuff in the
+    // rbuf, so we might be able to store the compressed data for some
+    // objects.
+    // We can proceed to deserialize the individual subblocks.
+
+    // setup the memory of the partitions
+    // for partitions being decompressed, create either message buffer or basement node
+    // for partitions staying compressed, create sub_block
+    setup_ftnode_partitions(node, bfe, false);
+
+    // We must capture deserialize and decompression time before
+    // the pf_callback, otherwise we would double-count.
+    t1 = toku_time_now();
+    deserialize_time = (t1 - t0) - decompress_time;
+
+    // do partial fetch if necessary
+    if (bfe->type != ftnode_fetch_none) {
+        PAIR_ATTR attr;
+        r = toku_ftnode_pf_callback(node, *ndd, bfe, fd, &attr);
+        if (r != 0) {
+            goto cleanup;
+        }
+    }
+
+    // handle clock
+    for (int i = 0; i < node->n_children; i++) {
+        if (bfe->wants_child_available(i)) {
+            paranoid_invariant(BP_STATE(node,i) == PT_AVAIL);
+            BP_TOUCH_CLOCK(node,i);
+        }
+    }
+    *ftnode = node;
+    r = 0;
+
+cleanup:
+    if (r == 0) {
+        bfe->deserialize_time += deserialize_time;
+        bfe->decompress_time += decompress_time;
+        toku_ft_status_update_deserialize_times(node, deserialize_time, decompress_time);
+    }
+    if (r != 0) {
+        if (node) {
+            toku_free(*ndd);
+            toku_free(node->bp);
+            toku_free(node);
+        }
+    }
+    return r;
+}
+
+// This function takes a deserialized version 13 or 14 buffer and
+// constructs the associated internal, non-leaf ftnode object.  It
+// also creates MSN's for older messages created in older versions
+// that did not generate MSN's for messages.  These new MSN's are
+// generated from the root downwards, counting backwards from MIN_MSN
+// and persisted in the ft header.
+static int
+deserialize_and_upgrade_internal_node(FTNODE node,
+                                      struct rbuf *rb,
+                                      ftnode_fetch_extra *bfe,
+                                      STAT64INFO info)
+{
+    int version = node->layout_version_read_from_disk;
+
+    if (version == FT_LAST_LAYOUT_VERSION_WITH_FINGERPRINT) {
+        (void) rbuf_int(rb);                          // 10. fingerprint
+    }
+
+    node->n_children = rbuf_int(rb);                  // 11. n_children
+
+    // Sub-tree esitmates...
+    for (int i = 0; i < node->n_children; ++i) {
+        if (version == FT_LAST_LAYOUT_VERSION_WITH_FINGERPRINT) {
+            (void) rbuf_int(rb);                      // 12. fingerprint
+        }
+        uint64_t nkeys = rbuf_ulonglong(rb);          // 13. nkeys
+        uint64_t ndata = rbuf_ulonglong(rb);          // 14. ndata
+        uint64_t dsize = rbuf_ulonglong(rb);          // 15. dsize
+        (void) rbuf_char(rb);                         // 16. exact (char)
+        invariant(nkeys == ndata);
+        if (info) {
+            // info is non-null if we're trying to upgrade old subtree
+            // estimates to stat64info
+            info->numrows += nkeys;
+            info->numbytes += dsize;
+        }
+    }
+
+    // Pivot keys
+    node->pivotkeys.deserialize_from_rbuf(rb, node->n_children - 1);
+
+    // Create space for the child node buffers (a.k.a. partitions).
+    XMALLOC_N(node->n_children, node->bp);
+
+    // Set the child blocknums.
+    for (int i = 0; i < node->n_children; ++i) {
+        BP_BLOCKNUM(node, i) = rbuf_blocknum(rb);    // 18. blocknums
+        BP_WORKDONE(node, i) = 0;
+    }
+
+    // Read in the child buffer maps.
+    for (int i = 0; i < node->n_children; ++i) {
+        // The following fields were previously used by the `sub_block_map'
+        // They include:
+        // - 4 byte index
+        (void) rbuf_int(rb);
+        // - 4 byte offset
+        (void) rbuf_int(rb);
+        // - 4 byte size
+        (void) rbuf_int(rb);
+    }
+
+    // We need to setup this node's partitions, but we can't call the
+    // existing call (setup_ftnode_paritions.) because there are
+    // existing optimizations that would prevent us from bringing all
+    // of this node's partitions into memory.  Instead, We use the
+    // existing bfe and node to set the bfe's child_to_search member.
+    // Then we create a temporary bfe that needs all the nodes to make
+    // sure we properly intitialize our partitions before filling them
+    // in from our soon-to-be-upgraded node.
+    update_bfe_using_ftnode(node, bfe);
+    ftnode_fetch_extra temp_bfe;
+    temp_bfe.create_for_full_read(nullptr);
+    setup_partitions_using_bfe(node, &temp_bfe, true);
+
+    // Cache the highest MSN generated for the message buffers.  This
+    // will be set in the ftnode.
+    //
+    // The way we choose MSNs for upgraded messages is delicate.  The
+    // field `highest_unused_msn_for_upgrade' in the header is always an
+    // MSN that no message has yet.  So when we have N messages that need
+    // MSNs, we decrement it by N, and then use it and the N-1 MSNs less
+    // than it, but we do not use the value we decremented it to.
+    //
+    // In the code below, we initialize `lowest' with the value of
+    // `highest_unused_msn_for_upgrade' after it is decremented, so we
+    // need to be sure to increment it once before we enqueue our first
+    // message.
+    MSN highest_msn;
+    highest_msn.msn = 0;
+
+    // Deserialize de-compressed buffers.
+    for (int i = 0; i < node->n_children; ++i) {
+        NONLEAF_CHILDINFO bnc = BNC(node, i);
+        MSN highest_msn_in_this_buffer = deserialize_child_buffer_v13(bfe->ft, bnc, rb);
+        if (highest_msn.msn == 0) {
+            highest_msn.msn = highest_msn_in_this_buffer.msn;
+        }
+    }
+
+    // Assign the highest msn from our upgrade message buffers
+    node->max_msn_applied_to_node_on_disk = highest_msn;
+    // Since we assigned MSNs to this node's messages, we need to dirty it.
+    node->dirty = 1;
+
+    // Must compute the checksum now (rather than at the end, while we
+    // still have the pointer to the buffer).
+    if (version >= FT_FIRST_LAYOUT_VERSION_WITH_END_TO_END_CHECKSUM) {
+        uint32_t expected_xsum = toku_dtoh32(*(uint32_t*)(rb->buf+rb->size-4)); // 27. checksum
+        uint32_t actual_xsum   = toku_x1764_memory(rb->buf, rb->size-4);
+        if (expected_xsum != actual_xsum) {
+            fprintf(stderr, "%s:%d: Bad checksum: expected = %" PRIx32 ", actual= %" PRIx32 "\n",
+                    __FUNCTION__,
+                    __LINE__,
+                    expected_xsum,
+                    actual_xsum);
+            fprintf(stderr,
+                    "Checksum failure while reading node in file %s.\n",
+                    toku_cachefile_fname_in_env(bfe->ft->cf));
+            fflush(stderr);
+            return toku_db_badformat();
+        }
+    }
+
+    return 0;
+}
+
+// This function takes a deserialized version 13 or 14 buffer and
+// constructs the associated leaf ftnode object.
+static int
+deserialize_and_upgrade_leaf_node(FTNODE node,
+                                  struct rbuf *rb,
+                                  ftnode_fetch_extra *bfe,
+                                  STAT64INFO info)
+{
+    int r = 0;
+    int version = node->layout_version_read_from_disk;
+
+    // This is a leaf node, so the offsets in the buffer will be
+    // different from the internal node offsets above.
+    uint64_t nkeys = rbuf_ulonglong(rb);                // 10. nkeys
+    uint64_t ndata = rbuf_ulonglong(rb);                // 11. ndata
+    uint64_t dsize = rbuf_ulonglong(rb);                // 12. dsize
+    invariant(nkeys == ndata);
+    if (info) {
+        // info is non-null if we're trying to upgrade old subtree
+        // estimates to stat64info
+        info->numrows += nkeys;
+        info->numbytes += dsize;
+    }
+
+    // This is the optimized for upgrade field.
+    if (version == FT_LAYOUT_VERSION_14) {
+        (void) rbuf_int(rb);                            // 13. optimized
+    }
+
+    // npartitions - This is really the number of leaf entries in
+    // our single basement node.  There should only be 1 (ONE)
+    // partition, so there shouldn't be any pivot key stored.  This
+    // means the loop will not iterate.  We could remove the loop and
+    // assert that the value is indeed 1.
+    int npartitions = rbuf_int(rb);                     // 14. npartitions
+    assert(npartitions == 1);
+
+    // Set number of children to 1, since we will only have one
+    // basement node.
+    node->n_children = 1;
+    XMALLOC_N(node->n_children, node->bp);
+    node->pivotkeys.create_empty();
+
+    // Create one basement node to contain all the leaf entries by
+    // setting up the single partition and updating the bfe.
+    update_bfe_using_ftnode(node, bfe);
+    ftnode_fetch_extra temp_bfe;
+    temp_bfe.create_for_full_read(bfe->ft);
+    setup_partitions_using_bfe(node, &temp_bfe, true);
+
+    // 11. Deserialize the partition maps, though they are not used in the
+    // newer versions of ftnodes.
+    for (int i = 0; i < node->n_children; ++i) {
+        // The following fields were previously used by the `sub_block_map'
+        // They include:
+        // - 4 byte index
+        (void) rbuf_int(rb);
+        // - 4 byte offset
+        (void) rbuf_int(rb);
+        // - 4 byte size
+        (void) rbuf_int(rb);
+    }
+
+    // Copy all of the leaf entries into the single basement node.
+
+    // The number of leaf entries in buffer.
+    int n_in_buf = rbuf_int(rb);                        // 15. # of leaves
+    BLB_SEQINSERT(node,0) = 0;
+    BASEMENTNODE bn = BLB(node, 0);
+
+    // Read the leaf entries from the buffer, advancing the buffer
+    // as we go.
+    bool has_end_to_end_checksum = (version >= FT_FIRST_LAYOUT_VERSION_WITH_END_TO_END_CHECKSUM);
+    if (version <= FT_LAYOUT_VERSION_13) {
+        // Create our mempool.
+        // Loop through
+        for (int i = 0; i < n_in_buf; ++i) {
+            LEAFENTRY_13 le = reinterpret_cast<LEAFENTRY_13>(&rb->buf[rb->ndone]);
+            uint32_t disksize = leafentry_disksize_13(le);
+            rb->ndone += disksize;                       // 16. leaf entry (13)
+            invariant(rb->ndone<=rb->size);
+            LEAFENTRY new_le;
+            size_t new_le_size;
+            void* key = NULL;
+            uint32_t keylen = 0;
+            r = toku_le_upgrade_13_14(le,
+                                      &key,
+                                      &keylen,
+                                      &new_le_size,
+                                      &new_le);
+            assert_zero(r);
+            // Copy the pointer value straight into the OMT
+            LEAFENTRY new_le_in_bn = nullptr;
+            void *maybe_free;
+            bn->data_buffer.get_space_for_insert(
+                i,
+                key,
+                keylen,
+                new_le_size,
+                &new_le_in_bn,
+                &maybe_free
+                );
+            if (maybe_free) {
+                toku_free(maybe_free);
+            }
+            memcpy(new_le_in_bn, new_le, new_le_size);
+            toku_free(new_le);
+        }
+    } else {
+        uint32_t data_size = rb->size - rb->ndone;
+        if (has_end_to_end_checksum) {
+            data_size -= sizeof(uint32_t);
+        }
+        bn->data_buffer.deserialize_from_rbuf(n_in_buf, rb, data_size, node->layout_version_read_from_disk);
+    }
+
+    // Whatever this is must be less than the MSNs of every message above
+    // it, so it's ok to take it here.
+    bn->max_msn_applied = bfe->ft->h->highest_unused_msn_for_upgrade;
+    bn->stale_ancestor_messages_applied = false;
+    node->max_msn_applied_to_node_on_disk = bn->max_msn_applied;
+
+    // Checksum (end to end) is only on version 14
+    if (has_end_to_end_checksum) {
+        uint32_t expected_xsum = rbuf_int(rb);             // 17. checksum 
+        uint32_t actual_xsum = toku_x1764_memory(rb->buf, rb->size - 4);
+        if (expected_xsum != actual_xsum) {
+            fprintf(stderr, "%s:%d: Bad checksum: expected = %" PRIx32 ", actual= %" PRIx32 "\n",
+                    __FUNCTION__,
+                    __LINE__,
+                    expected_xsum,
+                    actual_xsum);
+            fprintf(stderr,
+                    "Checksum failure while reading node in file %s.\n",
+                    toku_cachefile_fname_in_env(bfe->ft->cf));
+            fflush(stderr);
+            return toku_db_badformat();
+        }
+    }
+
+    // We should have read the whole block by this point.
+    if (rb->ndone != rb->size) {
+        // TODO: Error handling.
+        return 1;
+    }
+
+    return r;
+}
+
+static int
+read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum,
+                                            DISKOFF offset, DISKOFF size,
+                                            FT ft,
+                                            struct rbuf *rb,
+                                            /* out */ int *layout_version_p);
+
+// This function upgrades a version 14 or 13 ftnode to the current
+// verison. NOTE: This code assumes the first field of the rbuf has
+// already been read from the buffer (namely the layout_version of the
+// ftnode.)
+static int
+deserialize_and_upgrade_ftnode(FTNODE node,
+                                FTNODE_DISK_DATA* ndd,
+                                BLOCKNUM blocknum,
+                                ftnode_fetch_extra *bfe,
+                                STAT64INFO info,
+                                int fd)
+{
+    int r = 0;
+    int version;
+
+    // I. First we need to de-compress the entire node, only then can
+    // we read the different sub-sections.
+    // get the file offset and block size for the block
+    DISKOFF offset, size;
+    bfe->ft->blocktable.translate_blocknum_to_offset_size(blocknum, &offset, &size);
+
+    struct rbuf rb;
+    r = read_and_decompress_block_from_fd_into_rbuf(fd,
+                                                    blocknum,
+                                                    offset,
+                                                    size,
+                                                    bfe->ft,
+                                                    &rb,
+                                                    &version);
+    if (r != 0) {
+        goto exit;
+    }
+
+    // Re-read the magic field from the previous call, since we are
+    // restarting with a fresh rbuf.
+    {
+        const void *magic;
+        rbuf_literal_bytes(&rb, &magic, 8);              // 1. magic
+    }
+
+    // II. Start reading ftnode fields out of the decompressed buffer.
+
+    // Copy over old version info.
+    node->layout_version_read_from_disk = rbuf_int(&rb); // 2. layout version
+    version = node->layout_version_read_from_disk;
+    assert(version <= FT_LAYOUT_VERSION_14);
+    // Upgrade the current version number to the current version.
+    node->layout_version = FT_LAYOUT_VERSION;
+
+    node->layout_version_original = rbuf_int(&rb);      // 3. original layout
+    node->build_id = rbuf_int(&rb);                     // 4. build id
+
+    // The remaining offsets into the rbuf do not map to the current
+    // version, so we need to fill in the blanks and ignore older
+    // fields.
+    (void)rbuf_int(&rb);                                // 5. nodesize
+    node->flags = rbuf_int(&rb);                        // 6. flags
+    node->height = rbuf_int(&rb);                       // 7. height
+
+    // If the version is less than 14, there are two extra ints here.
+    // we would need to ignore them if they are there.
+    // These are the 'fingerprints'.
+    if (version == FT_LAYOUT_VERSION_13) {
+        (void) rbuf_int(&rb);                           // 8. rand4
+        (void) rbuf_int(&rb);                           // 9. local
+    }
+
+    // The next offsets are dependent on whether this is a leaf node
+    // or not.
+
+    // III. Read in Leaf and Internal Node specific data.
+
+    // Check height to determine whether this is a leaf node or not.
+    if (node->height > 0) {
+        r = deserialize_and_upgrade_internal_node(node, &rb, bfe, info);
+    } else {
+        r = deserialize_and_upgrade_leaf_node(node, &rb, bfe, info);
+    }
+
+    XMALLOC_N(node->n_children, *ndd);
+    // Initialize the partition locations to zero, because version 14
+    // and below have no notion of partitions on disk.
+    for (int i=0; i<node->n_children; i++) {
+        BP_START(*ndd,i) = 0;
+        BP_SIZE (*ndd,i) = 0;
+    }
+
+    toku_free(rb.buf);
+exit:
+    return r;
+}
+
+static int
+deserialize_ftnode_from_rbuf(
+    FTNODE *ftnode,
+    FTNODE_DISK_DATA* ndd,
+    BLOCKNUM blocknum,
+    uint32_t fullhash,
+    ftnode_fetch_extra *bfe,
+    STAT64INFO info,
+    struct rbuf *rb,
+    int fd
+    )
+// Effect: deserializes a ftnode that is in rb (with pointer of rb just past the magic) into a FTNODE.
+{
+    int r = 0;
+    struct sub_block sb_node_info;
+
+    tokutime_t t0, t1;
+    tokutime_t decompress_time = 0;
+    tokutime_t deserialize_time = 0;
+
+    t0 = toku_time_now();
+
+    FTNODE node = alloc_ftnode_for_deserialize(fullhash, blocknum);
+
+    // now start reading from rbuf
+    // first thing we do is read the header information
+    const void *magic;
+    rbuf_literal_bytes(rb, &magic, 8);
+    if (memcmp(magic, "tokuleaf", 8)!=0 &&
+        memcmp(magic, "tokunode", 8)!=0) {
+        r = toku_db_badformat();
+        goto cleanup;
+    }
+
+    node->layout_version_read_from_disk = rbuf_int(rb);
+    lazy_assert(node->layout_version_read_from_disk >= FT_LAYOUT_MIN_SUPPORTED_VERSION);
+
+    // Check if we are reading in an older node version.
+    if (node->layout_version_read_from_disk <= FT_LAYOUT_VERSION_14) {
+        int version = node->layout_version_read_from_disk;
+        // Perform the upgrade.
+        r = deserialize_and_upgrade_ftnode(node, ndd, blocknum, bfe, info, fd);
+        if (r != 0) {
+            goto cleanup;
+        }
+
+        if (version <= FT_LAYOUT_VERSION_13) {
+            // deprecate 'TOKU_DB_VALCMP_BUILTIN'. just remove the flag
+            node->flags &= ~TOKU_DB_VALCMP_BUILTIN_13;
+        }
+
+        // If everything is ok, just re-assign the ftnode and retrn.
+        *ftnode = node;
+        r = 0;
+        goto cleanup;
+    }
+
+    // Upgrade versions after 14 to current.  This upgrade is trivial, it
+    // removes the optimized for upgrade field, which has already been
+    // removed in the deserialization code (see
+    // deserialize_ftnode_info()).
+    node->layout_version = FT_LAYOUT_VERSION;
+    node->layout_version_original = rbuf_int(rb);
+    node->build_id = rbuf_int(rb);
+    node->n_children = rbuf_int(rb);
+    XMALLOC_N(node->n_children, node->bp);
+    XMALLOC_N(node->n_children, *ndd);
+    // read the partition locations
+    for (int i=0; i<node->n_children; i++) {
+        BP_START(*ndd,i) = rbuf_int(rb);
+        BP_SIZE (*ndd,i) = rbuf_int(rb);
+    }
+    // verify checksum of header stored
+    uint32_t checksum;
+    checksum = toku_x1764_memory(rb->buf, rb->ndone);
+    uint32_t stored_checksum;
+    stored_checksum = rbuf_int(rb);
+    if (stored_checksum != checksum) {
+        dump_bad_block(rb->buf, rb->size);
+        invariant(stored_checksum == checksum);
+    }
+
+    // now we read and decompress the pivot and child information
+    sub_block_init(&sb_node_info);
+    {
+        tokutime_t sb_decompress_t0 = toku_time_now();
+        r = read_and_decompress_sub_block(rb, &sb_node_info);
+        tokutime_t sb_decompress_t1 = toku_time_now();
+        decompress_time += sb_decompress_t1 - sb_decompress_t0;
+    }
+    if (r != 0) {
+        goto cleanup;
+    }
+
+    // at this point, sb->uncompressed_ptr stores the serialized node info
+    r = deserialize_ftnode_info(&sb_node_info, node);
+    if (r != 0) {
+        goto cleanup;
+    }
+    toku_free(sb_node_info.uncompressed_ptr);
+
+    // now that the node info has been deserialized, we can proceed to deserialize
+    // the individual sub blocks
+
+    // setup the memory of the partitions
+    // for partitions being decompressed, create either message buffer or basement node
+    // for partitions staying compressed, create sub_block
+    setup_ftnode_partitions(node, bfe, true);
+
+    // This loop is parallelizeable, since we don't have a dependency on the work done so far.
+    for (int i = 0; i < node->n_children; i++) {
+        uint32_t curr_offset = BP_START(*ndd,i);
+        uint32_t curr_size   = BP_SIZE(*ndd,i);
+        // the compressed, serialized partitions start at where rb is currently pointing,
+        // which would be rb->buf + rb->ndone
+        // we need to intialize curr_rbuf to point to this place
+        struct rbuf curr_rbuf  = {.buf = NULL, .size = 0, .ndone = 0};
+        rbuf_init(&curr_rbuf, rb->buf + curr_offset, curr_size);
+
+        //
+        // now we are at the point where we have:
+        //  - read the entire compressed node off of disk,
+        //  - decompressed the pivot and offset information,
+        //  - have arrived at the individual partitions.
+        //
+        // Based on the information in bfe, we want to decompress a subset of
+        // of the compressed partitions (also possibly none or possibly all)
+        // The partitions that we want to decompress and make available
+        // to the node, we do, the rest we simply copy in compressed
+        // form into the node, and set the state of the partition to PT_COMPRESSED
+        //
+
+        struct sub_block curr_sb;
+        sub_block_init(&curr_sb);
+
+        // curr_rbuf is passed by value to decompress_and_deserialize_worker, so there's no ugly race condition.
+        // This would be more obvious if curr_rbuf were an array.
+
+        // deserialize_ftnode_info figures out what the state
+        // should be and sets up the memory so that we are ready to use it
+
+        switch (BP_STATE(node,i)) {
+        case PT_AVAIL: {
+                //  case where we read and decompress the partition
+                tokutime_t partition_decompress_time;
+                r = decompress_and_deserialize_worker(curr_rbuf, curr_sb, node, i,
+                                                      bfe->ft->cmp, &partition_decompress_time);
+                decompress_time += partition_decompress_time;
+                if (r != 0) {
+                    goto cleanup;
+                }
+                break;
+            }
+        case PT_COMPRESSED:
+            // case where we leave the partition in the compressed state
+            r = check_and_copy_compressed_sub_block_worker(curr_rbuf, curr_sb, node, i);
+            if (r != 0) {
+                goto cleanup;
+            }
+            break;
+        case PT_INVALID: // this is really bad
+        case PT_ON_DISK: // it's supposed to be in memory.
+            abort();
+        }
+    }
+    *ftnode = node;
+    r = 0;
+
+cleanup:
+    if (r == 0) {
+        t1 = toku_time_now();
+        deserialize_time = (t1 - t0) - decompress_time;
+        bfe->deserialize_time += deserialize_time;
+        bfe->decompress_time += decompress_time; 
+        toku_ft_status_update_deserialize_times(node, deserialize_time, decompress_time);
+    }
+    if (r != 0) {
+        // NOTE: Right now, callers higher in the stack will assert on
+        // failure, so this is OK for production.  However, if we
+        // create tools that use this function to search for errors in
+        // the FT, then we will leak memory.
+        if (node) {
+            toku_free(node);
+        }
+    }
+    return r;
+}
+
+int
+toku_deserialize_bp_from_disk(FTNODE node, FTNODE_DISK_DATA ndd, int childnum, int fd, ftnode_fetch_extra *bfe) {
+    int r = 0;
+    assert(BP_STATE(node,childnum) == PT_ON_DISK);
+    assert(node->bp[childnum].ptr.tag == BCT_NULL);
+    
+    //
+    // setup the partition
+    //
+    setup_available_ftnode_partition(node, childnum);
+    BP_STATE(node,childnum) = PT_AVAIL;
+    
+    //
+    // read off disk and make available in memory
+    // 
+    // get the file offset and block size for the block
+    DISKOFF node_offset, total_node_disk_size;
+    bfe->ft->blocktable.translate_blocknum_to_offset_size(node->blocknum, &node_offset, &total_node_disk_size);
+
+    uint32_t curr_offset = BP_START(ndd, childnum);
+    uint32_t curr_size = BP_SIZE (ndd, childnum);
+
+    struct rbuf rb;
+    rbuf_init(&rb, nullptr, 0);
+
+    uint32_t pad_at_beginning = (node_offset+curr_offset)%512;
+    uint32_t padded_size = roundup_to_multiple(512, pad_at_beginning + curr_size);
+
+    toku::scoped_malloc_aligned raw_block_buf(padded_size, 512);
+    uint8_t *raw_block = reinterpret_cast<uint8_t *>(raw_block_buf.get());
+    rbuf_init(&rb, pad_at_beginning+raw_block, curr_size);
+    tokutime_t t0 = toku_time_now();
+
+    // read the block
+    assert(0==((unsigned long long)raw_block)%512); // for O_DIRECT
+    assert(0==(padded_size)%512);
+    assert(0==(node_offset+curr_offset-pad_at_beginning)%512);
+    ssize_t rlen = toku_os_pread(fd, raw_block, padded_size, node_offset+curr_offset-pad_at_beginning);
+    assert((DISKOFF)rlen >= pad_at_beginning + curr_size); // we read in at least enough to get what we wanted
+    assert((DISKOFF)rlen <= padded_size);                  // we didn't read in too much.
+
+    tokutime_t t1 = toku_time_now();
+
+    // read sub block
+    struct sub_block curr_sb;
+    sub_block_init(&curr_sb);
+    r = read_compressed_sub_block(&rb, &curr_sb);
+    if (r != 0) {
+        return r;
+    }
+    invariant(curr_sb.compressed_ptr != NULL);
+
+    // decompress
+    toku::scoped_malloc uncompressed_buf(curr_sb.uncompressed_size);
+    curr_sb.uncompressed_ptr = uncompressed_buf.get();
+    toku_decompress((Bytef *) curr_sb.uncompressed_ptr, curr_sb.uncompressed_size,
+                    (Bytef *) curr_sb.compressed_ptr, curr_sb.compressed_size);
+
+    // deserialize
+    tokutime_t t2 = toku_time_now();
+
+    r = deserialize_ftnode_partition(&curr_sb, node, childnum, bfe->ft->cmp);
+
+    tokutime_t t3 = toku_time_now();
+
+    // capture stats
+    tokutime_t io_time = t1 - t0;
+    tokutime_t decompress_time = t2 - t1;
+    tokutime_t deserialize_time = t3 - t2;
+    bfe->deserialize_time += deserialize_time;
+    bfe->decompress_time += decompress_time;
+    toku_ft_status_update_deserialize_times(node, deserialize_time, decompress_time);
+
+    bfe->bytes_read = rlen;
+    bfe->io_time = io_time;
+
+    return r;
+}
+
+// Take a ftnode partition that is in the compressed state, and make it avail
+int
+toku_deserialize_bp_from_compressed(FTNODE node, int childnum, ftnode_fetch_extra *bfe) {
+    int r = 0;
+    assert(BP_STATE(node, childnum) == PT_COMPRESSED);
+    SUB_BLOCK curr_sb = BSB(node, childnum);
+
+    toku::scoped_malloc uncompressed_buf(curr_sb->uncompressed_size);
+    assert(curr_sb->uncompressed_ptr == NULL);
+    curr_sb->uncompressed_ptr = uncompressed_buf.get();
+
+    setup_available_ftnode_partition(node, childnum);
+    BP_STATE(node,childnum) = PT_AVAIL;
+
+    // decompress the sub_block
+    tokutime_t t0 = toku_time_now();
+
+    toku_decompress(
+        (Bytef *) curr_sb->uncompressed_ptr,
+        curr_sb->uncompressed_size,
+        (Bytef *) curr_sb->compressed_ptr,
+        curr_sb->compressed_size
+        );
+
+    tokutime_t t1 = toku_time_now();
+
+    r = deserialize_ftnode_partition(curr_sb, node, childnum, bfe->ft->cmp);
+
+    tokutime_t t2 = toku_time_now();
+
+    tokutime_t decompress_time = t1 - t0;
+    tokutime_t deserialize_time = t2 - t1;
+    bfe->deserialize_time += deserialize_time;
+    bfe->decompress_time += decompress_time;
+    toku_ft_status_update_deserialize_times(node, deserialize_time, decompress_time);
+
+    toku_free(curr_sb->compressed_ptr);
+    toku_free(curr_sb);
+    return r;
+}
+
+static int
+deserialize_ftnode_from_fd(int fd,
+                            BLOCKNUM blocknum,
+                            uint32_t fullhash,
+                            FTNODE *ftnode,
+                            FTNODE_DISK_DATA *ndd,
+                            ftnode_fetch_extra *bfe,
+                            STAT64INFO info)
+{
+    struct rbuf rb = RBUF_INITIALIZER;
+
+    tokutime_t t0 = toku_time_now();
+    read_block_from_fd_into_rbuf(fd, blocknum, bfe->ft, &rb); 
+    tokutime_t t1 = toku_time_now();
+
+    // Decompress and deserialize the ftnode. Time statistics
+    // are taken inside this function.
+    int r = deserialize_ftnode_from_rbuf(ftnode, ndd, blocknum, fullhash, bfe, info, &rb, fd);
+    if (r != 0) {
+        dump_bad_block(rb.buf,rb.size);
+    }
+
+    bfe->bytes_read = rb.size;
+    bfe->io_time = t1 - t0;
+    toku_free(rb.buf);
+    return r;
+}
+
+// Read ftnode from file into struct.  Perform version upgrade if necessary.
+int
+toku_deserialize_ftnode_from (int fd,
+                               BLOCKNUM blocknum,
+                               uint32_t fullhash,
+                               FTNODE *ftnode,
+                               FTNODE_DISK_DATA* ndd,
+                               ftnode_fetch_extra *bfe
+    )
+// Effect: Read a node in.  If possible, read just the header.
+{
+    int r = 0;
+    struct rbuf rb = RBUF_INITIALIZER;
+
+    // each function below takes the appropriate io/decompression/deserialize statistics
+
+    if (!bfe->read_all_partitions) {
+        read_ftnode_header_from_fd_into_rbuf_if_small_enough(fd, blocknum, bfe->ft, &rb, bfe);
+        r = deserialize_ftnode_header_from_rbuf_if_small_enough(ftnode, ndd, blocknum, fullhash, bfe, &rb, fd);
+    } else {
+        // force us to do it the old way
+        r = -1;
+    }
+    if (r != 0) {
+        // Something went wrong, go back to doing it the old way.
+        r = deserialize_ftnode_from_fd(fd, blocknum, fullhash, ftnode, ndd, bfe, NULL);
+    }
+
+    toku_free(rb.buf);
+    return r;
+}
+
+void
+toku_verify_or_set_counts(FTNODE UU(node)) {
+}
+
+int 
+toku_db_badformat(void) {
+    return DB_BADFORMAT;
+}
+
+static size_t
+serialize_rollback_log_size(ROLLBACK_LOG_NODE log) {
+    size_t size = node_header_overhead //8 "tokuroll", 4 version, 4 version_original, 4 build_id
+                 +16 //TXNID_PAIR
+                 +8 //sequence
+                 +8 //blocknum
+                 +8 //previous (blocknum)
+                 +8 //resident_bytecount
+                 +8 //memarena size
+                 +log->rollentry_resident_bytecount;
+    return size;
+}
+
+static void
+serialize_rollback_log_node_to_buf(ROLLBACK_LOG_NODE log, char *buf, size_t calculated_size, int UU(n_sub_blocks), struct sub_block UU(sub_block[])) {
+    struct wbuf wb;
+    wbuf_init(&wb, buf, calculated_size);
+    {   //Serialize rollback log to local wbuf
+        wbuf_nocrc_literal_bytes(&wb, "tokuroll", 8);
+        lazy_assert(log->layout_version == FT_LAYOUT_VERSION);
+        wbuf_nocrc_int(&wb, log->layout_version);
+        wbuf_nocrc_int(&wb, log->layout_version_original);
+        wbuf_nocrc_uint(&wb, BUILD_ID);
+        wbuf_nocrc_TXNID_PAIR(&wb, log->txnid);
+        wbuf_nocrc_ulonglong(&wb, log->sequence);
+        wbuf_nocrc_BLOCKNUM(&wb, log->blocknum);
+        wbuf_nocrc_BLOCKNUM(&wb, log->previous);
+        wbuf_nocrc_ulonglong(&wb, log->rollentry_resident_bytecount);
+        //Write down memarena size needed to restore
+        wbuf_nocrc_ulonglong(&wb, log->rollentry_arena.total_size_in_use());
+
+        {
+            //Store rollback logs
+            struct roll_entry *item;
+            size_t done_before = wb.ndone;
+            for (item = log->newest_logentry; item; item = item->prev) {
+                toku_logger_rollback_wbuf_nocrc_write(&wb, item);
+            }
+            lazy_assert(done_before + log->rollentry_resident_bytecount == wb.ndone);
+        }
+    }
+    lazy_assert(wb.ndone == wb.size);
+    lazy_assert(calculated_size==wb.ndone);
+}
+
+static void
+serialize_uncompressed_block_to_memory(char * uncompressed_buf,
+                                       int n_sub_blocks,
+                                       struct sub_block sub_block[/*n_sub_blocks*/],
+                                       enum toku_compression_method method,
+                               /*out*/ size_t *n_bytes_to_write,
+                               /*out*/ char  **bytes_to_write)
+// Guarantees that the malloc'd BYTES_TO_WRITE is 512-byte aligned (so that O_DIRECT will work)
+{
+    // allocate space for the compressed uncompressed_buf
+    size_t compressed_len = get_sum_compressed_size_bound(n_sub_blocks, sub_block, method);
+    size_t sub_block_header_len = sub_block_header_size(n_sub_blocks);
+    size_t header_len = node_header_overhead + sub_block_header_len + sizeof (uint32_t); // node + sub_block + checksum
+    char *XMALLOC_N_ALIGNED(512, roundup_to_multiple(512, header_len + compressed_len), compressed_buf);
+
+    // copy the header
+    memcpy(compressed_buf, uncompressed_buf, node_header_overhead);
+    if (0) printf("First 4 bytes before compressing data are %02x%02x%02x%02x\n",
+                  uncompressed_buf[node_header_overhead],   uncompressed_buf[node_header_overhead+1],
+                  uncompressed_buf[node_header_overhead+2], uncompressed_buf[node_header_overhead+3]);
+
+    // compress all of the sub blocks
+    char *uncompressed_ptr = uncompressed_buf + node_header_overhead;
+    char *compressed_ptr = compressed_buf + header_len;
+    compressed_len = compress_all_sub_blocks(n_sub_blocks, sub_block, uncompressed_ptr, compressed_ptr, num_cores, ft_pool, method);
+
+    //if (0) printf("Block %" PRId64 " Size before compressing %u, after compression %" PRIu64 "\n", blocknum.b, calculated_size-node_header_overhead, (uint64_t) compressed_len);
+
+    // serialize the sub block header
+    uint32_t *ptr = (uint32_t *)(compressed_buf + node_header_overhead);
+    *ptr++ = toku_htod32(n_sub_blocks);
+    for (int i=0; i<n_sub_blocks; i++) {
+        ptr[0] = toku_htod32(sub_block[i].compressed_size);
+        ptr[1] = toku_htod32(sub_block[i].uncompressed_size);
+        ptr[2] = toku_htod32(sub_block[i].xsum);
+        ptr += 3;
+    }
+
+    // compute the header checksum and serialize it
+    uint32_t header_length = (char *)ptr - (char *)compressed_buf;
+    uint32_t xsum = toku_x1764_memory(compressed_buf, header_length);
+    *ptr = toku_htod32(xsum);
+
+    uint32_t padded_len = roundup_to_multiple(512, header_len + compressed_len);
+    // Zero out padding.
+    for (uint32_t i = header_len+compressed_len; i < padded_len; i++) {
+        compressed_buf[i] = 0;
+    }
+    *n_bytes_to_write = padded_len;
+    *bytes_to_write   = compressed_buf;
+}
+
+void
+toku_serialize_rollback_log_to_memory_uncompressed(ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBACK_LOG_NODE serialized) {
+    // get the size of the serialized node
+    size_t calculated_size = serialize_rollback_log_size(log);
+
+    serialized->len = calculated_size;
+    serialized->n_sub_blocks = 0;
+    // choose sub block parameters
+    int sub_block_size = 0;
+    size_t data_size = calculated_size - node_header_overhead;
+    choose_sub_block_size(data_size, max_sub_blocks, &sub_block_size, &serialized->n_sub_blocks);
+    lazy_assert(0 < serialized->n_sub_blocks && serialized->n_sub_blocks <= max_sub_blocks);
+    lazy_assert(sub_block_size > 0);
+
+    // set the initial sub block size for all of the sub blocks
+    for (int i = 0; i < serialized->n_sub_blocks; i++) 
+        sub_block_init(&serialized->sub_block[i]);
+    set_all_sub_block_sizes(data_size, sub_block_size, serialized->n_sub_blocks, serialized->sub_block);
+
+    // allocate space for the serialized node
+    XMALLOC_N(calculated_size, serialized->data);
+    // serialize the node into buf
+    serialize_rollback_log_node_to_buf(log, serialized->data, calculated_size, serialized->n_sub_blocks, serialized->sub_block);
+    serialized->blocknum = log->blocknum;
+}
+
+int
+toku_serialize_rollback_log_to (int fd, ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBACK_LOG_NODE serialized_log, bool is_serialized,
+                                FT ft, bool for_checkpoint) {
+    size_t n_to_write;
+    char *compressed_buf;
+    struct serialized_rollback_log_node serialized_local;
+
+    if (is_serialized) {
+        invariant_null(log);
+    } else {
+        invariant_null(serialized_log);
+        serialized_log = &serialized_local;
+        toku_serialize_rollback_log_to_memory_uncompressed(log, serialized_log);
+    }
+
+    BLOCKNUM blocknum = serialized_log->blocknum;
+    invariant(blocknum.b >= 0);
+
+    // Compress and malloc buffer to write
+    serialize_uncompressed_block_to_memory(serialized_log->data,
+                                           serialized_log->n_sub_blocks,
+                                           serialized_log->sub_block,
+                                           ft->h->compression_method,
+                                           &n_to_write, &compressed_buf);
+
+    // Dirties the ft
+    DISKOFF offset;
+    ft->blocktable.realloc_on_disk(blocknum, n_to_write, &offset,
+                                   ft, fd, for_checkpoint,
+                                   // We consider rollback log flushing the hottest possible allocation,
+                                   // since rollback logs are short-lived compared to FT nodes.
+                                   INT_MAX);
+
+    toku_os_full_pwrite(fd, compressed_buf, n_to_write, offset);
+    toku_free(compressed_buf);
+    if (!is_serialized) {
+        toku_static_serialized_rollback_log_destroy(&serialized_local);
+        log->dirty = 0;  // See #1957.   Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction.
+    }
+    return 0;
+}
+
+static int
+deserialize_rollback_log_from_rbuf (BLOCKNUM blocknum, ROLLBACK_LOG_NODE *log_p, struct rbuf *rb) {
+    ROLLBACK_LOG_NODE MALLOC(result);
+    int r;
+    if (result==NULL) {
+	r=get_error_errno();
+	if (0) { died0: toku_free(result); }
+	return r;
+    }
+
+    const void *magic;
+    rbuf_literal_bytes(rb, &magic, 8);
+    lazy_assert(!memcmp(magic, "tokuroll", 8));
+
+    result->layout_version    = rbuf_int(rb);
+    lazy_assert((FT_LAYOUT_VERSION_25 <= result->layout_version && result->layout_version <= FT_LAYOUT_VERSION_27) ||
+                (result->layout_version == FT_LAYOUT_VERSION));
+    result->layout_version_original = rbuf_int(rb);
+    result->layout_version_read_from_disk = result->layout_version;
+    result->build_id = rbuf_int(rb);
+    result->dirty = false;
+    //TODO: Maybe add descriptor (or just descriptor version) here eventually?
+    //TODO: This is hard.. everything is shared in a single dictionary.
+    rbuf_TXNID_PAIR(rb, &result->txnid);
+    result->sequence = rbuf_ulonglong(rb);
+    result->blocknum = rbuf_blocknum(rb);
+    if (result->blocknum.b != blocknum.b) {
+        r = toku_db_badformat();
+        goto died0;
+    }
+    result->previous       = rbuf_blocknum(rb);
+    result->rollentry_resident_bytecount = rbuf_ulonglong(rb);
+
+    size_t arena_initial_size = rbuf_ulonglong(rb);
+    result->rollentry_arena.create(arena_initial_size);
+    if (0) { died1: result->rollentry_arena.destroy(); goto died0; }
+
+    //Load rollback entries
+    lazy_assert(rb->size > 4);
+    //Start with empty list
+    result->oldest_logentry = result->newest_logentry = NULL;
+    while (rb->ndone < rb->size) {
+        struct roll_entry *item;
+        uint32_t rollback_fsize = rbuf_int(rb); //Already read 4.  Rest is 4 smaller
+        const void *item_vec;
+        rbuf_literal_bytes(rb, &item_vec, rollback_fsize-4);
+        unsigned char* item_buf = (unsigned char*)item_vec;
+        r = toku_parse_rollback(item_buf, rollback_fsize-4, &item, &result->rollentry_arena);
+        if (r!=0) {
+            r = toku_db_badformat();
+            goto died1;
+        }
+        //Add to head of list
+        if (result->oldest_logentry) {
+            result->oldest_logentry->prev = item;
+            result->oldest_logentry       = item;
+            item->prev = NULL;
+        }
+        else {
+            result->oldest_logentry = result->newest_logentry = item;
+            item->prev = NULL;
+        }
+    }
+
+    toku_free(rb->buf);
+    rb->buf = NULL;
+    *log_p = result;
+    return 0;
+}
+
+static int
+deserialize_rollback_log_from_rbuf_versioned (uint32_t version, BLOCKNUM blocknum,
+                                              ROLLBACK_LOG_NODE *log,
+                                              struct rbuf *rb) {
+    int r = 0;
+    ROLLBACK_LOG_NODE rollback_log_node = NULL;
+    invariant((FT_LAYOUT_VERSION_25 <= version && version <= FT_LAYOUT_VERSION_27) || version == FT_LAYOUT_VERSION);
+    r = deserialize_rollback_log_from_rbuf(blocknum, &rollback_log_node, rb);
+    if (r==0) {
+        *log = rollback_log_node;
+    }
+    return r;
+}
+
+int
+decompress_from_raw_block_into_rbuf(uint8_t *raw_block, size_t raw_block_size, struct rbuf *rb, BLOCKNUM blocknum) {
+    int r = 0;
+    // get the number of compressed sub blocks
+    int n_sub_blocks;
+    n_sub_blocks = toku_dtoh32(*(uint32_t*)(&raw_block[node_header_overhead]));
+
+    // verify the number of sub blocks
+    invariant(0 <= n_sub_blocks);
+    invariant(n_sub_blocks <= max_sub_blocks);
+
+    { // verify the header checksum
+        uint32_t header_length = node_header_overhead + sub_block_header_size(n_sub_blocks);
+        invariant(header_length <= raw_block_size);
+        uint32_t xsum = toku_x1764_memory(raw_block, header_length);
+        uint32_t stored_xsum = toku_dtoh32(*(uint32_t *)(raw_block + header_length));
+        if (xsum != stored_xsum) {
+            r = TOKUDB_BAD_CHECKSUM;
+        }
+    }
+
+    // deserialize the sub block header
+    struct sub_block sub_block[n_sub_blocks];
+    uint32_t *sub_block_header = (uint32_t *) &raw_block[node_header_overhead+4];
+    for (int i = 0; i < n_sub_blocks; i++) {
+        sub_block_init(&sub_block[i]);
+        sub_block[i].compressed_size = toku_dtoh32(sub_block_header[0]);
+        sub_block[i].uncompressed_size = toku_dtoh32(sub_block_header[1]);
+        sub_block[i].xsum = toku_dtoh32(sub_block_header[2]);
+        sub_block_header += 3;
+    }
+
+    // This predicate needs to be here and instead of where it is set
+    // for the compiler.
+    if (r == TOKUDB_BAD_CHECKSUM) {
+        goto exit;
+    }
+
+    // verify sub block sizes
+    for (int i = 0; i < n_sub_blocks; i++) {
+        uint32_t compressed_size = sub_block[i].compressed_size;
+        if (compressed_size<=0   || compressed_size>(1<<30)) { 
+            r = toku_db_badformat(); 
+            goto exit;
+        }
+
+        uint32_t uncompressed_size = sub_block[i].uncompressed_size;
+        if (0) printf("Block %" PRId64 " Compressed size = %u, uncompressed size=%u\n", blocknum.b, compressed_size, uncompressed_size);
+        if (uncompressed_size<=0 || uncompressed_size>(1<<30)) { 
+            r = toku_db_badformat();
+            goto exit;
+        }
+    }
+
+    // sum up the uncompressed size of the sub blocks
+    size_t uncompressed_size;
+    uncompressed_size = get_sum_uncompressed_size(n_sub_blocks, sub_block);
+
+    // allocate the uncompressed buffer
+    size_t size;
+    size = node_header_overhead + uncompressed_size;
+    unsigned char *buf;
+    XMALLOC_N(size, buf);
+    rbuf_init(rb, buf, size);
+
+    // copy the uncompressed node header to the uncompressed buffer
+    memcpy(rb->buf, raw_block, node_header_overhead);
+
+    // point at the start of the compressed data (past the node header, the sub block header, and the header checksum)
+    unsigned char *compressed_data;
+    compressed_data = raw_block + node_header_overhead + sub_block_header_size(n_sub_blocks) + sizeof (uint32_t);
+
+    // point at the start of the uncompressed data
+    unsigned char *uncompressed_data;
+    uncompressed_data = rb->buf + node_header_overhead;    
+
+    // decompress all the compressed sub blocks into the uncompressed buffer
+    r = decompress_all_sub_blocks(n_sub_blocks, sub_block, compressed_data, uncompressed_data, num_cores, ft_pool);
+    if (r != 0) {
+        fprintf(stderr, "%s:%d block %" PRId64 " failed %d at %p size %lu\n", __FUNCTION__, __LINE__, blocknum.b, r, raw_block, raw_block_size);
+        dump_bad_block(raw_block, raw_block_size);
+        goto exit;
+    }
+
+    rb->ndone=0;
+exit:
+    return r;
+}
+
+static int decompress_from_raw_block_into_rbuf_versioned(uint32_t version, uint8_t *raw_block, size_t raw_block_size, struct rbuf *rb, BLOCKNUM blocknum) {
+    // This function exists solely to accomodate future changes in compression.
+    int r = 0;
+    if ((version == FT_LAYOUT_VERSION_13 || version == FT_LAYOUT_VERSION_14) ||
+        (FT_LAYOUT_VERSION_25 <= version && version <= FT_LAYOUT_VERSION_27) ||
+        version == FT_LAYOUT_VERSION) {
+        r = decompress_from_raw_block_into_rbuf(raw_block, raw_block_size, rb, blocknum);
+    } else {
+        abort();
+    }
+    return r;
+}
+
+static int
+read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum,
+                                            DISKOFF offset, DISKOFF size,
+                                            FT ft,
+                                            struct rbuf *rb,
+                                  /* out */ int *layout_version_p) {
+    int r = 0;
+    if (0) printf("Deserializing Block %" PRId64 "\n", blocknum.b);
+
+    DISKOFF size_aligned = roundup_to_multiple(512, size);
+    uint8_t *XMALLOC_N_ALIGNED(512, size_aligned, raw_block);
+    {
+        // read the (partially compressed) block
+        ssize_t rlen = toku_os_pread(fd, raw_block, size_aligned, offset);
+        lazy_assert((DISKOFF)rlen >= size);
+        lazy_assert((DISKOFF)rlen <= size_aligned);
+    }
+    // get the layout_version
+    int layout_version;
+    {
+        uint8_t *magic = raw_block + uncompressed_magic_offset;
+        if (memcmp(magic, "tokuleaf", 8)!=0 &&
+            memcmp(magic, "tokunode", 8)!=0 &&
+            memcmp(magic, "tokuroll", 8)!=0) {
+            r = toku_db_badformat();
+            goto cleanup;
+        }
+        uint8_t *version = raw_block + uncompressed_version_offset;
+        layout_version = toku_dtoh32(*(uint32_t*)version);
+        if (layout_version < FT_LAYOUT_MIN_SUPPORTED_VERSION || layout_version > FT_LAYOUT_VERSION) {
+            r = toku_db_badformat();
+            goto cleanup;
+        }
+    }
+
+    r = decompress_from_raw_block_into_rbuf_versioned(layout_version, raw_block, size, rb, blocknum);
+    if (r != 0) {
+        // We either failed the checksome, or there is a bad format in
+        // the buffer.
+        if (r == TOKUDB_BAD_CHECKSUM) {
+            fprintf(stderr,
+                    "Checksum failure while reading raw block in file %s.\n",
+                    toku_cachefile_fname_in_env(ft->cf));
+            abort();
+        } else {
+            r = toku_db_badformat();
+            goto cleanup;
+        }
+    }
+
+    *layout_version_p = layout_version;
+cleanup:
+    if (r!=0) {
+        if (rb->buf) toku_free(rb->buf);
+        rb->buf = NULL;
+    }
+    if (raw_block) {
+        toku_free(raw_block);
+    }
+    return r;
+}
+
+// Read rollback log node from file into struct.
+// Perform version upgrade if necessary.
+int toku_deserialize_rollback_log_from(int fd, BLOCKNUM blocknum, ROLLBACK_LOG_NODE *logp, FT ft) {
+    int layout_version = 0;
+    int r;
+
+    struct rbuf rb;
+    rbuf_init(&rb, nullptr, 0);
+
+    // get the file offset and block size for the block
+    DISKOFF offset, size;
+    ft->blocktable.translate_blocknum_to_offset_size(blocknum, &offset, &size);
+
+    // if the size is 0, then the blocknum is unused
+    if (size == 0) {
+        // blocknum is unused, just create an empty one and get out
+        ROLLBACK_LOG_NODE XMALLOC(log);
+        rollback_empty_log_init(log);
+        log->blocknum.b = blocknum.b;
+        r = 0;
+        *logp = log;
+        goto cleanup;
+    }
+
+    r = read_and_decompress_block_from_fd_into_rbuf(fd, blocknum, offset, size, ft, &rb, &layout_version);
+    if (r!=0) goto cleanup;
+
+    {
+        uint8_t *magic = rb.buf + uncompressed_magic_offset;
+        if (memcmp(magic, "tokuroll", 8)!=0) {
+            r = toku_db_badformat();
+            goto cleanup;
+        }
+    }
+
+    r = deserialize_rollback_log_from_rbuf_versioned(layout_version, blocknum, logp, &rb);
+
+cleanup:
+    if (rb.buf) {
+        toku_free(rb.buf);
+    }
+    return r;
+}
+
+int
+toku_upgrade_subtree_estimates_to_stat64info(int fd, FT ft)
+{
+    int r = 0;
+    // 15 was the last version with subtree estimates
+    invariant(ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_15);
+
+    FTNODE unused_node = NULL;
+    FTNODE_DISK_DATA unused_ndd = NULL;
+    ftnode_fetch_extra bfe;
+    bfe.create_for_min_read(ft);
+    r = deserialize_ftnode_from_fd(fd, ft->h->root_blocknum, 0, &unused_node, &unused_ndd,
+                                   &bfe, &ft->h->on_disk_stats);
+    ft->in_memory_stats = ft->h->on_disk_stats;
+
+    if (unused_node) {
+        toku_ftnode_free(&unused_node);
+    }
+    if (unused_ndd) {
+        toku_free(unused_ndd);
+    }
+    return r;
+}
+
+int
+toku_upgrade_msn_from_root_to_header(int fd, FT ft)
+{
+    int r;
+    // 21 was the first version with max_msn_in_ft in the header
+    invariant(ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_20);
+
+    FTNODE node;
+    FTNODE_DISK_DATA ndd;
+    ftnode_fetch_extra bfe;
+    bfe.create_for_min_read(ft);
+    r = deserialize_ftnode_from_fd(fd, ft->h->root_blocknum, 0, &node, &ndd, &bfe, nullptr);
+    if (r != 0) {
+        goto exit;
+    }
+
+    ft->h->max_msn_in_ft = node->max_msn_applied_to_node_on_disk;
+    toku_ftnode_free(&node);
+    toku_free(ndd);
+ exit:
+    return r;
+}
+
+#undef UPGRADE_STATUS_VALUE
diff --git a/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.h b/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.h
new file mode 100644
index 00000000000..3ad616053e9
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.h
@@ -0,0 +1,92 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "ft/ft.h"
+#include "ft/node.h"
+#include "ft/serialize/sub_block.h"
+#include "ft/serialize/rbuf.h"
+#include "ft/serialize/wbuf.h"
+#include "ft/serialize/block_table.h"
+
+unsigned int toku_serialize_ftnode_size(FTNODE node);
+int toku_serialize_ftnode_to_memory(FTNODE node, FTNODE_DISK_DATA *ndd,
+                                    unsigned int basementnodesize,
+                                    enum toku_compression_method compression_method,
+                                    bool do_rebalancing, bool in_parallel,
+                                    size_t *n_bytes_to_write, size_t *n_uncompressed_bytes,
+                                    char **bytes_to_write);
+int toku_serialize_ftnode_to(int fd, BLOCKNUM, FTNODE node, FTNODE_DISK_DATA *ndd, bool do_rebalancing, FT ft, bool for_checkpoint);
+int toku_serialize_rollback_log_to(int fd, ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBACK_LOG_NODE serialized_log, bool is_serialized,
+                                    FT ft, bool for_checkpoint);
+void toku_serialize_rollback_log_to_memory_uncompressed(ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBACK_LOG_NODE serialized);
+
+int toku_deserialize_rollback_log_from(int fd, BLOCKNUM blocknum, ROLLBACK_LOG_NODE *logp, FT ft);
+int toku_deserialize_bp_from_disk(FTNODE node, FTNODE_DISK_DATA ndd, int childnum, int fd, ftnode_fetch_extra *bfe);
+int toku_deserialize_bp_from_compressed(FTNODE node, int childnum, ftnode_fetch_extra *bfe);
+int toku_deserialize_ftnode_from(int fd, BLOCKNUM off, uint32_t fullhash, FTNODE *node, FTNODE_DISK_DATA *ndd, ftnode_fetch_extra *bfe);
+
+void toku_serialize_set_parallel(bool);
+
+// used by nonleaf node partial eviction
+void toku_create_compressed_partition_from_available(FTNODE node, int childnum,
+                                                     enum toku_compression_method compression_method, SUB_BLOCK sb);
+
+// <CER> For verifying old, non-upgraded nodes (versions 13 and 14).
+int decompress_from_raw_block_into_rbuf(uint8_t *raw_block, size_t raw_block_size, struct rbuf *rb, BLOCKNUM blocknum);
+
+// used by verify
+int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ft, uint32_t version);
+void read_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum, FT ft, struct rbuf *rb);
+int read_compressed_sub_block(struct rbuf *rb, struct sub_block *sb);
+int verify_ftnode_sub_block(struct sub_block *sb);
+void just_decompress_sub_block(struct sub_block *sb);
+
+// used by ft-node-deserialize.cc
+void initialize_ftnode(FTNODE node, BLOCKNUM blocknum);
+int read_and_check_magic(struct rbuf *rb);
+int read_and_check_version(FTNODE node, struct rbuf *rb);
+void read_node_info(FTNODE node, struct rbuf *rb, int version);
+void allocate_and_read_partition_offsets(FTNODE node, struct rbuf *rb, FTNODE_DISK_DATA *ndd);
+int check_node_info_checksum(struct rbuf *rb);
+void read_legacy_node_info(FTNODE node, struct rbuf *rb, int version);
+int check_legacy_end_checksum(struct rbuf *rb);
+
+// exported so the loader can dump bad blocks
+void dump_bad_block(unsigned char *vp, uint64_t size);
diff --git a/storage/tokudb/PerconaFT/ft/serialize/quicklz.cc b/storage/tokudb/PerconaFT/ft/serialize/quicklz.cc
new file mode 100644
index 00000000000..44f084f3475
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/quicklz.cc
@@ -0,0 +1,887 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+// Fast data compression library
+// Copyright (C) 2006-2011 Lasse Mikkel Reinhold
+// lar@quicklz.com
+//
+// QuickLZ can be used for free under the GPL 1, 2 or 3 license (where anything 
+// released into public must be open source) or under a commercial license if such 
+// has been acquired (see http://www.quicklz.com/order.html). The commercial license 
+// does not cover derived or ported versions created by third parties under GPL.
+
+// 1.5.0 final
+
+#include "quicklz.h"
+
+#if QLZ_VERSION_MAJOR != 1 || QLZ_VERSION_MINOR != 5 || QLZ_VERSION_REVISION != 0
+	#error quicklz.c and quicklz.h have different versions
+#endif
+
+#if (defined(__X86__) || defined(__i386__) || defined(i386) || defined(_M_IX86) || defined(__386__) || defined(__x86_64__) || defined(_M_X64))
+	#define X86X64
+#endif
+
+#define MINOFFSET 2
+#define UNCONDITIONAL_MATCHLEN 6
+#define UNCOMPRESSED_END 4
+#define CWORD_LEN 4
+
+#if QLZ_COMPRESSION_LEVEL == 1 && defined QLZ_PTR_64 && QLZ_STREAMING_BUFFER == 0
+	#define OFFSET_BASE source
+	#define CAST (ui32)(size_t)
+#else
+	#define OFFSET_BASE 0
+	#define CAST
+#endif
+
+int qlz_get_setting(int setting)
+{
+	switch (setting)
+	{
+		case 0: return QLZ_COMPRESSION_LEVEL;
+		case 1: return sizeof(qlz_state_compress);
+		case 2: return sizeof(qlz_state_decompress);
+		case 3: return QLZ_STREAMING_BUFFER;
+#ifdef QLZ_MEMORY_SAFE
+		case 6: return 1;
+#else
+		case 6: return 0;
+#endif
+		case 7: return QLZ_VERSION_MAJOR;
+		case 8: return QLZ_VERSION_MINOR;
+		case 9: return QLZ_VERSION_REVISION;
+	}
+	return -1;
+}
+
+#if QLZ_COMPRESSION_LEVEL == 1
+static int same(const unsigned char *src, size_t n)
+{
+	while(n > 0 && *(src + n) == *src)
+		n--;
+	return n == 0 ? 1 : 0;
+}
+#endif
+
+static void reset_table_compress(qlz_state_compress *state)
+{
+	int i;
+	for(i = 0; i < QLZ_HASH_VALUES; i++)
+	{
+#if QLZ_COMPRESSION_LEVEL == 1
+		state->hash[i].offset = 0;
+#else
+		state->hash_counter[i] = 0;
+                state->hash[i].offset[0] = 0;
+#endif
+	}
+}
+
+static void reset_table_decompress(qlz_state_decompress *state)
+{
+	(void)state;
+#if QLZ_COMPRESSION_LEVEL == 2
+	for(int i = 0; i < QLZ_HASH_VALUES; i++)
+	{
+		state->hash_counter[i] = 0;
+	}
+#endif
+}
+
+static __inline ui32 hash_func(ui32 i)
+{
+#if QLZ_COMPRESSION_LEVEL == 2
+	return ((i >> 9) ^ (i >> 13) ^ i) & (QLZ_HASH_VALUES - 1);
+#else
+	return ((i >> 12) ^ i) & (QLZ_HASH_VALUES - 1);
+#endif
+}
+
+static __inline ui32 fast_read(void const *src, ui32 bytes)
+{
+#ifndef X86X64
+	unsigned char *p = (unsigned char*)src;
+	switch (bytes)
+	{
+		case 4:
+			return(*p | *(p + 1) << 8 | *(p + 2) << 16 | *(p + 3) << 24);
+		case 3: 
+			return(*p | *(p + 1) << 8 | *(p + 2) << 16);
+		case 2:
+			return(*p | *(p + 1) << 8);
+		case 1: 
+			return(*p);
+	}
+	return 0;
+#else
+	if (bytes >= 1 && bytes <= 4)
+		return *((ui32*)src);
+	else
+		return 0;
+#endif
+}
+
+static __inline ui32 hashat(const unsigned char *src)
+{
+	ui32 fetch, hash;
+	fetch = fast_read(src, 3);
+	hash = hash_func(fetch);
+	return hash;
+}
+
+static __inline void fast_write(ui32 f, void *dst, size_t bytes)
+{
+#ifndef X86X64
+	unsigned char *p = (unsigned char*)dst;
+
+	switch (bytes)
+	{
+		case 4: 
+			*p = (unsigned char)f;
+			*(p + 1) = (unsigned char)(f >> 8);
+			*(p + 2) = (unsigned char)(f >> 16);
+			*(p + 3) = (unsigned char)(f >> 24);
+			return;
+		case 3:
+			*p = (unsigned char)f;
+			*(p + 1) = (unsigned char)(f >> 8);
+			*(p + 2) = (unsigned char)(f >> 16);
+			return;
+		case 2:
+			*p = (unsigned char)f;
+			*(p + 1) = (unsigned char)(f >> 8);
+			return;
+		case 1:
+			*p = (unsigned char)f;
+			return;
+	}
+#else
+	switch (bytes)
+	{
+		case 4: 
+			*((ui32*)dst) = f;
+			return;
+		case 3:
+			*((ui32*)dst) = f;
+			return;
+		case 2:
+			*((ui16 *)dst) = (ui16)f;
+			return;
+		case 1:
+			*((unsigned char*)dst) = (unsigned char)f;
+			return;
+	}
+#endif
+}
+
+
+size_t qlz_size_decompressed(const char *source)
+{
+	ui32 n, r;
+	n = (((*source) & 2) == 2) ? 4 : 1;
+	r = fast_read(source + 1 + n, n);
+	r = r & (0xffffffff >> ((4 - n)*8));
+	return r;
+}
+
+size_t qlz_size_compressed(const char *source)
+{
+	ui32 n, r;
+	n = (((*source) & 2) == 2) ? 4 : 1;
+	r = fast_read(source + 1, n);
+	r = r & (0xffffffff >> ((4 - n)*8));
+	return r;
+}
+
+static
+size_t qlz_size_header(const char *source)
+{
+	size_t n = 2*((((*source) & 2) == 2) ? 4 : 1) + 1;
+	return n;
+}
+
+
+static __inline void memcpy_up(unsigned char *dst, const unsigned char *src, ui32 n)
+{
+	// Caution if modifying memcpy_up! Overlap of dst and src must be special handled.
+#ifndef X86X64
+	unsigned char *end = dst + n;
+	while(dst < end)
+	{
+		*dst = *src;
+		dst++;
+		src++;
+	}
+#else
+	ui32 f = 0;
+	do
+	{
+		*(ui32 *)(dst + f) = *(ui32 *)(src + f);
+		f += MINOFFSET + 1;
+	}
+	while (f < n);
+#endif
+}
+
+__attribute__((unused))
+static __inline void update_hash(qlz_state_decompress *state, const unsigned char *s)
+{
+#if QLZ_COMPRESSION_LEVEL == 1
+	ui32 hash;
+	hash = hashat(s);
+	state->hash[hash].offset = s;
+	state->hash_counter[hash] = 1;
+#elif QLZ_COMPRESSION_LEVEL == 2
+	ui32 hash;
+	unsigned char c;
+	hash = hashat(s);
+	c = state->hash_counter[hash];
+	state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = s;
+	c++;
+	state->hash_counter[hash] = c;
+#endif
+	(void)state;
+	(void)s;
+}
+
+#if QLZ_COMPRESSION_LEVEL <= 2
+static void update_hash_upto(qlz_state_decompress *state, unsigned char **lh, const unsigned char *max)
+{
+	while(*lh < max)
+	{
+		(*lh)++;
+		update_hash(state, *lh);
+	}
+}
+#endif
+
+static size_t qlz_compress_core(const unsigned char *source, unsigned char *destination, size_t size, qlz_state_compress *state)
+{
+	const unsigned char *last_byte = source + size - 1;
+	const unsigned char *src = source;
+	unsigned char *cword_ptr = destination;
+	unsigned char *dst = destination + CWORD_LEN;
+	ui32 cword_val = 1U << 31;
+	const unsigned char *last_matchstart = last_byte - UNCONDITIONAL_MATCHLEN - UNCOMPRESSED_END; 
+	ui32 fetch = 0;
+	unsigned int lits = 0;
+
+	(void) lits;
+
+	if(src <= last_matchstart)
+		fetch = fast_read(src, 3);
+	
+	while(src <= last_matchstart)
+	{
+		if ((cword_val & 1) == 1)
+		{
+			// store uncompressed if compression ratio is too low
+			if (src > source + (size >> 1) && dst - destination > src - source - ((src - source) >> 5))
+				return 0;
+
+			fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN);
+
+			cword_ptr = dst;
+			dst += CWORD_LEN;
+			cword_val = 1U << 31;
+			fetch = fast_read(src, 3);
+		}
+#if QLZ_COMPRESSION_LEVEL == 1
+		{
+			const unsigned char *o;
+			ui32 hash, cached;
+
+			hash = hash_func(fetch);
+			cached = fetch ^ state->hash[hash].cache;
+			state->hash[hash].cache = fetch;
+
+			o = state->hash[hash].offset + OFFSET_BASE;
+			state->hash[hash].offset = CAST(src - OFFSET_BASE);
+
+#ifdef X86X64
+			if ((cached & 0xffffff) == 0 && o != OFFSET_BASE && (src - o > MINOFFSET || (src == o + 1 && lits >= 3 && src > source + 3 && same(src - 3, 6))))
+			{
+				if(cached != 0)
+				{
+#else
+			if (cached == 0 && o != OFFSET_BASE && (src - o > MINOFFSET || (src == o + 1 && lits >= 3 && src > source + 3 && same(src - 3, 6))))
+			{
+				if (*(o + 3) != *(src + 3))
+				{
+#endif
+					hash <<= 4;
+					cword_val = (cword_val >> 1) | (1U << 31);
+					fast_write((3 - 2) | hash, dst, 2);
+					src += 3;
+					dst += 2;
+				}
+				else
+				{
+					const unsigned char *old_src = src;
+					size_t matchlen;
+					hash <<= 4;
+
+					cword_val = (cword_val >> 1) | (1U << 31);
+					src += 4;
+
+					if(*(o + (src - old_src)) == *src)
+					{
+						src++;
+						if(*(o + (src - old_src)) == *src)
+						{
+							size_t q = last_byte - UNCOMPRESSED_END - (src - 5) + 1;
+							size_t remaining = q > 255 ? 255 : q;
+							src++;	
+							while(*(o + (src - old_src)) == *src && (size_t)(src - old_src) < remaining)
+								src++;
+						}
+					}
+
+					matchlen = src - old_src;
+					if (matchlen < 18)
+					{
+						fast_write((ui32)(matchlen - 2) | hash, dst, 2);
+						dst += 2;
+					} 
+					else
+					{
+						fast_write((ui32)(matchlen << 16) | hash, dst, 3);
+						dst += 3;
+					}
+				}
+				fetch = fast_read(src, 3);
+				lits = 0;
+			}
+			else
+			{
+				lits++;
+				*dst = *src;
+				src++;
+				dst++;
+				cword_val = (cword_val >> 1);
+#ifdef X86X64
+				fetch = fast_read(src, 3);
+#else
+				fetch = (fetch >> 8 & 0xffff) | (*(src + 2) << 16);
+#endif
+			}
+		}
+#elif QLZ_COMPRESSION_LEVEL >= 2
+		{
+			const unsigned char *o, *offset2;
+			ui32 hash, matchlen, k, m, best_k = 0;
+			unsigned char c;
+			size_t remaining = (last_byte - UNCOMPRESSED_END - src + 1) > 255 ? 255 : (last_byte - UNCOMPRESSED_END - src + 1);
+			(void)best_k;
+		
+
+			//hash = hashat(src);
+			fetch = fast_read(src, 3);
+			hash = hash_func(fetch);
+
+			c = state->hash_counter[hash];
+
+			offset2 = state->hash[hash].offset[0];
+			if(offset2 < src - MINOFFSET && c > 0 && ((fast_read(offset2, 3) ^ fetch) & 0xffffff) == 0)
+			{	
+				matchlen = 3;
+				if(*(offset2 + matchlen) == *(src + matchlen))
+				{
+					matchlen = 4;
+					while(*(offset2 + matchlen) == *(src + matchlen) && matchlen < remaining)
+						matchlen++;
+				}
+			}
+			else
+				matchlen = 0;
+			for(k = 1; k < QLZ_POINTERS && c > k; k++)
+			{
+				o = state->hash[hash].offset[k];
+#if QLZ_COMPRESSION_LEVEL == 3
+				if(((fast_read(o, 3) ^ fetch) & 0xffffff) == 0 && o < src - MINOFFSET)
+#elif QLZ_COMPRESSION_LEVEL == 2
+				if(*(src + matchlen) == *(o + matchlen)	&& ((fast_read(o, 3) ^ fetch) & 0xffffff) == 0 && o < src - MINOFFSET)
+#endif
+				{	
+					m = 3;
+					while(*(o + m) == *(src + m) && m < remaining)
+						m++;
+#if QLZ_COMPRESSION_LEVEL == 3
+					if ((m > matchlen) || (m == matchlen && o > offset2))
+#elif QLZ_COMPRESSION_LEVEL == 2
+					if (m > matchlen)
+#endif
+					{
+						offset2 = o;
+						matchlen = m;
+						best_k = k;
+					}
+				}
+			}
+			o = offset2;
+			state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src;
+			c++;
+			state->hash_counter[hash] = c;
+
+#if QLZ_COMPRESSION_LEVEL == 3
+			if(matchlen > 2 && src - o < 131071)
+			{
+				ui32 u;
+				size_t offset = src - o;
+
+				for(u = 1; u < matchlen; u++)
+				{	
+					hash = hashat(src + u);
+					c = state->hash_counter[hash]++;
+					state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src + u;
+				}
+
+				cword_val = (cword_val >> 1) | (1U << 31);
+				src += matchlen;
+
+				if(matchlen == 3 && offset <= 63)
+				{
+					*dst = (unsigned char)(offset << 2);
+					dst++;
+				}
+				else if (matchlen == 3 && offset <= 16383)
+				{
+					ui32 f = (ui32)((offset << 2) | 1);
+					fast_write(f, dst, 2);
+					dst += 2;
+				}		
+				else if (matchlen <= 18 && offset <= 1023)
+				{
+					ui32 f = ((matchlen - 3) << 2) | ((ui32)offset << 6) | 2;
+					fast_write(f, dst, 2);
+					dst += 2;
+				}
+
+				else if(matchlen <= 33)
+				{
+					ui32 f = ((matchlen - 2) << 2) | ((ui32)offset << 7) | 3;
+					fast_write(f, dst, 3);
+					dst += 3;
+				}
+				else
+				{
+					ui32 f = ((matchlen - 3) << 7) | ((ui32)offset << 15) | 3;
+					fast_write(f, dst, 4);
+					dst += 4;
+				}
+			}
+			else
+			{
+				*dst = *src;
+				src++;
+				dst++;
+				cword_val = (cword_val >> 1);
+			}
+#elif QLZ_COMPRESSION_LEVEL == 2
+
+			if(matchlen > 2)
+			{
+				cword_val = (cword_val >> 1) | (1U << 31);
+				src += matchlen;			
+
+				if (matchlen < 10)
+				{			
+					ui32 f = best_k | ((matchlen - 2) << 2) | (hash << 5);
+					fast_write(f, dst, 2);
+					dst += 2;
+				}
+				else
+				{
+					ui32 f = best_k | (matchlen << 16) | (hash << 5);
+					fast_write(f, dst, 3);
+					dst += 3;
+				}
+			}
+			else
+			{
+				*dst = *src;
+				src++;
+				dst++;
+				cword_val = (cword_val >> 1);
+			}
+#endif
+		}
+#endif
+	}
+	while (src <= last_byte)
+	{
+		if ((cword_val & 1) == 1)
+		{
+			fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN);
+			cword_ptr = dst;
+			dst += CWORD_LEN;
+			cword_val = 1U << 31;
+		}
+#if QLZ_COMPRESSION_LEVEL < 3
+		if (src <= last_byte - 3)
+		{
+#if QLZ_COMPRESSION_LEVEL == 1
+			ui32 hash, fetchv;
+			fetchv = fast_read(src, 3);
+			hash = hash_func(fetch);
+			state->hash[hash].offset = CAST(src - OFFSET_BASE);
+			state->hash[hash].cache = fetchv;
+#elif QLZ_COMPRESSION_LEVEL == 2
+			ui32 hash;
+			unsigned char c;
+			hash = hashat(src);
+			c = state->hash_counter[hash];
+			state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src;
+			c++;
+			state->hash_counter[hash] = c;
+#endif
+		}
+#endif
+		*dst = *src;
+		src++;
+		dst++;
+		cword_val = (cword_val >> 1);
+	}
+
+	while((cword_val & 1) != 1)
+		cword_val = (cword_val >> 1);
+
+	fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN);
+
+	// min. size must be 9 bytes so that the qlz_size functions can take 9 bytes as argument
+	return dst - destination < 9 ? 9 : dst - destination;
+}
+
+static size_t qlz_decompress_core(const unsigned char *source, unsigned char *destination, size_t size, qlz_state_decompress *state, const unsigned char *history)
+{
+	const unsigned char *src = source + qlz_size_header((const char *)source);
+	unsigned char *dst = destination;
+	const unsigned char *last_destination_byte = destination + size - 1;
+	ui32 cword_val = 1;
+	const unsigned char *last_matchstart = last_destination_byte - UNCONDITIONAL_MATCHLEN - UNCOMPRESSED_END;
+	unsigned char *last_hashed = destination - 1;
+	const unsigned char *last_source_byte = source + qlz_size_compressed((const char *)source) - 1;
+	static const ui32 bitlut[16] = {4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
+
+	(void) last_source_byte;
+	(void) last_hashed;
+	(void) state;
+	(void) history;
+
+	for(;;) 
+	{
+		ui32 fetch;
+
+		if (cword_val == 1)
+		{
+#ifdef QLZ_MEMORY_SAFE
+			if(src + CWORD_LEN - 1 > last_source_byte)
+				return 0;
+#endif
+			cword_val = fast_read(src, CWORD_LEN);
+			src += CWORD_LEN;
+		}
+
+#ifdef QLZ_MEMORY_SAFE
+			if(src + 4 - 1 > last_source_byte)
+				return 0;
+#endif
+
+		fetch = fast_read(src, 4);
+
+		if ((cword_val & 1) == 1)
+		{
+			ui32 matchlen;
+			const unsigned char *offset2;
+
+#if QLZ_COMPRESSION_LEVEL == 1
+			ui32 hash;
+			cword_val = cword_val >> 1;
+			hash = (fetch >> 4) & 0xfff;
+			offset2 = (const unsigned char *)(size_t)state->hash[hash].offset;
+
+			if((fetch & 0xf) != 0)
+			{
+				matchlen = (fetch & 0xf) + 2;
+				src += 2;
+			}
+			else
+			{
+				matchlen = *(src + 2);
+				src += 3;							
+			}	
+
+#elif QLZ_COMPRESSION_LEVEL == 2
+			ui32 hash;
+			unsigned char c;
+			cword_val = cword_val >> 1;
+			hash = (fetch >> 5) & 0x7ff;
+			c = (unsigned char)(fetch & 0x3);
+			offset2 = state->hash[hash].offset[c];
+
+			if((fetch & (28)) != 0)
+			{
+				matchlen = ((fetch >> 2) & 0x7) + 2;
+				src += 2;
+			}
+			else
+			{
+				matchlen = *(src + 2);
+				src += 3;							
+			}	
+
+#elif QLZ_COMPRESSION_LEVEL == 3
+			ui32 offset;
+			cword_val = cword_val >> 1;
+			if ((fetch & 3) == 0)
+			{
+				offset = (fetch & 0xff) >> 2;
+				matchlen = 3;
+				src++;
+			}
+			else if ((fetch & 2) == 0)
+			{
+				offset = (fetch & 0xffff) >> 2;
+				matchlen = 3;
+				src += 2;
+			}
+			else if ((fetch & 1) == 0)
+			{
+				offset = (fetch & 0xffff) >> 6;
+				matchlen = ((fetch >> 2) & 15) + 3;
+				src += 2;
+			}
+			else if ((fetch & 127) != 3)
+			{
+				offset = (fetch >> 7) & 0x1ffff;
+				matchlen = ((fetch >> 2) & 0x1f) + 2;
+				src += 3;
+			}
+			else
+			{
+				offset = (fetch >> 15);
+				matchlen = ((fetch >> 7) & 255) + 3;
+				src += 4;
+			}
+
+			offset2 = dst - offset;
+#endif
+	
+#ifdef QLZ_MEMORY_SAFE
+			if(offset2 < history || offset2 > dst - MINOFFSET - 1)
+				return 0;
+
+			if(matchlen > (ui32)(last_destination_byte - dst - UNCOMPRESSED_END + 1))
+				return 0;
+#endif
+
+			memcpy_up(dst, offset2, matchlen);
+			dst += matchlen;
+
+#if QLZ_COMPRESSION_LEVEL <= 2
+			update_hash_upto(state, &last_hashed, dst - matchlen);
+			last_hashed = dst - 1;
+#endif
+		}
+		else
+		{
+			if (dst < last_matchstart)
+			{
+				unsigned int n = bitlut[cword_val & 0xf];
+#ifdef X86X64
+				*(ui32 *)dst = *(ui32 *)src;
+#else
+				memcpy_up(dst, src, 4);
+#endif
+				cword_val = cword_val >> n;
+				dst += n;
+				src += n;
+#if QLZ_COMPRESSION_LEVEL <= 2
+				update_hash_upto(state, &last_hashed, dst - 3);		
+#endif
+			}
+			else
+			{			
+				while(dst <= last_destination_byte)
+				{
+					if (cword_val == 1)
+					{
+						src += CWORD_LEN;
+						cword_val = 1U << 31;
+					}
+#ifdef QLZ_MEMORY_SAFE
+					if(src >= last_source_byte + 1)
+						return 0;
+#endif
+					*dst = *src;
+					dst++;
+					src++;
+					cword_val = cword_val >> 1;
+				}
+
+#if QLZ_COMPRESSION_LEVEL <= 2
+				update_hash_upto(state, &last_hashed, last_destination_byte - 3); // todo, use constant
+#endif
+				return size;
+			}
+
+		}
+	}
+}
+
+size_t qlz_compress(const void *source, char *destination, size_t size, qlz_state_compress *state)
+{
+	size_t r;
+	ui32 compressed;
+	size_t base;
+
+	if(size == 0 || size > 0xffffffff - 400)
+		return 0;
+
+	if(size < 216)
+		base = 3;
+	else
+		base = 9;
+
+#if QLZ_STREAMING_BUFFER > 0
+	if (state->stream_counter + size - 1 >= QLZ_STREAMING_BUFFER)
+#endif
+	{
+		reset_table_compress(state);
+		r = base + qlz_compress_core((const unsigned char *)source, (unsigned char*)destination + base, size, state);
+#if QLZ_STREAMING_BUFFER > 0
+		reset_table_compress(state);
+#endif
+		if(r == base)
+		{
+			memcpy(destination + base, source, size);
+			r = size + base;
+			compressed = 0;
+		}
+		else
+		{
+			compressed = 1;
+		}
+		state->stream_counter = 0;
+	}
+#if QLZ_STREAMING_BUFFER > 0
+	else
+	{
+		unsigned char *src = state->stream_buffer + state->stream_counter;
+
+		memcpy(src, source, size);
+		r = base + qlz_compress_core(src, (unsigned char*)destination + base, size, state);
+
+ 		if(r == base)
+		{
+			memcpy(destination + base, src, size);
+			r = size + base;
+			compressed = 0;
+			reset_table_compress(state);
+		}
+		else
+		{
+			compressed = 1;
+		}
+		state->stream_counter += size;
+	}
+#endif
+	if(base == 3)
+	{
+		*destination = (unsigned char)(0 | compressed);
+		*(destination + 1) = (unsigned char)r;
+		*(destination + 2) = (unsigned char)size;
+	}
+	else
+	{
+		*destination = (unsigned char)(2 | compressed);
+		fast_write((ui32)r, destination + 1, 4);
+		fast_write((ui32)size, destination + 5, 4);
+	}
+	
+	*destination |= (QLZ_COMPRESSION_LEVEL << 2);
+	*destination |= (1 << 6);
+	*destination |= ((QLZ_STREAMING_BUFFER == 0 ? 0 : (QLZ_STREAMING_BUFFER == 100000 ? 1 : (QLZ_STREAMING_BUFFER == 1000000 ? 2 : 3))) << 4);
+
+// 76543210
+// 01SSLLHC
+
+	return r;
+}
+
+size_t qlz_decompress(const char *source, void *destination, qlz_state_decompress *state)
+{
+	size_t dsiz = qlz_size_decompressed(source);
+
+#if QLZ_STREAMING_BUFFER > 0
+	if (state->stream_counter + qlz_size_decompressed(source) - 1 >= QLZ_STREAMING_BUFFER) 
+#endif
+	{
+		if((*source & 1) == 1)
+		{
+			reset_table_decompress(state);
+			dsiz = qlz_decompress_core((const unsigned char *)source, (unsigned char *)destination, dsiz, state, (const unsigned char *)destination);
+		}
+		else
+		{
+			memcpy(destination, source + qlz_size_header(source), dsiz);
+		}
+		state->stream_counter = 0;
+		reset_table_decompress(state);
+	}
+#if QLZ_STREAMING_BUFFER > 0
+	else
+	{
+		unsigned char *dst = state->stream_buffer + state->stream_counter;
+		if((*source & 1) == 1)
+		{
+			dsiz = qlz_decompress_core((const unsigned char *)source, dst, dsiz, state, (const unsigned char *)state->stream_buffer);
+		}
+		else
+		{
+			memcpy(dst, source + qlz_size_header(source), dsiz);
+			reset_table_decompress(state);
+		}
+		memcpy(destination, dst, dsiz);
+		state->stream_counter += dsiz;
+	}
+#endif
+	return dsiz;
+}
+
diff --git a/storage/tokudb/PerconaFT/ft/serialize/quicklz.h b/storage/tokudb/PerconaFT/ft/serialize/quicklz.h
new file mode 100644
index 00000000000..b9ce2f9913c
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/quicklz.h
@@ -0,0 +1,177 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// Fast data compression library
+// Copyright (C) 2006-2011 Lasse Mikkel Reinhold
+// lar@quicklz.com
+//
+// QuickLZ can be used for free under the GPL 1, 2 or 3 license (where anything 
+// released into public must be open source) or under a commercial license if such 
+// has been acquired (see http://www.quicklz.com/order.html). The commercial license 
+// does not cover derived or ported versions created by third parties under GPL.
+
+// You can edit following user settings. Data must be decompressed with the same 
+// setting of QLZ_COMPRESSION_LEVEL and QLZ_STREAMING_BUFFER as it was compressed
+// (see manual). If QLZ_STREAMING_BUFFER > 0, scratch buffers must be initially
+// zeroed out (see manual). First #ifndef makes it possible to define settings from 
+// the outside like the compiler command line.
+
+// 1.5.0 final
+
+#ifndef QLZ_COMPRESSION_LEVEL
+	//#define QLZ_COMPRESSION_LEVEL 1
+	//#define QLZ_COMPRESSION_LEVEL 2
+	#define QLZ_COMPRESSION_LEVEL 3
+
+	#define QLZ_STREAMING_BUFFER 0
+	//#define QLZ_STREAMING_BUFFER 100000
+	//#define QLZ_STREAMING_BUFFER 1000000
+
+	//#define QLZ_MEMORY_SAFE
+#endif
+
+#define QLZ_VERSION_MAJOR 1
+#define QLZ_VERSION_MINOR 5
+#define QLZ_VERSION_REVISION 0
+
+// Using size_t, memset() and memcpy()
+#include <string.h>
+
+// Verify compression level
+#if QLZ_COMPRESSION_LEVEL != 1 && QLZ_COMPRESSION_LEVEL != 2 && QLZ_COMPRESSION_LEVEL != 3
+#error QLZ_COMPRESSION_LEVEL must be 1, 2 or 3
+#endif
+
+typedef unsigned int ui32;
+typedef unsigned short int ui16;
+
+// Decrease QLZ_POINTERS for level 3 to increase compression speed. Do not touch any other values!
+#if QLZ_COMPRESSION_LEVEL == 1
+#define QLZ_POINTERS 1
+#define QLZ_HASH_VALUES 4096
+#elif QLZ_COMPRESSION_LEVEL == 2
+#define QLZ_POINTERS 4
+#define QLZ_HASH_VALUES 2048
+#elif QLZ_COMPRESSION_LEVEL == 3
+#define QLZ_POINTERS 16
+#define QLZ_HASH_VALUES 4096
+#endif
+
+// Detect if pointer size is 64-bit. It's not fatal if some 64-bit target is not detected because this is only for adding an optional 64-bit optimization.
+#if defined _LP64 || defined __LP64__ || defined __64BIT__ || _ADDR64 || defined _WIN64 || defined __arch64__ || __WORDSIZE == 64 || (defined __sparc && defined __sparcv9) || defined __x86_64 || defined __amd64 || defined __x86_64__ || defined _M_X64 || defined _M_IA64 || defined __ia64 || defined __IA64__
+	#define QLZ_PTR_64
+#endif
+
+// hash entry
+typedef struct 
+{
+#if QLZ_COMPRESSION_LEVEL == 1
+	ui32 cache;
+#if defined QLZ_PTR_64 && QLZ_STREAMING_BUFFER == 0
+	unsigned int offset;
+#else
+	const unsigned char *offset;
+#endif
+#else
+	const unsigned char *offset[QLZ_POINTERS];
+#endif
+
+} qlz_hash_compress;
+
+typedef struct 
+{
+#if QLZ_COMPRESSION_LEVEL == 1
+	const unsigned char *offset;
+#else
+	const unsigned char *offset[QLZ_POINTERS];
+#endif
+} qlz_hash_decompress;
+
+
+// states
+typedef struct
+{
+	#if QLZ_STREAMING_BUFFER > 0
+		unsigned char stream_buffer[QLZ_STREAMING_BUFFER];
+	#endif
+	size_t stream_counter;
+	qlz_hash_compress hash[QLZ_HASH_VALUES];
+	unsigned char hash_counter[QLZ_HASH_VALUES];
+} qlz_state_compress;
+
+
+#if QLZ_COMPRESSION_LEVEL == 1 || QLZ_COMPRESSION_LEVEL == 2
+	typedef struct
+	{
+#if QLZ_STREAMING_BUFFER > 0
+		unsigned char stream_buffer[QLZ_STREAMING_BUFFER];
+#endif
+		qlz_hash_decompress hash[QLZ_HASH_VALUES];
+		unsigned char hash_counter[QLZ_HASH_VALUES];
+		size_t stream_counter;
+	} qlz_state_decompress;
+#elif QLZ_COMPRESSION_LEVEL == 3
+	typedef struct
+	{
+#if QLZ_STREAMING_BUFFER > 0
+		unsigned char stream_buffer[QLZ_STREAMING_BUFFER];
+#endif
+#if QLZ_COMPRESSION_LEVEL <= 2
+		qlz_hash_decompress hash[QLZ_HASH_VALUES];
+#endif
+		size_t stream_counter;
+	} qlz_state_decompress;
+#endif
+
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+// Public functions of QuickLZ
+size_t qlz_size_decompressed(const char *source);
+size_t qlz_size_compressed(const char *source);
+size_t qlz_compress(const void *source, char *destination, size_t size, qlz_state_compress *state);
+size_t qlz_decompress(const char *source, void *destination, qlz_state_decompress *state);
+int qlz_get_setting(int setting);
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/storage/tokudb/PerconaFT/ft/serialize/rbuf.h b/storage/tokudb/PerconaFT/ft/serialize/rbuf.h
new file mode 100644
index 00000000000..c14dedbf992
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/rbuf.h
@@ -0,0 +1,156 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <string.h>
+
+#include "portability/memory.h"
+#include "portability/toku_assert.h"
+#include "portability/toku_htonl.h"
+#include "portability/toku_portability.h"
+#include "util/memarena.h"
+
+struct rbuf {
+    unsigned char *buf;
+    unsigned int  size;
+    unsigned int  ndone;
+};
+#define RBUF_INITIALIZER ((struct rbuf){.buf = NULL, .size=0, .ndone=0})
+
+static inline void rbuf_init(struct rbuf *r, unsigned char *buf, unsigned int size) {
+    r->buf = buf;
+    r->size = size;
+    r->ndone = 0;
+}
+
+static inline unsigned int rbuf_get_roffset(struct rbuf *r) {
+    return r->ndone;
+}
+
+static inline unsigned char rbuf_char (struct rbuf *r) {
+    assert(r->ndone<r->size);
+    return r->buf[r->ndone++];
+}
+
+static inline void rbuf_ma_uint8_t (struct rbuf *r, memarena *ma __attribute__((__unused__)), uint8_t *num) {
+    *num = rbuf_char(r);
+}
+
+static inline void rbuf_ma_bool (struct rbuf *r, memarena *ma __attribute__((__unused__)), bool *b) {
+    uint8_t n = rbuf_char(r);
+    *b = (n!=0);
+}
+
+//Read an int that MUST be in network order regardless of disk order
+static unsigned int rbuf_network_int (struct rbuf *r) __attribute__((__unused__));
+static unsigned int rbuf_network_int (struct rbuf *r) {
+    assert(r->ndone+4 <= r->size);
+    uint32_t result = toku_ntohl(*(uint32_t*)(r->buf+r->ndone)); // This only works on machines where unaligned loads are OK.
+    r->ndone+=4;
+    return result;
+}
+
+static unsigned int rbuf_int (struct rbuf *r) {
+#if 1
+    assert(r->ndone+4 <= r->size);
+    uint32_t result = toku_dtoh32(*(uint32_t*)(r->buf+r->ndone)); // This only works on machines where unaligned loads are OK.
+    r->ndone+=4;
+    return result;
+#else
+    unsigned char c0 = rbuf_char(r);
+    unsigned char c1 = rbuf_char(r);
+    unsigned char c2 = rbuf_char(r);
+    unsigned char c3 = rbuf_char(r);
+    return ((c0<<24)|
+	    (c1<<16)|
+	    (c2<<8)|
+	    (c3<<0));
+#endif
+}
+
+static inline void rbuf_literal_bytes (struct rbuf *r, const void **bytes, unsigned int n_bytes) {
+    *bytes =   &r->buf[r->ndone];
+    r->ndone+=n_bytes;
+    assert(r->ndone<=r->size);
+}
+
+/* Return a pointer into the middle of the buffer. */
+static inline void rbuf_bytes (struct rbuf *r, const void **bytes, unsigned int *n_bytes)
+{
+    *n_bytes = rbuf_int(r);
+    rbuf_literal_bytes(r, bytes, *n_bytes);
+}
+
+static inline unsigned long long rbuf_ulonglong (struct rbuf *r) {
+    unsigned i0 = rbuf_int(r);
+    unsigned i1 = rbuf_int(r);
+    return ((unsigned long long)(i0)<<32) | ((unsigned long long)(i1));
+}
+
+static inline signed long long rbuf_longlong (struct rbuf *r) {
+    return (signed long long)rbuf_ulonglong(r);
+}
+
+static inline void rbuf_ma_uint32_t (struct rbuf *r, memarena *ma __attribute__((__unused__)), uint32_t *num) {
+    *num = rbuf_int(r);
+}
+
+static inline void rbuf_ma_uint64_t (struct rbuf *r, memarena *ma __attribute__((__unused__)), uint64_t *num) {
+    *num = rbuf_ulonglong(r);
+}
+
+// Don't try to use the same space, malloc it
+static inline void rbuf_BYTESTRING (struct rbuf *r, BYTESTRING *bs) {
+    bs->len  = rbuf_int(r);
+    uint32_t newndone = r->ndone + bs->len;
+    assert(newndone <= r->size);
+    bs->data = (char *) toku_memdup(&r->buf[r->ndone], (size_t)bs->len);
+    assert(bs->data);
+    r->ndone = newndone;
+}
+
+static inline void rbuf_ma_BYTESTRING  (struct rbuf *r, memarena *ma, BYTESTRING *bs) {
+    bs->len  = rbuf_int(r);
+    uint32_t newndone = r->ndone + bs->len;
+    assert(newndone <= r->size);
+    bs->data = (char *) ma->malloc_from_arena(bs->len);
+    assert(bs->data);
+    memcpy(bs->data, &r->buf[r->ndone], bs->len);
+    r->ndone = newndone;
+}
diff --git a/storage/tokudb/PerconaFT/ft/serialize/sub_block.cc b/storage/tokudb/PerconaFT/ft/serialize/sub_block.cc
new file mode 100644
index 00000000000..c967d4b4c1c
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/sub_block.cc
@@ -0,0 +1,389 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <zlib.h>
+
+#include "portability/memory.h"
+#include "portability/toku_assert.h"
+#include "portability/toku_portability.h"
+
+#include "ft/serialize/compress.h"
+#include "ft/serialize/sub_block.h"
+#include "ft/serialize/quicklz.h"
+#include "util/threadpool.h"
+#include "util/x1764.h"
+
+SUB_BLOCK sub_block_creat(void) {
+    SUB_BLOCK XMALLOC(sb);
+    sub_block_init(sb);
+    return sb;
+}
+void sub_block_init(SUB_BLOCK sub_block) {
+    sub_block->uncompressed_ptr = 0;
+    sub_block->uncompressed_size = 0;
+
+    sub_block->compressed_ptr = 0;
+    sub_block->compressed_size_bound = 0;
+    sub_block->compressed_size = 0;
+
+    sub_block->xsum = 0;
+}
+    
+// get the size of the compression header
+size_t 
+sub_block_header_size(int n_sub_blocks) {
+    return sizeof (uint32_t) + n_sub_blocks * sizeof (struct stored_sub_block);
+}
+
+void
+set_compressed_size_bound(struct sub_block *se, enum toku_compression_method method) {
+    se->compressed_size_bound = toku_compress_bound(method, se->uncompressed_size);
+}
+
+// get the sum of the sub block compressed sizes 
+size_t 
+get_sum_compressed_size_bound(int n_sub_blocks, struct sub_block sub_block[], enum toku_compression_method method) {
+    size_t compressed_size_bound = 0;
+    for (int i = 0; i < n_sub_blocks; i++) {
+        sub_block[i].compressed_size_bound = toku_compress_bound(method, sub_block[i].uncompressed_size);
+        compressed_size_bound += sub_block[i].compressed_size_bound;
+    }
+    return compressed_size_bound;
+}
+
+// get the sum of the sub block uncompressed sizes 
+size_t 
+get_sum_uncompressed_size(int n_sub_blocks, struct sub_block sub_block[]) {
+    size_t uncompressed_size = 0;
+    for (int i = 0; i < n_sub_blocks; i++) 
+        uncompressed_size += sub_block[i].uncompressed_size;
+    return uncompressed_size;
+}
+
+// round up n
+static inline int 
+alignup32(int a, int b) {
+    return ((a+b-1) / b) * b;
+}
+
+// Choose n_sub_blocks and sub_block_size such that the product is >= total_size and the sub_block_size is at
+// least >= the target_sub_block_size.
+int
+choose_sub_block_size(int total_size, int n_sub_blocks_limit, int *sub_block_size_ret, int *n_sub_blocks_ret) {
+    if (total_size < 0 || n_sub_blocks_limit < 1)
+        return EINVAL;
+
+    const int alignment = 32;
+
+    int n_sub_blocks, sub_block_size;
+    n_sub_blocks = total_size / target_sub_block_size;
+    if (n_sub_blocks <= 1) {
+        if (total_size > 0 && n_sub_blocks_limit > 0)
+            n_sub_blocks = 1;
+        sub_block_size = total_size;
+    } else {
+        if (n_sub_blocks > n_sub_blocks_limit) // limit the number of sub-blocks
+            n_sub_blocks = n_sub_blocks_limit;
+	sub_block_size = alignup32(total_size / n_sub_blocks, alignment);
+        while (sub_block_size * n_sub_blocks < total_size) // round up the sub-block size until big enough
+            sub_block_size += alignment;
+    }
+
+    *sub_block_size_ret = sub_block_size;
+    *n_sub_blocks_ret = n_sub_blocks;
+
+    return 0;
+}
+
+// Choose the right size of basement nodes.  For now, just align up to
+// 256k blocks and hope it compresses well enough.
+int
+choose_basement_node_size(int total_size, int *sub_block_size_ret, int *n_sub_blocks_ret) {
+    if (total_size < 0)
+        return EINVAL;
+
+    *n_sub_blocks_ret = (total_size + max_basement_node_uncompressed_size - 1) / max_basement_node_uncompressed_size;
+    *sub_block_size_ret = max_basement_node_uncompressed_size;
+
+    return 0;
+}
+
+void
+set_all_sub_block_sizes(int total_size, int sub_block_size, int n_sub_blocks, struct sub_block sub_block[]) {
+    int size_left = total_size;
+    int i;
+    for (i = 0; i < n_sub_blocks-1; i++) {
+        sub_block[i].uncompressed_size = sub_block_size;
+        size_left -= sub_block_size;
+    }
+    if (i == 0 || size_left > 0) 
+        sub_block[i].uncompressed_size = size_left;
+}
+
+// find the index of the first sub block that contains offset
+// Returns the sub block index, else returns -1
+int
+get_sub_block_index(int n_sub_blocks, struct sub_block sub_block[], size_t offset) {
+    size_t start_offset = 0;
+    for (int i = 0; i < n_sub_blocks; i++) {
+        size_t size = sub_block[i].uncompressed_size;
+        if (offset < start_offset + size)
+            return i;
+        start_offset += size;
+    }
+    return -1;
+}
+
+#include "workset.h"
+
+void
+compress_work_init(struct compress_work *w, enum toku_compression_method method, struct sub_block *sub_block) {
+    w->method = method;
+    w->sub_block = sub_block;
+}
+
+//
+// takes the uncompressed contents of sub_block
+// and compresses them into sb_compressed_ptr
+// cs_bound is the compressed size bound
+// Returns the size of the compressed data
+//
+uint32_t
+compress_nocrc_sub_block(
+    struct sub_block *sub_block,
+    void* sb_compressed_ptr,
+    uint32_t cs_bound,
+    enum toku_compression_method method
+    )
+{
+    // compress it
+    Bytef *uncompressed_ptr = (Bytef *) sub_block->uncompressed_ptr;
+    Bytef *compressed_ptr = (Bytef *) sb_compressed_ptr;
+    uLongf uncompressed_len = sub_block->uncompressed_size;
+    uLongf real_compressed_len = cs_bound;
+    toku_compress(method,
+                  compressed_ptr, &real_compressed_len,
+                  uncompressed_ptr, uncompressed_len);
+    return real_compressed_len;
+}
+
+void
+compress_sub_block(struct sub_block *sub_block, enum toku_compression_method method) {
+    sub_block->compressed_size = compress_nocrc_sub_block(
+        sub_block,
+        sub_block->compressed_ptr,
+        sub_block->compressed_size_bound,
+        method
+        );
+    // checksum it
+    sub_block->xsum = toku_x1764_memory(sub_block->compressed_ptr, sub_block->compressed_size);
+}
+
+void *
+compress_worker(void *arg) {
+    struct workset *ws = (struct workset *) arg;
+    while (1) {
+        struct compress_work *w = (struct compress_work *) workset_get(ws);
+        if (w == NULL)
+            break;
+        compress_sub_block(w->sub_block, w->method);
+    }
+    workset_release_ref(ws);
+    return arg;
+}
+
+size_t
+compress_all_sub_blocks(int n_sub_blocks, struct sub_block sub_block[], char *uncompressed_ptr, char *compressed_ptr, int num_cores, struct toku_thread_pool *pool, enum toku_compression_method method) {
+    char *compressed_base_ptr = compressed_ptr;
+    size_t compressed_len;
+
+    // This is a complex way to write a parallel loop.  Cilk would be better.
+
+    if (n_sub_blocks == 1) {
+        // single sub-block 
+        sub_block[0].uncompressed_ptr = uncompressed_ptr;
+        sub_block[0].compressed_ptr = compressed_ptr;
+        compress_sub_block(&sub_block[0], method);
+        compressed_len = sub_block[0].compressed_size;
+    } else {
+        // multiple sub-blocks
+        int T = num_cores; // T = min(num_cores, n_sub_blocks) - 1
+        if (T > n_sub_blocks)
+            T = n_sub_blocks;
+        if (T > 0)
+            T = T - 1;     // threads in addition to the running thread
+
+        struct workset ws;
+        ZERO_STRUCT(ws);
+        workset_init(&ws);
+
+        struct compress_work work[n_sub_blocks];
+        workset_lock(&ws);
+        for (int i = 0; i < n_sub_blocks; i++) {
+            sub_block[i].uncompressed_ptr = uncompressed_ptr;
+            sub_block[i].compressed_ptr = compressed_ptr;
+            compress_work_init(&work[i], method, &sub_block[i]);
+            workset_put_locked(&ws, &work[i].base);
+            uncompressed_ptr += sub_block[i].uncompressed_size;
+            compressed_ptr += sub_block[i].compressed_size_bound;
+        }
+        workset_unlock(&ws);
+
+        // compress the sub-blocks
+        if (0) printf("%s:%d T=%d N=%d\n", __FUNCTION__, __LINE__, T, n_sub_blocks);
+        toku_thread_pool_run(pool, 0, &T, compress_worker, &ws);
+        workset_add_ref(&ws, T);
+        compress_worker(&ws);
+
+        // wait for all of the work to complete
+        workset_join(&ws);
+        workset_destroy(&ws);
+
+        // squeeze out the holes not used by the compress bound
+        compressed_ptr = compressed_base_ptr + sub_block[0].compressed_size;
+        for (int i = 1; i < n_sub_blocks; i++) {
+            memmove(compressed_ptr, sub_block[i].compressed_ptr, sub_block[i].compressed_size);
+            compressed_ptr += sub_block[i].compressed_size;
+        }
+
+        compressed_len = compressed_ptr - compressed_base_ptr;
+    }
+    return compressed_len;
+}
+
+// initialize the decompression work
+void 
+decompress_work_init(struct decompress_work *dw,
+                     void *compress_ptr, uint32_t compress_size,
+                     void *uncompress_ptr, uint32_t uncompress_size,
+                     uint32_t xsum) {
+    dw->compress_ptr = compress_ptr; 
+    dw->compress_size = compress_size;
+    dw->uncompress_ptr = uncompress_ptr; 
+    dw->uncompress_size = uncompress_size;
+    dw->xsum = xsum;
+    dw->error = 0;
+}
+
+int verbose_decompress_sub_block = 1;
+
+// decompress one block
+int
+decompress_sub_block(void *compress_ptr, uint32_t compress_size, void *uncompress_ptr, uint32_t uncompress_size, uint32_t expected_xsum) {
+    int result = 0;
+
+    // verify checksum
+    uint32_t xsum = toku_x1764_memory(compress_ptr, compress_size);
+    if (xsum != expected_xsum) {
+        if (verbose_decompress_sub_block) fprintf(stderr, "%s:%d xsum %u expected %u\n", __FUNCTION__, __LINE__, xsum, expected_xsum);
+        result = EINVAL;
+    } else {
+        // decompress
+	toku_decompress((Bytef *) uncompress_ptr, uncompress_size, (Bytef *) compress_ptr, compress_size);
+    }
+    return result;
+}
+
+// decompress blocks until there is no more work to do
+void *
+decompress_worker(void *arg) {
+    struct workset *ws = (struct workset *) arg;
+    while (1) {
+        struct decompress_work *dw = (struct decompress_work *) workset_get(ws);
+        if (dw == NULL)
+            break;
+        dw->error = decompress_sub_block(dw->compress_ptr, dw->compress_size, dw->uncompress_ptr, dw->uncompress_size, dw->xsum);
+    }
+    workset_release_ref(ws);
+    return arg;
+}
+
+int
+decompress_all_sub_blocks(int n_sub_blocks, struct sub_block sub_block[], unsigned char *compressed_data, unsigned char *uncompressed_data, int num_cores, struct toku_thread_pool *pool) {
+    int r;
+
+    if (n_sub_blocks == 1) {
+        r = decompress_sub_block(compressed_data, sub_block[0].compressed_size, uncompressed_data, sub_block[0].uncompressed_size, sub_block[0].xsum);
+    } else {
+        // compute the number of additional threads needed for decompressing this node
+        int T = num_cores; // T = min(#cores, #blocks) - 1
+        if (T > n_sub_blocks)
+            T = n_sub_blocks;
+        if (T > 0)
+            T = T - 1;       // threads in addition to the running thread
+
+        // init the decompression work set
+        struct workset ws;
+        ZERO_STRUCT(ws);
+        workset_init(&ws);
+
+        // initialize the decompression work and add to the work set
+        struct decompress_work decompress_work[n_sub_blocks];
+        workset_lock(&ws);
+        for (int i = 0; i < n_sub_blocks; i++) {
+            decompress_work_init(&decompress_work[i], compressed_data, sub_block[i].compressed_size, uncompressed_data, sub_block[i].uncompressed_size, sub_block[i].xsum);
+            workset_put_locked(&ws, &decompress_work[i].base);
+
+            uncompressed_data += sub_block[i].uncompressed_size;
+            compressed_data += sub_block[i].compressed_size;
+        }
+        workset_unlock(&ws);
+    
+        // decompress the sub-blocks
+        if (0) printf("%s:%d Cores=%d Blocks=%d T=%d\n", __FUNCTION__, __LINE__, num_cores, n_sub_blocks, T);
+        toku_thread_pool_run(pool, 0, &T, decompress_worker, &ws);
+        workset_add_ref(&ws, T);
+        decompress_worker(&ws);
+
+        // cleanup
+        workset_join(&ws);
+        workset_destroy(&ws);
+
+        r = 0;
+        for (int i = 0; i < n_sub_blocks; i++) {
+            r = decompress_work[i].error;
+            if (r != 0)
+                break;
+        }
+    }
+
+    return r;
+}
diff --git a/storage/tokudb/PerconaFT/ft/serialize/sub_block.h b/storage/tokudb/PerconaFT/ft/serialize/sub_block.h
new file mode 100644
index 00000000000..2ae8a2a41bb
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/sub_block.h
@@ -0,0 +1,160 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "ft/serialize/compress.h"
+
+// TODO: Clean this abstraciton up
+static const int max_sub_blocks = 8;
+static const int target_sub_block_size = 512 * 1024;
+static const int max_basement_nodes = 32;
+static const int max_basement_node_uncompressed_size = 256 * 1024;
+static const int max_basement_node_compressed_size = 64 * 1024;
+
+struct sub_block {
+    void *uncompressed_ptr;
+    uint32_t uncompressed_size;
+
+    void *compressed_ptr;
+    uint32_t compressed_size;         // real compressed size
+    uint32_t compressed_size_bound;   // estimated compressed size
+
+    uint32_t xsum;                    // sub block checksum
+};
+typedef struct sub_block *SUB_BLOCK;
+
+struct stored_sub_block {
+    uint32_t uncompressed_size;
+    uint32_t compressed_size;
+    uint32_t xsum;
+};
+
+void sub_block_init(SUB_BLOCK);
+SUB_BLOCK sub_block_creat(void);
+
+// get the size of the compression header
+size_t 
+sub_block_header_size(int n_sub_blocks);
+
+void
+set_compressed_size_bound(struct sub_block *se, enum toku_compression_method method);
+
+// get the sum of the sub block compressed bound sizes 
+size_t 
+get_sum_compressed_size_bound(int n_sub_blocks, struct sub_block sub_block[], enum toku_compression_method method);
+
+// get the sum of the sub block uncompressed sizes 
+size_t 
+get_sum_uncompressed_size(int n_sub_blocks, struct sub_block sub_block[]);
+
+// Choose n_sub_blocks and sub_block_size such that the product is >= total_size and the sub_block_size is at
+// least >= the target_sub_block_size.
+int
+choose_sub_block_size(int total_size, int n_sub_blocks_limit, int *sub_block_size_ret, int *n_sub_blocks_ret);
+
+int
+choose_basement_node_size(int total_size, int *sub_block_size_ret, int *n_sub_blocks_ret);
+
+void
+set_all_sub_block_sizes(int total_size, int sub_block_size, int n_sub_blocks, struct sub_block sub_block[]);
+
+// find the index of the first sub block that contains the offset
+// Returns the index if found, else returns -1
+int
+get_sub_block_index(int n_sub_blocks, struct sub_block sub_block[], size_t offset);
+
+#include "workset.h"
+
+struct compress_work {
+    struct work base;
+    enum toku_compression_method method;
+    struct sub_block *sub_block;
+};
+
+void
+compress_work_init(struct compress_work *w, enum toku_compression_method method, struct sub_block *sub_block);
+
+uint32_t
+compress_nocrc_sub_block(
+    struct sub_block *sub_block,
+    void* sb_compressed_ptr,
+    uint32_t cs_bound,
+    enum toku_compression_method method
+    );
+
+void
+compress_sub_block(struct sub_block *sub_block, enum toku_compression_method method);
+
+void *
+compress_worker(void *arg);
+
+size_t
+compress_all_sub_blocks(int n_sub_blocks, struct sub_block sub_block[], char *uncompressed_ptr, char *compressed_ptr, int num_cores, struct toku_thread_pool *pool, enum toku_compression_method method);
+
+struct decompress_work {
+    struct work base;
+    void *compress_ptr;
+    void *uncompress_ptr;
+    uint32_t compress_size;
+    uint32_t uncompress_size;
+    uint32_t xsum;
+    int error;
+};
+
+// initialize the decompression work
+void 
+decompress_work_init(struct decompress_work *dw,
+                     void *compress_ptr, uint32_t compress_size,
+                     void *uncompress_ptr, uint32_t uncompress_size,
+                     uint32_t xsum);
+
+// decompress one block
+int
+decompress_sub_block(void *compress_ptr, uint32_t compress_size, void *uncompress_ptr, uint32_t uncompress_size, uint32_t expected_xsum);
+
+// decompress blocks until there is no more work to do
+void *
+decompress_worker(void *arg);
+
+// decompress all sub blocks from the compressed_data buffer to the uncompressed_data buffer
+// Returns 0 if success, otherwise an error
+int
+decompress_all_sub_blocks(int n_sub_blocks, struct sub_block sub_block[], unsigned char *compressed_data, unsigned char *uncompressed_data, int num_cores, struct toku_thread_pool *pool);
+
+extern int verbose_decompress_sub_block;
diff --git a/storage/tokudb/PerconaFT/ft/serialize/wbuf.h b/storage/tokudb/PerconaFT/ft/serialize/wbuf.h
new file mode 100644
index 00000000000..062294e2182
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/wbuf.h
@@ -0,0 +1,209 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <memory.h>
+#include <string.h>
+
+#include "portability/toku_htonl.h"
+
+#include "util/bytestring.h"
+#include "util/x1764.h"
+
+/* When serializing a value, write it into a buffer. */
+/* This code requires that the buffer be big enough to hold whatever you put into it. */
+/* This abstraction doesn't do a good job of hiding its internals.
+ * Why?  The performance of this code is important, and we want to inline stuff */
+//Why is size here an int instead of DISKOFF like in the initializer?
+struct wbuf {
+    unsigned char *buf;
+    unsigned int  size;
+    unsigned int  ndone;
+    struct x1764  checksum;    // The checksum state
+};
+
+static inline void wbuf_nocrc_init (struct wbuf *w, void *buf, unsigned int size) {
+    w->buf = (unsigned char *) buf;
+    w->size = size;
+    w->ndone = 0;
+}
+
+static inline void wbuf_init (struct wbuf *w, void *buf, unsigned int size) {
+    wbuf_nocrc_init(w, buf, size);
+    toku_x1764_init(&w->checksum);
+}
+
+static inline size_t wbuf_get_woffset(struct wbuf *w) {
+    return w->ndone;
+}
+
+/* Write a character. */
+static inline void wbuf_nocrc_char (struct wbuf *w, unsigned char ch) {
+    assert(w->ndone<w->size);
+    w->buf[w->ndone++]=ch;
+}
+
+/* Write a character. */
+static inline void wbuf_nocrc_uint8_t (struct wbuf *w, uint8_t ch) {
+    assert(w->ndone<w->size);
+    w->buf[w->ndone++]=ch;
+}
+
+static inline void wbuf_char (struct wbuf *w, unsigned char ch) {
+    wbuf_nocrc_char (w, ch);
+    toku_x1764_add(&w->checksum, &w->buf[w->ndone-1], 1);
+}
+
+//Write an int that MUST be in network order regardless of disk order
+static void wbuf_network_int (struct wbuf *w, int32_t i) __attribute__((__unused__));
+static void wbuf_network_int (struct wbuf *w, int32_t i) {
+    assert(w->ndone + 4 <= w->size);
+    *(uint32_t*)(&w->buf[w->ndone]) = toku_htonl(i);
+    toku_x1764_add(&w->checksum, &w->buf[w->ndone], 4);
+    w->ndone += 4;
+}
+
+static inline void wbuf_nocrc_int (struct wbuf *w, int32_t i) {
+#if 0
+    wbuf_nocrc_char(w, i>>24);
+    wbuf_nocrc_char(w, i>>16);
+    wbuf_nocrc_char(w, i>>8);
+    wbuf_nocrc_char(w, i>>0);
+#else
+    assert(w->ndone + 4 <= w->size);
+ #if 0
+    w->buf[w->ndone+0] = i>>24;
+    w->buf[w->ndone+1] = i>>16;
+    w->buf[w->ndone+2] = i>>8;
+    w->buf[w->ndone+3] = i>>0;
+ #else
+    *(uint32_t*)(&w->buf[w->ndone]) = toku_htod32(i);
+ #endif
+    w->ndone += 4;
+#endif
+}
+
+static inline void wbuf_int (struct wbuf *w, int32_t i) {
+    wbuf_nocrc_int(w, i);
+    toku_x1764_add(&w->checksum, &w->buf[w->ndone-4], 4);
+}
+
+static inline void wbuf_nocrc_uint (struct wbuf *w, uint32_t i) {
+    wbuf_nocrc_int(w, (int32_t)i);
+}
+
+static inline void wbuf_uint (struct wbuf *w, uint32_t i) {
+    wbuf_int(w, (int32_t)i);
+}
+
+static inline uint8_t* wbuf_nocrc_reserve_literal_bytes(struct wbuf *w, uint32_t nbytes) {
+    assert(w->ndone + nbytes <= w->size);
+    uint8_t * dest = w->buf + w->ndone;
+    w->ndone += nbytes;
+    return dest;
+}
+
+static inline void wbuf_nocrc_literal_bytes(struct wbuf *w, const void *bytes_bv, uint32_t nbytes) {
+    const unsigned char *bytes = (const unsigned char *) bytes_bv;
+#if 0
+    { int i; for (i=0; i<nbytes; i++) wbuf_nocrc_char(w, bytes[i]); }
+#else
+    assert(w->ndone + nbytes <= w->size);
+    memcpy(w->buf + w->ndone, bytes, (size_t)nbytes);
+    w->ndone += nbytes;
+#endif
+}
+
+static inline void wbuf_literal_bytes(struct wbuf *w, const void *bytes_bv, uint32_t nbytes) {
+    wbuf_nocrc_literal_bytes(w, bytes_bv, nbytes);
+    toku_x1764_add(&w->checksum, &w->buf[w->ndone-nbytes], nbytes);
+}
+
+static void wbuf_nocrc_bytes (struct wbuf *w, const void *bytes_bv, uint32_t nbytes) {
+    wbuf_nocrc_uint(w, nbytes);
+    wbuf_nocrc_literal_bytes(w, bytes_bv, nbytes);
+}
+
+static void wbuf_bytes (struct wbuf *w, const void *bytes_bv, uint32_t nbytes) {
+    wbuf_uint(w, nbytes);
+    wbuf_literal_bytes(w, bytes_bv, nbytes);
+}
+
+static void wbuf_nocrc_ulonglong (struct wbuf *w, uint64_t ull) {
+    wbuf_nocrc_uint(w, (uint32_t)(ull>>32));
+    wbuf_nocrc_uint(w, (uint32_t)(ull&0xFFFFFFFF));
+}
+
+static void wbuf_ulonglong (struct wbuf *w, uint64_t ull) {
+    wbuf_uint(w, (uint32_t)(ull>>32));
+    wbuf_uint(w, (uint32_t)(ull&0xFFFFFFFF));
+}
+
+static inline void wbuf_nocrc_uint64_t(struct wbuf *w, uint64_t ull) {
+    wbuf_nocrc_ulonglong(w, ull);
+}
+
+
+static inline void wbuf_uint64_t(struct wbuf *w, uint64_t ull) {
+    wbuf_ulonglong(w, ull);
+}
+
+static inline void wbuf_nocrc_bool (struct wbuf *w, bool b) {
+    wbuf_nocrc_uint8_t(w, (uint8_t)(b ? 1 : 0));
+}
+
+static inline void wbuf_nocrc_BYTESTRING (struct wbuf *w, BYTESTRING v) {
+    wbuf_nocrc_bytes(w, v.data, v.len);
+}
+
+static inline void wbuf_BYTESTRING (struct wbuf *w, BYTESTRING v) {
+    wbuf_bytes(w, v.data, v.len);
+}
+
+static inline void wbuf_uint8_t (struct wbuf *w, uint8_t v) {
+    wbuf_char(w, v);
+}
+
+static inline void wbuf_nocrc_uint32_t (struct wbuf *w, uint32_t v) {
+    wbuf_nocrc_uint(w, v);
+}
+
+static inline void wbuf_uint32_t (struct wbuf *w, uint32_t v) {
+    wbuf_uint(w, v);
+}
diff --git a/storage/tokudb/PerconaFT/ft/serialize/workset.h b/storage/tokudb/PerconaFT/ft/serialize/workset.h
new file mode 100644
index 00000000000..073741fccb1
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/workset.h
@@ -0,0 +1,135 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <toku_list.h>
+#include <toku_pthread.h>
+
+// The work struct is the base class for work to be done by some threads
+struct work {
+    struct toku_list next;
+};
+
+// The workset struct contains the set of work to be done by some threads
+struct workset {
+    toku_mutex_t lock;
+    struct toku_list worklist;           // a list of work
+    int refs;                            // number of workers that have a reference on the workset
+    toku_cond_t worker_wait;     // a condition variable used to wait for all of the worker to release their reference on the workset
+};
+
+static inline void 
+workset_init(struct workset *ws) {
+    toku_mutex_init(&ws->lock, NULL);
+    toku_list_init(&ws->worklist);
+    ws->refs = 1;      // the calling thread gets a reference
+    toku_cond_init(&ws->worker_wait, NULL);
+}
+
+static inline void 
+workset_destroy(struct workset *ws) {
+    invariant(toku_list_empty(&ws->worklist));
+    toku_cond_destroy(&ws->worker_wait);
+    toku_mutex_destroy(&ws->lock);
+}
+
+static inline void 
+workset_lock(struct workset *ws) {
+    toku_mutex_lock(&ws->lock);
+}
+        
+static inline void 
+workset_unlock(struct workset *ws) {
+    toku_mutex_unlock(&ws->lock);
+}
+
+// Put work in the workset.  Assume the workset is already locked.
+static inline void 
+workset_put_locked(struct workset *ws, struct work *w) {
+    toku_list_push(&ws->worklist, &w->next);
+}
+
+// Put work in the workset 
+static inline void 
+workset_put(struct workset *ws, struct work *w) {
+    workset_lock(ws);
+    workset_put_locked(ws, w);
+    workset_unlock(ws);
+}
+
+// Get work from the workset
+static inline struct work *
+workset_get(struct workset *ws) {
+    workset_lock(ws);
+    struct work *w = NULL;
+    if (!toku_list_empty(&ws->worklist)) {
+        struct toku_list *l = toku_list_pop_head(&ws->worklist);
+        w = toku_list_struct(l, struct work, next);
+    }
+    workset_unlock(ws);
+    return w;
+}
+
+// Add references to the workset
+static inline void 
+workset_add_ref(struct workset *ws, int refs) {
+    workset_lock(ws);
+    ws->refs += refs;
+    workset_unlock(ws);
+}
+
+// Release a reference on the workset
+static inline void 
+workset_release_ref(struct workset *ws) {
+    workset_lock(ws);
+    if (--ws->refs == 0) {
+        toku_cond_broadcast(&ws->worker_wait);
+    }
+    workset_unlock(ws);
+}
+
+// Wait until all of the worker threads have released their reference on the workset
+static inline void 
+workset_join(struct workset *ws) {
+    workset_lock(ws);
+    while (ws->refs != 0) {
+        toku_cond_wait(&ws->worker_wait, &ws->lock);
+    }
+    workset_unlock(ws);
+}
author	Sergei Golubchik <serg@mariadb.org>	2015-10-26 12:48:26 +0100
committer	Sergei Golubchik <serg@mariadb.org>	2015-10-26 12:57:57 +0100
commit	2c8c65297865d9f8da501761f46e2a34e29af603 (patch)
tree	3fdf4a00f8537bb3564827884f923ac56966e778 /storage/tokudb/PerconaFT/ft/serialize
download	mariadb-git-2c8c65297865d9f8da501761f46e2a34e29af603.tar.gz