diff options
author | Yuhong Zhang <danielzhangyh@gmail.com> | 2020-07-22 05:27:29 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-08-07 01:14:02 +0000 |
commit | bf95b1fabd4ee9b2154763f86eaa8f6c7ba3370c (patch) | |
tree | ad9ebaafbcf998a64323b0198dc732497e8b1dd8 /src/mongo/db | |
parent | 0607a6c291bf4cf4580a4444d826ed3c3ac3df47 (diff) | |
download | mongo-bf95b1fabd4ee9b2154763f86eaa8f6c7ba3370c.tar.gz |
SERVER-36709 Make Biggie store nodes adaptive to improve memory efficiency
server-36709
server-36709
server-36709
server-36709
server-36709
server-36709
server-36709
server-36709
server-36709
server-36709debug
server-36709
server-36709
server-36709
server-36709
server-36709
only 256
server-36709 48&256
server-36709 16-256 without shrink
server-36709 16-256
server-36709 4-256
server-36709 0-256
server-36709 cleanup
server-36709
Diffstat (limited to 'src/mongo/db')
-rw-r--r-- | src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_radix_store.h | 1140 | ||||
-rw-r--r-- | src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_radix_store_test.cpp | 325 |
2 files changed, 1241 insertions, 224 deletions
diff --git a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_radix_store.h b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_radix_store.h index 9410577e442..2a20c94ee17 100644 --- a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_radix_store.h +++ b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_radix_store.h @@ -65,6 +65,7 @@ struct Metrics { totalMemory.fetchAndSubtract(size); } }; +enum class NodeType : uint8_t { LEAF, NODE4, NODE16, NODE48, NODE256 }; /** * RadixStore is a Trie data structure with the ability to share nodes among copies of trees to @@ -88,6 +89,10 @@ public: using size_type = std::size_t; using difference_type = std::ptrdiff_t; using uint8_t = std::uint8_t; + using node_ptr = boost::intrusive_ptr<Node>; + using head_ptr = boost::intrusive_ptr<Head>; + + static constexpr uint8_t maxByte = 255; template <class pointer_type, class reference_type> class radix_iterator { @@ -164,10 +169,9 @@ public: } private: - radix_iterator(const boost::intrusive_ptr<Head>& root) : _root(root), _current(nullptr) {} + radix_iterator(const head_ptr& root) : _root(root), _current(nullptr) {} - radix_iterator(const boost::intrusive_ptr<Head>& root, Node* current) - : _root(root), _current(current) {} + radix_iterator(const head_ptr& root, Node* current) : _root(root), _current(current) {} /** * This function traverses the tree to find the next left-most node with data. Modifies @@ -208,25 +212,20 @@ public: // Check the children right of the node that the iterator was at already. This way, // there will be no backtracking in the traversal. - for (auto iter = oldKey + 1 + node->_children.begin(); - iter != node->_children.end(); - ++iter) { - - // If the node has a child, then the sub-tree must have a node with data that - // has not yet been visited. - if (*iter != nullptr) { - - // If the current node has data, return it and exit. If not, continue - // following the nodes to find the next one with data. It is necessary to go - // to the left-most node in this sub-tree. - if ((*iter)->_data) { - _current = iter->get(); - return; - } - _current = iter->get(); - _traverseLeftSubtree(); - return; + bool res = _forEachChild(node, oldKey + 1, false, [this](node_ptr child) { + // If the current node has data, return it and exit. If not, find the + // left-most node with data in this sub-tree. + if (child->_data) { + _current = child.get(); + return false; } + + _current = child.get(); + _traverseLeftSubtree(); + return false; + }); + if (!res) { + return; } } return; @@ -237,12 +236,10 @@ public: // '_current' is root. However, it cannot return the root, and hence at least 1 // iteration of the while loop is required. do { - for (auto child : _current->_children) { - if (child != nullptr) { - _current = child.get(); - break; - } - } + _forEachChild(_current, 0, false, [this](node_ptr child) { + _current = child.get(); + return false; + }); } while (!_current->_data); } @@ -259,7 +256,7 @@ public: } // "_root" is a pointer to the root of the tree over which this is iterating. - boost::intrusive_ptr<Head> _root; + head_ptr _root; // "_current" is the node that the iterator is currently on. _current->_data will never be // boost::none (unless it is within the process of tree traversal), and _current will be @@ -384,10 +381,9 @@ public: } private: - reverse_radix_iterator(const boost::intrusive_ptr<Head>& root) - : _root(root), _current(nullptr) {} + reverse_radix_iterator(const head_ptr& root) : _root(root), _current(nullptr) {} - reverse_radix_iterator(const boost::intrusive_ptr<Head>& root, Node* current) + reverse_radix_iterator(const head_ptr& root, Node* current) : _root(root), _current(current) {} void _findNextReverse() { @@ -416,14 +412,15 @@ public: // After moving up in the tree, continue searching for neighboring nodes to see if // they have data, moving from right to left. - for (int i = oldKey - 1; i >= 0; i--) { - if (node->_children[i] != nullptr) { - // If there is a sub-tree found, it must have data, therefore it's necessary - // to traverse to the right most node. - _current = node->_children[i].get(); - _traverseRightSubtree(); - return; - } + bool res = _forEachChild(node, oldKey - 1, true, [this](node_ptr child) { + // If there is a sub-tree found, it must have data, therefore it's necessary + // to traverse to the right most node. + _current = child.get(); + _traverseRightSubtree(); + return false; + }); + if (!res) { + return; } // If there were no sub-trees that contained data, and the 'current' node has data, @@ -439,13 +436,10 @@ public: // This function traverses the given tree to the right most leaf of the subtree where // 'current' is the root. do { - for (auto iter = _current->_children.rbegin(); iter != _current->_children.rend(); - ++iter) { - if (*iter != nullptr) { - _current = iter->get(); - break; - } - } + _forEachChild(_current, maxByte, true, [this](node_ptr child) { + _current = child.get(); + return false; + }); } while (!_current->isLeaf()); } @@ -462,7 +456,7 @@ public: } // "_root" is a pointer to the root of the tree over which this is iterating. - boost::intrusive_ptr<Head> _root; + head_ptr _root; // "_current" is a the node that the iterator is currently on. _current->_data will never be // boost::none, and _current will be become a nullptr once there are no more nodes left to @@ -542,9 +536,9 @@ public: return _metrics.totalNodes.load(); } - static uint16_t averageChildren() { + static float averageChildren() { auto totalNodes = _metrics.totalNodes.load(); - return totalNodes ? _metrics.totalChildren.load() / totalNodes : 0; + return totalNodes ? _metrics.totalChildren.load() / static_cast<float>(totalNodes) : 0; } // Modifiers @@ -593,8 +587,8 @@ public: const uint8_t* charKey = reinterpret_cast<const uint8_t*>(key.data()); size_t depth = prev->_depth + prev->_trieKey.size(); while (depth < key.size()) { - uint8_t c = charKey[depth]; - node = prev->_children[c].get(); + auto nodePtr = _findChild(prev, charKey[depth]); + node = nodePtr.get(); if (node == nullptr) { return false; } @@ -605,7 +599,7 @@ public: return false; } - isUniquelyOwned = isUniquelyOwned && prev->_children[c]->refCount() == 1; + isUniquelyOwned = isUniquelyOwned && nodePtr->refCount() - 1 == 1; context.push_back(std::make_pair(node, isUniquelyOwned)); depth = node->_depth + node->_trieKey.size(); prev = node; @@ -643,27 +637,35 @@ public: size_t sizeOfRemovedData = node->_data->second.size(); _root->_dataSize -= sizeOfRemovedData; _root->_count--; - + Node* prevParent = nullptr; for (size_t depth = 1; depth < context.size(); depth++) { Node* child = context.at(depth).first; isUniquelyOwned = context.at(depth).second; uint8_t childFirstChar = child->_trieKey.front(); if (!isUniquelyOwned) { - parent->_children[childFirstChar] = make_intrusive_node<Node>(*child); - child = parent->_children[childFirstChar].get(); + auto childCopy = _copyNode(child); + _setChildPtr(parent, childFirstChar, childCopy); + child = childCopy.get(); } + prevParent = parent; parent = child; } // Handle the deleted node, as it is a leaf. - parent->_children[deleted->_trieKey.front()] = nullptr; - --parent->_numChildren; + _setChildPtr(parent, deleted->_trieKey.front(), nullptr); // 'parent' may only have one child, in which case we need to evaluate whether or not // this node is redundant. - _compressOnlyChild(parent); + if (auto newParent = _compressOnlyChild(prevParent, parent)) { + parent = newParent; + } + + // Don't shrink the root node. + if (prevParent && parent->needShrink()) { + parent = _shrink(prevParent, parent).get(); + } return true; } @@ -729,10 +731,11 @@ public: if (idx != UINT8_MAX) context.push_back(std::make_pair(node, idx + 1)); - if (!node->_children[idx]) + auto nodePtr = _findChild(node, idx); + if (!nodePtr) break; - node = node->_children[idx].get(); + node = nodePtr.get(); size_t mismatchIdx = _comparePrefix(node->_trieKey, charKey + depth, key.size() - depth); @@ -775,18 +778,17 @@ public: std::tie(node, idx) = context.back(); context.pop_back(); - for (auto iter = idx + node->_children.begin(); iter != node->_children.end(); ++iter) { - if (!(*iter)) - continue; - + bool exhausted = _forEachChild(node, idx, false, [&node, &context](node_ptr child) { + node = child.get(); + if (!node->_data) { + // Need to search this node's children for the next largest node. + context.push_back(std::make_pair(node, 0)); + } + return false; + }); + if (!exhausted && node->_data) { // There exists a node with a key larger than the one given. - node = iter->get(); - if (node->_data) - return const_iterator(_root, node); - - // Need to search this node's children for the next largest node. - context.push_back(std::make_pair(node, 0)); - break; + return const_iterator(_root, node); } if (node->_trieKey.empty() && context.empty()) { @@ -824,6 +826,9 @@ public: } private: + /* + * The base class of all other node types. + */ class Node { friend class RadixStore; friend class RadixStoreTest; @@ -833,32 +838,32 @@ private: addNodeMemory(); } - Node(std::vector<uint8_t> key) { + Node(NodeType type, std::vector<uint8_t> key) { + _nodeType = type; _trieKey = key; addNodeMemory(); } Node(const Node& other) { - _trieKey = other._trieKey; + _nodeType = other._nodeType; + _numChildren = other._numChildren; _depth = other._depth; + _trieKey = other._trieKey; if (other._data) { - addData(other._data); + addData(other._data.value()); } - _children = other._children; - _numChildren = other._numChildren; // _refCount is initialized to 0 and not copied from other. addNodeMemory(); } - Node(Node&& other) { - _depth = other._depth; - _trieKey = std::move(other._trieKey); - _data = std::move(other._data); - _children = std::move(other._children); - _numChildren = other._numChildren; + Node(Node&& other) + : _nodeType(other._nodeType), + _numChildren(other._numChildren), + _depth(other._depth), + _trieKey(std::move(other._trieKey)), + _data(std::move(other._data)) { // _refCount is initialized to 0 and not copied from other. - // The move constructor transfers the dynamic memory so only the static memory of Node // should be added. addNodeMemory(-_trieKey.capacity() * sizeof(uint8_t)); @@ -890,38 +895,354 @@ private: } } + protected: + NodeType _nodeType; + uint16_t _numChildren = 0; + unsigned int _depth = 0; + std::vector<uint8_t> _trieKey; + boost::optional<value_type> _data; + AtomicWord<uint32_t> _refCount{0}; + + private: + bool needGrow() const { + switch (_nodeType) { + case NodeType::LEAF: + return true; + case NodeType::NODE4: + return numChildren() == 4; + case NodeType::NODE16: + return numChildren() == 16; + case NodeType::NODE48: + return numChildren() == 48; + case NodeType::NODE256: + return false; + } + MONGO_UNREACHABLE; + } + + bool needShrink() const { + switch (_nodeType) { + case NodeType::LEAF: + return false; + case NodeType::NODE4: + return numChildren() < 1; + case NodeType::NODE16: + return numChildren() < 5; + case NodeType::NODE48: + return numChildren() < 17; + case NodeType::NODE256: + return numChildren() < 49; + } + MONGO_UNREACHABLE; + } + + /* + * Only adds non-empty value and handle the increment on memory metrics. + */ + void addData(value_type value) { + _data.emplace(value.first, value.second); + _metrics.addMemory(value.first.capacity() + value.second.capacity()); + } + void addNodeMemory(difference_type offset = 0) { _metrics.totalMemory.fetchAndAdd(sizeof(Node) + _trieKey.capacity() * sizeof(uint8_t) + offset); _metrics.totalNodes.fetchAndAdd(1); - _metrics.totalChildren.fetchAndAdd(_children.size()); } void subtractNodeMemory() { - // Todo SERVER-36709: Vector capacity changes are ignored for now. This might be - // handled when nodes are made adaptive. size_t memUsage = sizeof(Node) + _trieKey.capacity() * sizeof(uint8_t); if (_data) { memUsage += _data->first.capacity() + _data->second.capacity(); } _metrics.totalMemory.fetchAndSubtract(memUsage); _metrics.totalNodes.fetchAndSubtract(1); + } + }; + + class Node4; + class Node16; + class Node48; + class Node256; + + class NodeLeaf : public Node { + friend class RadixStore; + + public: + NodeLeaf(std::vector<uint8_t> key) : Node(NodeType::LEAF, key) {} + NodeLeaf(const NodeLeaf& other) : Node(other) {} + + NodeLeaf(Node4&& other) : Node(std::move(other)) { + this->_nodeType = NodeType::LEAF; + } + }; + + /* + * Node4 is used when the number of children is in the range of [1, 4]. + */ + class Node4 : public Node { + friend class RadixStore; + + public: + Node4(std::vector<uint8_t> key) : Node(NodeType::NODE4, key) { + std::fill(_childKey.begin(), _childKey.end(), 0); + addNodeMemory(); + } + + Node4(const Node4& other) + : Node(other), _childKey(other._childKey), _children(other._children) { + addNodeMemory(); + } + + /* + * This move constructor should only be used when growing NodeLeaf to Node4. + */ + Node4(NodeLeaf&& other) : Node(std::move(other)) { + this->_nodeType = NodeType::NODE4; + std::fill(_childKey.begin(), _childKey.end(), 0); + addNodeMemory(); + } + + /* + * This move constructor should only be used when shrinking Node16 to Node4. + */ + Node4(Node16&& other) : Node(std::move(other)) { + invariant(other.needShrink()); + this->_nodeType = NodeType::NODE4; + std::fill(_childKey.begin(), _childKey.end(), 0); + for (size_t i = 0; i < other.numChildren(); ++i) { + _childKey[i] = other._childKey[i]; + _children[i] = std::move(other._children[i]); + } + addNodeMemory(); + } + + ~Node4() { + _metrics.subtractMemory(sizeof(Node4) - sizeof(Node)); _metrics.totalChildren.fetchAndSubtract(_children.size()); } - void addData(boost::optional<value_type> value) { - _data.emplace(value->first, value->second); - _metrics.addMemory(value->first.capacity() + value->second.capacity()); + private: + void addNodeMemory() { + _metrics.totalMemory.fetchAndAdd(sizeof(Node4) - sizeof(Node)); + _metrics.totalChildren.fetchAndAdd(_children.size()); } - protected: - unsigned int _depth = 0; - std::vector<uint8_t> _trieKey; - boost::optional<value_type> _data; - std::array<boost::intrusive_ptr<Node>, 256> _children; - // TODO SERVER-36709: Updating to uint8_t after using adaptive nodes. - uint16_t _numChildren = 0; - AtomicWord<uint32_t> _refCount{0}; + // The first bytes of each child's key is stored in a sorted order. + std::array<uint8_t, 4> _childKey; + std::array<node_ptr, 4> _children; + }; + + /* + * Node16 is used when the number of children is in the range of [5, 16]. + */ + class Node16 : public Node { + friend class RadixStore; + + public: + Node16(std::vector<uint8_t> key) : Node(NodeType::NODE16, key) { + std::fill(_childKey.begin(), _childKey.end(), 0); + addNodeMemory(); + } + + Node16(const Node16& other) + : Node(other), _childKey(other._childKey), _children(other._children) { + addNodeMemory(); + } + + /* + * This move constructor should only be used when growing Node4 to Node16. + */ + Node16(Node4&& other) : Node(std::move(other)) { + invariant(other.needGrow()); + this->_nodeType = NodeType::NODE16; + auto n = other._childKey.size(); + for (size_t i = 0; i < n; ++i) { + _childKey[i] = other._childKey[i]; + _children[i] = std::move(other._children[i]); + } + std::fill(_childKey.begin() + n, _childKey.end(), 0); + addNodeMemory(); + } + + /* + * This move constructor should only be used when shrinking Node48 to Node16. + */ + Node16(Node48&& other) : Node(std::move(other)) { + invariant(other.needShrink()); + this->_nodeType = NodeType::NODE16; + std::fill(_childKey.begin(), _childKey.end(), 0); + size_t cur = 0; + for (size_t i = 0; i < other._childIndex.size(); ++i) { + auto index = other._childIndex[i]; + if (index != maxByte) { + _childKey[cur] = i; + _children[cur++] = std::move(other._children[index]); + } + } + addNodeMemory(); + } + + ~Node16() { + _metrics.subtractMemory(sizeof(Node16) - sizeof(Node)); + _metrics.totalChildren.fetchAndSubtract(_children.size()); + } + + private: + void addNodeMemory() { + _metrics.totalMemory.fetchAndAdd(sizeof(Node16) - sizeof(Node)); + _metrics.totalChildren.fetchAndAdd(_children.size()); + } + + // _childKey is sorted ascendingly. + std::array<uint8_t, 16> _childKey; + std::array<node_ptr, 16> _children; + }; + + /* + * Node48 is used when the number of children is in the range of [17, 48]. + */ + class Node48 : public Node { + friend class RadixStore; + + public: + Node48(std::vector<uint8_t> key) : Node(NodeType::NODE48, key) { + std::fill(_childIndex.begin(), _childIndex.end(), maxByte); + addNodeMemory(); + } + + Node48(const Node48& other) + : Node(other), _childIndex(other._childIndex), _children(other._children) { + addNodeMemory(); + } + + /* + * This move constructor should only be used when growing Node16 to Node48. + */ + Node48(Node16&& other) : Node(std::move(other)) { + invariant(other.needGrow()); + this->_nodeType = NodeType::NODE48; + std::fill(_childIndex.begin(), _childIndex.end(), maxByte); + for (uint8_t i = 0; i < other._children.size(); ++i) { + _childIndex[other._childKey[i]] = i; + _children[i] = std::move(other._children[i]); + } + addNodeMemory(); + } + + /* + * This move constructor should only be used when shrinking Node256 to Node48. + */ + Node48(Node256&& other) : Node(std::move(other)) { + invariant(other.needShrink()); + this->_nodeType = NodeType::NODE48; + std::fill(_childIndex.begin(), _childIndex.end(), maxByte); + size_t cur = 0; + for (size_t i = 0; i < other._children.size(); ++i) { + auto child = other._children[i]; + if (child) { + _childIndex[i] = cur; + _children[cur++] = child; + } + } + addNodeMemory(); + } + + ~Node48() { + _metrics.subtractMemory(sizeof(Node48) - sizeof(Node)); + _metrics.totalChildren.fetchAndSubtract(_children.size()); + } + + private: + void addNodeMemory() { + _metrics.totalMemory.fetchAndAdd(sizeof(Node48) - sizeof(Node)); + _metrics.totalChildren.fetchAndAdd(_children.size()); + } + + // A lookup table for child pointers. It has values from 0 to 48, where 0 + // represents empty, and all other index i maps to index i - 1 in _children. + std::array<uint8_t, 256> _childIndex; + std::array<node_ptr, 48> _children; + }; + + /* + * Node256 is used when the number of children is in the range of [49, 256]. + */ + class Node256 : public Node { + friend class RadixStore; + + public: + Node256() { + this->_nodeType = NodeType::NODE256; + addNodeMemory(); + } + + Node256(std::vector<uint8_t> key) : Node(NodeType::NODE256, key) { + addNodeMemory(); + } + + Node256(const NodeLeaf& other) : Node(other) { + addNodeMemory(); + } + + Node256(const Node4& other) : Node(other) { + this->_nodeType = NodeType::NODE256; + for (size_t i = 0; i < other.numChildren(); ++i) { + this->_children[other._childKey[i]] = other._children[i]; + } + addNodeMemory(); + } + + Node256(const Node16& other) : Node(other) { + this->_nodeType = NodeType::NODE256; + for (size_t i = 0; i < other.numChildren(); ++i) { + this->_children[other._childKey[i]] = other._children[i]; + } + addNodeMemory(); + } + + Node256(const Node48& other) : Node(other) { + this->_nodeType = NodeType::NODE256; + for (size_t i = 0; i < other._childIndex.size(); ++i) { + auto index = other._childIndex[i]; + if (index != maxByte) { + this->_children[i] = other._children[index]; + } + } + addNodeMemory(); + } + + Node256(const Node256& other) : Node(other), _children(other._children) { + addNodeMemory(); + } + + /* + * This move constructor should only be used when growing Node48 to Node256. + */ + Node256(Node48&& other) : Node(std::move(other)) { + invariant(other.needGrow()); + this->_nodeType = NodeType::NODE256; + for (size_t i = 0; i < other._childIndex.size(); ++i) { + auto index = other._childIndex[i]; + if (index != maxByte) { + _children[i] = std::move(other._children[index]); + } + } + addNodeMemory(); + } + + ~Node256() { + _metrics.subtractMemory(sizeof(Node256) - sizeof(Node)); + _metrics.totalChildren.fetchAndSubtract(_children.size()); + } + + private: + void addNodeMemory() { + _metrics.totalMemory.fetchAndAdd(sizeof(Node256) - sizeof(Node)); + _metrics.totalChildren.fetchAndAdd(_children.size()); + } + + std::array<node_ptr, 256> _children; }; template <typename Node, typename... Args> @@ -935,34 +1256,38 @@ private: * be able to see when the tree is modified and to respond to these changes by ensuring they are * not iterating over stale trees. */ - class Head : public Node { + class Head : public Node256 { friend class RadixStore; public: Head() { - _metrics.addMemory(sizeof(Head) - sizeof(Node)); + addNodeMemory(); } - Head(std::vector<uint8_t> key) : Node(key) { - _metrics.addMemory(sizeof(Head) - sizeof(Node)); + Head(std::vector<uint8_t> key) : Node256(key) { + addNodeMemory(); } - Head(const Node& other) : Node(other) { - _metrics.addMemory(sizeof(Head) - sizeof(Node)); + Head(const Head& other) : Node256(other), _count(other._count), _dataSize(other._dataSize) { + addNodeMemory(); } - Head(const Head& other) : Node(other), _count(other._count), _dataSize(other._dataSize) { - _metrics.addMemory(sizeof(Head) - sizeof(Node)); + // Copy constructor template for the five node types. + template <typename NodeT> + Head(const NodeT& other) : Node256(other) { + addNodeMemory(); } ~Head() { if (_nextVersion) _nextVersion->_hasPreviousVersion = false; - _metrics.subtractMemory(sizeof(Head) - sizeof(Node)); + _metrics.subtractMemory(sizeof(Head) - sizeof(Node256)); } Head(Head&& other) - : Node(std::move(other)), _count(other._count), _dataSize(other._dataSize) {} + : Node256(std::move(other)), _count(other._count), _dataSize(other._dataSize) { + addNodeMemory(); + } bool hasPreviousVersion() const { return _hasPreviousVersion; @@ -971,7 +1296,7 @@ private: protected: // Forms a singly linked list of versions that is needed to reposition cursors after // modifications have been made. - boost::intrusive_ptr<Head> _nextVersion; + head_ptr _nextVersion; // While we have cursors that haven't been repositioned to the latest tree, this will be // true to help us understand when to copy on modifications due to the extra shared pointer @@ -979,6 +1304,10 @@ private: bool _hasPreviousVersion = false; private: + void addNodeMemory() { + _metrics.totalMemory.fetchAndAdd(sizeof(Head) - sizeof(Node256)); + } + size_type _count = 0; size_type _dataSize = 0; }; @@ -1008,12 +1337,160 @@ private: } ret.push_back('\n'); - for (auto child : node->_children) { - if (child != nullptr) { - ret.append(_walkTree(child.get(), depth + 1)); + _forEachChild(node, 0, false, [this, &ret, depth](node_ptr child) { + ret.append(_walkTree(child.get(), depth + 1)); + return true; + }); + return ret; + } + + /* + * Helper function to iterate through _children array for different node types and execute the + * given function on each child. The given function returns false when it intends to break the + * iteration. + * Returns false when the given function returns false on an element. Returns true + * when the end of the array is reached. + */ + static bool _forEachChild(Node* node, + uint16_t startKey, + bool reverse, + const std::function<bool(node_ptr)>& func) { + if (startKey > maxByte || !node->_numChildren) { + return true; + } + // Sets the step depending on the direction of iteration. + int16_t step = reverse ? -1 : 1; + switch (node->_nodeType) { + case NodeType::LEAF: + return true; + case NodeType::NODE4: { + Node4* node4 = static_cast<Node4*>(node); + // Locates the actual starting position first. + auto first = node4->_childKey.begin(); + auto numChildren = node4->numChildren(); + auto pos = std::find_if(first, first + numChildren, [&startKey](uint8_t keyByte) { + return keyByte >= startKey; + }); + if (reverse) { + if (pos == first && *pos != startKey) { + return true; + } + if (pos == first + numChildren || *pos != startKey) { + --pos; + } + } + + // No qualified elements. + if (pos == first + numChildren) { + return true; + } + + auto end = reverse ? -1 : numChildren; + for (auto cur = pos - first; cur != end; cur += step) { + // All children within the range will not be null since they are stored + // consecutively. + if (!func(node4->_children[cur])) { + // Breaks the loop if the given function decides to. + return false; + } + } + return true; + } + case NodeType::NODE16: { + Node16* node16 = static_cast<Node16*>(node); + // Locates the actual starting position first. + auto first = node16->_childKey.begin(); + auto numChildren = node16->numChildren(); + auto pos = std::find_if(first, first + numChildren, [&startKey](uint8_t keyByte) { + return keyByte >= startKey; + }); + if (reverse) { + if (pos == first && *pos != startKey) { + return true; + } + if (pos == first + numChildren || *pos != startKey) { + --pos; + } + } + + // No qualified elements. + if (pos == first + numChildren) { + return true; + } + + int16_t end = reverse ? -1 : numChildren; + for (auto cur = pos - first; cur != end; cur += step) { + // All children within the range will not be null since they are stored + // consecutively. + if (!func(node16->_children[cur])) { + return false; + } + } + return true; + } + case NodeType::NODE48: { + Node48* node48 = static_cast<Node48*>(node); + size_t end = reverse ? -1 : node48->_childIndex.size(); + for (size_t cur = startKey; cur != end; cur += step) { + auto index = node48->_childIndex[cur]; + if (index != maxByte && !func(node48->_children[index])) { + return false; + } + } + return true; + } + case NodeType::NODE256: { + Node256* node256 = static_cast<Node256*>(node); + size_t end = reverse ? -1 : node256->_children.size(); + for (size_t cur = startKey; cur != end; cur += step) { + auto child = node256->_children[cur]; + if (child && !func(child)) { + return false; + } + } + return true; } } - return ret; + MONGO_UNREACHABLE; + } + + /* + * Gets the child mapped by the key for different node types. Node4 just does a simple linear + * search. Node16 uses binary search. Node48 does one extra lookup with the index table. Node256 + * has direct mapping. + */ + static node_ptr _findChild(const Node* node, uint8_t key) { + switch (node->_nodeType) { + case NodeType::LEAF: + return node_ptr(nullptr); + case NodeType::NODE4: { + const Node4* node4 = static_cast<const Node4*>(node); + auto start = node4->_childKey.begin(); + auto end = start + node4->numChildren(); + auto pos = std::find(start, end, key); + return pos != end ? node4->_children[pos - start] : node_ptr(nullptr); + } + case NodeType::NODE16: { + const Node16* node16 = static_cast<const Node16*>(node); + auto start = node16->_childKey.begin(); + auto end = start + node16->numChildren(); + auto pos = std::find(start, end, key); + return pos != end ? node16->_children[pos - start] : node_ptr(nullptr); + } + case NodeType::NODE48: { + const Node48* node48 = static_cast<const Node48*>(node); + auto index = node48->_childIndex[key]; + if (index != maxByte) { + return node48->_children[index]; + } + return node_ptr(nullptr); + } + case NodeType::NODE256: { + const Node256* node256 = static_cast<const Node256*>(node); + return node256->_children[key]; + } + } + MONGO_UNREACHABLE; } Node* _findNode(const Key& key) const { @@ -1038,7 +1515,7 @@ private: depth = _root->_depth + _root->_trieKey.size(); uint8_t childFirstChar = charKey[depth]; - auto node = _root->_children[childFirstChar]; + auto node = _findChild(_root.get(), childFirstChar); while (node != nullptr) { @@ -1055,7 +1532,7 @@ private: depth = node->_depth + node->_trieKey.size(); childFirstChar = charKey[depth]; - node = node->_children[childFirstChar]; + node = _findChild(node.get(), childFirstChar); } return nullptr; @@ -1086,6 +1563,78 @@ private: _root->_hasPreviousVersion = true; } + /* + * Moves a smaller node's contents to a larger one. The method should only be called when the + * node is full. + */ + node_ptr _grow(Node* parent, Node* node) { + invariant(node->_nodeType != NodeType::NODE256); + auto key = node->_trieKey.front(); + switch (node->_nodeType) { + case NodeType::NODE256: + return nullptr; + case NodeType::LEAF: { + NodeLeaf* leaf = static_cast<NodeLeaf*>(node); + auto newNode = make_intrusive_node<Node4>(std::move(*leaf)); + _setChildPtr(parent, key, newNode); + return newNode; + } + case NodeType::NODE4: { + Node4* node4 = static_cast<Node4*>(node); + auto newNode = make_intrusive_node<Node16>(std::move(*node4)); + _setChildPtr(parent, key, newNode); + return newNode; + } + case NodeType::NODE16: { + Node16* node16 = static_cast<Node16*>(node); + auto newNode = make_intrusive_node<Node48>(std::move(*node16)); + _setChildPtr(parent, key, newNode); + return newNode; + } + case NodeType::NODE48: { + Node48* node48 = static_cast<Node48*>(node); + auto newNode = make_intrusive_node<Node256>(std::move(*node48)); + _setChildPtr(parent, key, newNode); + return newNode; + } + } + MONGO_UNREACHABLE; + } + + node_ptr _shrink(Node* parent, Node* node) { + invariant(node->_nodeType != NodeType::LEAF); + auto key = node->_trieKey.front(); + switch (node->_nodeType) { + case NodeType::LEAF: + return nullptr; + case NodeType::NODE4: { + Node4* node4 = static_cast<Node4*>(node); + auto newNode = make_intrusive_node<NodeLeaf>(std::move(*node4)); + _setChildPtr(parent, key, newNode); + return newNode; + } + case NodeType::NODE16: { + Node16* node16 = static_cast<Node16*>(node); + auto newNode = make_intrusive_node<Node4>(std::move(*node16)); + _setChildPtr(parent, key, newNode); + return newNode; + } + case NodeType::NODE48: { + Node48* node48 = static_cast<Node48*>(node); + auto newNode = make_intrusive_node<Node16>(std::move(*node48)); + _setChildPtr(parent, key, newNode); + return newNode; + } + case NodeType::NODE256: { + Node256* node256 = static_cast<Node256*>(node); + auto newNode = make_intrusive_node<Node48>(std::move(*node256)); + _setChildPtr(parent, key, newNode); + return newNode; + } + } + MONGO_UNREACHABLE; + } + /** * _upsertWithCopyOnSharedNodes is a helper function to help manage copy on modification for the * tree. This function follows the path for the to-be modified node using the keystring. If at @@ -1106,13 +1655,14 @@ private: _makeRootUnique(); + Node* prevParent = nullptr; Node* prev = _root.get(); - boost::intrusive_ptr<Node> node = prev->_children[childFirstChar]; + node_ptr node = _findChild(prev, childFirstChar); while (node != nullptr) { if (node->refCount() - 1 > 1) { // Copy node on a modifying operation when it isn't owned uniquely. - node = make_intrusive_node<Node>(*node); - prev->_children[childFirstChar] = node; + node = _copyNode(node.get()); + _setChildPtr(prev, childFirstChar, node); } // 'node' is uniquely owned at this point, so we are free to modify it. @@ -1126,8 +1676,7 @@ private: // Make a new node with whatever prefix is shared between node->_trieKey // and the new key. This will replace the current node in the tree. std::vector<uint8_t> newKey = _makeKey(node->_trieKey, 0, mismatchIdx); - Node* newNode = _addChild(prev, newKey, boost::none); - + Node* newNode = _addChild(prev, newKey, boost::none, NodeType::NODE4); depth += mismatchIdx; const_iterator it(_root, newNode); if (key.size() - depth != 0) { @@ -1138,15 +1687,14 @@ private: } else { // The new key is a prefix of an existing key, and has its own node, so we don't // need to add any new nodes. - newNode->addData(value); + newNode->addData(value.value()); } _root->_count++; _root->_dataSize += value->second.size(); // Change the current node's trieKey and make a child of the new node. newKey = _makeKey(node->_trieKey, mismatchIdx, node->_trieKey.size() - mismatchIdx); - newNode->_children[newKey.front()] = node; - ++newNode->_numChildren; + _setChildPtr(newNode, newKey.front(), node); // Handle key size change to the new key. _metrics.subtractMemory(sizeof(uint8_t) * @@ -1167,11 +1715,14 @@ private: // Update an internal node. if (!value) { data = boost::none; - _compressOnlyChild(node.get()); + auto keyByte = node->_trieKey.front(); + if (_compressOnlyChild(prev, node.get())) { + node = _findChild(prev, keyByte); + } } else { _root->_count++; _root->_dataSize += value->second.size(); - node->addData(value); + node->addData(value.value()); } const_iterator it(_root, node.get()); @@ -1181,13 +1732,17 @@ private: depth = node->_depth + node->_trieKey.size(); childFirstChar = charKey[depth]; + prevParent = prev; prev = node.get(); - node = node->_children[childFirstChar]; + node = _findChild(node.get(), childFirstChar); } // Add a completely new child to a node. The new key at this depth does not // share a prefix with any existing keys. std::vector<uint8_t> newKey = _makeKey(charKey + depth, key.size() - depth); + if (prev->needGrow()) { + prev = _grow(prevParent, prev).get(); + } Node* newNode = _addChild(prev, newKey, value); _root->_count++; _root->_dataSize += value->second.size(); @@ -1220,23 +1775,193 @@ private: return key; } - /** - * Add a child with trieKey 'key' and value 'value' to 'node'. + /* + * Wraps around the copy constructor for different node types. */ - Node* _addChild(Node* node, std::vector<uint8_t> key, boost::optional<value_type> value) { + node_ptr _copyNode(Node* node) { + switch (node->_nodeType) { + case NodeType::LEAF: { + NodeLeaf* leaf = static_cast<NodeLeaf*>(node); + return make_intrusive_node<NodeLeaf>(*leaf); + } + case NodeType::NODE4: { + Node4* node4 = static_cast<Node4*>(node); + return make_intrusive_node<Node4>(*node4); + } + case NodeType::NODE16: { + Node16* node16 = static_cast<Node16*>(node); + return make_intrusive_node<Node16>(*node16); + } + case NodeType::NODE48: { + Node48* node48 = static_cast<Node48*>(node); + return make_intrusive_node<Node48>(*node48); + } + case NodeType::NODE256: { + Node256* node256 = static_cast<Node256*>(node); + return make_intrusive_node<Node256>(*node256); + } + } + MONGO_UNREACHABLE; + } + + head_ptr _makeHead(Node* node) { + switch (node->_nodeType) { + case NodeType::LEAF: { + NodeLeaf* leaf = static_cast<NodeLeaf*>(node); + return make_intrusive_node<Head>(*leaf); + } + case NodeType::NODE4: { + Node4* node4 = static_cast<Node4*>(node); + return make_intrusive_node<Head>(*node4); + } + case NodeType::NODE16: { + Node16* node16 = static_cast<Node16*>(node); + return make_intrusive_node<Head>(*node16); + } + case NodeType::NODE48: { + Node48* node48 = static_cast<Node48*>(node); + return make_intrusive_node<Head>(*node48); + } + case NodeType::NODE256: { + Node256* node256 = static_cast<Node256*>(node); + return make_intrusive_node<Head>(*node256); + } + } + MONGO_UNREACHABLE; + } - boost::intrusive_ptr<Node> newNode = make_intrusive_node<Node>(key); + /** + * Add a child with trieKey 'key' and value 'value' to 'node'. The new child node created should + * only be the type NodeLeaf or Node4. + */ + Node* _addChild(Node* node, + std::vector<uint8_t> key, + boost::optional<value_type> value, + NodeType childType = NodeType::LEAF) { + invariant(childType == NodeType::LEAF || childType == NodeType::NODE4); + node_ptr newNode; + if (childType == NodeType::LEAF) { + newNode = make_intrusive_node<NodeLeaf>(key); + } else if (childType == NodeType::NODE4) { + newNode = make_intrusive_node<Node4>(key); + } newNode->_depth = node->_depth + node->_trieKey.size(); if (value) { - newNode->addData(value); + newNode->addData(value.value()); } - auto& child = node->_children[key.front()]; - if (!child) { - // Only increment the number of children if the node didn't have one at the position. + auto newNodeRaw = newNode.get(); + _setChildPtr(node, key.front(), std::move(newNode)); + return newNodeRaw; + } + + /* + * Inserts, updates, or deletes a child node at index and then maintains the key and children + * array for Node4 and Node16 in _setChildPtr(). + */ + template <class Node4Or16> + bool _setChildAtIndex(Node4Or16* node, node_ptr child, uint8_t key, size_t index) { + // Adds to the end of the array. + if (!node->_childKey[index] && !node->_children[index]) { + node->_childKey[index] = key; + node->_children[index] = child; ++node->_numChildren; + return true; + } + if (node->_childKey[index] == key) { + if (!child) { + // Deletes a child and shifts + auto n = node->numChildren(); + for (int j = index; j < n - 1; ++j) { + node->_childKey[j] = node->_childKey[j + 1]; + node->_children[j] = node->_children[j + 1]; + } + node->_childKey[n - 1] = 0; + node->_children[n - 1] = nullptr; + --node->_numChildren; + return true; + } + node->_children[index] = child; + return true; + } + if (node->_childKey[index] > key) { + // _children is guaranteed not to be full. + // Inserts to 'index' and shift larger keys to the right. + for (size_t j = node->numChildren(); j > index; --j) { + node->_childKey[j] = node->_childKey[j - 1]; + node->_children[j] = node->_children[j - 1]; + } + node->_childKey[index] = key; + node->_children[index] = child; + ++node->_numChildren; + return true; + } + return false; + } + + /* + * Sets the child pointer of 'key' to the 'newNode' in 'node'. + */ + void _setChildPtr(Node* node, uint8_t key, node_ptr newNode) { + switch (node->_nodeType) { + case NodeType::LEAF: + return; + case NodeType::NODE4: { + Node4* node4 = static_cast<Node4*>(node); + auto start = node4->_childKey.begin(); + // Find the position of the first larger or equal key to insert. + auto pos = std::find_if(start, + start + node4->numChildren(), + [&key](uint8_t keyByte) { return keyByte >= key; }); + _setChildAtIndex(node4, newNode, key, pos - start); + return; + } + case NodeType::NODE16: { + Node16* node16 = static_cast<Node16*>(node); + auto start = node16->_childKey.begin(); + auto pos = std::find_if(start, + start + node16->numChildren(), + [&key](uint8_t keyByte) { return keyByte >= key; }); + _setChildAtIndex(node16, newNode, key, pos - start); + return; + } + case NodeType::NODE48: { + Node48* node48 = static_cast<Node48*>(node); + auto index = node48->_childIndex[key]; + if (index != maxByte) { + // Pointer already exists. Delete or update it. + if (!newNode) { + node48->_childIndex[key] = maxByte; + --node48->_numChildren; + } + node48->_children[index] = newNode; + return; + } else { + // Finds the first empty slot to insert the newNode. + for (size_t i = 0; i < node48->_children.size(); ++i) { + auto& child = node48->_children[i]; + if (!child) { + child = newNode; + node48->_childIndex[key] = i; + ++node48->_numChildren; + return; + } + } + } + return; + } + case NodeType::NODE256: { + Node256* node256 = static_cast<Node256*>(node); + auto& child = node256->_children[key]; + if (!child) { + ++node256->_numChildren; + } else if (!newNode) { + --node256->_numChildren; + } + child = newNode; + return; + } } - child = newNode; - return newNode.get(); + MONGO_UNREACHABLE; } /** @@ -1255,8 +1980,7 @@ private: size_t depth = node->_depth + node->_trieKey.size(); while (depth < key.size()) { - uint8_t c = charKey[depth]; - node = node->_children[c].get(); + node = _findChild(node, charKey[depth]).get(); context.push_back(node); depth = node->_depth + node->_trieKey.size(); } @@ -1285,34 +2009,41 @@ private: * in a node with no value and only one child. * Returns true if compression occurred and false otherwise. */ - bool _compressOnlyChild(Node* node) { - // Don't compress if this node has an actual value associated with it or is the root + Node* _compressOnlyChild(Node* parent, Node* node) { + // Don't compress if this node is not of type Node4, has an actual value associated with it, // or doesn't have only one child. - if (node->_data || node->_trieKey.empty() || node->numChildren() != 1) { - return false; + if (node->_nodeType != NodeType::NODE4 || node->_data || node->_trieKey.empty() || + node->numChildren() != 1) { + return nullptr; } - // Determine if this node has only one child. - boost::intrusive_ptr<Node> onlyChild = nullptr; - - for (size_t i = 0; i < node->_children.size(); ++i) { - if (node->_children[i] != nullptr) { - onlyChild = node->_children[i]; - break; - } - } + Node4* node4 = static_cast<Node4*>(node); + node_ptr onlyChild = std::move(node4->_children[0]); + node4->_childKey[0] = 0; + node4->_numChildren = 0; + auto oldCapacity = node4->_trieKey.capacity(); - // Append the child's key onto the parent. for (char item : onlyChild->_trieKey) { - node->_trieKey.push_back(item); + node4->_trieKey.push_back(item); } - + _metrics.addMemory(sizeof(uint8_t) * (node4->_trieKey.capacity() - oldCapacity)); if (onlyChild->_data) { - node->addData(onlyChild->_data); + node4->addData(onlyChild->_data.value()); } - node->_children = onlyChild->_children; - node->_numChildren = onlyChild->_numChildren; - return true; + _forEachChild(onlyChild.get(), 0, false, [this, &parent, &node](node_ptr child) { + if (node->needGrow()) { + node = _grow(parent, node).get(); + } + auto keyByte = child->_trieKey.front(); + _setChildPtr(node, keyByte, std::move(child)); + return true; + }); + + if (node->needShrink()) { + node = _shrink(parent, node).get(); + } + + return node; } /** @@ -1325,7 +2056,7 @@ private: context[0] = replaceNode; for (size_t node = 1; node < context.size(); node++) { - replaceNode = replaceNode->_children[trieKeyIndex[node - 1]].get(); + replaceNode = _findChild(replaceNode, trieKeyIndex[node - 1]).get(); context[node] = replaceNode; } } @@ -1350,13 +2081,14 @@ private: for (size_t idx = 1; idx < context.size(); idx++) { node = context[idx]; - if (prev->_children[node->_trieKey.front()]->refCount() > 1) { - boost::intrusive_ptr<Node> nodeCopy = make_intrusive_node<Node>(*node); - prev->_children[nodeCopy->_trieKey.front()] = nodeCopy; + auto next = _findChild(prev, node->_trieKey.front()); + if (next->refCount() - 1 > 1) { + node_ptr nodeCopy = _copyNode(node); + _setChildPtr(prev, nodeCopy->_trieKey.front(), nodeCopy); context[idx] = nodeCopy.get(); prev = nodeCopy.get(); } else { - prev = prev->_children[node->_trieKey.front()].get(); + prev = next.get(); } } @@ -1367,14 +2099,14 @@ private: * Resolves conflicts within subtrees due to the complicated structure of path-compressed radix * tries. */ - void _mergeResolveConflict(const Node* current, const Node* baseNode, const Node* otherNode) { + void _mergeResolveConflict(Node* current, Node* baseNode, Node* otherNode) { // Merges all differences between this and other, using base to determine whether operations // are allowed or should throw a merge conflict. RadixStore base, other, node; - node._root = make_intrusive_node<Head>(*current); - base._root = make_intrusive_node<Head>(*baseNode); - other._root = make_intrusive_node<Head>(*otherNode); + node._root = _makeHead(current); + base._root = _makeHead(baseNode); + other._root = _makeHead(otherNode); // Merges insertions and updates from the master tree into the working tree, if possible. for (const value_type otherVal : other) { @@ -1437,11 +2169,11 @@ private: * Merges elements from the master tree into the working copy if they have no presence in the * working copy, otherwise we throw a merge conflict. */ - void _mergeTwoBranches(const Node* current, const Node* otherNode) { + void _mergeTwoBranches(Node* current, Node* otherNode) { RadixStore other, node; - node._root = make_intrusive_node<Head>(*current); - other._root = make_intrusive_node<Head>(*otherNode); + node._root = _makeHead(current); + other._root = _makeHead(otherNode); for (const value_type otherVal : other) { RadixStore::const_iterator thisIter = node.find(otherVal.first); @@ -1489,22 +2221,21 @@ private: auto key = current->_trieKey.front(); auto newTrieKeyBegin = current->_trieKey.begin(); auto newTrieKeyEnd = current->_trieKey.begin() + mismatchIdx; - auto shared_current = std::move(parent->_children[key]); - --parent->_numChildren; + auto shared_current = std::move(_findChild(parent, key)); auto newTrieKey = std::vector<uint8_t>(newTrieKeyBegin, newTrieKeyEnd); // Replace current with a new node with no data - auto newNode = _addChild(parent, newTrieKey, boost::none); + auto newNode = _addChild(parent, newTrieKey, boost::none, NodeType::NODE4); // Remove the part of the trieKey that is used by the new node. current->_trieKey.erase(current->_trieKey.begin(), current->_trieKey.begin() + mismatchIdx); + _metrics.subtractMemory(sizeof(uint8_t) * mismatchIdx); current->_depth += mismatchIdx; // Add what was the current node as a child to the new internal node key = current->_trieKey.front(); - newNode->_children[key] = std::move(shared_current); - newNode->_numChildren = 1; + _setChildPtr(newNode, key, std::move(shared_current)); // Update current pointer and context child = current; @@ -1518,9 +2249,9 @@ private: // Since _makeBranchUnique may make changes to the pointer addresses in recursive calls. current = context.back(); - Node* node = current->_children[key].get(); - Node* baseNode = base->_children[key].get(); - Node* otherNode = other->_children[key].get(); + Node* node = _findChild(current, key).get(); + Node* baseNode = _findChild(base, key).get(); + Node* otherNode = _findChild(other, key).get(); if (!node && !baseNode && !otherNode) continue; @@ -1535,12 +2266,14 @@ private: // merge in the other's branch. current = _makeBranchUnique(context); + if (current->needGrow()) { + current = _grow(context.at(context.size() - 2), current).get(); + } + // Need to rebuild our context to have updated pointers due to the // modifications that go on in _makeBranchUnique. _rebuildContext(context, trieKeyIndex); - - current->_children[key] = other->_children[key]; - ++current->_numChildren; + _setChildPtr(current, key, _findChild(other, key)); } else if (!otherNode || (baseNode && baseNode != otherNode)) { // Either the master tree and working tree remove the same branch, or the master // tree updated the branch while the working tree removed the branch, resulting @@ -1552,18 +2285,23 @@ private: node = splitCurrentBeforeWriteIfNeeded(node); // Other has a deleted branch that must also be removed from current tree. - current = _makeBranchUnique(context); + int prevIdx = context.size() - 2; + _setChildPtr(current, key, nullptr); + if (prevIdx >= 0 && current->needShrink()) { + current = _shrink(context.at(prevIdx), current).get(); + } _rebuildContext(context, trieKeyIndex); - current->_children[key] = nullptr; - --current->_numChildren; } else if (baseNode && otherNode && baseNode == node) { node = splitCurrentBeforeWriteIfNeeded(node); // If base and current point to the same node, then master changed. current = _makeBranchUnique(context); + if (current->needGrow()) { + current = _grow(context.at(context.size() - 2), current).get(); + } _rebuildContext(context, trieKeyIndex); - current->_children[key] = other->_children[key]; + _setChildPtr(current, key, _findChild(other, key)); } } else if (baseNode && otherNode && baseNode != otherNode) { // If all three are unique and leaf nodes with different data, then it is a merge @@ -1579,7 +2317,7 @@ private: // Only other changed the data. Take that node current = _makeBranchUnique(context); _rebuildContext(context, trieKeyIndex); - current->_children[key] = other->_children[key]; + _setChildPtr(current, key, _findChild(other, key)); } continue; } @@ -1605,15 +2343,27 @@ private: // Drop if leaf node without data, that is not valid. Otherwise we might // need to compress if we have only one child. if (node->isLeaf()) { - current->_children[key] = nullptr; - --current->_numChildren; + _setChildPtr(current, key, nullptr); + int prevIdx = context.size() - 2; + // Don't shrink the root node. + if (prevIdx >= 0 && current->needShrink()) { + current = _shrink(context.at(prevIdx), current).get(); + } + _rebuildContext(context, trieKeyIndex); } else { - _compressOnlyChild(node); + if (auto compressedNode = _compressOnlyChild(current, node)) { + node = compressedNode; + } } } } if (resolveConflict) { - Node* nodeToResolve = _compressOnlyChild(current) ? current : node; + auto curParent = context.size() > 1 ? context.at(context.size() - 2) : nullptr; + Node* nodeToResolve = node; + if (auto compressed = _compressOnlyChild(curParent, current)) { + current = compressed; + nodeToResolve = current; + } _mergeResolveConflict(nodeToResolve, baseNode, otherNode); _rebuildContext(context, trieKeyIndex); // If we compressed above, resolving the conflict can result in erasing current. @@ -1650,17 +2400,15 @@ private: Node* _begin(Node* root) const noexcept { Node* node = root; while (!node->_data) { - for (auto child : node->_children) { - if (child != nullptr) { - node = child.get(); - break; - } - } + _forEachChild(node, 0, false, [&node](node_ptr child) { + node = child.get(); + return false; + }); } return node; } - boost::intrusive_ptr<Head> _root = nullptr; + head_ptr _root = nullptr; static Metrics _metrics; }; diff --git a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_radix_store_test.cpp b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_radix_store_test.cpp index 1dfe224f020..46b16109d96 100644 --- a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_radix_store_test.cpp +++ b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_radix_store_test.cpp @@ -81,18 +81,17 @@ public: /** * Returns all nodes in "store" with level order traversal. */ - std::vector<boost::intrusive_ptr<node_type>> allNodes(StringStore& store) const { - std::deque<boost::intrusive_ptr<node_type>> level(1, store._root); - std::vector<boost::intrusive_ptr<node_type>> result(1, store._root); + std::vector<node_type*> allNodes(StringStore& store) const { + std::deque<node_type*> level(1, store._root.get()); + std::vector<node_type*> result(1, store._root.get()); while (!level.empty()) { - auto node = level.front().get(); - for (int i = 0; i < 256; ++i) { - auto child = node->_children[i]; - if (child.get()) { - level.push_back(child); - result.push_back(child); - } - } + auto node = level.front(); + StringStore::_forEachChild( + node, 0, false, [&level, &result](boost::intrusive_ptr<node_type> child) { + level.push_back(child.get()); + result.push_back(child.get()); + return true; + }); level.pop_front(); } return result; @@ -105,16 +104,21 @@ public: auto nodes = allNodes(store); for (const auto& node : nodes) { uint16_t numChildren = 0; - auto children = node.get()->_children; - for (uint16_t i = 0; i < children.size(); ++i) { - if (children[i]) { + StringStore::_forEachChild( + node, 0, false, [&numChildren](boost::intrusive_ptr<node_type> child) { ++numChildren; - } - } - ASSERT_EQ(numChildren, node.get()->numChildren()); + return true; + }); + ASSERT_EQ(numChildren, node->numChildren()); } } + void debug(StringStore& store) const { + std::cout << "Memory: " << store._metrics.totalMemory.load() << std::endl + << "Nodes: " << store._metrics.totalNodes.load() << std::endl + << "Children: " << store._metrics.totalChildren.load() << std::endl; + } + protected: StringStore thisStore; StringStore parallelStore; @@ -2107,20 +2111,139 @@ TEST_F(RadixStoreTest, BasicInsertFindDeleteNullCharacter) { ASSERT_EQ(iter->first, value2.first); } +TEST_F(RadixStoreTest, IteratorTest) { + uint8_t keyArr[] = {97, 0, 0}; + int v = 0; + int cur = v; + + // Node4 + for (size_t i = 0; i < 4; ++i) { + ++keyArr[1]; + std::string keyStr(reinterpret_cast<char*>(keyArr)); + value_type value = std::make_pair(keyStr, std::to_string(++v)); + + thisStore.insert(value_type(value)); + } + + cur = 0; + for (auto iter = thisStore.begin(); iter != thisStore.end(); ++iter) { + ++cur; + ASSERT_EQ(iter->second, std::to_string(cur)); + } + ASSERT_EQ(cur, v); + + // Node16 + for (size_t i = 0; i < 12; ++i) { + ++keyArr[1]; + std::string keyStr(reinterpret_cast<char*>(keyArr)); + value_type value = std::make_pair(keyStr, std::to_string(++v)); + + thisStore.insert(value_type(value)); + } + + cur = 0; + for (auto iter = thisStore.begin(); iter != thisStore.end(); ++iter) { + ++cur; + ASSERT_EQ(iter->second, std::to_string(cur)); + } + ASSERT_EQ(cur, v); + + // Node48 + for (size_t i = 0; i < 32; ++i) { + ++keyArr[1]; + std::string keyStr(reinterpret_cast<char*>(keyArr)); + value_type value = std::make_pair(keyStr, std::to_string(++v)); + + thisStore.insert(value_type(value)); + } + + cur = 0; + for (auto iter = thisStore.begin(); iter != thisStore.end(); ++iter) { + ++cur; + ASSERT_EQ(iter->second, std::to_string(cur)); + } + ASSERT_EQ(cur, v); + + // Node256 + for (size_t i = 0; i < 207; ++i) { + ++keyArr[1]; + std::string keyStr(reinterpret_cast<char*>(keyArr)); + value_type value = std::make_pair(keyStr, std::to_string(++v)); + + thisStore.insert(value_type(value)); + } + + cur = 0; + for (auto iter = thisStore.begin(); iter != thisStore.end(); ++iter) { + ++cur; + ASSERT_EQ(iter->second, std::to_string(cur)); + } + ASSERT_EQ(cur, v); +} + TEST_F(RadixStoreTest, ReverseIteratorTest) { - value_type value1 = std::make_pair("foo", "3"); - value_type value2 = std::make_pair("bar", "1"); - value_type value3 = std::make_pair("baz", "2"); - value_type value4 = std::make_pair("fools", "5"); - value_type value5 = std::make_pair("foods", "4"); + uint8_t keyArr[] = {97, 0, 0}; + int v = 0; + int cur = v; - thisStore.insert(value_type(value4)); - thisStore.insert(value_type(value5)); - thisStore.insert(value_type(value1)); - thisStore.insert(value_type(value3)); - thisStore.insert(value_type(value2)); + // Node4 + for (size_t i = 0; i < 4; ++i) { + ++keyArr[1]; + std::string keyStr(reinterpret_cast<char*>(keyArr)); + value_type value = std::make_pair(keyStr, std::to_string(++v)); - int cur = 5; + thisStore.insert(value_type(value)); + } + + cur = v; + for (auto iter = thisStore.rbegin(); iter != thisStore.rend(); ++iter) { + ASSERT_EQ(iter->second, std::to_string(cur)); + --cur; + } + ASSERT_EQ(cur, 0); + + // Node16 + for (size_t i = 0; i < 12; ++i) { + ++keyArr[1]; + std::string keyStr(reinterpret_cast<char*>(keyArr)); + value_type value = std::make_pair(keyStr, std::to_string(++v)); + + thisStore.insert(value_type(value)); + } + + cur = v; + for (auto iter = thisStore.rbegin(); iter != thisStore.rend(); ++iter) { + ASSERT_EQ(iter->second, std::to_string(cur)); + --cur; + } + ASSERT_EQ(cur, 0); + + // Node48 + for (size_t i = 0; i < 32; ++i) { + ++keyArr[1]; + std::string keyStr(reinterpret_cast<char*>(keyArr)); + value_type value = std::make_pair(keyStr, std::to_string(++v)); + + thisStore.insert(value_type(value)); + } + + cur = v; + for (auto iter = thisStore.rbegin(); iter != thisStore.rend(); ++iter) { + ASSERT_EQ(iter->second, std::to_string(cur)); + --cur; + } + ASSERT_EQ(cur, 0); + + // Node256 + for (size_t i = 0; i < 207; ++i) { + ++keyArr[1]; + std::string keyStr(reinterpret_cast<char*>(keyArr)); + value_type value = std::make_pair(keyStr, std::to_string(++v)); + + thisStore.insert(value_type(value)); + } + + cur = v; for (auto iter = thisStore.rbegin(); iter != thisStore.rend(); ++iter) { ASSERT_EQ(iter->second, std::to_string(cur)); --cur; @@ -2793,5 +2916,151 @@ TEST_F(RadixStoreTest, LowerBoundEndpoint) { ASSERT_TRUE(it == thisStore.end()); } +TEST_F(RadixStoreTest, SimpleGrowTest) { + std::pair<StringStore::const_iterator, bool> res; + uint8_t keyArr[] = {97, 0, 0}; + + for (size_t i = 0; i < 255; ++i) { + ++keyArr[1]; + std::string keyStr(reinterpret_cast<char*>(keyArr)); + value_type value = std::make_pair(keyStr, "1"); + + res = thisStore.insert(value_type(value)); + ASSERT_TRUE(res.second); + ASSERT_TRUE(*res.first == value); + } + + ASSERT_EQ(thisStore.size(), 255); +} + +TEST_F(RadixStoreTest, SimpleShrinkTest) { + uint8_t keyArr[] = {97, 0, 0}; + + for (size_t i = 0; i < 255; ++i) { + ++keyArr[1]; + std::string keyStr(reinterpret_cast<char*>(keyArr)); + value_type value = std::make_pair(keyStr, "1"); + + thisStore.insert(value_type(value)); + } + + for (size_t i = 0; i < 255; ++i) { + std::string keyStr(reinterpret_cast<char*>(keyArr)); + thisStore.erase(keyStr); + --keyArr[1]; + } + + ASSERT_EQ(thisStore.size(), 0); +} + +TEST_F(RadixStoreTest, UpdateGrowTest) { + std::pair<StringStore::const_iterator, bool> res; + uint8_t keyArr[] = {97, 0, 0}; + + for (size_t i = 0; i < 255; ++i) { + ++keyArr[1]; + std::string keyStr(reinterpret_cast<char*>(keyArr)); + value_type value = std::make_pair(keyStr, "1"); + + thisStore.insert(value_type(value)); + + value_type update = std::make_pair(keyStr, "2"); + res = thisStore.update(value_type(update)); + ASSERT_TRUE(res.second); + ASSERT_TRUE(*res.first == update); + } +} + +TEST_F(RadixStoreTest, MergeGrowOneTest) { + // !baseNode && otherNode + thisStore = baseStore; + + uint8_t keyArr[] = {97, 0, 0}; + for (size_t i = 0; i < 16; ++i) { + ++keyArr[1]; + std::string keyStr(reinterpret_cast<char*>(keyArr)); + value_type value = std::make_pair(keyStr, "1"); + thisStore.insert((value_type(value))); + } + + otherStore = baseStore; + auto keyArrOther = keyArr; + ++keyArrOther[1]; + std::string keyStr1(reinterpret_cast<char*>(keyArrOther)); + value_type value1 = std::make_pair(keyStr1, "1"); + otherStore.insert((value_type(value1))); + + thisStore.merge3(baseStore, otherStore); + + ASSERT_EQ(baseStore.size(), 0); + ASSERT_EQ(otherStore.size(), 1); + ASSERT_EQ(thisStore.size(), 17); + ASSERT_EQ(thisStore.dataSize(), 17); + + for (size_t i = 0; i < 31; ++i) { + ++keyArr[1]; + std::string keyStr(reinterpret_cast<char*>(keyArr)); + value_type value = std::make_pair(keyStr, "1"); + thisStore.insert((value_type(value))); + } + + otherStore = baseStore; + keyArrOther = keyArr; + ++keyArrOther[1]; + std::string keyStr2(reinterpret_cast<char*>(keyArrOther)); + value_type value2 = std::make_pair(keyStr2, "1"); + otherStore.insert((value_type(value2))); + + thisStore.merge3(baseStore, otherStore); + + ASSERT_EQ(baseStore.size(), 0); + ASSERT_EQ(otherStore.size(), 1); + ASSERT_EQ(thisStore.size(), 49); + ASSERT_EQ(thisStore.dataSize(), 49); +} + +TEST_F(RadixStoreTest, MergeGrowTwoTest) { + // !node && !baseNode && otherNode + value_type value1 = std::make_pair("a", "1"); + value_type value2 = std::make_pair("aa", "2"); + + baseStore.insert(value_type(value1)); + + thisStore = baseStore; + otherStore = baseStore; + + otherStore.insert(value_type(value2)); + + thisStore.merge3(baseStore, otherStore); + + ASSERT_EQ(baseStore.size(), 1); + ASSERT_EQ(otherStore.size(), 2); + ASSERT_EQ(thisStore.size(), 2); +} + +TEST_F(RadixStoreTest, MergeCompressTest) { + value_type value1 = std::make_pair("a", "1"); + value_type value2 = std::make_pair("aaa", "2"); + value_type value3 = std::make_pair("aab", "3"); + value_type value4 = std::make_pair("aac", "4"); + + baseStore.insert(value_type(value1)); + baseStore.insert(value_type(value2)); + baseStore.insert(value_type(value3)); + baseStore.insert(value_type(value4)); + + thisStore = baseStore; + otherStore = baseStore; + + thisStore.erase("aac"); + otherStore.erase("aab"); + + thisStore.merge3(baseStore, otherStore); + + ASSERT_EQ(baseStore.size(), 4); + ASSERT_EQ(otherStore.size(), 3); + ASSERT_EQ(thisStore.size(), 2); +} + } // namespace ephemeral_for_test } // namespace mongo |