diff options
author | Henrik Edin <henrik.edin@mongodb.com> | 2021-08-11 11:17:19 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-08-24 02:41:29 +0000 |
commit | 2873b99f1f7477d7053504dcd3f0caa68745f506 (patch) | |
tree | f2fc0df393b67a118b3c3203df2b6edd2ef7e7ff /src/mongo | |
parent | a241a584d81ade104f7a2e37ca7edfdc974a47cc (diff) | |
download | mongo-2873b99f1f7477d7053504dcd3f0caa68745f506.tar.gz |
SERVER-58577 Add BSONColumn to interpret BSON Binary Subtype 7 data.
BSONColumn provides forward iteration over BSON Binary Subtype 7 data
Decompressed BSONElement from deltas need to be re-materialized.
BSONColumn manage this memory and any BSONElement reference has the same lifetime as BSONColumn.
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/bson/util/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/bson/util/bsoncolumn.cpp | 465 | ||||
-rw-r--r-- | src/mongo/bson/util/bsoncolumn.h | 223 | ||||
-rw-r--r-- | src/mongo/bson/util/bsoncolumn_test.cpp | 568 | ||||
-rw-r--r-- | src/mongo/bson/util/bsoncolumn_util.cpp | 64 | ||||
-rw-r--r-- | src/mongo/bson/util/bsoncolumn_util.h | 45 | ||||
-rw-r--r-- | src/mongo/bson/util/bsoncolumnbuilder.cpp | 84 | ||||
-rw-r--r-- | src/mongo/bson/util/bsoncolumnbuilder.h | 1 | ||||
-rw-r--r-- | src/mongo/bson/util/simple8b.cpp | 25 | ||||
-rw-r--r-- | src/mongo/bson/util/simple8b.h | 6 | ||||
-rw-r--r-- | src/mongo/bson/util/simple8b_type_util.cpp | 33 | ||||
-rw-r--r-- | src/mongo/bson/util/simple8b_type_util.h | 2 | ||||
-rw-r--r-- | src/mongo/bson/util/simple8b_type_util_test.cpp | 10 |
13 files changed, 1347 insertions, 180 deletions
diff --git a/src/mongo/bson/util/SConscript b/src/mongo/bson/util/SConscript index f5fb665919d..7be350eb69d 100644 --- a/src/mongo/bson/util/SConscript +++ b/src/mongo/bson/util/SConscript @@ -19,6 +19,7 @@ env.Library( source=[ 'bsoncolumn.cpp', 'bsoncolumnbuilder.cpp', + 'bsoncolumn_util.cpp', 'simple8b.cpp', 'simple8b_type_util.cpp', ], diff --git a/src/mongo/bson/util/bsoncolumn.cpp b/src/mongo/bson/util/bsoncolumn.cpp index a0e48c1fbfa..4ad5f16738c 100644 --- a/src/mongo/bson/util/bsoncolumn.cpp +++ b/src/mongo/bson/util/bsoncolumn.cpp @@ -28,3 +28,468 @@ */ #include "mongo/bson/util/bsoncolumn.h" +#include "mongo/bson/util/bsoncolumn_util.h" +#include "mongo/bson/util/simple8b_type_util.h" + +#include <algorithm> + +namespace mongo { +using namespace bsoncolumn; + +namespace { +// Start capacity for memory blocks allocated by ElementStorage +constexpr int kStartCapacity = 128; + +// Max capacity for memory blocks allocated by ElementStorage +constexpr int kMaxCapacity = 1024 * 32; + +// Memory offset to get to BSONElement value when field name is an empty string. +constexpr int kElementValueOffset = 2; + +// Sentinel to indicate end index for BSONColumn Iterator. +constexpr size_t kEndIndex = 0xFFFFFFFFFFFFFFFF; + +// Lookup table to go from Control byte (high 4 bits) to scale index. +constexpr uint8_t kInvalidScaleIndex = 0xFF; +constexpr std::array<uint8_t, 16> kControlToScaleIndex = { + kInvalidScaleIndex, + kInvalidScaleIndex, + kInvalidScaleIndex, + kInvalidScaleIndex, + kInvalidScaleIndex, + kInvalidScaleIndex, + kInvalidScaleIndex, + kInvalidScaleIndex, + Simple8bTypeUtil::kMemoryAsInteger, // 0b1000 + 0, // 0b1001 + 1, // 0b1010 + 2, // 0b1011 + 3, // 0b1100 + 4, // 0b1101 + kInvalidScaleIndex, + kInvalidScaleIndex}; + +} // namespace + +BSONColumn::ElementStorage::Element::Element(char* buffer, int valueSize) + : _buffer(buffer), _valueSize(valueSize) {} + +char* BSONColumn::ElementStorage::Element::value() { + // Skip over type byte and null terminator for field name + return _buffer + kElementValueOffset; +} + +int BSONColumn::ElementStorage::Element::size() const { + return _valueSize; +} + +BSONElement BSONColumn::ElementStorage::Element::element() const { + return {_buffer, 1, _valueSize + kElementValueOffset, BSONElement::CachedSizeTag{}}; +} + +BSONColumn::ElementStorage::Element BSONColumn::ElementStorage::allocate(BSONType type, + int valueSize) { + // Size needed for this BSONElement + int size = valueSize + kElementValueOffset; + + // If current block don't have enough capacity we need to allocate a new one + if (_capacity - _pos < size) { + // Keep track of current block if it exists. + if (_block) { + _blocks.push_back(std::move(_block)); + } + + // Double block size while keeping it within [kStartCapacity, kMaxCapacity] range, unless a + // size larger than kMaxCapacity is requested. + _capacity = std::max(std::clamp(_capacity * 2, kStartCapacity, kMaxCapacity), size); + _block = std::make_unique<char[]>(_capacity); + _pos = 0; + } + + // Write type and null terminator in the first two bytes + _block[_pos] = type; + _block[_pos + 1] = '\0'; + + // Construct the Element, current block will have enough size at this point + Element elem(_block.get() + _pos, valueSize); + + // Increment the used size and return + _pos += size; + return elem; +} + +BSONColumn::Iterator::Iterator(BSONColumn& column, const char* pos, const char* end) + : _column(column), _control(pos), _end(end) {} + +BSONColumn::Iterator& BSONColumn::Iterator::operator++() { + // We need to setup iterator state even if this is not the first time we iterate in case we need + // to decompress elements further along + ++_index; + + // Get reference to last non-skipped element. Needed to apply delta. + const auto& lastVal = _column._decompressed.at(_lastValueIndex); + + // Traverse current Simple8b block for 64bit values if it exists + if (_decoder64 && ++_decoder64->pos != _decoder64->end) { + _loadDelta(lastVal, *_decoder64->pos); + return *this; + } + + // Traverse current Simple8b block for 128bit values if it exists + if (_decoder128 && ++_decoder128->pos != _decoder128->end) { + _loadDelta(lastVal, *_decoder128->pos); + return *this; + } + + // We don't have any more delta values in current block so we need to move to the next control + // byte. + if (_literal(*_control)) { + // If we were positioned on a literal, move to the byte after + _control += lastVal.size(); + } else { + // If we were positioned on Simple-8b blocks, move to the byte after then + uint8_t blocks = (*_control & 0x0F) + 1; + _control += sizeof(uint64_t) * blocks + 1; + } + + // Validate that we are not reading out of bounds + uassert(ErrorCodes::BadValue, "Invalid BSON Column encoding", _control < _end); + + // Load new control byte + _loadControl(lastVal); + + return *this; +} + +BSONColumn::Iterator BSONColumn::Iterator::operator++(int) { + auto ret = *this; + operator++(); + return ret; +} + +bool BSONColumn::Iterator::operator==(const Iterator& rhs) const { + return _index == rhs._index; +} +bool BSONColumn::Iterator::operator!=(const Iterator& rhs) const { + return !operator==(rhs); +} + +void BSONColumn::Iterator::_loadControl(const BSONElement& prev) { + // Load current control byte, it can be either a literal or Simple-8b deltas + uint8_t control = *_control; + if (_literal(control)) { + // If we detect EOO we are done, set values to be the same as an end iterator + if (control == EOO) { + _control += 1; + _index = kEndIndex; + _column._fullyDecompressed = true; + return; + } + + // Load BSONElement from the literal and set last encoded in case we need to calculate + // deltas from this literal + BSONElement literalElem(_control, 1, -1, BSONElement::CachedSizeTag{}); + switch (literalElem.type()) { + case BinData: { + int size; + const char* binary = literalElem.binData(size); + if (size <= 16) { + _lastEncodedValue128 = Simple8bTypeUtil::encodeBinary(binary, size); + } + break; + } + case jstOID: + _lastEncodedValue64 = Simple8bTypeUtil::encodeObjectId(literalElem.__oid()); + break; + case Date: + _lastEncodedValue64 = literalElem.date().toMillisSinceEpoch(); + break; + case Bool: + _lastEncodedValue64 = literalElem.boolean(); + break; + case NumberInt: + _lastEncodedValue64 = literalElem._numberInt(); + break; + case NumberLong: + _lastEncodedValue64 = literalElem._numberLong(); + break; + case bsonTimestamp: + _lastEncodedValue64 = 0; + _lastEncodedValueForDeltaOfDelta = literalElem.timestamp().asULL(); + break; + case NumberDecimal: + _lastEncodedValue128 = + Simple8bTypeUtil::encodeDecimal128(literalElem._numberDecimal()); + break; + default: + break; + }; + + // Store at and and reset any previous decoders + _storeElementIfNeeded(literalElem); + + _lastValueIndex = _index; + _decoder64 = boost::none; + _decoder128 = boost::none; + + // Remember index to last literal to speed up "random access". + if (_column._indexLastLiteral < _index) { + _column._controlLastLiteral = _control; + _column._indexLastLiteral = _index; + } + + return; + } + + // Simple-8b delta block, load its scale factor and validate for sanity + _scaleIndex = kControlToScaleIndex[(control & 0xF0) >> 4]; + uassert(ErrorCodes::BadValue, + "Invalid control byte in BSON Column", + _scaleIndex != kInvalidScaleIndex); + + // If Double, scale last value according to this scale factor + auto type = prev.type(); + if (type == NumberDouble) { + auto encoded = Simple8bTypeUtil::encodeDouble(prev._numberDouble(), _scaleIndex); + uassert(ErrorCodes::BadValue, "Invalid double encoding in BSON Column", encoded); + _lastEncodedValue64 = *encoded; + } + + // Setup decoder for this range of Simple-8b blocks + uint8_t blocks = _numSimple8bBlocks(control); + auto size = sizeof(uint64_t) * blocks; + uassert(ErrorCodes::BadValue, "Invalid BSON Column encoding", _control + size + 1 < _end); + + // Instantiate decoder and load first value, every Simple-8b block should have at least one + // value + if (!uses128bit(type)) { + _decoder64.emplace(_control + 1, size); + _loadDelta(prev, *_decoder64->pos); + } else { + _decoder128.emplace(_control + 1, size); + _loadDelta(prev, *_decoder128->pos); + } +} + +void BSONColumn::Iterator::_loadDelta(const BSONElement& prev, + const boost::optional<uint64_t>& delta) { + // boost::none represent skip, just append EOO BSONElement. + if (!delta) { + _storeElementIfNeeded(BSONElement()); + return; + } + + // We have an actual value, remember this index for future delta lookups. + _lastValueIndex = _index; + BSONType type = prev.type(); + + // If we have a zero delta no need to allocate a new Element, we can just use previous. + bool deltaOfDelta = usesDeltaOfDelta(type); + if (!deltaOfDelta && *delta == 0) { + _storeElementIfNeeded(prev); + return; + } + + // Expand delta or delta-of-delta as last encoded. + _lastEncodedValue64 = expandDelta(_lastEncodedValue64, Simple8bTypeUtil::decodeInt64(*delta)); + if (deltaOfDelta) { + _lastEncodedValueForDeltaOfDelta = + expandDelta(_lastEncodedValueForDeltaOfDelta, _lastEncodedValue64); + } + + // Decoder state is now setup, no need to create BSONElement if already exist decompressed + if (!_needStoreElement()) { + return; + } + + // Allocate a new BSONElement that fits same value size as previous + ElementStorage::Element elem = _column._elementStorage.allocate(type, prev.valuesize()); + + // Write value depending on type + switch (type) { + case NumberDouble: + DataView(elem.value()) + .write<LittleEndian<double>>( + Simple8bTypeUtil::decodeDouble(_lastEncodedValue64, _scaleIndex)); + break; + case jstOID: { + Simple8bTypeUtil::decodeObjectIdInto( + elem.value(), _lastEncodedValue64, prev.__oid().getInstanceUnique()); + } break; + case Date: + case NumberLong: + DataView(elem.value()).write<LittleEndian<long long>>(_lastEncodedValue64); + break; + case Bool: + DataView(elem.value()).write<LittleEndian<char>>(_lastEncodedValue64); + break; + case NumberInt: + DataView(elem.value()).write<LittleEndian<int>>(_lastEncodedValue64); + break; + case bsonTimestamp: { + DataView(elem.value()).write<LittleEndian<long long>>(_lastEncodedValueForDeltaOfDelta); + } break; + default: + // unhandled for now + break; + } + + // Append our written BSONElement to decompressed values + _column._decompressed.push_back(elem.element()); +} + +void BSONColumn::Iterator::_loadDelta(const BSONElement& prev, + const boost::optional<uint128_t>& delta) { + // boost::none represent skip, just append EOO BSONElement. + if (!delta) { + _storeElementIfNeeded(BSONElement()); + return; + } + + // We have an actual value, remember this index for future delta lookups. + _lastValueIndex = _index; + BSONType type = prev.type(); + + // If we have a zero delta no need to allocate a new Element, we can just use previous. + if (*delta == 0) { + _storeElementIfNeeded(prev); + return; + } + + // Expand delta as last encoded. + _lastEncodedValue128 = + expandDelta(_lastEncodedValue128, Simple8bTypeUtil::decodeInt128(*delta)); + + // Decoder state is now setup, no need to create BSONElement if already exist decompressed + if (!_needStoreElement()) { + return; + } + + // Allocate a new BSONElement that fits same value size as previous + ElementStorage::Element elem = _column._elementStorage.allocate(type, prev.valuesize()); + + // Write value depending on type + switch (type) { + case BinData: + // The first 5 bytes in binData is a count and subType, copy them from previous + memcpy(elem.value(), prev.value(), 5); + Simple8bTypeUtil::decodeBinary( + _lastEncodedValue128, elem.value() + 5, prev.valuestrsize()); + break; + case NumberDecimal: { + Decimal128 d128 = Simple8bTypeUtil::decodeDecimal128(_lastEncodedValue128); + Decimal128::Value d128Val = d128.getValue(); + DataView(elem.value()).write<LittleEndian<long long>>(d128Val.low64); + DataView(elem.value() + sizeof(long long)) + .write<LittleEndian<long long>>(d128Val.high64); + break; + } + default: + // unhandled for now + break; + } + + _column._decompressed.push_back(elem.element()); +} + +bool BSONColumn::Iterator::_needStoreElement() const { + return _index == _column._decompressed.size(); +} + +void BSONColumn::Iterator::_storeElementIfNeeded(BSONElement elem) { + if (_needStoreElement()) { + _column._decompressed.emplace_back(elem); + } +} + +BSONColumn::BSONColumn(BSONElement bin) { + tassert(5857700, + "Invalid BSON type for column", + bin.type() == BSONType::BinData && bin.binDataType() == BinDataType::Column); + _binary = bin.binData(_size); + _controlLastLiteral = _binary; + uassert(ErrorCodes::BadValue, "Invalid BSON Column encoding", _size > 0); +} + +BSONColumn::Iterator BSONColumn::begin() { + Iterator it{*this, _binary, _binary + _size}; + it._loadControl(BSONElement()); + return it; +} + +BSONColumn::Iterator BSONColumn::end() { + Iterator it{*this, _binary + _size, _binary + _size}; + it._index = kEndIndex; + return it; +} + +BSONElement BSONColumn::operator[](size_t index) { + // If index is already decompressed, we can just return the element + if (index < _decompressed.size()) { + return _decompressed[index]; + } + + // No more elements to be found if we are fully decompressed, return EOO + if (_fullyDecompressed) + return BSONElement(); + + // We can begin iterating from last known literal + Iterator it{*this, _controlLastLiteral, _binary + _size}; + it._index = _indexLastLiteral; + it._loadControl(BSONElement()); // previous doesn't matter when we load literals + + // Traverse until we reach desired index or end + auto e = end(); + for (size_t i = _indexLastLiteral; it != e && i < index; ++it, ++i) { + } + + // Return EOO if not found + if (it == e) + return BSONElement(); + + return *it; +} + +size_t BSONColumn::size() const { + // Size is known when we are fully decompressed + if (_fullyDecompressed) + return _decompressed.size(); + + // We can begin iterating from last known literal + auto e = _binary + _size; + size_t num = _indexLastLiteral; + + // Iterate as long as we are within the Column binary + const char* pos = _controlLastLiteral; + while (pos < e) { + uint8_t control = *pos; + if (Iterator::_literal(control)) { + // We are done at EOO + if (control == EOO) { + break; + } + + // Count our literal and set our position to the byte after + ++num; + pos += BSONElement(pos, 1, -1, BSONElement::CachedSizeTag{}).size(); + } else { + // Count elements in Simple-8b blocks, no need to decompress + uint8_t blocks = Iterator::_numSimple8bBlocks(control); + int blockSize = blocks * sizeof(uint64_t); + uassert( + ErrorCodes::BadValue, "Invalid BSON Column encoding", (pos + blockSize + 1) < e); + Simple8b<uint128_t> s8b(pos + 1, blockSize); + for (auto it = s8b.begin(), s8bEnd = s8b.end(); it != s8bEnd; it.advanceBlock()) { + num += it.blockSize(); + } + + // Set our position to the byte after the Simple-8b blocks + pos += blockSize + 1; + } + } + + return num; +} + + +} // namespace mongo diff --git a/src/mongo/bson/util/bsoncolumn.h b/src/mongo/bson/util/bsoncolumn.h index d3f38aaf98d..c43143521d1 100644 --- a/src/mongo/bson/util/bsoncolumn.h +++ b/src/mongo/bson/util/bsoncolumn.h @@ -28,3 +28,226 @@ */ #pragma once + +#include "mongo/bson/bsonelement.h" +#include "mongo/bson/util/simple8b.h" + +#include <deque> +#include <vector> + +namespace mongo { + +/** + * The BSONColumn class represents a reference to a BSONElement of BinDataType 7, which can + * efficiently store any BSONArray and also allows for missing values. At a high level, two + * optimizations are applied: + * - implied field names: do not store decimal keys representing index keys. + * - delta compression using Simple-8b: store difference between subsequent scalars of the same + * type + * + * The BSONColumn will not take ownership of the BinData element, but otherwise implements + * an interface similar to BSONObj. Because iterators over the BSONColumn need to rematerialize + * deltas, they use additional storage owned by the BSONColumn for this. As all iterators will + * create new delta's in the same order, they share a single ElementStore, with a worst-case memory + * usage bounded to a total size on the order of that of the size of the expanded BSONColumn. + */ +class BSONColumn { +public: + BSONColumn(BSONElement bin); + + /** + * Forward iterator type to access BSONElement from BSONColumn. + * + * Default-constructed BSONElement (EOO type) represent missing value. + * Returned BSONElement are owned by BSONColumn instance and should not be kept after the + * BSONColumn instance goes out of scope. + */ + class Iterator { + public: + friend class BSONColumn; + + // typedefs expected in iterators + using iterator_category = std::forward_iterator_tag; + using difference_type = ptrdiff_t; + using value_type = BSONElement; + using pointer = const BSONElement*; + using reference = const BSONElement&; + + reference operator*() const { + return _column._decompressed.at(_index); + } + pointer operator->() const { + return &operator*(); + } + + // pre-increment operator + Iterator& operator++(); + + // post-increment operator + Iterator operator++(int); + + bool operator==(const Iterator& rhs) const; + bool operator!=(const Iterator& rhs) const; + + private: + Iterator(BSONColumn& column, const char* pos, const char* end); + + // Loads current control byte + void _loadControl(const BSONElement& prev); + + // Loads delta value + void _loadDelta(const BSONElement& prev, const boost::optional<uint64_t>& delta); + void _loadDelta(const BSONElement& prev, const boost::optional<uint128_t>& delta); + + // Helpers to determine if we need to store uncompressed element when advancing iterator + bool _needStoreElement() const; + void _storeElementIfNeeded(BSONElement elem); + + // Checks if control byte is literal + static bool _literal(uint8_t control) { + return (control & 0xE0) == 0; + } + + // Returns number of Simple-8b blocks from control byte + static uint8_t _numSimple8bBlocks(uint8_t control) { + return (control & 0x0F) + 1; + } + + // Reference to BSONColumn this Iterator is created from + BSONColumn& _column; + + // Current iterator position + size_t _index = 0; + + // Last index observed with a non-skipped value + size_t _lastValueIndex = 0; + + // Last encoded values used to calculate delta and delta-of-delta + int64_t _lastEncodedValue64 = 0; + int64_t _lastEncodedValueForDeltaOfDelta = 0; + int128_t _lastEncodedValue128 = 0; + + // Current control byte on iterator position + const char* _control; + + // End of BSONColumn memory block, we may not dereference any memory passed this. + const char* _end; + + // Helper to create Simple8b decoding iterators for 64bit and 128bit value types. + template <typename T> + struct Decoder { + Decoder(const char* buf, size_t size) + : s8b(buf, size), pos(s8b.begin()), end(s8b.end()) {} + + Simple8b<T> s8b; + typename Simple8b<T>::Iterator pos; + typename Simple8b<T>::Iterator end; + }; + + // Decoders, only one should be instantiated at a time. + boost::optional<Decoder<uint64_t>> _decoder64; + boost::optional<Decoder<uint128_t>> _decoder128; + + // Current scale index + uint8_t _scaleIndex; + }; + + /** + * Forward iterator access. + * + * Iterator value is EOO + * + * Iterators materialize compressed BSONElement as they iterate over the compressed binary. + * It is NOT safe to do this from multiple threads concurrently. + * + * Throws BadValue if invalid encoding is encountered. + */ + Iterator begin(); + Iterator end(); + + /** + * Element lookup by index + * + * Returns EOO if index represent skipped element or is out of bounds. + * O(1) time complexity if element has been previously accessed + * O(N) time complexity otherwise + * + * Materializes compressed BSONElement as needed. It is NOT safe to do this from multiple + * threads concurrently. + * + * Throws BadValue if invalid encoding is encountered. + */ + BSONElement operator[](size_t index); + + /** + * Number of elements stored in this BSONColumn + * + * O(1) time complexity if BSONColumn is fully decompressed + * O(N) time complexity otherwise + * + * Throws BadValue if invalid encoding is encountered + */ + size_t size() const; + +private: + /** + * BSONElement storage, owns materialised BSONElement returned by BSONColumn. + * Allocates memory in blocks which double in size as they grow. + */ + class ElementStorage { + public: + class Element { + public: + Element(char* buffer, int valueSize); + + /** + * Returns a pointer for writing a BSONElement value. + */ + char* value(); + + /** + * Size for the pointer returned by value() + */ + int size() const; + + /** + * Constructs a BSONElement from the owned buffer. + */ + BSONElement element() const; + + private: + char* _buffer; + int _valueSize; + }; + + /** + * Allocates a BSONElement of provided type and value size. Field name is set to empty + * string. + */ + Element allocate(BSONType type, int valueSize); + + private: + // Full memory blocks that are kept alive. + std::vector<std::unique_ptr<char[]>> _blocks; + + // Current memory block + std::unique_ptr<char[]> _block; + + // Capacity of current memory block + int _capacity = 0; + + // Position to first unused byte in current memory block + int _pos = 0; + }; + + std::deque<BSONElement> _decompressed; + ElementStorage _elementStorage; + + const char* _binary; + int _size; + + const char* _controlLastLiteral; + size_t _indexLastLiteral = 0; + bool _fullyDecompressed = false; +}; +} // namespace mongo diff --git a/src/mongo/bson/util/bsoncolumn_test.cpp b/src/mongo/bson/util/bsoncolumn_test.cpp index f1aa7228dfa..a1e790e46e9 100644 --- a/src/mongo/bson/util/bsoncolumn_test.cpp +++ b/src/mongo/bson/util/bsoncolumn_test.cpp @@ -27,6 +27,7 @@ * it in the license file. */ +#include "mongo/bson/util/bsoncolumn.h" #include "mongo/bson/util/bsoncolumnbuilder.h" #include "mongo/bson/util/simple8b_type_util.h" @@ -39,6 +40,13 @@ namespace { class BSONColumnTest : public unittest::Test { public: + BSONElement createBSONColumn(const char* buffer, int size) { + BSONObjBuilder ob; + ob.appendBinData(""_sd, size, BinDataType::Column, buffer); + _elementMemory.emplace_front(ob.obj()); + return _elementMemory.front().firstElement(); + } + template <typename T> BSONElement _createElement(T val) { BSONObjBuilder ob; @@ -137,13 +145,21 @@ public: return _elementMemory.front().firstElement(); } - static uint128_t deltaBinData(BSONElement val, BSONElement prev) { + static boost::optional<uint128_t> deltaBinData(BSONElement val, BSONElement prev) { if (val.binaryEqualValues(prev)) { - return 0; + return uint128_t(0); } - return Simple8bTypeUtil::encodeInt128( - Simple8bTypeUtil::encodeBinary(val.valuestr(), val.valuestrsize()) - - Simple8bTypeUtil::encodeBinary(prev.valuestr(), prev.valuestrsize())); + + int valSize; + int prevSize; + const char* valBinary = val.binData(valSize); + const char* prevBinary = prev.binData(prevSize); + if (valSize != prevSize || valSize > 16) { + return boost::none; + } + + return Simple8bTypeUtil::encodeInt128(Simple8bTypeUtil::encodeBinary(valBinary, valSize) - + Simple8bTypeUtil::encodeBinary(prevBinary, prevSize)); } static uint64_t deltaInt32(BSONElement val, BSONElement prev) { @@ -318,6 +334,85 @@ public: ASSERT_EQ(memcmp(columnBinary.data, buf, columnBinary.length), 0); } + static void verifyDecompression(BSONBinData columnBinary, + const std::vector<BSONElement>& expected) { + BSONObjBuilder obj; + obj.append(""_sd, columnBinary); + BSONElement columnElement = obj.done().firstElement(); + + // Verify that we can traverse BSONColumn twice and extract values on the second pass + { + BSONColumn col(columnElement); + ASSERT_EQ(col.size(), expected.size()); + ASSERT_EQ(std::distance(col.begin(), col.end()), expected.size()); + ASSERT_EQ(col.size(), expected.size()); + + auto it = col.begin(); + for (auto elem : expected) { + BSONElement other = *(it++); + ASSERT(elem.binaryEqualValues(other)); + } + } + + // Verify that we can traverse BSONColumn and extract values on the first pass + { + BSONColumn col(columnElement); + + auto it = col.begin(); + for (auto elem : expected) { + BSONElement other = *(it++); + ASSERT(elem.binaryEqualValues(other)); + } + } + + // Verify operator[] when accessing in order + { + BSONColumn col(columnElement); + + for (size_t i = 0; i < expected.size(); ++i) { + ASSERT(expected[i].binaryEqualValues(col[i])); + } + } + + // Verify operator[] when accessing in reverse order + { + BSONColumn col(columnElement); + + for (int i = (int)expected.size() - 1; i >= 0; --i) { + ASSERT(expected[i].binaryEqualValues(col[i])); + } + } + + // Verify that we can continue traverse with new iterators when we stop before end + { + BSONColumn col(columnElement); + + for (size_t e = 0; e < expected.size(); ++e) { + auto it = col.begin(); + for (size_t i = 0; i < e; ++i, ++it) { + ASSERT(expected[i].binaryEqualValues(*it)); + } + ASSERT_EQ(col.size(), expected.size()); + } + } + + // Verify that we can have multiple iterators on the same thread + { + BSONColumn col(columnElement); + + auto it1 = col.begin(); + auto it2 = col.begin(); + auto itEnd = col.end(); + + for (; it1 != itEnd && it2 != itEnd; ++it1, ++it2) { + ASSERT(it1->binaryEqualValues(*it2)); + ASSERT_EQ(&*it1, &*it2); // Iterators should point to same reference + } + + ASSERT(it1 == it2); + } + } + const boost::optional<uint64_t> kDeltaForBinaryEqualValues = Simple8bTypeUtil::encodeInt64(0); private: @@ -337,7 +432,9 @@ TEST_F(BSONColumnTest, BasicValue) { appendSimple8bBlock64(expected, 0); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elem, elem}); } TEST_F(BSONColumnTest, BasicSkip) { @@ -353,7 +450,9 @@ TEST_F(BSONColumnTest, BasicSkip) { appendSimple8bBlock64(expected, boost::none); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elem, BSONElement()}); } TEST_F(BSONColumnTest, OnlySkip) { @@ -366,7 +465,9 @@ TEST_F(BSONColumnTest, OnlySkip) { appendSimple8bBlock64(expected, boost::none); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {BSONElement()}); } TEST_F(BSONColumnTest, ValueAfterSkip) { @@ -382,7 +483,9 @@ TEST_F(BSONColumnTest, ValueAfterSkip) { appendLiteral(expected, elem); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {BSONElement(), elem}); } @@ -400,7 +503,9 @@ TEST_F(BSONColumnTest, LargeDeltaIsLiteral) { appendLiteral(expected, second); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {first, second}); } TEST_F(BSONColumnTest, LargeDeltaIsLiteralAfterSimple8b) { @@ -423,7 +528,9 @@ TEST_F(BSONColumnTest, LargeDeltaIsLiteralAfterSimple8b) { appendSimple8bBlock64(expected, deltaInt64(large, large)); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {zero, zero, large, large}); } TEST_F(BSONColumnTest, OverBlockCount) { @@ -454,7 +561,9 @@ TEST_F(BSONColumnTest, OverBlockCount) { appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, elems); } TEST_F(BSONColumnTest, TypeChangeAfterLiteral) { @@ -471,7 +580,9 @@ TEST_F(BSONColumnTest, TypeChangeAfterLiteral) { appendLiteral(expected, elemInt64); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemInt32, elemInt64}); } TEST_F(BSONColumnTest, TypeChangeAfterSimple8b) { @@ -491,7 +602,9 @@ TEST_F(BSONColumnTest, TypeChangeAfterSimple8b) { appendLiteral(expected, elemInt64); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemInt32, elemInt32, elemInt64}); } TEST_F(BSONColumnTest, Simple8bAfterTypeChange) { @@ -511,38 +624,9 @@ TEST_F(BSONColumnTest, Simple8bAfterTypeChange) { appendSimple8bBlock64(expected, 0); appendEOO(expected); - verifyBinary(cb.finalize(), expected); -} - -TEST_F(BSONColumnTest, Decimal128Base) { - BSONColumnBuilder cb("test"_sd); - - auto elemDec128 = createElementDecimal128(Decimal128()); - - cb.append(elemDec128); - - BufBuilder expected; - appendLiteral(expected, elemDec128); - appendEOO(expected); - - verifyBinary(cb.finalize(), expected); -} - -TEST_F(BSONColumnTest, Decimal128Delta) { - BSONColumnBuilder cb("test"_sd); - - auto elemDec128 = createElementDecimal128(Decimal128(1)); - - cb.append(elemDec128); - cb.append(elemDec128); - - BufBuilder expected; - appendLiteral(expected, elemDec128); - appendSimple8bControl(expected, 0b1000, 0b0000); - appendSimple8bBlock128(expected, deltaDecimal128(elemDec128, elemDec128)); - appendEOO(expected); - - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemInt32, elemInt64, elemInt64}); } TEST_F(BSONColumnTest, BasicDouble) { @@ -559,7 +643,9 @@ TEST_F(BSONColumnTest, BasicDouble) { appendSimple8bBlock64(expected, deltaDouble(d2, d1, 1)); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {d1, d2}); } TEST_F(BSONColumnTest, DoubleSameScale) { @@ -581,7 +667,9 @@ TEST_F(BSONColumnTest, DoubleSameScale) { expected, deltaDouble(elems.begin() + 1, elems.end(), elems.front(), 1), 1); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, elems); } TEST_F(BSONColumnTest, DoubleIncreaseScaleFromLiteral) { @@ -598,7 +686,9 @@ TEST_F(BSONColumnTest, DoubleIncreaseScaleFromLiteral) { appendSimple8bBlock64(expected, deltaDouble(d2, d1, 10)); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {d1, d2}); } TEST_F(BSONColumnTest, DoubleLiteralAndScaleAfterSkip) { @@ -618,7 +708,9 @@ TEST_F(BSONColumnTest, DoubleLiteralAndScaleAfterSkip) { appendSimple8bBlock64(expected, deltaDouble(d2, d1, 10)); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {BSONElement(), d1, d2}); } TEST_F(BSONColumnTest, DoubleIncreaseScaleFromLiteralAfterSkip) { @@ -640,7 +732,9 @@ TEST_F(BSONColumnTest, DoubleIncreaseScaleFromLiteralAfterSkip) { appendSimple8bBlocks64(expected, expectedVals, 1); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {d1, BSONElement(), BSONElement(), d2}); } TEST_F(BSONColumnTest, DoubleIncreaseScaleFromDeltaWithRescale) { @@ -663,7 +757,9 @@ TEST_F(BSONColumnTest, DoubleIncreaseScaleFromDeltaWithRescale) { expected, deltaDouble(elems.begin() + 1, elems.end(), elems.front(), 10), 1); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, elems); } TEST_F(BSONColumnTest, DoubleIncreaseScaleFromDeltaNoRescale) { @@ -693,7 +789,9 @@ TEST_F(BSONColumnTest, DoubleIncreaseScaleFromDeltaNoRescale) { expected, deltaDouble(deltaEnd, elems.end(), *(deltaEnd - 1), 100000000), 1); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, elems); } TEST_F(BSONColumnTest, DoubleDecreaseScaleAfterBlock) { @@ -722,7 +820,9 @@ TEST_F(BSONColumnTest, DoubleDecreaseScaleAfterBlock) { appendSimple8bBlocks64(expected, deltaDouble(deltaEnd, elems.end(), *(deltaEnd - 1), 1), 1); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, elems); } TEST_F(BSONColumnTest, DoubleDecreaseScaleAfterBlockUsingSkip) { @@ -756,7 +856,9 @@ TEST_F(BSONColumnTest, DoubleDecreaseScaleAfterBlockUsingSkip) { appendSimple8bBlocks64(expected, deltaDouble(deltaEnd, elems.end(), *deltaBegin, 1), 1); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, elems); } TEST_F(BSONColumnTest, DoubleDecreaseScaleAfterBlockThenScaleBackUp) { @@ -780,7 +882,9 @@ TEST_F(BSONColumnTest, DoubleDecreaseScaleAfterBlockThenScaleBackUp) { expected, deltaDouble(elems.begin() + 1, elems.end(), elems.front(), 100000000), 2); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, elems); } TEST_F(BSONColumnTest, DoubleDecreaseScaleAfterBlockUsingSkipThenScaleBackUp) { @@ -808,7 +912,9 @@ TEST_F(BSONColumnTest, DoubleDecreaseScaleAfterBlockUsingSkipThenScaleBackUp) { expected, deltaDouble(elems.begin() + 1, elems.end(), elems.front(), 100000000), 2); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, elems); } TEST_F(BSONColumnTest, DoubleUnscalable) { @@ -833,7 +939,9 @@ TEST_F(BSONColumnTest, DoubleUnscalable) { appendSimple8bBlocks64(expected, expectedVals, 1); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, elems); } TEST_F(BSONColumnTest, DoubleSignalingNaN) { @@ -857,7 +965,9 @@ TEST_F(BSONColumnTest, DoubleSignalingNaN) { appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elem, nan}); } TEST_F(BSONColumnTest, DoubleQuietNaN) { @@ -879,7 +989,9 @@ TEST_F(BSONColumnTest, DoubleQuietNaN) { } appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elem, nan}); } TEST_F(BSONColumnTest, DoubleInfinity) { @@ -901,7 +1013,9 @@ TEST_F(BSONColumnTest, DoubleInfinity) { } appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elem, inf}); } TEST_F(BSONColumnTest, DoubleDenorm) { @@ -923,7 +1037,9 @@ TEST_F(BSONColumnTest, DoubleDenorm) { } appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elem, denorm}); } TEST_F(BSONColumnTest, DoubleIntegerOverflow) { @@ -944,6 +1060,45 @@ TEST_F(BSONColumnTest, DoubleIntegerOverflow) { appendSimple8bControl(expected, 0b1000, 0b0000); appendSimple8bBlock64(expected, deltaDoubleMemory(e2, e1)); appendEOO(expected); + + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {e1, e2}); +} + +TEST_F(BSONColumnTest, Decimal128Base) { + BSONColumnBuilder cb("test"_sd); + + auto elemDec128 = createElementDecimal128(Decimal128()); + + cb.append(elemDec128); + + BufBuilder expected; + appendLiteral(expected, elemDec128); + appendEOO(expected); + + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemDec128}); +} + +TEST_F(BSONColumnTest, Decimal128Delta) { + BSONColumnBuilder cb("test"_sd); + + auto elemDec128 = createElementDecimal128(Decimal128(1)); + + cb.append(elemDec128); + cb.append(elemDec128); + + BufBuilder expected; + appendLiteral(expected, elemDec128); + appendSimple8bControl(expected, 0b1000, 0b0000); + appendSimple8bBlock128(expected, deltaDecimal128(elemDec128, elemDec128)); + appendEOO(expected); + + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemDec128, elemDec128}); } TEST_F(BSONColumnTest, DecimalNonZeroDelta) { @@ -959,7 +1114,10 @@ TEST_F(BSONColumnTest, DecimalNonZeroDelta) { appendSimple8bControl(expected, 0b1000, 0b0000); appendSimple8bBlock128(expected, deltaDecimal128(elemDec128Max, elemDec128Zero)); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemDec128Zero, elemDec128Max}); } TEST_F(BSONColumnTest, DecimalMaxMin) { @@ -975,7 +1133,10 @@ TEST_F(BSONColumnTest, DecimalMaxMin) { appendSimple8bControl(expected, 0b1000, 0b0000); appendSimple8bBlock128(expected, deltaDecimal128(elemDec128Max, elemDec128Zero)); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemDec128Zero, elemDec128Max}); } TEST_F(BSONColumnTest, DecimalMultiElement) { @@ -999,7 +1160,11 @@ TEST_F(BSONColumnTest, DecimalMultiElement) { deltaDecimal128(elemDec128One, elemDec128Zero)}; appendSimple8bBlocks128(expected, valuesToAppend, 1); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression( + binData, {elemDec128Zero, elemDec128Max, elemDec128Zero, elemDec128Zero, elemDec128One}); } TEST_F(BSONColumnTest, DecimalMultiElementSkips) { @@ -1027,7 +1192,17 @@ TEST_F(BSONColumnTest, DecimalMultiElementSkips) { deltaDecimal128(elemDec128One, elemDec128Zero)}; appendSimple8bBlocks128(expected, valuesToAppend, 1); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, + {elemDec128Zero, + elemDec128Max, + BSONElement(), + BSONElement(), + elemDec128Zero, + elemDec128Zero, + elemDec128One}); } TEST_F(BSONColumnTest, BasicObjectId) { @@ -1051,7 +1226,9 @@ TEST_F(BSONColumnTest, BasicObjectId) { appendSimple8bBlocks64(expected, expectedDeltas, 1); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {first, second, second, third}); } TEST_F(BSONColumnTest, ObjectIdDifferentProcessUnique) { @@ -1068,7 +1245,9 @@ TEST_F(BSONColumnTest, ObjectIdDifferentProcessUnique) { appendLiteral(expected, second); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {first, second}); } TEST_F(BSONColumnTest, ObjectIdAfterChangeBack) { @@ -1099,7 +1278,9 @@ TEST_F(BSONColumnTest, ObjectIdAfterChangeBack) { appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {first, second, elemInt32, first, second}); } TEST_F(BSONColumnTest, Simple8bTimestamp) { @@ -1120,7 +1301,9 @@ TEST_F(BSONColumnTest, Simple8bTimestamp) { appendSimple8bBlocks64(expected, expectedDeltaOfDeltas, 1); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {first, second, second}); } TEST_F(BSONColumnTest, Simple8bTimestampNegativeDeltaOfDelta) { @@ -1142,7 +1325,9 @@ TEST_F(BSONColumnTest, Simple8bTimestampNegativeDeltaOfDelta) { appendSimple8bBlocks64(expected, expectedDeltaOfDeltas, 1); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {first, second, third}); } TEST_F(BSONColumnTest, Simple8bTimestampAfterChangeBack) { @@ -1172,7 +1357,9 @@ TEST_F(BSONColumnTest, Simple8bTimestampAfterChangeBack) { appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {first, second, elemInt32, first, second}); } TEST_F(BSONColumnTest, LargeDeltaOfDeltaTimestamp) { @@ -1189,7 +1376,9 @@ TEST_F(BSONColumnTest, LargeDeltaOfDeltaTimestamp) { appendLiteral(expected, second); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {first, second}); } TEST_F(BSONColumnTest, LargeDeltaOfDeltaIsLiteralAfterSimple8bTimestamp) { @@ -1218,7 +1407,9 @@ TEST_F(BSONColumnTest, LargeDeltaOfDeltaIsLiteralAfterSimple8bTimestamp) { appendSimple8bBlocks64(expected, expectedDeltaOfDeltas, 1); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {zero, zero, large, large, semiLarge}); } TEST_F(BSONColumnTest, DateBasic) { @@ -1238,7 +1429,9 @@ TEST_F(BSONColumnTest, DateBasic) { _appendSimple8bBlocks(expected, expectedDeltaOfDeltas, 1); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {first, second, second}); } TEST_F(BSONColumnTest, DateAfterChangeBack) { @@ -1259,7 +1452,9 @@ TEST_F(BSONColumnTest, DateAfterChangeBack) { appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemInt32, date, date}); } TEST_F(BSONColumnTest, DateLargeDelta) { @@ -1276,7 +1471,9 @@ TEST_F(BSONColumnTest, DateLargeDelta) { appendLiteral(expected, second); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {first, second}); } TEST_F(BSONColumnTest, BoolBasic) { @@ -1298,7 +1495,9 @@ TEST_F(BSONColumnTest, BoolBasic) { _appendSimple8bBlocks(expected, expectedDeltaOfDeltas, 1); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {trueBson, trueBson, falseBson, trueBson}); } TEST_F(BSONColumnTest, BoolAfterChangeBack) { @@ -1319,7 +1518,9 @@ TEST_F(BSONColumnTest, BoolAfterChangeBack) { appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemInt32, trueBson, trueBson}); } TEST_F(BSONColumnTest, UndefinedBasic) { @@ -1335,7 +1536,9 @@ TEST_F(BSONColumnTest, UndefinedBasic) { appendSimple8bBlock64(expected, kDeltaForBinaryEqualValues); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {first, first}); } TEST_F(BSONColumnTest, UndefinedAfterChangeBack) { @@ -1356,7 +1559,9 @@ TEST_F(BSONColumnTest, UndefinedAfterChangeBack) { appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemInt32, undefined, undefined}); } TEST_F(BSONColumnTest, NullBasic) { @@ -1372,7 +1577,9 @@ TEST_F(BSONColumnTest, NullBasic) { appendSimple8bBlock64(expected, kDeltaForBinaryEqualValues); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {first, first}); } TEST_F(BSONColumnTest, NullAfterChangeBack) { @@ -1393,7 +1600,9 @@ TEST_F(BSONColumnTest, NullAfterChangeBack) { appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemInt32, null, null}); } TEST_F(BSONColumnTest, RegexBasic) { @@ -1412,7 +1621,9 @@ TEST_F(BSONColumnTest, RegexBasic) { appendSimple8bBlock64(expected, kDeltaForBinaryEqualValues); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {first, second, second}); } TEST_F(BSONColumnTest, RegexAfterChangeBack) { @@ -1433,7 +1644,9 @@ TEST_F(BSONColumnTest, RegexAfterChangeBack) { appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemInt32, regex, regex}); } TEST_F(BSONColumnTest, DBRefBasic) { @@ -1453,7 +1666,9 @@ TEST_F(BSONColumnTest, DBRefBasic) { appendSimple8bBlock64(expected, kDeltaForBinaryEqualValues); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {first, second, second}); } TEST_F(BSONColumnTest, DBRefAfterChangeBack) { @@ -1475,7 +1690,9 @@ TEST_F(BSONColumnTest, DBRefAfterChangeBack) { appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemInt32, dbRef, dbRef}); } TEST_F(BSONColumnTest, CodeWScopeBasic) { @@ -1494,7 +1711,9 @@ TEST_F(BSONColumnTest, CodeWScopeBasic) { appendSimple8bBlock64(expected, kDeltaForBinaryEqualValues); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {first, second, second}); } TEST_F(BSONColumnTest, CodeWScopeAfterChangeBack) { @@ -1515,7 +1734,9 @@ TEST_F(BSONColumnTest, CodeWScopeAfterChangeBack) { appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemInt32, codeWScope, codeWScope}); } TEST_F(BSONColumnTest, SymbolBasic) { @@ -1534,7 +1755,9 @@ TEST_F(BSONColumnTest, SymbolBasic) { appendSimple8bBlock64(expected, kDeltaForBinaryEqualValues); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {first, second, second}); } TEST_F(BSONColumnTest, SymbolAfterChangeBack) { @@ -1555,7 +1778,9 @@ TEST_F(BSONColumnTest, SymbolAfterChangeBack) { appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemInt32, symbol, symbol}); } TEST_F(BSONColumnTest, BinDataBase) { @@ -1569,7 +1794,9 @@ TEST_F(BSONColumnTest, BinDataBase) { appendLiteral(expected, elemBinData); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemBinData}); } TEST_F(BSONColumnTest, BinDataOdd) { @@ -1583,7 +1810,9 @@ TEST_F(BSONColumnTest, BinDataOdd) { appendLiteral(expected, elemBinData); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemBinData}); } TEST_F(BSONColumnTest, BinDataDelta) { @@ -1600,7 +1829,9 @@ TEST_F(BSONColumnTest, BinDataDelta) { appendSimple8bBlock128(expected, deltaBinData(elemBinData, elemBinData)); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemBinData, elemBinData}); } TEST_F(BSONColumnTest, BinDataDeltaShouldFail) { @@ -1619,7 +1850,9 @@ TEST_F(BSONColumnTest, BinDataDeltaShouldFail) { appendLiteral(expected, elemBinDataLong); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemBinData, elemBinDataLong}); } TEST_F(BSONColumnTest, BinDataDeltaCheckSkips) { @@ -1637,15 +1870,17 @@ TEST_F(BSONColumnTest, BinDataDeltaCheckSkips) { BufBuilder expected; appendLiteral(expected, elemBinData); - appendSimple8bControl(expected, 0b1000, 0b0000); + appendSimple8bControl(expected, 0b1000, 0b0001); std::vector<boost::optional<uint128_t>> expectedValues = { deltaBinData(elemBinDataLong, elemBinData), boost::none, deltaBinData(elemBinData, elemBinDataLong)}; - appendSimple8bBlocks128(expected, expectedValues, 1); + appendSimple8bBlocks128(expected, expectedValues, 2); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemBinData, elemBinDataLong, BSONElement(), elemBinData}); } TEST_F(BSONColumnTest, BinDataLargerThan16) { @@ -1666,7 +1901,9 @@ TEST_F(BSONColumnTest, BinDataLargerThan16) { appendLiteral(expected, elemBinDataLong); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemBinData, elemBinDataLong}); } TEST_F(BSONColumnTest, BinDataEqualTo16) { @@ -1688,7 +1925,9 @@ TEST_F(BSONColumnTest, BinDataEqualTo16) { appendSimple8bBlock128(expected, deltaBinData(elemBinDataLong, elemBinData)); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemBinData, elemBinDataLong}); } TEST_F(BSONColumnTest, BinDataLargerThan16SameValue) { @@ -1706,8 +1945,135 @@ TEST_F(BSONColumnTest, BinDataLargerThan16SameValue) { appendSimple8bBlock128(expected, deltaBinData(elemBinData, elemBinData)); appendEOO(expected); - verifyBinary(cb.finalize(), expected); + auto binData = cb.finalize(); + verifyBinary(binData, expected); + verifyDecompression(binData, {elemBinData, elemBinData}); +} + +TEST_F(BSONColumnTest, InvalidControlByte) { + auto elem = createElementInt32(0); + + BufBuilder expected; + appendLiteral(expected, elem); + appendSimple8bControl(expected, 0b0010, 0b0000); + appendSimple8bBlock64(expected, deltaInt32(elem, elem)); + appendEOO(expected); + + try { + BSONColumn col(createBSONColumn(expected.buf(), expected.len())); + for (auto it = col.begin(), e = col.end(); it != e; ++it) { + } + ASSERT(false); + + } catch (DBException&) { + } } +TEST_F(BSONColumnTest, InvalidSize) { + auto elem = createElementInt32(0); + + BufBuilder expected; + appendLiteral(expected, elem); + appendSimple8bControl(expected, 0b1000, 0b0000); + appendSimple8bBlock64(expected, deltaInt32(elem, elem)); + appendEOO(expected); + + try { + BSONColumn col(createBSONColumn(expected.buf(), expected.len() - 2)); + for (auto it = col.begin(), e = col.end(); it != e; ++it) { + } + ASSERT(false); + + } catch (DBException&) { + } +} + +TEST_F(BSONColumnTest, InvalidDoubleScale) { + auto d1 = createElementDouble(1.11); + auto d2 = createElementDouble(1.12); + + BufBuilder expected; + appendLiteral(expected, d1); + appendSimple8bControl(expected, 0b1001, 0b0000); + appendSimple8bBlock64(expected, deltaDouble(d2, d1, 100)); + appendEOO(expected); + + try { + BSONColumn col(createBSONColumn(expected.buf(), expected.len())); + for (auto it = col.begin(), e = col.end(); it != e; ++it) { + } + ASSERT(false); + + } catch (DBException&) { + } +} + +TEST_F(BSONColumnTest, MissingEOO) { + auto elem = createElementInt32(0); + + BufBuilder expected; + appendLiteral(expected, elem); + + try { + BSONColumn col(createBSONColumn(expected.buf(), expected.len())); + for (auto it = col.begin(), e = col.end(); it != e; ++it) { + } + ASSERT(false); + + } catch (DBException&) { + } +} + +TEST_F(BSONColumnTest, EmptyBuffer) { + BufBuilder expected; + + try { + BSONColumn col(createBSONColumn(expected.buf(), expected.len())); + for (auto it = col.begin(), e = col.end(); it != e; ++it) { + } + ASSERT(false); + + } catch (DBException&) { + } +} + +TEST_F(BSONColumnTest, InvalidSimple8b) { + // A Simple8b block with an invalid selector doesn't throw an error, but make sure we can handle + // it gracefully. Check so we don't read out of bounds and can iterate. + + auto elem = createElementInt32(0); + + BufBuilder expected; + appendLiteral(expected, elem); + appendSimple8bControl(expected, 0b1000, 0b0000); + uint64_t invalidSimple8b = 0; + expected.appendNum(invalidSimple8b); + appendEOO(expected); + + BSONColumn col(createBSONColumn(expected.buf(), expected.len())); + for (auto it = col.begin(), e = col.end(); it != e; ++it) { + } +} + +TEST_F(BSONColumnTest, NoLiteralStart) { + // Starting the stream with a delta block doesn't throw an error. Make sure we handle it + // gracefully even if the values we extracted may not be meaningful. Check so we don't read out + // of bounds and can iterate. + + auto elem = createElementInt32(0); + + BufBuilder expected; + appendLiteral(expected, elem); + appendSimple8bControl(expected, 0b1000, 0b0000); + uint64_t invalidSimple8b = 0; + expected.appendNum(invalidSimple8b); + appendEOO(expected); + + BSONColumn col(createBSONColumn(expected.buf(), expected.len())); + for (auto it = col.begin(), e = col.end(); it != e; ++it) { + } +} + + } // namespace } // namespace mongo diff --git a/src/mongo/bson/util/bsoncolumn_util.cpp b/src/mongo/bson/util/bsoncolumn_util.cpp new file mode 100644 index 00000000000..921df59e921 --- /dev/null +++ b/src/mongo/bson/util/bsoncolumn_util.cpp @@ -0,0 +1,64 @@ +/** + * Copyright (C) 2021-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/bson/util/bsoncolumn_util.h" + +namespace mongo::bsoncolumn { +bool usesDeltaOfDelta(BSONType type) { + return type == bsonTimestamp; +} + +bool uses128bit(BSONType type) { + return type == NumberDecimal || type == BinData; +} + +int64_t calcDelta(int64_t val, int64_t prev) { + // Do the subtraction as unsigned and cast back to signed to get overflow defined to wrapped + // around instead of undefined behavior. + return static_cast<int64_t>(static_cast<uint64_t>(val) - static_cast<uint64_t>(prev)); +} + +int128_t calcDelta(int128_t val, int128_t prev) { + // Do the subtraction as unsigned and cast back to signed to get overflow defined to wrapped + // around instead of undefined behavior. + return static_cast<int128_t>(static_cast<uint128_t>(val) - static_cast<uint128_t>(prev)); +} + +int64_t expandDelta(int64_t prev, int64_t delta) { + // Do the addition as unsigned and cast back to signed to get overflow defined to wrapped around + // instead of undefined behavior. + return static_cast<int64_t>(static_cast<uint64_t>(prev) + static_cast<uint64_t>(delta)); +} + +int128_t expandDelta(int128_t prev, int128_t delta) { + // Do the addition as unsigned and cast back to signed to get overflow defined to wrapped around + // instead of undefined behavior. + return static_cast<int128_t>(static_cast<uint128_t>(prev) + static_cast<uint128_t>(delta)); +} +} // namespace mongo::bsoncolumn diff --git a/src/mongo/bson/util/bsoncolumn_util.h b/src/mongo/bson/util/bsoncolumn_util.h new file mode 100644 index 00000000000..89be5b2bb4b --- /dev/null +++ b/src/mongo/bson/util/bsoncolumn_util.h @@ -0,0 +1,45 @@ +/** + * Copyright (C) 2021-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/bson/bsontypes.h" +#include "mongo/platform/int128.h" + +namespace mongo::bsoncolumn { +bool usesDeltaOfDelta(BSONType type); +bool uses128bit(BSONType type); + +int64_t calcDelta(int64_t val, int64_t prev); +int128_t calcDelta(int128_t val, int128_t prev); + +int64_t expandDelta(int64_t prev, int64_t delta); +int128_t expandDelta(int128_t prev, int128_t delta); + +} // namespace mongo::bsoncolumn diff --git a/src/mongo/bson/util/bsoncolumnbuilder.cpp b/src/mongo/bson/util/bsoncolumnbuilder.cpp index 73a2f4b7431..3e35b3fda29 100644 --- a/src/mongo/bson/util/bsoncolumnbuilder.cpp +++ b/src/mongo/bson/util/bsoncolumnbuilder.cpp @@ -28,12 +28,15 @@ */ #include "mongo/bson/util/bsoncolumnbuilder.h" +#include "mongo/bson/util/bsoncolumn_util.h" #include "mongo/bson/util/simple8b_type_util.h" #include <memory> namespace mongo { +using namespace bsoncolumn; + namespace { static constexpr uint8_t kMaxCount = 16; static constexpr uint8_t kCountMask = 0x0F; @@ -42,18 +45,6 @@ static constexpr uint8_t kControlMask = 0xF0; static constexpr std::array<uint8_t, Simple8bTypeUtil::kMemoryAsInteger + 1> kControlByteForScaleIndex = {0x90, 0xA0, 0xB0, 0xC0, 0xD0, 0x80}; -int64_t calcDelta(int64_t val, int64_t prev) { - // Do the subtraction as unsigned and cast back to signed to get overflow defined to wrapped - // around instead of undefined behavior. - return static_cast<int64_t>(static_cast<uint64_t>(val) - static_cast<uint64_t>(prev)); -} - -int64_t expandDelta(int64_t prev, int64_t delta) { - // Do the addition as unsigned and cast back to signed to get overflow defined to wrapped around - // instead of undefined behavior. - return static_cast<int64_t>(static_cast<uint64_t>(prev) + static_cast<uint64_t>(delta)); -} - // Encodes the double with the lowest possible scale index. In worst case we will interpret the // memory as integer which is guaranteed to succeed. std::pair<int64_t, uint8_t> scaleAndEncodeDouble(double value, uint8_t minScaleIndex) { @@ -91,13 +82,13 @@ BSONColumnBuilder& BSONColumnBuilder::append(BSONElement elem) { _storePrevious(elem); _simple8bBuilder128.flush(); _simple8bBuilder64.flush(); - _storeWith128 = elem.type() == NumberDecimal || elem.type() == BinData; + _storeWith128 = uses128bit(type); _writeLiteralFromPrevious(); return *this; } // Store delta in Simple-8b if types match - bool compressed = !_usesDeltaOfDelta(type) && elem.binaryEqualValues(previous); + bool compressed = !usesDeltaOfDelta(type) && elem.binaryEqualValues(previous); if (compressed) { if (_storeWith128) { _simple8bBuilder128.append(0); @@ -112,18 +103,19 @@ BSONColumnBuilder& BSONColumnBuilder::append(BSONElement elem) { int128_t delta = 0; switch (type) { case BinData: { - encodingPossible = - elem.valuestrsize() == previous.valuestrsize() && elem.valuestrsize() <= 16; + int size; + const char* binary = elem.binData(size); + encodingPossible = size == previous.valuestrsize() && elem.valuestrsize() <= 16; if (!encodingPossible) break; - int128_t curEncoded = - Simple8bTypeUtil::encodeBinary(elem.valuestr(), elem.valuestrsize()); - delta = curEncoded - _prevEncoded128; + + int128_t curEncoded = Simple8bTypeUtil::encodeBinary(binary, size); + delta = calcDelta(curEncoded, _prevEncoded128); _prevEncoded128 = curEncoded; } break; case NumberDecimal: { int128_t curEncoded = Simple8bTypeUtil::encodeDecimal128(elem._numberDecimal()); - delta = curEncoded - _prevEncoded128; + delta = calcDelta(curEncoded, _prevEncoded128); _prevEncoded128 = curEncoded; } break; default: @@ -144,12 +136,16 @@ BSONColumnBuilder& BSONColumnBuilder::append(BSONElement elem) { case NumberLong: value = calcDelta(elem._numberLong(), previous._numberLong()); break; - case jstOID: + case jstOID: { encodingPossible = _objectIdDeltaPossible(elem, previous); - if (encodingPossible) - value = calcDelta(Simple8bTypeUtil::encodeObjectId(elem.OID()), - Simple8bTypeUtil::encodeObjectId(previous.OID())); + if (!encodingPossible) + break; + + int64_t curEncoded = Simple8bTypeUtil::encodeObjectId(elem.OID()); + value = calcDelta(curEncoded, _prevEncoded64); + _prevEncoded64 = curEncoded; break; + } case bsonTimestamp: { int64_t currTimestampDelta = calcDelta(elem.timestamp().asULL(), previous.timestamp().asULL()); @@ -158,10 +154,11 @@ BSONColumnBuilder& BSONColumnBuilder::append(BSONElement elem) { break; } case Date: - value = elem.date().toMillisSinceEpoch() - previous.date().toMillisSinceEpoch(); + value = calcDelta(elem.date().toMillisSinceEpoch(), + previous.date().toMillisSinceEpoch()); break; case Bool: - value = elem.boolean() - previous.boolean(); + value = calcDelta(elem.boolean(), previous.boolean()); break; case Undefined: case jstNULL: @@ -374,29 +371,34 @@ void BSONColumnBuilder::_writeLiteralFromPrevious() { // appending next value. auto prevElem = _previous(); _bufBuilder.appendBuf(_prev.get(), _prevSize); + + // Reset state _controlByteOffset = 0; - // There is no previous timestamp delta. Set to default. + _scaleIndex = Simple8bTypeUtil::kMemoryAsInteger; _prevDelta = 0; + + // Initialize previous encoded when needed switch (prevElem.type()) { - case BinData: - if (prevElem.valuestrsize() <= 16) - _prevEncoded128 = - Simple8bTypeUtil::encodeBinary(prevElem.valuestr(), prevElem.valuestrsize()); + case NumberDouble: + _lastValueInPrevBlock = prevElem._numberDouble(); + std::tie(_prevEncoded64, _scaleIndex) = scaleAndEncodeDouble(_lastValueInPrevBlock, 0); + break; + case BinData: { + int size; + const char* binary = prevElem.binData(size); + if (size <= 16) + _prevEncoded128 = Simple8bTypeUtil::encodeBinary(binary, size); break; + } case NumberDecimal: _prevEncoded128 = Simple8bTypeUtil::encodeDecimal128(prevElem._numberDecimal()); break; + case jstOID: + _prevEncoded64 = Simple8bTypeUtil::encodeObjectId(prevElem.__oid()); + break; default: break; } - - // Set scale factor for this literal and values needed to append values - if (_prev[0] == NumberDouble) { - _lastValueInPrevBlock = prevElem._numberDouble(); - std::tie(_prevEncoded64, _scaleIndex) = scaleAndEncodeDouble(_lastValueInPrevBlock, 0); - } else { - _scaleIndex = Simple8bTypeUtil::kMemoryAsInteger; - } } void BSONColumnBuilder::_incrementSimple8bCount() { @@ -448,10 +450,6 @@ Simple8bWriteFn BSONColumnBuilder::_createBufferWriter() { }; } -bool BSONColumnBuilder::_usesDeltaOfDelta(BSONType type) { - return type == bsonTimestamp; -} - bool BSONColumnBuilder::_objectIdDeltaPossible(BSONElement elem, BSONElement prev) { return !memcmp(prev.OID().getInstanceUnique().bytes, elem.OID().getInstanceUnique().bytes, diff --git a/src/mongo/bson/util/bsoncolumnbuilder.h b/src/mongo/bson/util/bsoncolumnbuilder.h index d4d76c0e89a..70f26f43e37 100644 --- a/src/mongo/bson/util/bsoncolumnbuilder.h +++ b/src/mongo/bson/util/bsoncolumnbuilder.h @@ -80,7 +80,6 @@ private: void _storePrevious(BSONElement elem); void _writeLiteralFromPrevious(); void _incrementSimple8bCount(); - bool _usesDeltaOfDelta(BSONType type); bool _objectIdDeltaPossible(BSONElement elem, BSONElement prev); // Helper to append doubles to this Column builder. Returns true if append was successful and diff --git a/src/mongo/bson/util/simple8b.cpp b/src/mongo/bson/util/simple8b.cpp index 4b0de312a66..98fca714193 100644 --- a/src/mongo/bson/util/simple8b.cpp +++ b/src/mongo/bson/util/simple8b.cpp @@ -212,11 +212,12 @@ constexpr std::array<std::array<uint64_t, 16>, 4> kDecodeMask = { 0}}; // The number of meaningful bits for each selector. This does not include any trailing zero bits. +// We use 64 bits for all invalid selectors, this is to make sure iteration does not get stuck. constexpr std::array<std::array<uint8_t, 16>, 4> kBitsPerIntForSelector = { - std::array<uint8_t, 16>{0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 15, 20, 30, 60, 0}, - std::array<uint8_t, 16>{0, 2, 3, 4, 5, 7, 10, 14, 24, 52, 0, 0, 0, 0, 0, 0}, - std::array<uint8_t, 16>{0, 4, 5, 7, 10, 14, 24, 52, 0, 0, 0, 0, 0, 0, 0, 0}, - std::array<uint8_t, 16>{0, 0, 0, 0, 0, 0, 0, 0, 4, 6, 9, 13, 23, 51, 0, 0}}; + std::array<uint8_t, 16>{64, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 15, 20, 30, 60, 64}, + std::array<uint8_t, 16>{64, 2, 3, 4, 5, 7, 10, 14, 24, 52, 64, 64, 64, 64, 64, 64}, + std::array<uint8_t, 16>{64, 4, 5, 7, 10, 14, 24, 52, 0, 0, 64, 64, 64, 64, 64, 64}, + std::array<uint8_t, 16>{64, 0, 0, 0, 0, 0, 0, 0, 4, 6, 9, 13, 23, 51, 64, 64}}; // The number of integers coded for each selector. constexpr std::array<std::array<uint8_t, 16>, 4> kIntsStoreForSelector = { @@ -723,7 +724,7 @@ void Simple8bBuilder<T>::setWriteCallback(Simple8bWriteFn writer) { } template <typename T> -Simple8b<T>::Iterator::Iterator(const uint64_t* pos, const uint64_t* end) +Simple8b<T>::Iterator::Iterator(const char* pos, const char* end) : _pos(pos), _end(end), _value(0), _rleRemaining(0), _shift(0) { if (pos != end) { _loadBlock(); @@ -732,7 +733,7 @@ Simple8b<T>::Iterator::Iterator(const uint64_t* pos, const uint64_t* end) template <typename T> void Simple8b<T>::Iterator::_loadBlock() { - _current = LittleEndian<uint64_t>::load(*_pos); + _current = ConstDataView(_pos).read<LittleEndian<uint64_t>>(); _selector = _current & kBaseSelectorMask; uint8_t selectorExtension = ((_current >> kSelectorBits) & kBaseSelectorMask); @@ -823,7 +824,7 @@ typename Simple8b<T>::Iterator& Simple8b<T>::Iterator::operator++() { template <typename T> typename Simple8b<T>::Iterator& Simple8b<T>::Iterator::advanceBlock() { - ++_pos; + _pos += sizeof(uint64_t); if (_pos == _end) { _rleRemaining = 0; _shift = 0; @@ -845,18 +846,18 @@ bool Simple8b<T>::Iterator::operator!=(const Simple8b::Iterator& rhs) const { } template <typename T> -Simple8b<T>::Simple8b(const char* buffer, int size) : _buffer(buffer), _size(size) {} +Simple8b<T>::Simple8b(const char* buffer, int size) : _buffer(buffer), _size(size) { + invariant(size % sizeof(uint64_t) == 0); +} template <typename T> typename Simple8b<T>::Iterator Simple8b<T>::begin() const { - return {reinterpret_cast<const uint64_t*>(_buffer), - reinterpret_cast<const uint64_t*>(_buffer + _size)}; + return {_buffer, _buffer + _size}; } template <typename T> typename Simple8b<T>::Iterator Simple8b<T>::end() const { - return {reinterpret_cast<const uint64_t*>(_buffer + _size), - reinterpret_cast<const uint64_t*>(_buffer + _size)}; + return {_buffer + _size, _buffer + _size}; } template class Simple8b<uint64_t>; diff --git a/src/mongo/bson/util/simple8b.h b/src/mongo/bson/util/simple8b.h index 46beaa26627..9fad90c6f28 100644 --- a/src/mongo/bson/util/simple8b.h +++ b/src/mongo/bson/util/simple8b.h @@ -322,7 +322,7 @@ public: bool operator!=(const Iterator& rhs) const; private: - Iterator(const uint64_t* pos, const uint64_t* end); + Iterator(const char* pos, const char* end); /** * Loads the current Simple8b block into the iterator @@ -335,8 +335,8 @@ public: */ uint16_t _rleCountInCurrent(uint8_t selectorExtension) const; - const uint64_t* _pos; - const uint64_t* _end; + const char* _pos; + const char* _end; // Current Simple8b block in native endian uint64_t _current; diff --git a/src/mongo/bson/util/simple8b_type_util.cpp b/src/mongo/bson/util/simple8b_type_util.cpp index 32b9fe4b558..fec46ad80be 100644 --- a/src/mongo/bson/util/simple8b_type_util.cpp +++ b/src/mongo/bson/util/simple8b_type_util.cpp @@ -82,26 +82,30 @@ int64_t Simple8bTypeUtil::encodeObjectId(const OID& oid) { return LittleEndian<uint64_t>::load(encoded); } -OID Simple8bTypeUtil::decodeObjectId(int64_t val, OID::InstanceUnique processUnique) { - unsigned char objId[OID::kOIDSize]; - +void Simple8bTypeUtil::decodeObjectIdInto(char* buffer, + int64_t val, + OID::InstanceUnique processUnique) { val = LittleEndian<uint64_t>::store(val); uint8_t* encodedBytes = reinterpret_cast<uint8_t*>(&val); // Set Timestamp and Counter variables together. - objId[0] = encodedBytes[6]; // Timestamp index 0. - objId[1] = encodedBytes[4]; // Timestamp index 1. - objId[2] = encodedBytes[2]; // Timestamp index 2. - objId[3] = encodedBytes[0]; // Timestamp index 3. - objId[9] = encodedBytes[5]; // Counter index 0; - objId[10] = encodedBytes[3]; // Counter index 1. - objId[11] = encodedBytes[1]; // Counter index 2. + buffer[0] = encodedBytes[6]; // Timestamp index 0. + buffer[1] = encodedBytes[4]; // Timestamp index 1. + buffer[2] = encodedBytes[2]; // Timestamp index 2. + buffer[3] = encodedBytes[0]; // Timestamp index 3. + buffer[9] = encodedBytes[5]; // Counter index 0; + buffer[10] = encodedBytes[3]; // Counter index 1. + buffer[11] = encodedBytes[1]; // Counter index 2. // Finally set Process Unique. std::copy(processUnique.bytes, processUnique.bytes + OID::kInstanceUniqueSize, - objId + OID::kTimestampSize); + buffer + OID::kTimestampSize); +} +OID Simple8bTypeUtil::decodeObjectId(int64_t val, OID::InstanceUnique processUnique) { + unsigned char objId[OID::kOIDSize]; + decodeObjectIdInto(reinterpret_cast<char*>(objId), val, processUnique); return OID(objId); } @@ -186,8 +190,9 @@ Decimal128 Simple8bTypeUtil::decodeDecimal128(int128_t val) { } int128_t Simple8bTypeUtil::encodeBinary(const char* val, size_t size) { - char arr[16] = {}; + char arr[16]; memcpy(arr, val, size); + memset(arr + size, 0, sizeof(arr) - size); uint64_t low = ConstDataView(arr).read<LittleEndian<uint64_t>>(); uint64_t high = ConstDataView(arr + 8).read<LittleEndian<uint64_t>>(); return absl::MakeInt128(high, low); @@ -202,10 +207,6 @@ void Simple8bTypeUtil::decodeBinary(int128_t val, char* result, size_t size) { } else { memcpy(result, &low, size); } - if (size < 16) { - // Set the position at end of binary to be always one. - result[size] = 1; - } } } // namespace mongo diff --git a/src/mongo/bson/util/simple8b_type_util.h b/src/mongo/bson/util/simple8b_type_util.h index 345e66ce81a..bf59d48b47a 100644 --- a/src/mongo/bson/util/simple8b_type_util.h +++ b/src/mongo/bson/util/simple8b_type_util.h @@ -61,7 +61,9 @@ public: // then rearrange the bytes to: // | Byte Usage | TS3 | C2 | TS2 | C1 | TS1 | C0 | TS0 | // | Byte Index | 0 | 1 | 2 | 3 | 4 | 5 | 6 | + // The buffer passed to decodeObjectIdInto() must have at least OID::kOIDSize size. static int64_t encodeObjectId(const OID& oid); + static void decodeObjectIdInto(char* buffer, int64_t val, OID::InstanceUnique processUnique); static OID decodeObjectId(int64_t val, OID::InstanceUnique processUnique); // These methods add floating point support for encoding and decoding with simple8b up to 8 diff --git a/src/mongo/bson/util/simple8b_type_util_test.cpp b/src/mongo/bson/util/simple8b_type_util_test.cpp index 90ff2a4dd71..9b3ae3e19de 100644 --- a/src/mongo/bson/util/simple8b_type_util_test.cpp +++ b/src/mongo/bson/util/simple8b_type_util_test.cpp @@ -57,12 +57,14 @@ void assertDecimal128Equal(Decimal128 val) { void assertBinaryEqual(char* val, size_t size, int128_t expected) { int128_t encodeResult = Simple8bTypeUtil::encodeBinary(val, size); ASSERT_EQUALS(encodeResult, expected); - char charPtr[16] = {1}; + + // Initialize to something non zero so we can verify that we did not write out of bounds + char charPtr[17]; + char unused = 'x'; + memset(charPtr, unused, sizeof(charPtr)); Simple8bTypeUtil::decodeBinary(encodeResult, charPtr, size); ASSERT_EQUALS(std::memcmp(charPtr, val, size), 0); - if (size <= 16) { - ASSERT_EQUALS(charPtr[size], 1); - } + ASSERT_EQUALS(charPtr[size], unused); } TEST(Simple8bTypeUtil, EncodeAndDecodePositiveSignedInt) { |