diff options
author | Henrik Edin <henrik.edin@mongodb.com> | 2021-08-04 09:30:00 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-08-17 18:01:47 +0000 |
commit | a64ddda7ade5423803695ef1dc126230bc7458f9 (patch) | |
tree | bf177df6c7680a5ea5979e138aec0ec6bc11b92d /src/mongo/bson | |
parent | 7a852abddcf218fa4d0b854c63179a1807996246 (diff) | |
download | mongo-a64ddda7ade5423803695ef1dc126230bc7458f9.tar.gz |
SERVER-58560 Double support in BSONColumnBuilder
Doubles are scaled and rounded to closest integer.
When a double is appended that needs to be scaled differently the encoder validates if flushing simple8b or re-scaling compresses best.
After writing a finalized simple8b block, the scaling is reduced to be as low as possible.
Diffstat (limited to 'src/mongo/bson')
-rw-r--r-- | src/mongo/bson/util/bsoncolumn_test.cpp | 479 | ||||
-rw-r--r-- | src/mongo/bson/util/bsoncolumnbuilder.cpp | 277 | ||||
-rw-r--r-- | src/mongo/bson/util/bsoncolumnbuilder.h | 16 | ||||
-rw-r--r-- | src/mongo/bson/util/simple8b.cpp | 9 | ||||
-rw-r--r-- | src/mongo/bson/util/simple8b.h | 43 | ||||
-rw-r--r-- | src/mongo/bson/util/simple8b_type_util.cpp | 6 | ||||
-rw-r--r-- | src/mongo/bson/util/simple8b_type_util.h | 4 | ||||
-rw-r--r-- | src/mongo/bson/util/simple8b_type_util_test.cpp | 24 |
8 files changed, 789 insertions, 69 deletions
diff --git a/src/mongo/bson/util/bsoncolumn_test.cpp b/src/mongo/bson/util/bsoncolumn_test.cpp index 755b67b705e..c966a0a4856 100644 --- a/src/mongo/bson/util/bsoncolumn_test.cpp +++ b/src/mongo/bson/util/bsoncolumn_test.cpp @@ -53,6 +53,13 @@ public: return _elementMemory.front().firstElement(); } + BSONElement createElementDouble(double val) { + BSONObjBuilder ob; + ob.append("0"_sd, val); + _elementMemory.emplace_front(ob.obj()); + return _elementMemory.front().firstElement(); + } + BSONElement createObjectId(OID val) { BSONObjBuilder ob; ob.append("0"_sd, val); @@ -75,6 +82,30 @@ public: return Simple8bTypeUtil::encodeInt64(val.Long() - prev.Long()); } + static uint64_t deltaDouble(BSONElement val, BSONElement prev, double scaleFactor) { + uint8_t scaleIndex = 0; + for (; scaleIndex < Simple8bTypeUtil::kScaleMultiplier.size(); ++scaleIndex) { + if (Simple8bTypeUtil::kScaleMultiplier[scaleIndex] == scaleFactor) + break; + } + return Simple8bTypeUtil::encodeInt64( + *Simple8bTypeUtil::encodeDouble(val.Double(), scaleIndex) - + *Simple8bTypeUtil::encodeDouble(prev.Double(), scaleIndex)); + } + + static uint64_t deltaDoubleMemory(BSONElement val, BSONElement prev) { + return Simple8bTypeUtil::encodeInt64( + static_cast<int64_t>(static_cast<uint64_t>(*Simple8bTypeUtil::encodeDouble( + val.Double(), Simple8bTypeUtil::kMemoryAsInteger)) - + static_cast<uint64_t>(*Simple8bTypeUtil::encodeDouble( + prev.Double(), Simple8bTypeUtil::kMemoryAsInteger)))); + } + + static bool simple8bPossible(uint64_t val) { + Simple8bBuilder<uint64_t> b([](uint64_t block) {}); + return b.append(val); + } + static uint64_t deltaObjectId(BSONElement val, BSONElement prev) { return Simple8bTypeUtil::encodeInt64(Simple8bTypeUtil::encodeObjectId(val.OID()) - Simple8bTypeUtil::encodeObjectId(prev.OID())); @@ -91,8 +122,8 @@ public: } template <typename It> - static std::vector<uint64_t> deltaInt64(It begin, It end, BSONElement prev) { - std::vector<uint64_t> deltas; + static std::vector<boost::optional<uint64_t>> deltaInt64(It begin, It end, BSONElement prev) { + std::vector<boost::optional<uint64_t>> deltas; for (; begin != end; ++begin) { deltas.push_back(deltaInt64(*begin, prev)); prev = *begin; @@ -100,6 +131,23 @@ public: return deltas; } + template <typename It> + static std::vector<boost::optional<uint64_t>> deltaDouble(It begin, + It end, + BSONElement prev, + double scaleFactor) { + std::vector<boost::optional<uint64_t>> deltas; + for (; begin != end; ++begin) { + if (!begin->eoo()) { + deltas.push_back(deltaDouble(*begin, prev, scaleFactor)); + prev = *begin; + } else { + deltas.push_back(boost::none); + } + } + return deltas; + } + static void appendLiteral(BufBuilder& builder, BSONElement elem) { // BSON Type byte builder.appendChar(elem.type()); @@ -132,7 +180,7 @@ public: } static void appendSimple8bBlocks(BufBuilder& builder, - const std::vector<uint64_t>& vals, + const std::vector<boost::optional<uint64_t>>& vals, uint32_t expectedNum) { auto prev = builder.len(); Simple8bBuilder<uint64_t> s8bBuilder([&builder](uint64_t block) { @@ -140,7 +188,11 @@ public: return true; }); for (auto val : vals) { - s8bBuilder.append(val); + if (val) { + s8bBuilder.append(*val); + } else { + s8bBuilder.skip(); + } } s8bBuilder.flush(); ASSERT_EQ((builder.len() - prev) / sizeof(uint64_t), expectedNum); @@ -153,7 +205,9 @@ public: static void verifyBinary(BSONBinData columnBinary, const BufBuilder& expected) { ASSERT_EQ(columnBinary.type, BinDataType::Column); ASSERT_EQ(columnBinary.length, expected.len()); - ASSERT_EQ(memcmp(columnBinary.data, expected.buf(), columnBinary.length), 0); + + auto buf = expected.buf(); + ASSERT_EQ(memcmp(columnBinary.data, buf, columnBinary.length), 0); } private: @@ -350,6 +404,407 @@ TEST_F(BSONColumnTest, Simple8bAfterTypeChange) { verifyBinary(cb.finalize(), expected); } +TEST_F(BSONColumnTest, BasicDouble) { + BSONColumnBuilder cb("test"_sd); + + auto d1 = createElementDouble(1.0); + auto d2 = createElementDouble(2.0); + cb.append(d1); + cb.append(d2); + + BufBuilder expected; + appendLiteral(expected, d1); + appendSimple8bControl(expected, 0b1001, 0b0000); + appendSimple8bBlock(expected, deltaDouble(d2, d1, 1)); + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + +TEST_F(BSONColumnTest, DoubleSameScale) { + BSONColumnBuilder cb("test"_sd); + + std::vector<BSONElement> elems; + elems.push_back(createElementDouble(1.0)); + elems.push_back(createElementDouble(2.0)); + elems.push_back(createElementDouble(3.0)); + + for (const auto& elem : elems) { + cb.append(elem); + } + + BufBuilder expected; + appendLiteral(expected, elems.front()); + appendSimple8bControl(expected, 0b1001, 0b0000); + appendSimple8bBlocks( + expected, deltaDouble(elems.begin() + 1, elems.end(), elems.front(), 1), 1); + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + +TEST_F(BSONColumnTest, DoubleIncreaseScaleFromLiteral) { + BSONColumnBuilder cb("test"_sd); + + auto d1 = createElementDouble(1.0); + auto d2 = createElementDouble(1.1); + cb.append(d1); + cb.append(d2); + + BufBuilder expected; + appendLiteral(expected, d1); + appendSimple8bControl(expected, 0b1010, 0b0000); + appendSimple8bBlock(expected, deltaDouble(d2, d1, 10)); + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + +TEST_F(BSONColumnTest, DoubleLiteralAndScaleAfterSkip) { + BSONColumnBuilder cb("test"_sd); + + auto d1 = createElementDouble(1.0); + auto d2 = createElementDouble(1.1); + cb.skip(); + cb.append(d1); + cb.append(d2); + + BufBuilder expected; + appendSimple8bControl(expected, 0b1000, 0b0000); + appendSimple8bBlock(expected, boost::none); + appendLiteral(expected, d1); + appendSimple8bControl(expected, 0b1010, 0b0000); + appendSimple8bBlock(expected, deltaDouble(d2, d1, 10)); + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + +TEST_F(BSONColumnTest, DoubleIncreaseScaleFromLiteralAfterSkip) { + BSONColumnBuilder cb("test"_sd); + + auto d1 = createElementDouble(1); + auto d2 = createElementDouble(1.1); + cb.append(d1); + cb.skip(); + cb.skip(); + cb.append(d2); + + BufBuilder expected; + appendLiteral(expected, d1); + appendSimple8bControl(expected, 0b1010, 0b0000); + + std::vector<boost::optional<uint64_t>> expectedVals(2, boost::none); + expectedVals.push_back(deltaDouble(d2, d1, 10)); + appendSimple8bBlocks(expected, expectedVals, 1); + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + +TEST_F(BSONColumnTest, DoubleIncreaseScaleFromDeltaWithRescale) { + BSONColumnBuilder cb("test"_sd); + + std::vector<BSONElement> elems; + elems.push_back(createElementDouble(1.0)); + elems.push_back(createElementDouble(2.0)); + elems.push_back(createElementDouble(2.1)); + elems.push_back(createElementDouble(2.2)); + + for (const auto& elem : elems) { + cb.append(elem); + } + + BufBuilder expected; + appendLiteral(expected, elems.front()); + appendSimple8bControl(expected, 0b1010, 0b0000); + appendSimple8bBlocks( + expected, deltaDouble(elems.begin() + 1, elems.end(), elems.front(), 10), 1); + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + +TEST_F(BSONColumnTest, DoubleIncreaseScaleFromDeltaNoRescale) { + BSONColumnBuilder cb("test"_sd); + + std::vector<BSONElement> elems; + elems.push_back(createElementDouble(1.1)); + elems.push_back(createElementDouble(2.1)); + elems.push_back(createElementDouble(2.2)); + elems.push_back(createElementDouble(2.3)); + elems.push_back(createElementDouble(3.12345678)); + + for (const auto& elem : elems) { + cb.append(elem); + } + + BufBuilder expected; + appendLiteral(expected, elems.front()); + + auto deltaBegin = elems.begin() + 1; + auto deltaEnd = deltaBegin + 3; + appendSimple8bControl(expected, 0b1010, 0b0000); + appendSimple8bBlocks(expected, deltaDouble(deltaBegin, deltaEnd, elems.front(), 10), 1); + + appendSimple8bControl(expected, 0b1101, 0b0000); + appendSimple8bBlocks( + expected, deltaDouble(deltaEnd, elems.end(), *(deltaEnd - 1), 100000000), 1); + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + +TEST_F(BSONColumnTest, DoubleDecreaseScaleAfterBlock) { + BSONColumnBuilder cb("test"_sd); + + std::vector<BSONElement> elems; + elems.push_back(createElementDouble(1.12345671)); + elems.push_back(createElementDouble(1.12345672)); + elems.push_back(createElementDouble(2)); + elems.push_back(createElementDouble(3)); + + for (const auto& elem : elems) { + cb.append(elem); + } + + BufBuilder expected; + appendLiteral(expected, elems.front()); + + auto deltaBegin = elems.begin() + 1; + auto deltaEnd = deltaBegin + 2; + appendSimple8bControl(expected, 0b1101, 0b0000); + appendSimple8bBlocks(expected, deltaDouble(deltaBegin, deltaEnd, elems.front(), 100000000), 1); + + appendSimple8bControl(expected, 0b1001, 0b0000); + appendSimple8bBlocks(expected, deltaDouble(deltaEnd, elems.end(), *(deltaEnd - 1), 1), 1); + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + +TEST_F(BSONColumnTest, DoubleDecreaseScaleAfterBlockUsingSkip) { + BSONColumnBuilder cb("test"_sd); + + std::vector<BSONElement> elems; + elems.push_back(createElementDouble(1.12345671)); + elems.push_back(createElementDouble(2)); + elems.push_back(BSONElement()); + elems.push_back(BSONElement()); + elems.push_back(createElementDouble(3)); + + for (const auto& elem : elems) { + if (!elem.eoo()) { + cb.append(elem); + } else { + cb.skip(); + } + } + + BufBuilder expected; + appendLiteral(expected, elems.front()); + + auto deltaBegin = elems.begin() + 1; + auto deltaEnd = deltaBegin + 2; + appendSimple8bControl(expected, 0b1101, 0b0000); + appendSimple8bBlocks(expected, deltaDouble(deltaBegin, deltaEnd, elems.front(), 100000000), 1); + + appendSimple8bControl(expected, 0b1001, 0b0000); + appendSimple8bBlocks(expected, deltaDouble(deltaEnd, elems.end(), *deltaBegin, 1), 1); + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + +TEST_F(BSONColumnTest, DoubleDecreaseScaleAfterBlockThenScaleBackUp) { + BSONColumnBuilder cb("test"_sd); + + std::vector<BSONElement> elems; + elems.push_back(createElementDouble(1.12345671)); + elems.push_back(createElementDouble(1.12345672)); + elems.push_back(createElementDouble(2)); + elems.push_back(createElementDouble(3)); + elems.push_back(createElementDouble(1.12345672)); + + for (const auto& elem : elems) { + cb.append(elem); + } + + BufBuilder expected; + appendLiteral(expected, elems.front()); + appendSimple8bControl(expected, 0b1101, 0b0001); + appendSimple8bBlocks( + expected, deltaDouble(elems.begin() + 1, elems.end(), elems.front(), 100000000), 2); + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + +TEST_F(BSONColumnTest, DoubleDecreaseScaleAfterBlockUsingSkipThenScaleBackUp) { + BSONColumnBuilder cb("test"_sd); + + std::vector<BSONElement> elems; + elems.push_back(createElementDouble(1.12345671)); + elems.push_back(createElementDouble(2)); + elems.push_back(BSONElement()); + elems.push_back(BSONElement()); + elems.push_back(createElementDouble(1.12345671)); + + for (const auto& elem : elems) { + if (!elem.eoo()) { + cb.append(elem); + } else { + cb.skip(); + } + } + + BufBuilder expected; + appendLiteral(expected, elems.front()); + appendSimple8bControl(expected, 0b1101, 0b0001); + appendSimple8bBlocks( + expected, deltaDouble(elems.begin() + 1, elems.end(), elems.front(), 100000000), 2); + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + +TEST_F(BSONColumnTest, DoubleUnscalable) { + BSONColumnBuilder cb("test"_sd); + + std::vector<BSONElement> elems; + elems.push_back(createElementDouble(1.0)); + elems.push_back(createElementDouble(1.0 + std::numeric_limits<double>::epsilon())); + elems.push_back(createElementDouble(1.0 + std::numeric_limits<double>::epsilon() * 2)); + + for (const auto& elem : elems) { + cb.append(elem); + } + + BufBuilder expected; + appendLiteral(expected, elems.front()); + + std::vector<boost::optional<uint64_t>> expectedVals; + expectedVals.push_back(deltaDoubleMemory(elems[1], elems[0])); + expectedVals.push_back(deltaDoubleMemory(elems[2], elems[1])); + appendSimple8bControl(expected, 0b1000, 0b0000); + appendSimple8bBlocks(expected, expectedVals, 1); + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + +TEST_F(BSONColumnTest, DoubleSignalingNaN) { + BSONColumnBuilder cb("test"_sd); + + auto elem = createElementDouble(0.0); + auto nan = createElementDouble(std::numeric_limits<double>::signaling_NaN()); + + cb.append(elem); + cb.append(nan); + + BufBuilder expected; + appendLiteral(expected, elem); + + if (auto delta = deltaDoubleMemory(nan, elem); simple8bPossible(delta)) { + appendSimple8bControl(expected, 0b1000, 0b0000); + appendSimple8bBlock(expected, delta); + } else { + appendLiteral(expected, nan); + } + + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + +TEST_F(BSONColumnTest, DoubleQuietNaN) { + BSONColumnBuilder cb("test"_sd); + + auto elem = createElementDouble(0.0); + auto nan = createElementDouble(std::numeric_limits<double>::quiet_NaN()); + + cb.append(elem); + cb.append(nan); + + BufBuilder expected; + appendLiteral(expected, elem); + if (auto delta = deltaDoubleMemory(nan, elem); simple8bPossible(delta)) { + appendSimple8bControl(expected, 0b1000, 0b0000); + appendSimple8bBlock(expected, delta); + } else { + appendLiteral(expected, nan); + } + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + +TEST_F(BSONColumnTest, DoubleInfinity) { + BSONColumnBuilder cb("test"_sd); + + auto elem = createElementDouble(0.0); + auto inf = createElementDouble(std::numeric_limits<double>::infinity()); + + cb.append(elem); + cb.append(inf); + + BufBuilder expected; + appendLiteral(expected, elem); + if (auto delta = deltaDoubleMemory(inf, elem); simple8bPossible(delta)) { + appendSimple8bControl(expected, 0b1000, 0b0000); + appendSimple8bBlock(expected, delta); + } else { + appendLiteral(expected, inf); + } + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + +TEST_F(BSONColumnTest, DoubleDenorm) { + BSONColumnBuilder cb("test"_sd); + + auto elem = createElementDouble(0.0); + auto denorm = createElementDouble(std::numeric_limits<double>::denorm_min()); + + cb.append(elem); + cb.append(denorm); + + BufBuilder expected; + appendLiteral(expected, elem); + if (auto delta = deltaDoubleMemory(denorm, elem); simple8bPossible(delta)) { + appendSimple8bControl(expected, 0b1000, 0b0000); + appendSimple8bBlock(expected, delta); + } else { + appendLiteral(expected, denorm); + } + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + +TEST_F(BSONColumnTest, DoubleIntegerOverflow) { + BSONColumnBuilder cb("test"_sd); + + // std::numeric_limits<int64_t>::min() - 0x1000 will cause an overflow if performed as signed, + // make sure it is handled correctly + auto e1 = createElementDouble( + Simple8bTypeUtil::decodeDouble(0x1000, Simple8bTypeUtil::kMemoryAsInteger)); + auto e2 = createElementDouble(Simple8bTypeUtil::decodeDouble( + std::numeric_limits<int64_t>::min(), Simple8bTypeUtil::kMemoryAsInteger)); + + cb.append(e1); + cb.append(e2); + + BufBuilder expected; + appendLiteral(expected, e1); + appendSimple8bControl(expected, 0b1000, 0b0000); + appendSimple8bBlock(expected, deltaDoubleMemory(e2, e1)); + appendEOO(expected); + + verifyBinary(cb.finalize(), expected); +} + TEST_F(BSONColumnTest, BasicObjectId) { BSONColumnBuilder cb("test"_sd); @@ -366,7 +821,7 @@ TEST_F(BSONColumnTest, BasicObjectId) { BufBuilder expected; appendLiteral(expected, first); appendSimple8bControl(expected, 0b1000, 0b0000); - std::vector<uint64_t> expectedDeltas{ + std::vector<boost::optional<uint64_t>> expectedDeltas{ deltaObjectId(second, first), deltaObjectId(second, second), deltaObjectId(third, second)}; appendSimple8bBlocks(expected, expectedDeltas, 1); appendEOO(expected); @@ -435,8 +890,8 @@ TEST_F(BSONColumnTest, Simple8bTimestamp) { BufBuilder expected; appendLiteral(expected, first); appendSimple8bControl(expected, 0b1000, 0b0000); - std::vector<uint64_t> expectedDeltaOfDeltas{deltaOfDeltaTimestamp(second, first), - deltaOfDeltaTimestamp(second, second, first)}; + std::vector<boost::optional<uint64_t>> expectedDeltaOfDeltas{ + deltaOfDeltaTimestamp(second, first), deltaOfDeltaTimestamp(second, second, first)}; appendSimple8bBlocks(expected, expectedDeltaOfDeltas, 1); appendEOO(expected); @@ -457,8 +912,8 @@ TEST_F(BSONColumnTest, Simple8bTimestampNegativeDeltaOfDelta) { BufBuilder expected; appendLiteral(expected, first); appendSimple8bControl(expected, 0b1000, 0b0000); - std::vector<uint64_t> expectedDeltaOfDeltas{deltaOfDeltaTimestamp(second, first), - deltaOfDeltaTimestamp(third, second, first)}; + std::vector<boost::optional<uint64_t>> expectedDeltaOfDeltas{ + deltaOfDeltaTimestamp(second, first), deltaOfDeltaTimestamp(third, second, first)}; appendSimple8bBlocks(expected, expectedDeltaOfDeltas, 1); appendEOO(expected); @@ -533,8 +988,8 @@ TEST_F(BSONColumnTest, LargeDeltaOfDeltaIsLiteralAfterSimple8bTimestamp) { appendSimple8bBlock(expected, deltaOfDeltaTimestamp(zero, zero)); appendLiteral(expected, large); appendSimple8bControl(expected, 0b1000, 0b0000); - std::vector<uint64_t> expectedDeltaOfDeltas{deltaOfDeltaTimestamp(large, large), - deltaOfDeltaTimestamp(semiLarge, large, large)}; + std::vector<boost::optional<uint64_t>> expectedDeltaOfDeltas{ + deltaOfDeltaTimestamp(large, large), deltaOfDeltaTimestamp(semiLarge, large, large)}; appendSimple8bBlocks(expected, expectedDeltaOfDeltas, 1); appendEOO(expected); diff --git a/src/mongo/bson/util/bsoncolumnbuilder.cpp b/src/mongo/bson/util/bsoncolumnbuilder.cpp index eeb05ac1372..d54ade92c72 100644 --- a/src/mongo/bson/util/bsoncolumnbuilder.cpp +++ b/src/mongo/bson/util/bsoncolumnbuilder.cpp @@ -34,12 +34,44 @@ #include <memory> namespace mongo { +namespace { static constexpr uint8_t kMaxCount = 16; static constexpr uint8_t kCountMask = 0x0F; -static constexpr uint8_t kNoScaleControl = 0x80; +static constexpr uint8_t kControlMask = 0xF0; + +static constexpr std::array<uint8_t, Simple8bTypeUtil::kMemoryAsInteger + 1> + kControlByteForScaleIndex = {0x90, 0xA0, 0xB0, 0xC0, 0xD0, 0x80}; + +int64_t calcDelta(int64_t val, int64_t prev) { + // Do the subtraction as unsigned and cast back to signed to get overflow defined to wrapped + // around instead of undefined behavior. + return static_cast<int64_t>(static_cast<uint64_t>(val) - static_cast<uint64_t>(prev)); +} + +int64_t expandDelta(int64_t prev, int64_t delta) { + // Do the addition as unsigned and cast back to signed to get overflow defined to wrapped around + // instead of undefined behavior. + return static_cast<int64_t>(static_cast<uint64_t>(prev) + static_cast<uint64_t>(delta)); +} + +// Encodes the double with the lowest possible scale index. In worst case we will interpret the +// memory as integer which is guaranteed to succeed. +std::pair<int64_t, uint8_t> scaleAndEncodeDouble(double value, uint8_t minScaleIndex) { + boost::optional<int64_t> encoded; + for (; !encoded; ++minScaleIndex) { + encoded = Simple8bTypeUtil::encodeDouble(value, minScaleIndex); + } + + // Subtract the last scale that was added in the loop before returning + return {*encoded, minScaleIndex - 1}; +} + +} // namespace BSONColumnBuilder::BSONColumnBuilder(StringData fieldName) - : _simple8bBuilder(_createSimple8bBuilder()), _fieldName(fieldName) { + : _simple8bBuilder(_createBufferWriter()), + _scaleIndex(Simple8bTypeUtil::kMemoryAsInteger), + _fieldName(fieldName) { // Store EOO type with empty field name as previous. _storePrevious(BSONElement()); @@ -59,46 +91,57 @@ BSONColumnBuilder& BSONColumnBuilder::append(BSONElement elem) { _storePrevious(elem); _simple8bBuilder.flush(); _writeLiteralFromPrevious(); + return *this; } - // Depending on the type, delta or delta-of-delta is stored in Simple-8b. - int64_t value = 0; - bool deltaPossible = true; - if (_usesDeltaOfDelta(type) || !elem.binaryEqualValues(previous)) { - switch (type) { - case NumberInt: - value = elem._numberInt() - previous._numberInt(); - break; - case NumberLong: - value = elem._numberLong() - previous._numberLong(); - break; - case jstOID: - deltaPossible = _objectIdDeltaPossible(elem, previous); - if (deltaPossible) - value = Simple8bTypeUtil::encodeObjectId(elem.OID()) - - Simple8bTypeUtil::encodeObjectId(previous.OID()); - break; - case bsonTimestamp: { - int64_t currTimestampDelta = - elem.timestamp().asULL() - previous.timestamp().asULL(); - value = currTimestampDelta - _prevDelta; - _prevDelta = currTimestampDelta; - break; + // Store delta in Simple-8b if types match + bool compressed = !_usesDeltaOfDelta(type) && elem.binaryEqualValues(previous); + if (compressed) { + _simple8bBuilder.append(0); + } + + if (!compressed) { + if (type == NumberDouble) { + compressed = _appendDouble(elem._numberDouble(), previous._numberDouble()); + + } else { + bool encodingPossible = true; + int64_t value = 0; + switch (type) { + case NumberInt: + value = calcDelta(elem._numberInt(), previous._numberInt()); + break; + case NumberLong: + value = calcDelta(elem._numberLong(), previous._numberLong()); + break; + case jstOID: + encodingPossible = _objectIdDeltaPossible(elem, previous); + if (encodingPossible) + value = calcDelta(Simple8bTypeUtil::encodeObjectId(elem.OID()), + Simple8bTypeUtil::encodeObjectId(previous.OID())); + break; + case bsonTimestamp: { + int64_t currTimestampDelta = + calcDelta(elem.timestamp().asULL(), previous.timestamp().asULL()); + value = calcDelta(currTimestampDelta, _prevDelta); + _prevDelta = currTimestampDelta; + break; + } + default: + // Nothing else is implemented yet + invariant(false); + }; + if (encodingPossible) { + compressed = _simple8bBuilder.append(Simple8bTypeUtil::encodeInt64(value)); } - default: - // Nothing else is implemented yet - invariant(false); - }; + } } - bool result = false; - if (deltaPossible) - result = _simple8bBuilder.append(Simple8bTypeUtil::encodeInt64(value)); _storePrevious(elem); // Store uncompressed literal if value is outside of range of encodable values. - if (!result) { + if (!compressed) { _simple8bBuilder.flush(); _writeLiteralFromPrevious(); } @@ -106,8 +149,143 @@ BSONColumnBuilder& BSONColumnBuilder::append(BSONElement elem) { return *this; } +boost::optional<Simple8bBuilder<uint64_t>> BSONColumnBuilder::_tryRescalePending( + int64_t encoded, uint8_t newScaleIndex) { + // Encode last value in the previous block with old and new scale index. We know that scaling + // with the old index is possible. + int64_t prev = *Simple8bTypeUtil::encodeDouble(_lastValueInPrevBlock, _scaleIndex); + boost::optional<int64_t> prevRescaled = + Simple8bTypeUtil::encodeDouble(_lastValueInPrevBlock, newScaleIndex); + + // Fail if we could not rescale + bool possible = prevRescaled.has_value(); + if (!possible) + return boost::none; + + // Create a new Simple8bBuilder for the rescaled values. If any Simple8b block is finalized when + // adding the new values then rescaling is less optimal than flushing with the current scale. So + // we just record if this happens in our write callback. + Simple8bBuilder<uint64_t> builder([&possible](uint64_t block) { possible = false; }); + + // Iterate over our pending values, decode them back into double, rescale and append to our new + // Simple8b builder + for (const auto& pending : _simple8bBuilder) { + if (!pending) { + builder.skip(); + continue; + } + + // Apply delta to previous, decode to double and rescale + prev = expandDelta(prev, Simple8bTypeUtil::decodeInt64(*pending)); + auto rescaled = Simple8bTypeUtil::encodeDouble( + Simple8bTypeUtil::decodeDouble(prev, _scaleIndex), newScaleIndex); + + // Fail if we could not rescale + if (!rescaled || !prevRescaled) + return boost::none; + + // Append the scaled delta + auto appended = + builder.append(Simple8bTypeUtil::encodeInt64(calcDelta(*rescaled, *prevRescaled))); + + // Fail if are out of range for Simple8b or a block was written + if (!appended || !possible) + return boost::none; + + // Remember previous for next value + prevRescaled = rescaled; + } + + // Last add our new value + auto appended = + builder.append(Simple8bTypeUtil::encodeInt64(calcDelta(encoded, *prevRescaled))); + if (!appended || !possible) + return boost::none; + + // We managed to add all re-scaled values, this will thus compress better. Set write callback to + // our buffer writer and return + builder.setWriteCallback(_createBufferWriter()); + return builder; +} + +bool BSONColumnBuilder::_appendDouble(double value, double previous) { + // Scale with lowest possible scale index + auto [encoded, scaleIndex] = scaleAndEncodeDouble(value, _scaleIndex); + + if (scaleIndex != _scaleIndex) { + // New value need higher scale index. We have two choices: + // (1) Re-scale pending values to use this larger scale factor + // (2) Flush pending and start a new block with this higher scale factor + // We try both options and select the one that compresses best + auto rescaled = _tryRescalePending(encoded, scaleIndex); + if (rescaled) { + // Re-scale possible, use this Simple8b builder + std::swap(_simple8bBuilder, *rescaled); + _prevEncoded = encoded; + _scaleIndex = scaleIndex; + return true; + } + + // Re-scale not possible, flush and start new block with the higher scale factor + _simple8bBuilder.flush(); + _controlByteOffset = 0; + + // Make sure value and previous are using the same scale factor. + uint8_t prevScaleIndex; + std::tie(_prevEncoded, prevScaleIndex) = scaleAndEncodeDouble(previous, scaleIndex); + if (scaleIndex != prevScaleIndex) { + std::tie(encoded, scaleIndex) = scaleAndEncodeDouble(value, prevScaleIndex); + std::tie(_prevEncoded, prevScaleIndex) = scaleAndEncodeDouble(previous, scaleIndex); + } + + // Record our new scale factor + _scaleIndex = scaleIndex; + } + + // Append delta and check if we wrote a Simple8b block. If we did we may be able to reduce the + // scale factor when starting a new block + auto before = _bufBuilder.len(); + if (!_simple8bBuilder.append(Simple8bTypeUtil::encodeInt64(calcDelta(encoded, _prevEncoded)))) + return false; + + if (_bufBuilder.len() != before) { + // Reset the scale factor to 0 and append all pending values to a new Simple8bBuilder. In + // the worse case we will end up with an identical scale factor. + auto prevScale = _scaleIndex; + std::tie(_prevEncoded, _scaleIndex) = scaleAndEncodeDouble(_lastValueInPrevBlock, 0); + + // Create a new Simple8bBuilder. + Simple8bBuilder<uint64_t> builder(_createBufferWriter()); + std::swap(_simple8bBuilder, builder); + + // Iterate over previous pending values and re-add them recursively. That will increase the + // scale factor as needed. + auto prev = _lastValueInPrevBlock; + auto prevEncoded = *Simple8bTypeUtil::encodeDouble(prev, prevScale); + for (const auto& pending : builder) { + if (pending) { + prevEncoded = expandDelta(prevEncoded, Simple8bTypeUtil::decodeInt64(*pending)); + auto val = Simple8bTypeUtil::decodeDouble(prevEncoded, prevScale); + _appendDouble(val, prev); + prev = val; + } else { + _simple8bBuilder.skip(); + } + } + } + + _prevEncoded = encoded; + return true; +} + BSONColumnBuilder& BSONColumnBuilder::skip() { + auto before = _bufBuilder.len(); _simple8bBuilder.skip(); + + // Rescale previous known value if this skip caused Simple-8b blocks to be written + if (before != _bufBuilder.len() && _previous().type() == NumberDouble) { + std::tie(_prevEncoded, _scaleIndex) = scaleAndEncodeDouble(_lastValueInPrevBlock, 0); + } return *this; } @@ -148,11 +326,20 @@ void BSONColumnBuilder::_writeLiteralFromPrevious() { _controlByteOffset = 0; // There is no previous timestamp delta. Set to default. _prevDelta = 0; + + // Set scale factor for this literal and values needed to append values + if (_prev[0] == NumberDouble) { + _lastValueInPrevBlock = _previous()._numberDouble(); + std::tie(_prevEncoded, _scaleIndex) = scaleAndEncodeDouble(_lastValueInPrevBlock, 0); + } else { + _scaleIndex = Simple8bTypeUtil::kMemoryAsInteger; + } } void BSONColumnBuilder::_incrementSimple8bCount() { char* byte; uint8_t count; + uint8_t control = kControlByteForScaleIndex[_scaleIndex]; if (_controlByteOffset == 0) { // Allocate new control byte if we don't already have one. Record its offset so we can find @@ -163,25 +350,39 @@ void BSONColumnBuilder::_incrementSimple8bCount() { } else { // Read current count from previous control byte byte = _bufBuilder.buf() + _controlByteOffset; + + // If previous byte was written with a different control byte then we can't re-use and need + // to start a new one + if ((*byte & kControlMask) != control) { + _controlByteOffset = 0; + _incrementSimple8bCount(); + return; + } count = (*byte & kCountMask) + 1; } // Write back new count and clear offset if we have reached max count - *byte = kNoScaleControl | (count & kCountMask); + *byte = control | (count & kCountMask); if (count + 1 == kMaxCount) { _controlByteOffset = 0; } } -Simple8bBuilder<uint64_t> BSONColumnBuilder::_createSimple8bBuilder() { - return Simple8bBuilder<uint64_t>([this](uint64_t block) { +Simple8bWriteFn BSONColumnBuilder::_createBufferWriter() { + return [this](uint64_t block) { // Write/update block count _incrementSimple8bCount(); - // Write Simple-8b block + // Write Simple-8b block in little endian byte order _bufBuilder.appendNum(block); + + auto previous = _previous(); + if (previous.type() == NumberDouble) { + _lastValueInPrevBlock = previous._numberDouble(); + } + return true; - }); + }; } bool BSONColumnBuilder::_usesDeltaOfDelta(BSONType type) { diff --git a/src/mongo/bson/util/bsoncolumnbuilder.h b/src/mongo/bson/util/bsoncolumnbuilder.h index 36d6fc1672f..ce36228bb41 100644 --- a/src/mongo/bson/util/bsoncolumnbuilder.h +++ b/src/mongo/bson/util/bsoncolumnbuilder.h @@ -82,7 +82,16 @@ private: bool _usesDeltaOfDelta(BSONType type); bool _objectIdDeltaPossible(BSONElement elem, BSONElement prev); - Simple8bBuilder<uint64_t> _createSimple8bBuilder(); + // Helper to append doubles to this Column builder. Returns true if append was successful and + // false if the value needs to be stored uncompressed. + bool _appendDouble(double value, double previous); + + // Tries to rescale current pending values + one additional value into a new Simple8bBuilder. + // Returns the new Simple8bBuilder if rescaling was possible and none otherwise. + boost::optional<Simple8bBuilder<uint64_t>> _tryRescalePending(int64_t encoded, + uint8_t newScaleIndex); + + Simple8bWriteFn _createBufferWriter(); // Storage for the previously appended BSONElement std::unique_ptr<char[]> _prev; @@ -97,6 +106,11 @@ private: // Offset to last Simple-8b control byte std::ptrdiff_t _controlByteOffset = 0; + // Additional variables needed for previous state + int64_t _prevEncoded; + double _lastValueInPrevBlock = 0; + uint8_t _scaleIndex; + // Buffer for the BSON Column binary BufBuilder _bufBuilder; diff --git a/src/mongo/bson/util/simple8b.cpp b/src/mongo/bson/util/simple8b.cpp index d0cf8c577f9..594af3375f1 100644 --- a/src/mongo/bson/util/simple8b.cpp +++ b/src/mongo/bson/util/simple8b.cpp @@ -383,7 +383,7 @@ bool Simple8bBuilder<T>::PendingIterator::operator!=( } template <typename T> -Simple8bBuilder<T>::Simple8bBuilder(WriteFn writeFunc) : _writeFn(std::move(writeFunc)) {} +Simple8bBuilder<T>::Simple8bBuilder(Simple8bWriteFn writeFunc) : _writeFn(std::move(writeFunc)) {} template <typename T> Simple8bBuilder<T>::~Simple8bBuilder() = default; @@ -391,7 +391,7 @@ Simple8bBuilder<T>::~Simple8bBuilder() = default; template <typename T> bool Simple8bBuilder<T>::append(T value) { if (_rlePossible()) { - if (_lastValueInPrevWord.value() == value) { + if (_lastValueInPrevWord.val == value) { ++_rleCount; return true; } @@ -712,6 +712,11 @@ typename Simple8bBuilder<T>::PendingIterator Simple8bBuilder<T>::end() const { } template <typename T> +void Simple8bBuilder<T>::setWriteCallback(Simple8bWriteFn writer) { + _writeFn = std::move(writer); +} + +template <typename T> Simple8b<T>::Iterator::Iterator(const uint64_t* pos, const uint64_t* end) : _pos(pos), _end(end), _value(0), _rleRemaining(0), _shift(0) { if (pos != end) { diff --git a/src/mongo/bson/util/simple8b.h b/src/mongo/bson/util/simple8b.h index 4e345638359..46beaa26627 100644 --- a/src/mongo/bson/util/simple8b.h +++ b/src/mongo/bson/util/simple8b.h @@ -39,7 +39,14 @@ namespace mongo { /** - * Simple8bBuilder compresses a series of integers into chains of 64 bit Simple8b word. + * Callback type to implement writing of 64 bit Simple8b words. + */ +using Simple8bWriteFn = std::function<void(uint64_t)>; + +/** + * Simple8bBuilder compresses a series of integers into chains of 64 bit Simple8b blocks. + * + * T may be uint64_t and uint128_t only. */ template <typename T> class Simple8bBuilder { @@ -47,29 +54,30 @@ private: struct PendingValue; public: - // Callback to handle writing of finalized Simple-8b blocks. Machine Endian byte order. - using WriteFn = std::function<void(uint64_t)>; - Simple8bBuilder(WriteFn writeFunc); + // Callback to handle writing of finalized Simple-8b blocks. Machine Endian byte order, the + // value need to be converted to Little Endian before persisting. + Simple8bBuilder(Simple8bWriteFn writeFunc); ~Simple8bBuilder(); /** - * Checks if we can append val to an existing RLE and handles the ending of a RLE. - * The default RLE value at the beginning is 0. - * Otherwise, Appends a value to the Simple8b chain of words. - * Return true if successfully appended and false otherwise. + * Appends val to Simple8b. Returns true if the append was successful and false if the value was + * outside the range of possible values we can store in Simple8b. + * + * A call to append may result in multiple Simple8b blocks being finalized. */ bool append(T val); /** - * Appends an empty bucket to handle missing values. This works by incrementing an underlying - * simple8b index by one and encoding a "missing value" in the simple8b block as all 1s. + * Appends a missing value to Simple8b. + * + * May result in a single Simple8b being finalized. */ void skip(); /** - * Stores all values for RLE or in _pendingValues into _buffered even if the Simple8b word will - * not be opimtal and use a larger selector than necessary because we don't have enough integers - * to use one wiht more slots. + * Flushes all buffered values into finalized Simple8b blocks. + * + * It is allowed to continue to append values after this call. */ void flush(); @@ -114,6 +122,11 @@ public: PendingIterator begin() const; PendingIterator end() const; + /** + * Set write callback + */ + void setWriteCallback(Simple8bWriteFn writer); + private: // Number of different type of selectors and their extensions available static constexpr uint8_t kNumOfSelectorTypes = 4; @@ -259,7 +272,7 @@ private: std::deque<PendingValue> _pendingValues; // User-defined callback to handle writing of finalized Simple-8b blocks - WriteFn _writeFn; + Simple8bWriteFn _writeFn; }; /** @@ -286,7 +299,7 @@ public: size_t blockSize() const; /** - * Returns the value in Little Endian byte order at the current iterator position. + * Returns the value in at the current iterator position. */ pointer operator->() const { return &_value; diff --git a/src/mongo/bson/util/simple8b_type_util.cpp b/src/mongo/bson/util/simple8b_type_util.cpp index a225fd5afe9..7ac55dae78a 100644 --- a/src/mongo/bson/util/simple8b_type_util.cpp +++ b/src/mongo/bson/util/simple8b_type_util.cpp @@ -133,6 +133,9 @@ boost::optional<uint8_t> Simple8bTypeUtil::calculateDecimalShiftMultiplier(doubl } boost::optional<int64_t> Simple8bTypeUtil::encodeDouble(double val, uint8_t scaleIndex) { + if (scaleIndex == kMemoryAsInteger) + return *reinterpret_cast<int64_t*>(&val); + // Checks for both overflow and handles NaNs // We use 2^53 because this is the max integer that we can guarentee can be // exactly represented by a floating point decimal since there are 53 value bits @@ -157,6 +160,9 @@ boost::optional<int64_t> Simple8bTypeUtil::encodeDouble(double val, uint8_t scal } double Simple8bTypeUtil::decodeDouble(int64_t val, uint8_t scaleIndex) { + if (scaleIndex == kMemoryAsInteger) + return *reinterpret_cast<double*>(&val); + return val / kScaleMultiplier[scaleIndex]; } diff --git a/src/mongo/bson/util/simple8b_type_util.h b/src/mongo/bson/util/simple8b_type_util.h index e03fa958501..5d472960db3 100644 --- a/src/mongo/bson/util/simple8b_type_util.h +++ b/src/mongo/bson/util/simple8b_type_util.h @@ -74,6 +74,8 @@ public: // Array is a double as it will always be multiplied by a double and we don't want to do an // extra cast for - static constexpr std::array<double, 5> kScaleMultiplier = {1, 10, 100, 10000, 100000000}; + static constexpr uint8_t kMemoryAsInteger = 5; + static constexpr std::array<double, kMemoryAsInteger> kScaleMultiplier = { + 1, 10, 100, 10000, 100000000}; }; } // namespace mongo diff --git a/src/mongo/bson/util/simple8b_type_util_test.cpp b/src/mongo/bson/util/simple8b_type_util_test.cpp index 7129638025e..77e7d6729f6 100644 --- a/src/mongo/bson/util/simple8b_type_util_test.cpp +++ b/src/mongo/bson/util/simple8b_type_util_test.cpp @@ -239,6 +239,30 @@ TEST(Simple8b, TestMinDoubleShouldFail) { ASSERT_FALSE(result.has_value()); } +TEST(Simple8b, InterpretAsMemory) { + std::vector<double> vals = {0.0, + 1.12345678, + std::numeric_limits<double>::max(), + std::numeric_limits<double>::min(), + std::numeric_limits<double>::infinity(), + std::numeric_limits<double>::signaling_NaN(), + std::numeric_limits<double>::quiet_NaN(), + std::numeric_limits<double>::denorm_min()}; + + for (double val : vals) { + boost::optional<int64_t> result = + Simple8bTypeUtil::encodeDouble(val, Simple8bTypeUtil::kMemoryAsInteger); + ASSERT_TRUE(result); + ASSERT_EQ(*result, *reinterpret_cast<const int64_t*>(&val)); + + // Some of the special values above does not compare equal with themselves (signaling NaN). + // Verify that we end up with the same memory after decoding + double decoded = + Simple8bTypeUtil::decodeDouble(*result, Simple8bTypeUtil::kMemoryAsInteger); + ASSERT_EQ(memcmp(&decoded, &val, sizeof(val)), 0); + } +} + TEST(Simple8b, TestMaxInt) { // max int that can be stored as a double without losing precision double val = std::pow(2, 53); |