summaryrefslogtreecommitdiff
path: root/src/mongo/bson
diff options
context:
space:
mode:
authorHenrik Edin <henrik.edin@mongodb.com>2021-08-04 09:30:00 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-08-17 18:01:47 +0000
commita64ddda7ade5423803695ef1dc126230bc7458f9 (patch)
treebf177df6c7680a5ea5979e138aec0ec6bc11b92d /src/mongo/bson
parent7a852abddcf218fa4d0b854c63179a1807996246 (diff)
downloadmongo-a64ddda7ade5423803695ef1dc126230bc7458f9.tar.gz
SERVER-58560 Double support in BSONColumnBuilder
Doubles are scaled and rounded to closest integer. When a double is appended that needs to be scaled differently the encoder validates if flushing simple8b or re-scaling compresses best. After writing a finalized simple8b block, the scaling is reduced to be as low as possible.
Diffstat (limited to 'src/mongo/bson')
-rw-r--r--src/mongo/bson/util/bsoncolumn_test.cpp479
-rw-r--r--src/mongo/bson/util/bsoncolumnbuilder.cpp277
-rw-r--r--src/mongo/bson/util/bsoncolumnbuilder.h16
-rw-r--r--src/mongo/bson/util/simple8b.cpp9
-rw-r--r--src/mongo/bson/util/simple8b.h43
-rw-r--r--src/mongo/bson/util/simple8b_type_util.cpp6
-rw-r--r--src/mongo/bson/util/simple8b_type_util.h4
-rw-r--r--src/mongo/bson/util/simple8b_type_util_test.cpp24
8 files changed, 789 insertions, 69 deletions
diff --git a/src/mongo/bson/util/bsoncolumn_test.cpp b/src/mongo/bson/util/bsoncolumn_test.cpp
index 755b67b705e..c966a0a4856 100644
--- a/src/mongo/bson/util/bsoncolumn_test.cpp
+++ b/src/mongo/bson/util/bsoncolumn_test.cpp
@@ -53,6 +53,13 @@ public:
return _elementMemory.front().firstElement();
}
+ BSONElement createElementDouble(double val) {
+ BSONObjBuilder ob;
+ ob.append("0"_sd, val);
+ _elementMemory.emplace_front(ob.obj());
+ return _elementMemory.front().firstElement();
+ }
+
BSONElement createObjectId(OID val) {
BSONObjBuilder ob;
ob.append("0"_sd, val);
@@ -75,6 +82,30 @@ public:
return Simple8bTypeUtil::encodeInt64(val.Long() - prev.Long());
}
+ static uint64_t deltaDouble(BSONElement val, BSONElement prev, double scaleFactor) {
+ uint8_t scaleIndex = 0;
+ for (; scaleIndex < Simple8bTypeUtil::kScaleMultiplier.size(); ++scaleIndex) {
+ if (Simple8bTypeUtil::kScaleMultiplier[scaleIndex] == scaleFactor)
+ break;
+ }
+ return Simple8bTypeUtil::encodeInt64(
+ *Simple8bTypeUtil::encodeDouble(val.Double(), scaleIndex) -
+ *Simple8bTypeUtil::encodeDouble(prev.Double(), scaleIndex));
+ }
+
+ static uint64_t deltaDoubleMemory(BSONElement val, BSONElement prev) {
+ return Simple8bTypeUtil::encodeInt64(
+ static_cast<int64_t>(static_cast<uint64_t>(*Simple8bTypeUtil::encodeDouble(
+ val.Double(), Simple8bTypeUtil::kMemoryAsInteger)) -
+ static_cast<uint64_t>(*Simple8bTypeUtil::encodeDouble(
+ prev.Double(), Simple8bTypeUtil::kMemoryAsInteger))));
+ }
+
+ static bool simple8bPossible(uint64_t val) {
+ Simple8bBuilder<uint64_t> b([](uint64_t block) {});
+ return b.append(val);
+ }
+
static uint64_t deltaObjectId(BSONElement val, BSONElement prev) {
return Simple8bTypeUtil::encodeInt64(Simple8bTypeUtil::encodeObjectId(val.OID()) -
Simple8bTypeUtil::encodeObjectId(prev.OID()));
@@ -91,8 +122,8 @@ public:
}
template <typename It>
- static std::vector<uint64_t> deltaInt64(It begin, It end, BSONElement prev) {
- std::vector<uint64_t> deltas;
+ static std::vector<boost::optional<uint64_t>> deltaInt64(It begin, It end, BSONElement prev) {
+ std::vector<boost::optional<uint64_t>> deltas;
for (; begin != end; ++begin) {
deltas.push_back(deltaInt64(*begin, prev));
prev = *begin;
@@ -100,6 +131,23 @@ public:
return deltas;
}
+ template <typename It>
+ static std::vector<boost::optional<uint64_t>> deltaDouble(It begin,
+ It end,
+ BSONElement prev,
+ double scaleFactor) {
+ std::vector<boost::optional<uint64_t>> deltas;
+ for (; begin != end; ++begin) {
+ if (!begin->eoo()) {
+ deltas.push_back(deltaDouble(*begin, prev, scaleFactor));
+ prev = *begin;
+ } else {
+ deltas.push_back(boost::none);
+ }
+ }
+ return deltas;
+ }
+
static void appendLiteral(BufBuilder& builder, BSONElement elem) {
// BSON Type byte
builder.appendChar(elem.type());
@@ -132,7 +180,7 @@ public:
}
static void appendSimple8bBlocks(BufBuilder& builder,
- const std::vector<uint64_t>& vals,
+ const std::vector<boost::optional<uint64_t>>& vals,
uint32_t expectedNum) {
auto prev = builder.len();
Simple8bBuilder<uint64_t> s8bBuilder([&builder](uint64_t block) {
@@ -140,7 +188,11 @@ public:
return true;
});
for (auto val : vals) {
- s8bBuilder.append(val);
+ if (val) {
+ s8bBuilder.append(*val);
+ } else {
+ s8bBuilder.skip();
+ }
}
s8bBuilder.flush();
ASSERT_EQ((builder.len() - prev) / sizeof(uint64_t), expectedNum);
@@ -153,7 +205,9 @@ public:
static void verifyBinary(BSONBinData columnBinary, const BufBuilder& expected) {
ASSERT_EQ(columnBinary.type, BinDataType::Column);
ASSERT_EQ(columnBinary.length, expected.len());
- ASSERT_EQ(memcmp(columnBinary.data, expected.buf(), columnBinary.length), 0);
+
+ auto buf = expected.buf();
+ ASSERT_EQ(memcmp(columnBinary.data, buf, columnBinary.length), 0);
}
private:
@@ -350,6 +404,407 @@ TEST_F(BSONColumnTest, Simple8bAfterTypeChange) {
verifyBinary(cb.finalize(), expected);
}
+TEST_F(BSONColumnTest, BasicDouble) {
+ BSONColumnBuilder cb("test"_sd);
+
+ auto d1 = createElementDouble(1.0);
+ auto d2 = createElementDouble(2.0);
+ cb.append(d1);
+ cb.append(d2);
+
+ BufBuilder expected;
+ appendLiteral(expected, d1);
+ appendSimple8bControl(expected, 0b1001, 0b0000);
+ appendSimple8bBlock(expected, deltaDouble(d2, d1, 1));
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
+TEST_F(BSONColumnTest, DoubleSameScale) {
+ BSONColumnBuilder cb("test"_sd);
+
+ std::vector<BSONElement> elems;
+ elems.push_back(createElementDouble(1.0));
+ elems.push_back(createElementDouble(2.0));
+ elems.push_back(createElementDouble(3.0));
+
+ for (const auto& elem : elems) {
+ cb.append(elem);
+ }
+
+ BufBuilder expected;
+ appendLiteral(expected, elems.front());
+ appendSimple8bControl(expected, 0b1001, 0b0000);
+ appendSimple8bBlocks(
+ expected, deltaDouble(elems.begin() + 1, elems.end(), elems.front(), 1), 1);
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
+TEST_F(BSONColumnTest, DoubleIncreaseScaleFromLiteral) {
+ BSONColumnBuilder cb("test"_sd);
+
+ auto d1 = createElementDouble(1.0);
+ auto d2 = createElementDouble(1.1);
+ cb.append(d1);
+ cb.append(d2);
+
+ BufBuilder expected;
+ appendLiteral(expected, d1);
+ appendSimple8bControl(expected, 0b1010, 0b0000);
+ appendSimple8bBlock(expected, deltaDouble(d2, d1, 10));
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
+TEST_F(BSONColumnTest, DoubleLiteralAndScaleAfterSkip) {
+ BSONColumnBuilder cb("test"_sd);
+
+ auto d1 = createElementDouble(1.0);
+ auto d2 = createElementDouble(1.1);
+ cb.skip();
+ cb.append(d1);
+ cb.append(d2);
+
+ BufBuilder expected;
+ appendSimple8bControl(expected, 0b1000, 0b0000);
+ appendSimple8bBlock(expected, boost::none);
+ appendLiteral(expected, d1);
+ appendSimple8bControl(expected, 0b1010, 0b0000);
+ appendSimple8bBlock(expected, deltaDouble(d2, d1, 10));
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
+TEST_F(BSONColumnTest, DoubleIncreaseScaleFromLiteralAfterSkip) {
+ BSONColumnBuilder cb("test"_sd);
+
+ auto d1 = createElementDouble(1);
+ auto d2 = createElementDouble(1.1);
+ cb.append(d1);
+ cb.skip();
+ cb.skip();
+ cb.append(d2);
+
+ BufBuilder expected;
+ appendLiteral(expected, d1);
+ appendSimple8bControl(expected, 0b1010, 0b0000);
+
+ std::vector<boost::optional<uint64_t>> expectedVals(2, boost::none);
+ expectedVals.push_back(deltaDouble(d2, d1, 10));
+ appendSimple8bBlocks(expected, expectedVals, 1);
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
+TEST_F(BSONColumnTest, DoubleIncreaseScaleFromDeltaWithRescale) {
+ BSONColumnBuilder cb("test"_sd);
+
+ std::vector<BSONElement> elems;
+ elems.push_back(createElementDouble(1.0));
+ elems.push_back(createElementDouble(2.0));
+ elems.push_back(createElementDouble(2.1));
+ elems.push_back(createElementDouble(2.2));
+
+ for (const auto& elem : elems) {
+ cb.append(elem);
+ }
+
+ BufBuilder expected;
+ appendLiteral(expected, elems.front());
+ appendSimple8bControl(expected, 0b1010, 0b0000);
+ appendSimple8bBlocks(
+ expected, deltaDouble(elems.begin() + 1, elems.end(), elems.front(), 10), 1);
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
+TEST_F(BSONColumnTest, DoubleIncreaseScaleFromDeltaNoRescale) {
+ BSONColumnBuilder cb("test"_sd);
+
+ std::vector<BSONElement> elems;
+ elems.push_back(createElementDouble(1.1));
+ elems.push_back(createElementDouble(2.1));
+ elems.push_back(createElementDouble(2.2));
+ elems.push_back(createElementDouble(2.3));
+ elems.push_back(createElementDouble(3.12345678));
+
+ for (const auto& elem : elems) {
+ cb.append(elem);
+ }
+
+ BufBuilder expected;
+ appendLiteral(expected, elems.front());
+
+ auto deltaBegin = elems.begin() + 1;
+ auto deltaEnd = deltaBegin + 3;
+ appendSimple8bControl(expected, 0b1010, 0b0000);
+ appendSimple8bBlocks(expected, deltaDouble(deltaBegin, deltaEnd, elems.front(), 10), 1);
+
+ appendSimple8bControl(expected, 0b1101, 0b0000);
+ appendSimple8bBlocks(
+ expected, deltaDouble(deltaEnd, elems.end(), *(deltaEnd - 1), 100000000), 1);
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
+TEST_F(BSONColumnTest, DoubleDecreaseScaleAfterBlock) {
+ BSONColumnBuilder cb("test"_sd);
+
+ std::vector<BSONElement> elems;
+ elems.push_back(createElementDouble(1.12345671));
+ elems.push_back(createElementDouble(1.12345672));
+ elems.push_back(createElementDouble(2));
+ elems.push_back(createElementDouble(3));
+
+ for (const auto& elem : elems) {
+ cb.append(elem);
+ }
+
+ BufBuilder expected;
+ appendLiteral(expected, elems.front());
+
+ auto deltaBegin = elems.begin() + 1;
+ auto deltaEnd = deltaBegin + 2;
+ appendSimple8bControl(expected, 0b1101, 0b0000);
+ appendSimple8bBlocks(expected, deltaDouble(deltaBegin, deltaEnd, elems.front(), 100000000), 1);
+
+ appendSimple8bControl(expected, 0b1001, 0b0000);
+ appendSimple8bBlocks(expected, deltaDouble(deltaEnd, elems.end(), *(deltaEnd - 1), 1), 1);
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
+TEST_F(BSONColumnTest, DoubleDecreaseScaleAfterBlockUsingSkip) {
+ BSONColumnBuilder cb("test"_sd);
+
+ std::vector<BSONElement> elems;
+ elems.push_back(createElementDouble(1.12345671));
+ elems.push_back(createElementDouble(2));
+ elems.push_back(BSONElement());
+ elems.push_back(BSONElement());
+ elems.push_back(createElementDouble(3));
+
+ for (const auto& elem : elems) {
+ if (!elem.eoo()) {
+ cb.append(elem);
+ } else {
+ cb.skip();
+ }
+ }
+
+ BufBuilder expected;
+ appendLiteral(expected, elems.front());
+
+ auto deltaBegin = elems.begin() + 1;
+ auto deltaEnd = deltaBegin + 2;
+ appendSimple8bControl(expected, 0b1101, 0b0000);
+ appendSimple8bBlocks(expected, deltaDouble(deltaBegin, deltaEnd, elems.front(), 100000000), 1);
+
+ appendSimple8bControl(expected, 0b1001, 0b0000);
+ appendSimple8bBlocks(expected, deltaDouble(deltaEnd, elems.end(), *deltaBegin, 1), 1);
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
+TEST_F(BSONColumnTest, DoubleDecreaseScaleAfterBlockThenScaleBackUp) {
+ BSONColumnBuilder cb("test"_sd);
+
+ std::vector<BSONElement> elems;
+ elems.push_back(createElementDouble(1.12345671));
+ elems.push_back(createElementDouble(1.12345672));
+ elems.push_back(createElementDouble(2));
+ elems.push_back(createElementDouble(3));
+ elems.push_back(createElementDouble(1.12345672));
+
+ for (const auto& elem : elems) {
+ cb.append(elem);
+ }
+
+ BufBuilder expected;
+ appendLiteral(expected, elems.front());
+ appendSimple8bControl(expected, 0b1101, 0b0001);
+ appendSimple8bBlocks(
+ expected, deltaDouble(elems.begin() + 1, elems.end(), elems.front(), 100000000), 2);
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
+TEST_F(BSONColumnTest, DoubleDecreaseScaleAfterBlockUsingSkipThenScaleBackUp) {
+ BSONColumnBuilder cb("test"_sd);
+
+ std::vector<BSONElement> elems;
+ elems.push_back(createElementDouble(1.12345671));
+ elems.push_back(createElementDouble(2));
+ elems.push_back(BSONElement());
+ elems.push_back(BSONElement());
+ elems.push_back(createElementDouble(1.12345671));
+
+ for (const auto& elem : elems) {
+ if (!elem.eoo()) {
+ cb.append(elem);
+ } else {
+ cb.skip();
+ }
+ }
+
+ BufBuilder expected;
+ appendLiteral(expected, elems.front());
+ appendSimple8bControl(expected, 0b1101, 0b0001);
+ appendSimple8bBlocks(
+ expected, deltaDouble(elems.begin() + 1, elems.end(), elems.front(), 100000000), 2);
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
+TEST_F(BSONColumnTest, DoubleUnscalable) {
+ BSONColumnBuilder cb("test"_sd);
+
+ std::vector<BSONElement> elems;
+ elems.push_back(createElementDouble(1.0));
+ elems.push_back(createElementDouble(1.0 + std::numeric_limits<double>::epsilon()));
+ elems.push_back(createElementDouble(1.0 + std::numeric_limits<double>::epsilon() * 2));
+
+ for (const auto& elem : elems) {
+ cb.append(elem);
+ }
+
+ BufBuilder expected;
+ appendLiteral(expected, elems.front());
+
+ std::vector<boost::optional<uint64_t>> expectedVals;
+ expectedVals.push_back(deltaDoubleMemory(elems[1], elems[0]));
+ expectedVals.push_back(deltaDoubleMemory(elems[2], elems[1]));
+ appendSimple8bControl(expected, 0b1000, 0b0000);
+ appendSimple8bBlocks(expected, expectedVals, 1);
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
+TEST_F(BSONColumnTest, DoubleSignalingNaN) {
+ BSONColumnBuilder cb("test"_sd);
+
+ auto elem = createElementDouble(0.0);
+ auto nan = createElementDouble(std::numeric_limits<double>::signaling_NaN());
+
+ cb.append(elem);
+ cb.append(nan);
+
+ BufBuilder expected;
+ appendLiteral(expected, elem);
+
+ if (auto delta = deltaDoubleMemory(nan, elem); simple8bPossible(delta)) {
+ appendSimple8bControl(expected, 0b1000, 0b0000);
+ appendSimple8bBlock(expected, delta);
+ } else {
+ appendLiteral(expected, nan);
+ }
+
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
+TEST_F(BSONColumnTest, DoubleQuietNaN) {
+ BSONColumnBuilder cb("test"_sd);
+
+ auto elem = createElementDouble(0.0);
+ auto nan = createElementDouble(std::numeric_limits<double>::quiet_NaN());
+
+ cb.append(elem);
+ cb.append(nan);
+
+ BufBuilder expected;
+ appendLiteral(expected, elem);
+ if (auto delta = deltaDoubleMemory(nan, elem); simple8bPossible(delta)) {
+ appendSimple8bControl(expected, 0b1000, 0b0000);
+ appendSimple8bBlock(expected, delta);
+ } else {
+ appendLiteral(expected, nan);
+ }
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
+TEST_F(BSONColumnTest, DoubleInfinity) {
+ BSONColumnBuilder cb("test"_sd);
+
+ auto elem = createElementDouble(0.0);
+ auto inf = createElementDouble(std::numeric_limits<double>::infinity());
+
+ cb.append(elem);
+ cb.append(inf);
+
+ BufBuilder expected;
+ appendLiteral(expected, elem);
+ if (auto delta = deltaDoubleMemory(inf, elem); simple8bPossible(delta)) {
+ appendSimple8bControl(expected, 0b1000, 0b0000);
+ appendSimple8bBlock(expected, delta);
+ } else {
+ appendLiteral(expected, inf);
+ }
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
+TEST_F(BSONColumnTest, DoubleDenorm) {
+ BSONColumnBuilder cb("test"_sd);
+
+ auto elem = createElementDouble(0.0);
+ auto denorm = createElementDouble(std::numeric_limits<double>::denorm_min());
+
+ cb.append(elem);
+ cb.append(denorm);
+
+ BufBuilder expected;
+ appendLiteral(expected, elem);
+ if (auto delta = deltaDoubleMemory(denorm, elem); simple8bPossible(delta)) {
+ appendSimple8bControl(expected, 0b1000, 0b0000);
+ appendSimple8bBlock(expected, delta);
+ } else {
+ appendLiteral(expected, denorm);
+ }
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
+TEST_F(BSONColumnTest, DoubleIntegerOverflow) {
+ BSONColumnBuilder cb("test"_sd);
+
+ // std::numeric_limits<int64_t>::min() - 0x1000 will cause an overflow if performed as signed,
+ // make sure it is handled correctly
+ auto e1 = createElementDouble(
+ Simple8bTypeUtil::decodeDouble(0x1000, Simple8bTypeUtil::kMemoryAsInteger));
+ auto e2 = createElementDouble(Simple8bTypeUtil::decodeDouble(
+ std::numeric_limits<int64_t>::min(), Simple8bTypeUtil::kMemoryAsInteger));
+
+ cb.append(e1);
+ cb.append(e2);
+
+ BufBuilder expected;
+ appendLiteral(expected, e1);
+ appendSimple8bControl(expected, 0b1000, 0b0000);
+ appendSimple8bBlock(expected, deltaDoubleMemory(e2, e1));
+ appendEOO(expected);
+
+ verifyBinary(cb.finalize(), expected);
+}
+
TEST_F(BSONColumnTest, BasicObjectId) {
BSONColumnBuilder cb("test"_sd);
@@ -366,7 +821,7 @@ TEST_F(BSONColumnTest, BasicObjectId) {
BufBuilder expected;
appendLiteral(expected, first);
appendSimple8bControl(expected, 0b1000, 0b0000);
- std::vector<uint64_t> expectedDeltas{
+ std::vector<boost::optional<uint64_t>> expectedDeltas{
deltaObjectId(second, first), deltaObjectId(second, second), deltaObjectId(third, second)};
appendSimple8bBlocks(expected, expectedDeltas, 1);
appendEOO(expected);
@@ -435,8 +890,8 @@ TEST_F(BSONColumnTest, Simple8bTimestamp) {
BufBuilder expected;
appendLiteral(expected, first);
appendSimple8bControl(expected, 0b1000, 0b0000);
- std::vector<uint64_t> expectedDeltaOfDeltas{deltaOfDeltaTimestamp(second, first),
- deltaOfDeltaTimestamp(second, second, first)};
+ std::vector<boost::optional<uint64_t>> expectedDeltaOfDeltas{
+ deltaOfDeltaTimestamp(second, first), deltaOfDeltaTimestamp(second, second, first)};
appendSimple8bBlocks(expected, expectedDeltaOfDeltas, 1);
appendEOO(expected);
@@ -457,8 +912,8 @@ TEST_F(BSONColumnTest, Simple8bTimestampNegativeDeltaOfDelta) {
BufBuilder expected;
appendLiteral(expected, first);
appendSimple8bControl(expected, 0b1000, 0b0000);
- std::vector<uint64_t> expectedDeltaOfDeltas{deltaOfDeltaTimestamp(second, first),
- deltaOfDeltaTimestamp(third, second, first)};
+ std::vector<boost::optional<uint64_t>> expectedDeltaOfDeltas{
+ deltaOfDeltaTimestamp(second, first), deltaOfDeltaTimestamp(third, second, first)};
appendSimple8bBlocks(expected, expectedDeltaOfDeltas, 1);
appendEOO(expected);
@@ -533,8 +988,8 @@ TEST_F(BSONColumnTest, LargeDeltaOfDeltaIsLiteralAfterSimple8bTimestamp) {
appendSimple8bBlock(expected, deltaOfDeltaTimestamp(zero, zero));
appendLiteral(expected, large);
appendSimple8bControl(expected, 0b1000, 0b0000);
- std::vector<uint64_t> expectedDeltaOfDeltas{deltaOfDeltaTimestamp(large, large),
- deltaOfDeltaTimestamp(semiLarge, large, large)};
+ std::vector<boost::optional<uint64_t>> expectedDeltaOfDeltas{
+ deltaOfDeltaTimestamp(large, large), deltaOfDeltaTimestamp(semiLarge, large, large)};
appendSimple8bBlocks(expected, expectedDeltaOfDeltas, 1);
appendEOO(expected);
diff --git a/src/mongo/bson/util/bsoncolumnbuilder.cpp b/src/mongo/bson/util/bsoncolumnbuilder.cpp
index eeb05ac1372..d54ade92c72 100644
--- a/src/mongo/bson/util/bsoncolumnbuilder.cpp
+++ b/src/mongo/bson/util/bsoncolumnbuilder.cpp
@@ -34,12 +34,44 @@
#include <memory>
namespace mongo {
+namespace {
static constexpr uint8_t kMaxCount = 16;
static constexpr uint8_t kCountMask = 0x0F;
-static constexpr uint8_t kNoScaleControl = 0x80;
+static constexpr uint8_t kControlMask = 0xF0;
+
+static constexpr std::array<uint8_t, Simple8bTypeUtil::kMemoryAsInteger + 1>
+ kControlByteForScaleIndex = {0x90, 0xA0, 0xB0, 0xC0, 0xD0, 0x80};
+
+int64_t calcDelta(int64_t val, int64_t prev) {
+ // Do the subtraction as unsigned and cast back to signed to get overflow defined to wrapped
+ // around instead of undefined behavior.
+ return static_cast<int64_t>(static_cast<uint64_t>(val) - static_cast<uint64_t>(prev));
+}
+
+int64_t expandDelta(int64_t prev, int64_t delta) {
+ // Do the addition as unsigned and cast back to signed to get overflow defined to wrapped around
+ // instead of undefined behavior.
+ return static_cast<int64_t>(static_cast<uint64_t>(prev) + static_cast<uint64_t>(delta));
+}
+
+// Encodes the double with the lowest possible scale index. In worst case we will interpret the
+// memory as integer which is guaranteed to succeed.
+std::pair<int64_t, uint8_t> scaleAndEncodeDouble(double value, uint8_t minScaleIndex) {
+ boost::optional<int64_t> encoded;
+ for (; !encoded; ++minScaleIndex) {
+ encoded = Simple8bTypeUtil::encodeDouble(value, minScaleIndex);
+ }
+
+ // Subtract the last scale that was added in the loop before returning
+ return {*encoded, minScaleIndex - 1};
+}
+
+} // namespace
BSONColumnBuilder::BSONColumnBuilder(StringData fieldName)
- : _simple8bBuilder(_createSimple8bBuilder()), _fieldName(fieldName) {
+ : _simple8bBuilder(_createBufferWriter()),
+ _scaleIndex(Simple8bTypeUtil::kMemoryAsInteger),
+ _fieldName(fieldName) {
// Store EOO type with empty field name as previous.
_storePrevious(BSONElement());
@@ -59,46 +91,57 @@ BSONColumnBuilder& BSONColumnBuilder::append(BSONElement elem) {
_storePrevious(elem);
_simple8bBuilder.flush();
_writeLiteralFromPrevious();
+
return *this;
}
- // Depending on the type, delta or delta-of-delta is stored in Simple-8b.
- int64_t value = 0;
- bool deltaPossible = true;
- if (_usesDeltaOfDelta(type) || !elem.binaryEqualValues(previous)) {
- switch (type) {
- case NumberInt:
- value = elem._numberInt() - previous._numberInt();
- break;
- case NumberLong:
- value = elem._numberLong() - previous._numberLong();
- break;
- case jstOID:
- deltaPossible = _objectIdDeltaPossible(elem, previous);
- if (deltaPossible)
- value = Simple8bTypeUtil::encodeObjectId(elem.OID()) -
- Simple8bTypeUtil::encodeObjectId(previous.OID());
- break;
- case bsonTimestamp: {
- int64_t currTimestampDelta =
- elem.timestamp().asULL() - previous.timestamp().asULL();
- value = currTimestampDelta - _prevDelta;
- _prevDelta = currTimestampDelta;
- break;
+ // Store delta in Simple-8b if types match
+ bool compressed = !_usesDeltaOfDelta(type) && elem.binaryEqualValues(previous);
+ if (compressed) {
+ _simple8bBuilder.append(0);
+ }
+
+ if (!compressed) {
+ if (type == NumberDouble) {
+ compressed = _appendDouble(elem._numberDouble(), previous._numberDouble());
+
+ } else {
+ bool encodingPossible = true;
+ int64_t value = 0;
+ switch (type) {
+ case NumberInt:
+ value = calcDelta(elem._numberInt(), previous._numberInt());
+ break;
+ case NumberLong:
+ value = calcDelta(elem._numberLong(), previous._numberLong());
+ break;
+ case jstOID:
+ encodingPossible = _objectIdDeltaPossible(elem, previous);
+ if (encodingPossible)
+ value = calcDelta(Simple8bTypeUtil::encodeObjectId(elem.OID()),
+ Simple8bTypeUtil::encodeObjectId(previous.OID()));
+ break;
+ case bsonTimestamp: {
+ int64_t currTimestampDelta =
+ calcDelta(elem.timestamp().asULL(), previous.timestamp().asULL());
+ value = calcDelta(currTimestampDelta, _prevDelta);
+ _prevDelta = currTimestampDelta;
+ break;
+ }
+ default:
+ // Nothing else is implemented yet
+ invariant(false);
+ };
+ if (encodingPossible) {
+ compressed = _simple8bBuilder.append(Simple8bTypeUtil::encodeInt64(value));
}
- default:
- // Nothing else is implemented yet
- invariant(false);
- };
+ }
}
- bool result = false;
- if (deltaPossible)
- result = _simple8bBuilder.append(Simple8bTypeUtil::encodeInt64(value));
_storePrevious(elem);
// Store uncompressed literal if value is outside of range of encodable values.
- if (!result) {
+ if (!compressed) {
_simple8bBuilder.flush();
_writeLiteralFromPrevious();
}
@@ -106,8 +149,143 @@ BSONColumnBuilder& BSONColumnBuilder::append(BSONElement elem) {
return *this;
}
+boost::optional<Simple8bBuilder<uint64_t>> BSONColumnBuilder::_tryRescalePending(
+ int64_t encoded, uint8_t newScaleIndex) {
+ // Encode last value in the previous block with old and new scale index. We know that scaling
+ // with the old index is possible.
+ int64_t prev = *Simple8bTypeUtil::encodeDouble(_lastValueInPrevBlock, _scaleIndex);
+ boost::optional<int64_t> prevRescaled =
+ Simple8bTypeUtil::encodeDouble(_lastValueInPrevBlock, newScaleIndex);
+
+ // Fail if we could not rescale
+ bool possible = prevRescaled.has_value();
+ if (!possible)
+ return boost::none;
+
+ // Create a new Simple8bBuilder for the rescaled values. If any Simple8b block is finalized when
+ // adding the new values then rescaling is less optimal than flushing with the current scale. So
+ // we just record if this happens in our write callback.
+ Simple8bBuilder<uint64_t> builder([&possible](uint64_t block) { possible = false; });
+
+ // Iterate over our pending values, decode them back into double, rescale and append to our new
+ // Simple8b builder
+ for (const auto& pending : _simple8bBuilder) {
+ if (!pending) {
+ builder.skip();
+ continue;
+ }
+
+ // Apply delta to previous, decode to double and rescale
+ prev = expandDelta(prev, Simple8bTypeUtil::decodeInt64(*pending));
+ auto rescaled = Simple8bTypeUtil::encodeDouble(
+ Simple8bTypeUtil::decodeDouble(prev, _scaleIndex), newScaleIndex);
+
+ // Fail if we could not rescale
+ if (!rescaled || !prevRescaled)
+ return boost::none;
+
+ // Append the scaled delta
+ auto appended =
+ builder.append(Simple8bTypeUtil::encodeInt64(calcDelta(*rescaled, *prevRescaled)));
+
+ // Fail if are out of range for Simple8b or a block was written
+ if (!appended || !possible)
+ return boost::none;
+
+ // Remember previous for next value
+ prevRescaled = rescaled;
+ }
+
+ // Last add our new value
+ auto appended =
+ builder.append(Simple8bTypeUtil::encodeInt64(calcDelta(encoded, *prevRescaled)));
+ if (!appended || !possible)
+ return boost::none;
+
+ // We managed to add all re-scaled values, this will thus compress better. Set write callback to
+ // our buffer writer and return
+ builder.setWriteCallback(_createBufferWriter());
+ return builder;
+}
+
+bool BSONColumnBuilder::_appendDouble(double value, double previous) {
+ // Scale with lowest possible scale index
+ auto [encoded, scaleIndex] = scaleAndEncodeDouble(value, _scaleIndex);
+
+ if (scaleIndex != _scaleIndex) {
+ // New value need higher scale index. We have two choices:
+ // (1) Re-scale pending values to use this larger scale factor
+ // (2) Flush pending and start a new block with this higher scale factor
+ // We try both options and select the one that compresses best
+ auto rescaled = _tryRescalePending(encoded, scaleIndex);
+ if (rescaled) {
+ // Re-scale possible, use this Simple8b builder
+ std::swap(_simple8bBuilder, *rescaled);
+ _prevEncoded = encoded;
+ _scaleIndex = scaleIndex;
+ return true;
+ }
+
+ // Re-scale not possible, flush and start new block with the higher scale factor
+ _simple8bBuilder.flush();
+ _controlByteOffset = 0;
+
+ // Make sure value and previous are using the same scale factor.
+ uint8_t prevScaleIndex;
+ std::tie(_prevEncoded, prevScaleIndex) = scaleAndEncodeDouble(previous, scaleIndex);
+ if (scaleIndex != prevScaleIndex) {
+ std::tie(encoded, scaleIndex) = scaleAndEncodeDouble(value, prevScaleIndex);
+ std::tie(_prevEncoded, prevScaleIndex) = scaleAndEncodeDouble(previous, scaleIndex);
+ }
+
+ // Record our new scale factor
+ _scaleIndex = scaleIndex;
+ }
+
+ // Append delta and check if we wrote a Simple8b block. If we did we may be able to reduce the
+ // scale factor when starting a new block
+ auto before = _bufBuilder.len();
+ if (!_simple8bBuilder.append(Simple8bTypeUtil::encodeInt64(calcDelta(encoded, _prevEncoded))))
+ return false;
+
+ if (_bufBuilder.len() != before) {
+ // Reset the scale factor to 0 and append all pending values to a new Simple8bBuilder. In
+ // the worse case we will end up with an identical scale factor.
+ auto prevScale = _scaleIndex;
+ std::tie(_prevEncoded, _scaleIndex) = scaleAndEncodeDouble(_lastValueInPrevBlock, 0);
+
+ // Create a new Simple8bBuilder.
+ Simple8bBuilder<uint64_t> builder(_createBufferWriter());
+ std::swap(_simple8bBuilder, builder);
+
+ // Iterate over previous pending values and re-add them recursively. That will increase the
+ // scale factor as needed.
+ auto prev = _lastValueInPrevBlock;
+ auto prevEncoded = *Simple8bTypeUtil::encodeDouble(prev, prevScale);
+ for (const auto& pending : builder) {
+ if (pending) {
+ prevEncoded = expandDelta(prevEncoded, Simple8bTypeUtil::decodeInt64(*pending));
+ auto val = Simple8bTypeUtil::decodeDouble(prevEncoded, prevScale);
+ _appendDouble(val, prev);
+ prev = val;
+ } else {
+ _simple8bBuilder.skip();
+ }
+ }
+ }
+
+ _prevEncoded = encoded;
+ return true;
+}
+
BSONColumnBuilder& BSONColumnBuilder::skip() {
+ auto before = _bufBuilder.len();
_simple8bBuilder.skip();
+
+ // Rescale previous known value if this skip caused Simple-8b blocks to be written
+ if (before != _bufBuilder.len() && _previous().type() == NumberDouble) {
+ std::tie(_prevEncoded, _scaleIndex) = scaleAndEncodeDouble(_lastValueInPrevBlock, 0);
+ }
return *this;
}
@@ -148,11 +326,20 @@ void BSONColumnBuilder::_writeLiteralFromPrevious() {
_controlByteOffset = 0;
// There is no previous timestamp delta. Set to default.
_prevDelta = 0;
+
+ // Set scale factor for this literal and values needed to append values
+ if (_prev[0] == NumberDouble) {
+ _lastValueInPrevBlock = _previous()._numberDouble();
+ std::tie(_prevEncoded, _scaleIndex) = scaleAndEncodeDouble(_lastValueInPrevBlock, 0);
+ } else {
+ _scaleIndex = Simple8bTypeUtil::kMemoryAsInteger;
+ }
}
void BSONColumnBuilder::_incrementSimple8bCount() {
char* byte;
uint8_t count;
+ uint8_t control = kControlByteForScaleIndex[_scaleIndex];
if (_controlByteOffset == 0) {
// Allocate new control byte if we don't already have one. Record its offset so we can find
@@ -163,25 +350,39 @@ void BSONColumnBuilder::_incrementSimple8bCount() {
} else {
// Read current count from previous control byte
byte = _bufBuilder.buf() + _controlByteOffset;
+
+ // If previous byte was written with a different control byte then we can't re-use and need
+ // to start a new one
+ if ((*byte & kControlMask) != control) {
+ _controlByteOffset = 0;
+ _incrementSimple8bCount();
+ return;
+ }
count = (*byte & kCountMask) + 1;
}
// Write back new count and clear offset if we have reached max count
- *byte = kNoScaleControl | (count & kCountMask);
+ *byte = control | (count & kCountMask);
if (count + 1 == kMaxCount) {
_controlByteOffset = 0;
}
}
-Simple8bBuilder<uint64_t> BSONColumnBuilder::_createSimple8bBuilder() {
- return Simple8bBuilder<uint64_t>([this](uint64_t block) {
+Simple8bWriteFn BSONColumnBuilder::_createBufferWriter() {
+ return [this](uint64_t block) {
// Write/update block count
_incrementSimple8bCount();
- // Write Simple-8b block
+ // Write Simple-8b block in little endian byte order
_bufBuilder.appendNum(block);
+
+ auto previous = _previous();
+ if (previous.type() == NumberDouble) {
+ _lastValueInPrevBlock = previous._numberDouble();
+ }
+
return true;
- });
+ };
}
bool BSONColumnBuilder::_usesDeltaOfDelta(BSONType type) {
diff --git a/src/mongo/bson/util/bsoncolumnbuilder.h b/src/mongo/bson/util/bsoncolumnbuilder.h
index 36d6fc1672f..ce36228bb41 100644
--- a/src/mongo/bson/util/bsoncolumnbuilder.h
+++ b/src/mongo/bson/util/bsoncolumnbuilder.h
@@ -82,7 +82,16 @@ private:
bool _usesDeltaOfDelta(BSONType type);
bool _objectIdDeltaPossible(BSONElement elem, BSONElement prev);
- Simple8bBuilder<uint64_t> _createSimple8bBuilder();
+ // Helper to append doubles to this Column builder. Returns true if append was successful and
+ // false if the value needs to be stored uncompressed.
+ bool _appendDouble(double value, double previous);
+
+ // Tries to rescale current pending values + one additional value into a new Simple8bBuilder.
+ // Returns the new Simple8bBuilder if rescaling was possible and none otherwise.
+ boost::optional<Simple8bBuilder<uint64_t>> _tryRescalePending(int64_t encoded,
+ uint8_t newScaleIndex);
+
+ Simple8bWriteFn _createBufferWriter();
// Storage for the previously appended BSONElement
std::unique_ptr<char[]> _prev;
@@ -97,6 +106,11 @@ private:
// Offset to last Simple-8b control byte
std::ptrdiff_t _controlByteOffset = 0;
+ // Additional variables needed for previous state
+ int64_t _prevEncoded;
+ double _lastValueInPrevBlock = 0;
+ uint8_t _scaleIndex;
+
// Buffer for the BSON Column binary
BufBuilder _bufBuilder;
diff --git a/src/mongo/bson/util/simple8b.cpp b/src/mongo/bson/util/simple8b.cpp
index d0cf8c577f9..594af3375f1 100644
--- a/src/mongo/bson/util/simple8b.cpp
+++ b/src/mongo/bson/util/simple8b.cpp
@@ -383,7 +383,7 @@ bool Simple8bBuilder<T>::PendingIterator::operator!=(
}
template <typename T>
-Simple8bBuilder<T>::Simple8bBuilder(WriteFn writeFunc) : _writeFn(std::move(writeFunc)) {}
+Simple8bBuilder<T>::Simple8bBuilder(Simple8bWriteFn writeFunc) : _writeFn(std::move(writeFunc)) {}
template <typename T>
Simple8bBuilder<T>::~Simple8bBuilder() = default;
@@ -391,7 +391,7 @@ Simple8bBuilder<T>::~Simple8bBuilder() = default;
template <typename T>
bool Simple8bBuilder<T>::append(T value) {
if (_rlePossible()) {
- if (_lastValueInPrevWord.value() == value) {
+ if (_lastValueInPrevWord.val == value) {
++_rleCount;
return true;
}
@@ -712,6 +712,11 @@ typename Simple8bBuilder<T>::PendingIterator Simple8bBuilder<T>::end() const {
}
template <typename T>
+void Simple8bBuilder<T>::setWriteCallback(Simple8bWriteFn writer) {
+ _writeFn = std::move(writer);
+}
+
+template <typename T>
Simple8b<T>::Iterator::Iterator(const uint64_t* pos, const uint64_t* end)
: _pos(pos), _end(end), _value(0), _rleRemaining(0), _shift(0) {
if (pos != end) {
diff --git a/src/mongo/bson/util/simple8b.h b/src/mongo/bson/util/simple8b.h
index 4e345638359..46beaa26627 100644
--- a/src/mongo/bson/util/simple8b.h
+++ b/src/mongo/bson/util/simple8b.h
@@ -39,7 +39,14 @@
namespace mongo {
/**
- * Simple8bBuilder compresses a series of integers into chains of 64 bit Simple8b word.
+ * Callback type to implement writing of 64 bit Simple8b words.
+ */
+using Simple8bWriteFn = std::function<void(uint64_t)>;
+
+/**
+ * Simple8bBuilder compresses a series of integers into chains of 64 bit Simple8b blocks.
+ *
+ * T may be uint64_t and uint128_t only.
*/
template <typename T>
class Simple8bBuilder {
@@ -47,29 +54,30 @@ private:
struct PendingValue;
public:
- // Callback to handle writing of finalized Simple-8b blocks. Machine Endian byte order.
- using WriteFn = std::function<void(uint64_t)>;
- Simple8bBuilder(WriteFn writeFunc);
+ // Callback to handle writing of finalized Simple-8b blocks. Machine Endian byte order, the
+ // value need to be converted to Little Endian before persisting.
+ Simple8bBuilder(Simple8bWriteFn writeFunc);
~Simple8bBuilder();
/**
- * Checks if we can append val to an existing RLE and handles the ending of a RLE.
- * The default RLE value at the beginning is 0.
- * Otherwise, Appends a value to the Simple8b chain of words.
- * Return true if successfully appended and false otherwise.
+ * Appends val to Simple8b. Returns true if the append was successful and false if the value was
+ * outside the range of possible values we can store in Simple8b.
+ *
+ * A call to append may result in multiple Simple8b blocks being finalized.
*/
bool append(T val);
/**
- * Appends an empty bucket to handle missing values. This works by incrementing an underlying
- * simple8b index by one and encoding a "missing value" in the simple8b block as all 1s.
+ * Appends a missing value to Simple8b.
+ *
+ * May result in a single Simple8b being finalized.
*/
void skip();
/**
- * Stores all values for RLE or in _pendingValues into _buffered even if the Simple8b word will
- * not be opimtal and use a larger selector than necessary because we don't have enough integers
- * to use one wiht more slots.
+ * Flushes all buffered values into finalized Simple8b blocks.
+ *
+ * It is allowed to continue to append values after this call.
*/
void flush();
@@ -114,6 +122,11 @@ public:
PendingIterator begin() const;
PendingIterator end() const;
+ /**
+ * Set write callback
+ */
+ void setWriteCallback(Simple8bWriteFn writer);
+
private:
// Number of different type of selectors and their extensions available
static constexpr uint8_t kNumOfSelectorTypes = 4;
@@ -259,7 +272,7 @@ private:
std::deque<PendingValue> _pendingValues;
// User-defined callback to handle writing of finalized Simple-8b blocks
- WriteFn _writeFn;
+ Simple8bWriteFn _writeFn;
};
/**
@@ -286,7 +299,7 @@ public:
size_t blockSize() const;
/**
- * Returns the value in Little Endian byte order at the current iterator position.
+ * Returns the value in at the current iterator position.
*/
pointer operator->() const {
return &_value;
diff --git a/src/mongo/bson/util/simple8b_type_util.cpp b/src/mongo/bson/util/simple8b_type_util.cpp
index a225fd5afe9..7ac55dae78a 100644
--- a/src/mongo/bson/util/simple8b_type_util.cpp
+++ b/src/mongo/bson/util/simple8b_type_util.cpp
@@ -133,6 +133,9 @@ boost::optional<uint8_t> Simple8bTypeUtil::calculateDecimalShiftMultiplier(doubl
}
boost::optional<int64_t> Simple8bTypeUtil::encodeDouble(double val, uint8_t scaleIndex) {
+ if (scaleIndex == kMemoryAsInteger)
+ return *reinterpret_cast<int64_t*>(&val);
+
// Checks for both overflow and handles NaNs
// We use 2^53 because this is the max integer that we can guarentee can be
// exactly represented by a floating point decimal since there are 53 value bits
@@ -157,6 +160,9 @@ boost::optional<int64_t> Simple8bTypeUtil::encodeDouble(double val, uint8_t scal
}
double Simple8bTypeUtil::decodeDouble(int64_t val, uint8_t scaleIndex) {
+ if (scaleIndex == kMemoryAsInteger)
+ return *reinterpret_cast<double*>(&val);
+
return val / kScaleMultiplier[scaleIndex];
}
diff --git a/src/mongo/bson/util/simple8b_type_util.h b/src/mongo/bson/util/simple8b_type_util.h
index e03fa958501..5d472960db3 100644
--- a/src/mongo/bson/util/simple8b_type_util.h
+++ b/src/mongo/bson/util/simple8b_type_util.h
@@ -74,6 +74,8 @@ public:
// Array is a double as it will always be multiplied by a double and we don't want to do an
// extra cast for
- static constexpr std::array<double, 5> kScaleMultiplier = {1, 10, 100, 10000, 100000000};
+ static constexpr uint8_t kMemoryAsInteger = 5;
+ static constexpr std::array<double, kMemoryAsInteger> kScaleMultiplier = {
+ 1, 10, 100, 10000, 100000000};
};
} // namespace mongo
diff --git a/src/mongo/bson/util/simple8b_type_util_test.cpp b/src/mongo/bson/util/simple8b_type_util_test.cpp
index 7129638025e..77e7d6729f6 100644
--- a/src/mongo/bson/util/simple8b_type_util_test.cpp
+++ b/src/mongo/bson/util/simple8b_type_util_test.cpp
@@ -239,6 +239,30 @@ TEST(Simple8b, TestMinDoubleShouldFail) {
ASSERT_FALSE(result.has_value());
}
+TEST(Simple8b, InterpretAsMemory) {
+ std::vector<double> vals = {0.0,
+ 1.12345678,
+ std::numeric_limits<double>::max(),
+ std::numeric_limits<double>::min(),
+ std::numeric_limits<double>::infinity(),
+ std::numeric_limits<double>::signaling_NaN(),
+ std::numeric_limits<double>::quiet_NaN(),
+ std::numeric_limits<double>::denorm_min()};
+
+ for (double val : vals) {
+ boost::optional<int64_t> result =
+ Simple8bTypeUtil::encodeDouble(val, Simple8bTypeUtil::kMemoryAsInteger);
+ ASSERT_TRUE(result);
+ ASSERT_EQ(*result, *reinterpret_cast<const int64_t*>(&val));
+
+ // Some of the special values above does not compare equal with themselves (signaling NaN).
+ // Verify that we end up with the same memory after decoding
+ double decoded =
+ Simple8bTypeUtil::decodeDouble(*result, Simple8bTypeUtil::kMemoryAsInteger);
+ ASSERT_EQ(memcmp(&decoded, &val, sizeof(val)), 0);
+ }
+}
+
TEST(Simple8b, TestMaxInt) {
// max int that can be stored as a double without losing precision
double val = std::pow(2, 53);