summaryrefslogtreecommitdiff
path: root/src/mongo
diff options
context:
space:
mode:
authorMathias Stearn <redbeard0531@gmail.com>2022-04-12 16:10:28 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-04-12 22:04:04 +0000
commit851e346893f82a134e0bf8f3a4f393798340cd53 (patch)
tree0c6bcf644542dc0eb91930723c17fa7fece8bf31 /src/mongo
parent4db485642f9d1d3675308e8415da27c4ef212b79 (diff)
downloadmongo-851e346893f82a134e0bf8f3a4f393798340cd53.tar.gz
SERVER-65192 Implement ColumnShredder to extract paths and cells for a document
Diffstat (limited to 'src/mongo')
-rw-r--r--src/mongo/db/index/SConscript2
-rw-r--r--src/mongo/db/index/column_key_generator.cpp595
-rw-r--r--src/mongo/db/index/column_key_generator.h111
-rw-r--r--src/mongo/db/index/column_key_generator_test.cpp734
-rw-r--r--src/mongo/db/storage/column_store.h17
5 files changed, 1459 insertions, 0 deletions
diff --git a/src/mongo/db/index/SConscript b/src/mongo/db/index/SConscript
index 410c0a2825c..48ab6f723cc 100644
--- a/src/mongo/db/index/SConscript
+++ b/src/mongo/db/index/SConscript
@@ -23,6 +23,7 @@ env.Library(
target='key_generator',
source=[
'btree_key_generator.cpp',
+ 'column_key_generator.cpp',
'expression_keys_private.cpp',
'sort_key_generator.cpp',
'wildcard_key_generator.cpp',
@@ -150,6 +151,7 @@ env.CppUnitTest(
source=[
'2d_key_generator_test.cpp',
'btree_key_generator_test.cpp',
+ 'column_key_generator_test.cpp',
'hash_key_generator_test.cpp',
's2_key_generator_test.cpp',
's2_bucket_key_generator_test.cpp',
diff --git a/src/mongo/db/index/column_key_generator.cpp b/src/mongo/db/index/column_key_generator.cpp
new file mode 100644
index 00000000000..6e348179da8
--- /dev/null
+++ b/src/mongo/db/index/column_key_generator.cpp
@@ -0,0 +1,595 @@
+/**
+ * Copyright (C) 2022-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kIndex
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/index/column_key_generator.h"
+
+#include <iomanip>
+#include <ostream>
+#include <vector>
+
+#include "mongo/db/storage/column_store.h"
+#include "mongo/util/functional.h"
+#include "mongo/util/string_map.h"
+
+namespace mongo::column_keygen {
+namespace {
+
+/**
+ * Special handling for vals because BSONElement doesn't support ==, and we want binaryEqualValues
+ * rather than woCompare equality anyway.
+ */
+bool identicalBSONElementArrays(const std::vector<BSONElement>& lhs,
+ const std::vector<BSONElement>& rhs) {
+ if (lhs.size() != rhs.size())
+ return false;
+ for (size_t i = 0; i < lhs.size(); i++) {
+ if (!lhs[i].binaryEqualValues(rhs[i]))
+ return false;
+ }
+ return true;
+}
+
+/**
+ * This class handles the logic of key generation for columnar indexes. It produces
+ * UnencodedCellViews, which have all of the data that should be put in the index values, but it is
+ * not responsible for encoding that data into a flat buffer. That is handled by XXX.
+ *
+ * TODO once the code for that that is written (SERVER-64766) update this to replace the XXX.
+ *
+ * "Shredding" is an informal term for taking a single BSON document and splitting it into the data
+ * for each unique path. The data at each path should be sufficient to reconstruct the object in
+ * full fidelity, possibly considering parent paths. When determining the path, array index are not
+ * considered, only field names are.
+ *
+ * This class is private to this file, and only exposed via the free-functions in the header.
+ */
+class ColumnShredder {
+public:
+ /**
+ * Option to constructor to avoid overhead when you only need to know about the paths in an
+ * object, such as for deletes.
+ */
+ enum PathsAndCells : bool { kOnlyPaths = false, kPathsAndCells = true };
+
+ explicit ColumnShredder(const BSONObj& obj, PathsAndCells pathsAndCells = kPathsAndCells)
+ : _pathsAndCells(pathsAndCells) {
+ walkObj(_paths[ColumnStore::kRowIdPath], obj, /*isRoot=*/true);
+ }
+
+ void visitPaths(function_ref<void(PathView)> cb) const {
+ for (auto&& [path, _] : _paths) {
+ cb(path);
+ }
+ }
+
+ void visitCells(function_ref<void(PathView, const UnencodedCellView&)> cb) {
+ // This function requires that the shredder was constructed to record cell values.
+ invariant(_pathsAndCells);
+
+ for (auto&& [path, cell] : _paths) {
+ cb(path, makeCellView(path, cell));
+ }
+ }
+
+ static void visitDiff(const BSONObj& oldObj,
+ const BSONObj& newObj,
+ function_ref<void(DiffAction, PathView, const UnencodedCellView*)> cb) {
+ auto oldShredder = ColumnShredder(oldObj);
+ auto newShredder = ColumnShredder(newObj);
+
+ for (auto&& [path, rawCellNew] : newShredder._paths) {
+ auto itOld = oldShredder._paths.find(path);
+ if (itOld == oldShredder._paths.end()) {
+ auto cell = newShredder.makeCellView(path, rawCellNew);
+ cb(kInsert, path, &cell);
+ continue;
+ }
+
+ auto&& [_, rawCellOld] = *itOld; // For symmetry with rawCellNew.
+
+ // Need to compute sparseness prior to checking cell equality.
+ newShredder.computeIsSparse(path, &rawCellNew);
+ oldShredder.computeIsSparse(path, &rawCellOld);
+ if (rawCellNew == rawCellOld)
+ continue; // No change for this path.
+
+
+ auto cell = newShredder.makeCellView(path, rawCellNew);
+ cb(kUpdate, path, &cell);
+ }
+
+ for (auto&& [path, _] : oldShredder._paths) {
+ if (!newShredder._paths.contains(path)) {
+ cb(kDelete, path, nullptr);
+ }
+ }
+ }
+
+
+private:
+ enum Sparseness : int8_t { kUnknown, kIsSparse, kNotSparse };
+
+ struct RawCellValue {
+ // This is where we incrementally build up the full arrayInfo for this cell. While each
+ // position is essentially a walk from the root to somewhere in the document, this is a walk
+ // that starts at the root and then passes through each value or subobject at this path
+ // taking (hopefully) the shortest distance between each value. First we build it up in
+ // "uncompressed" format, then we call compressArrayInfo() to remove redundant information.
+ std::string arrayInfoBuf;
+
+ // This is used when building up arrayInfoBuf. It records the absolute position of the last
+ // value or subobject appended to arrayInfoBuf, so that we can easily compute the relative
+ // path to the next value. Equivalently, it is the value of _currentArrayInfo at the time of
+ // the last call to appendToArrayInfo(). It never has any '|' or 'o' bytes.
+ std::string lastPosition;
+
+ std::vector<BSONElement> vals;
+ int nSeen = 0; // Number of times this field has been seen.
+ int nNonEmptySubobjects = 0; // Number of values that are non-empty objects
+ Sparseness sparseness = kUnknown; // Memoized in computeIsSparse()
+ bool childrenMustBeSparse = false;
+ bool hasDoubleNestedArrays = false;
+ bool hasDuplicateFields = false;
+
+ bool operator==(const RawCellValue& rhs) const {
+ const RawCellValue& lhs = *this;
+
+ // Sparseness must have already been resolved.
+ invariant(lhs.sparseness != kUnknown);
+ invariant(rhs.sparseness != kUnknown);
+
+ // Intentionally not checking nSeen or childrenMustBeSparse, since they are just inputs
+ // to sparseness computation.
+
+ // Ordered to test cheaper stuff before more expensive stuff.
+ return lhs.sparseness == rhs.sparseness && //
+ lhs.hasDoubleNestedArrays == rhs.hasDoubleNestedArrays && //
+ lhs.nNonEmptySubobjects == rhs.nNonEmptySubobjects && //
+ lhs.hasDuplicateFields == rhs.hasDuplicateFields && //
+ lhs.arrayInfoBuf == rhs.arrayInfoBuf && //
+ identicalBSONElementArrays(lhs.vals, rhs.vals);
+ }
+ };
+
+ UnencodedCellView makeCellView(PathView path, RawCellValue& cell) {
+ compressArrayInfo(cell.arrayInfoBuf);
+ return {
+ cell.vals,
+ cell.arrayInfoBuf,
+ cell.hasDuplicateFields,
+ /*hasSubPaths*/ cell.nNonEmptySubobjects != 0,
+ computeIsSparse(path, &cell),
+ cell.hasDoubleNestedArrays,
+ };
+ }
+
+ // Memoized, so amortized O(1) when called on all paths, even though it must check all parent
+ // paths.
+ bool computeIsSparse(PathView path, RawCellValue* cell) {
+ if (cell->sparseness != kUnknown)
+ return cell->sparseness == kIsSparse;
+
+ auto parentPath = parentPathOf(path);
+ if (!parentPath) {
+ // Top level fields are never considered sparse.
+ cell->sparseness = kNotSparse;
+ return false;
+ }
+
+ auto parentIt = _paths.find(*parentPath);
+ invariant(parentIt != _paths.end());
+ auto& parent = parentIt->second;
+ const bool isSparse = parent.childrenMustBeSparse ||
+ cell->nSeen != parent.nNonEmptySubobjects || computeIsSparse(*parentPath, &parent);
+ cell->sparseness = isSparse ? kIsSparse : kNotSparse;
+ return isSparse;
+ }
+
+ void walkObj(RawCellValue& cell, const BSONObj& obj, bool isRoot = false) {
+ if (_pathsAndCells) {
+ cell.nNonEmptySubobjects++;
+ if (!isRoot) {
+ appendToArrayInfo(cell, 'o');
+ }
+ }
+
+ ON_BLOCK_EXIT([&, oldArrInfoSize = _currentArrayInfo.size()] {
+ if (_pathsAndCells)
+ _currentArrayInfo.resize(oldArrInfoSize);
+ });
+ if (_pathsAndCells && !isRoot)
+ _currentArrayInfo += '{';
+
+ for (auto [name, elem] : obj) {
+ // We skip fields in some edge cases such as dots in the field name. This may also throw
+ // if the field name contains invalid UTF-8 in a way that would break the index.
+ if (shouldSkipField(name))
+ continue;
+
+ ON_BLOCK_EXIT([&, oldPathSize = _currentPath.size()] { //
+ _currentPath.resize(oldPathSize);
+ });
+ if (!isRoot)
+ _currentPath += '.';
+ _currentPath += name;
+
+ auto& subCell = _paths[_currentPath];
+ subCell.nSeen++;
+ if (_inDoubleNestedArray)
+ subCell.hasDoubleNestedArrays = true;
+ handleElem(subCell, elem);
+ }
+ }
+
+ void walkArray(RawCellValue& cell, const BSONObj& arr) {
+ DecimalCounter<unsigned> index;
+
+ for (auto elem : arr) {
+ ON_BLOCK_EXIT([&,
+ oldArrInfoSize = _currentArrayInfo.size(),
+ oldInDoubleNested = _inDoubleNestedArray] {
+ if (_pathsAndCells) {
+ _currentArrayInfo.resize(oldArrInfoSize);
+ _inDoubleNestedArray = oldInDoubleNested;
+ }
+ });
+
+ if (_pathsAndCells) {
+ _currentArrayInfo += '[';
+ if (index == 0) {
+ // Zero is common so make it implied rather than explicit.
+ } else {
+ // Theoretically, index should be the same as elem.fieldNameStringData(),
+ // however, since we don't validate the array indexes, they cannot be trusted.
+ // The logic to encode array info relies on "sane" array indexes (at the very
+ // least they must be monotonically increasing), so we create a new index string
+ // here.
+ _currentArrayInfo += StringData(index);
+ }
+ ++index;
+
+ // [] doesn't start a double nesting since {a:{$eq: []}} matches {a: [[]]}
+ if (elem.type() == Array && !elem.Obj().isEmpty())
+ _inDoubleNestedArray = true;
+ }
+
+ // Note: always same cell, since array traversal never changes path.
+ handleElem(cell, elem);
+ }
+ }
+
+ void handleElem(RawCellValue& cell, const BSONElement& elem) {
+ // Only recurse on non-empty objects and arrays. Empty objects and arrays are handled as
+ // scalars.
+ if (elem.type() == Object) {
+ if (auto obj = elem.Obj(); !obj.isEmpty())
+ return walkObj(cell, obj);
+ } else if (elem.type() == Array) {
+ if (auto obj = elem.Obj(); !obj.isEmpty())
+ return walkArray(cell, obj);
+ }
+
+ if (_pathsAndCells) {
+ // If we get here, then this walk will not have any children. This means that there is
+ // at least one path where all children (if any) will be missing structural information,
+ // so they will need to consult the parent path.
+ cell.childrenMustBeSparse = true;
+
+ if (_inDoubleNestedArray)
+ cell.hasDoubleNestedArrays = true;
+
+ cell.vals.push_back(elem);
+ appendToArrayInfo(cell, '|');
+ }
+ }
+
+ void appendToArrayInfo(RawCellValue& rcd, char finalByte) {
+ dassert(finalByte == '|' || finalByte == 'o');
+
+ auto foundDuplicateField = [&] {
+ rcd.arrayInfoBuf.clear();
+ rcd.hasDuplicateFields = true;
+ };
+
+ if (rcd.hasDuplicateFields) {
+ // arrayInfo should be left empty in this case.
+ invariant(rcd.arrayInfoBuf.empty());
+ return;
+ }
+
+ ON_BLOCK_EXIT([&] { rcd.lastPosition = _currentArrayInfo; });
+
+ if (rcd.arrayInfoBuf.empty()) {
+ // The first time we get a position for this path, just record it verbatim. The first
+ // is special because we are essentially recording the absolute path to this value,
+ // while every other time we append the path relative to the prior value.
+ invariant(rcd.lastPosition.empty());
+ rcd.arrayInfoBuf.reserve(_currentArrayInfo.size() + 1);
+ rcd.arrayInfoBuf += _currentArrayInfo;
+ rcd.arrayInfoBuf += finalByte;
+ return;
+ }
+
+ // Make better names for symmetry (and to prevent accidental modifications):
+ StringData oldPosition = rcd.lastPosition;
+ StringData newPosition = _currentArrayInfo;
+
+ if (MONGO_unlikely(oldPosition.empty() || newPosition.empty())) {
+ // This can only happen if there is a duplicate field at the top level.
+ return foundDuplicateField();
+ }
+
+ auto [oldIt, newIt] = std::mismatch(oldPosition.begin(), //
+ oldPosition.end(),
+ newPosition.begin(),
+ newPosition.end());
+ if (MONGO_unlikely(newIt == newPosition.end())) {
+ // This can only happen if there is a duplicate field in an array, because otherwise
+ // the raw array infos must differ by an index in some array.
+ return foundDuplicateField();
+ }
+
+ // Walk back to start of differing elem. Important to use newIt here because if they are
+ // in the same array, oldIt may have an implicitly encoded 0 index, while newIt must
+ // have a higher index.
+ while (*newIt != '[') {
+ if (MONGO_unlikely(!(*newIt >= '0' && *newIt <= '9'))) {
+ // Non-index difference can only happen if there are duplicate fields in an array.
+ return foundDuplicateField();
+ }
+ invariant(newIt > newPosition.begin());
+ dassert(oldIt > oldPosition.begin()); // oldIt and newIt are at same index.
+ --newIt;
+ --oldIt;
+ }
+ invariant(oldIt < oldPosition.end());
+ if (MONGO_unlikely(*oldIt != '[')) {
+ // This is another type of non-index difference.
+ return foundDuplicateField();
+ }
+
+ // Close out arrays past the first mismatch in LIFO order.
+ for (auto revOldIt = oldPosition.end() - 1; revOldIt != oldIt; --revOldIt) {
+ if (*revOldIt == '[') {
+ rcd.arrayInfoBuf += ']';
+ } else {
+ dassert((*revOldIt >= '0' && *revOldIt <= '9') || *revOldIt == '{');
+ }
+ }
+
+ // Now process the mismatch. It must be a difference in array index (checked above).
+ dassert(*oldIt == '[' && *newIt == '[');
+ ++oldIt;
+ ++newIt;
+ const auto oldIx = ColumnStore::readArrInfoNumber(&oldIt, oldPosition.end());
+ const auto newIx = ColumnStore::readArrInfoNumber(&newIt, newPosition.end());
+
+ if (MONGO_unlikely(newIx <= oldIx)) {
+ // If this element is at a same or lower index, we must have hit a duplicate field
+ // above and restarted the array indexing.
+ return foundDuplicateField();
+ }
+ const auto delta = newIx - oldIx;
+ const auto skips = delta - 1;
+ if (skips == 0) {
+ // Nothing. Increment by one (skipping zero) is implicit.
+ } else {
+ rcd.arrayInfoBuf += '+';
+ rcd.arrayInfoBuf += ItoA(skips);
+ }
+
+ // Now put the rest of the new array info into the output buffer.
+ rcd.arrayInfoBuf += StringData(newIt, newPosition.end());
+ rcd.arrayInfoBuf += finalByte;
+ }
+
+ static void compressArrayInfo(std::string& arrayInfo) {
+ // NOTE: all operations in this function either shrink the array info or keep it the same
+ // size, so they are able to work in-place in the arrayInfo buffer.
+
+ // Logic below assumes arrayInfo is null terminated as a simplification.
+ dassert(strlen(arrayInfo.data()) == arrayInfo.size());
+
+ const char* in = arrayInfo.data();
+ char* out = arrayInfo.data();
+ bool anyArrays = false;
+
+ // Remove all '{' immediately before a '|' or 'o', and check for arrays
+ char* lastNonBrace = nullptr;
+ while (auto c = *in++) {
+ if (c == '[') {
+ anyArrays = true;
+ } else if ((c == '|' || c == 'o') && lastNonBrace) {
+ // Rewind output to just past last non-brace, since the last set of opens are
+ // encoded implicitly.
+ dassert(lastNonBrace + 1 <= out);
+ out = lastNonBrace + 1;
+ }
+
+ if (c != '{')
+ lastNonBrace = out;
+
+ *out++ = c;
+ }
+
+ // If there were no arrays, we don't need any array info.
+ if (!anyArrays) {
+ arrayInfo.clear();
+ return;
+ }
+
+ invariant(size_t(out - arrayInfo.data()) <= arrayInfo.size());
+
+ // Remove final run of '|' since end implies an infinite sequence of them.
+ while (out - arrayInfo.data() >= 1 && out[-1] == '|') {
+ out--;
+ }
+ arrayInfo.resize(out - arrayInfo.data()); // Reestablishes null termination.
+
+ // Now do a final pass to RLE remaining runs of '|' or 'o'. It may be possible to integrate
+ // this with the first loop above, but it would be a bit more complicated because we need
+ // to have removed any implicit runs of '{' prior to each '|' or 'o' before looking for
+ // runs.
+ out = arrayInfo.data();
+ in = arrayInfo.data();
+ while (auto c = *in++) {
+ *out++ = c;
+ if (c != '|' && c != 'o')
+ continue;
+
+ size_t repeats = 0;
+ while (*in == c) {
+ repeats++;
+ in++;
+ }
+
+ if (repeats) {
+ auto buf = ItoA(repeats); // Must be on own line so the buffer outlives numStr.
+ auto numStr = StringData(buf);
+
+ // We know there is room because a number > 0 takes up no more space than the number
+ // of '|' that it is replacing. repeats == 1 is the worst case because it isn't
+ // saving any space. However we still replace here since it should make decoding
+ // slightly faster.
+ dassert(numStr.size() <= repeats);
+ for (char c : numStr) {
+ dassert(out < in); // Note: `in` has already been advanced.
+ *out++ = c;
+ }
+ }
+ }
+
+ invariant(size_t(out - arrayInfo.data()) <= arrayInfo.size());
+ arrayInfo.resize(out - arrayInfo.data());
+ }
+
+ static bool shouldSkipField(PathView fieldName) {
+ // TODO consider skipping when the nesting depths exceed our documented limits of 100
+ // documents. This would allow decoding to use fixed-size bitsets without needing to check
+ // for overflow.
+
+ // WARNING: do not include the field name in error messages because if we throw from this
+ // function then the field name isn't valid utf-8, and could poison the whole error message!
+
+ // We only care about \xFF at the beginning, even though utf-8 bans it everywhere.
+ static_assert(ColumnStore::kRowIdPath == "\xFF"_sd);
+ uassert(6519200,
+ "Field name contains '\\xFF' which isn't valid in UTF-8",
+ fieldName[0] != '\xFF'); // We know field names are nul-terminated so this is safe.
+
+ // Fields with dots are not illegal, but we skip them because a) the query optimizer can't
+ // correctly track dependencies with them, and b) the current storage format uses dots as a
+ // separator and everything (including the array info encoder) would get confused with
+ // documents like {a: {b:1}, a.b: 2} because that breaks some of the assumptions.
+ return fieldName.find('.') != std::string::npos;
+ }
+
+ static boost::optional<PathView> parentPathOf(PathView path) {
+ auto sep = path.rfind('.');
+ if (sep == std::string::npos)
+ return {};
+
+ return path.substr(0, sep);
+ }
+
+ // This is the same as StringMap<V> but with node_hash_map. We need the reference stability
+ // guarantee because we have recursive functions that both insert into _paths and hold a
+ // reference to one of its values. It seems better to use a node-based hash table than to redo
+ // the lookup every time we want to touch a cell.
+ template <typename V>
+ using NodeStringMap = absl::node_hash_map<std::string, V, StringMapHasher, StringMapEq>;
+
+ PathValue _currentPath;
+ std::string _currentArrayInfo; // Describes a walk from root to current pos. No '|' or 'o'.
+ NodeStringMap<RawCellValue> _paths;
+ bool _inDoubleNestedArray = false;
+
+ const PathsAndCells _pathsAndCells = kPathsAndCells;
+};
+} // namespace
+
+void visitCellsForInsert(const BSONObj& obj,
+ function_ref<void(PathView, const UnencodedCellView&)> cb) {
+ ColumnShredder(obj).visitCells(cb);
+}
+
+void visitPathsForDelete(const BSONObj& obj, function_ref<void(PathView)> cb) {
+ ColumnShredder(obj, ColumnShredder::kOnlyPaths).visitPaths(cb);
+}
+
+void visitDiffForUpdate(const BSONObj& oldObj,
+ const BSONObj& newObj,
+ function_ref<void(DiffAction, PathView, const UnencodedCellView*)> cb) {
+ ColumnShredder::visitDiff(oldObj, newObj, cb);
+}
+
+bool operator==(const UnencodedCellView& lhs, const UnencodedCellView& rhs) {
+ if (lhs.hasDuplicateFields || rhs.hasDuplicateFields) {
+ // As a special case, if either is true, we only care about comparing this field, since all
+ // other fields are suspect. This simplifies testing, because we don't need to guess what
+ // the output will be (and lock it into a correctness test!) for the case with duplicate
+ // fields.
+ return lhs.hasDuplicateFields == rhs.hasDuplicateFields;
+ }
+
+ return identicalBSONElementArrays(lhs.vals, rhs.vals) && //
+ lhs.arrayInfo == rhs.arrayInfo && //
+ lhs.hasSubPaths == rhs.hasSubPaths && //
+ lhs.isSparse == rhs.isSparse && //
+ lhs.hasDoubleNestedArrays == rhs.hasDoubleNestedArrays;
+}
+
+std::ostream& operator<<(std::ostream& os, const UnencodedCellView& cell) {
+ if (cell.hasDuplicateFields) {
+ // As a special case just output this info, since other fields don't matter.
+ return os << "{duplicateFields: 1}";
+ }
+
+ os << "{vals: [";
+ for (auto&& elem : cell.vals) {
+ if (&elem != &cell.vals.front())
+ os << ", ";
+ os << elem.toString(/*includeFieldName*/ false);
+ }
+ os << "], arrayInfo: '" << cell.arrayInfo;
+ os << "', hasSubPaths: " << cell.hasSubPaths;
+ os << ", isSparse: " << cell.isSparse;
+ os << ", hasDoubleNestedArrays: " << cell.hasDoubleNestedArrays;
+ os << '}';
+
+ return os;
+}
+std::ostream& operator<<(std::ostream& os, const UnencodedCellView* cell) {
+ return cell ? (os << *cell) : (os << "(no cell)");
+}
+} // namespace mongo::column_keygen
diff --git a/src/mongo/db/index/column_key_generator.h b/src/mongo/db/index/column_key_generator.h
new file mode 100644
index 00000000000..d88e642ff0d
--- /dev/null
+++ b/src/mongo/db/index/column_key_generator.h
@@ -0,0 +1,111 @@
+/**
+ * Copyright (C) 2022-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+#include <iosfwd>
+
+#include "mongo/db/storage/column_store.h"
+#include "mongo/util/functional.h"
+
+namespace mongo::column_keygen {
+/**
+ * This is a representation of the cell prior to flattening it out into a buffer which is passed to
+ * visitor callbacks.
+ *
+ * All memory within the UnencodedCellView should only be assumed valid within the callback. If you
+ * need it longer, you must copy it yourself. Non-test callers will generally immediately encode
+ * this to a flat buffer, so this shouldn't be a problem.
+ */
+struct UnencodedCellView {
+ const std::vector<BSONElement>& vals;
+ StringData arrayInfo;
+
+ // If true, this path has multiple values in a single (possibly nested) object with the same
+ // field name. In this case, arrayInfo will be empty and this cell must not be used to
+ // reconstruct an object. We should probably not attempt to encode vals in the index either, and
+ // just put a marker that causes us to either skip the row (because it breaks the rules) or go
+ // to the row store.
+ //
+ // Note that this detection is best-effort and will only detect cases that would result in
+ // corrupt array info. We have decided that query results do not need to be precise for objects
+ // with duplicate fields, so it is OK if we don't detect every case, as long as we don't crash
+ // or cause corruption on the undetected cases.
+ bool hasDuplicateFields;
+
+ // If true, this cell omits values that are stored in subpaths.
+ bool hasSubPaths;
+
+ // If true, when reconstructing an object, you will need to visit the parent path in order to
+ // match current semantics for projections and field-path expressions.
+ bool isSparse;
+
+ // If true, at least one of the values in vals is inside of a directly-double-nested array, or
+ // the field name was encountered while already inside of a directly-double-nested array, so
+ // arrayInfo must be consulted to know which values to skip when matching. If false, simple
+ // matches can ignore the array info and just match against each value independently.
+ bool hasDoubleNestedArrays;
+
+ // These are only used in tests and for debugging.
+ friend bool operator==(const UnencodedCellView&, const UnencodedCellView&);
+ friend std::ostream& operator<<(std::ostream&, const UnencodedCellView&);
+ friend std::ostream& operator<<(std::ostream&, const UnencodedCellView*);
+};
+
+/**
+ * Visits all paths within obj and provides their cell values.
+ * Path visit order is unspecified.
+ */
+void visitCellsForInsert(const BSONObj& obj,
+ function_ref<void(PathView, const UnencodedCellView&)> cb);
+
+/**
+ * Visits all paths within obj. When deleting, you do not need to know about values.
+ * Path visit order is unspecified.
+ */
+void visitPathsForDelete(const BSONObj& obj, function_ref<void(PathView)> cb);
+
+/**
+ * See visitDiffForUpdate().
+ */
+enum DiffAction { kInsert, kUpdate, kDelete };
+
+/**
+ * Computes differences between oldObj and newObj, and invokes cb() with the required actions to
+ * take to update the columnar index.
+ *
+ * For kInsert and kUpdate, the UnencodedCellView will point to the cell data for newObj (you
+ * don't need to know the value for oldObj).
+ *
+ * For kDelete, the UnencodedCellView pointer will be null.
+ *
+ * Path visit order is unspecified.
+ */
+void visitDiffForUpdate(const BSONObj& oldObj,
+ const BSONObj& newObj,
+ function_ref<void(DiffAction, PathView, const UnencodedCellView*)> cb);
+
+} // namespace mongo::column_keygen
diff --git a/src/mongo/db/index/column_key_generator_test.cpp b/src/mongo/db/index/column_key_generator_test.cpp
new file mode 100644
index 00000000000..e299bb1a68d
--- /dev/null
+++ b/src/mongo/db/index/column_key_generator_test.cpp
@@ -0,0 +1,734 @@
+/**
+ * Copyright (C) 2022-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/bson/bson_depth.h"
+#include "mongo/bson/json.h"
+#include "mongo/db/index/column_key_generator.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo::column_keygen {
+namespace {
+enum Flags_ForTest {
+ kNoFlags = 0,
+ kHasDuplicateFields = 1 << 0,
+ kHasSubPath = 1 << 1,
+ kIsSparse = 1 << 2,
+ kHasDoubleNestedArrays = 1 << 3,
+};
+
+struct UnencodedCellValue_ForTest {
+ StringData valsJson; // Comma-separated values
+ std::string arrayInfo;
+ int flags = kNoFlags;
+
+ UnencodedCellView toView(BSONObj& owner, std::vector<BSONElement>& elems) const {
+ owner = fromjson(std::string("{x:[") + valsJson + "]}");
+ elems = owner.firstElement().Array();
+ return {
+ elems,
+ arrayInfo,
+ bool(flags & kHasDuplicateFields),
+ bool(flags & kHasSubPath),
+ bool(flags & kIsSparse),
+ bool(flags & kHasDoubleNestedArrays),
+ };
+ }
+};
+
+void insertTest(int line,
+ const BSONObj& doc,
+ const StringMap<UnencodedCellValue_ForTest>& expected) {
+ BSONObj owner;
+ std::vector<BSONElement> elems;
+
+ StringSet seenPaths;
+ visitCellsForInsert(doc, [&](PathView path, const UnencodedCellView& cell) {
+ seenPaths.insert(path.toString());
+
+ auto it = expected.find(path);
+ if (it == expected.end()) {
+ FAIL("Unexpected path in insert") << "test:" << line << " path:" << path;
+ }
+ auto expectedCell = it->second.toView(owner, elems);
+ ASSERT_EQ(cell, expectedCell) << "test:" << line << " path:" << path;
+ });
+
+ for (auto [path, _] : expected) {
+ if (seenPaths.contains(path))
+ continue;
+
+ FAIL("Expected to see path in insert, but didn't") << "test:" << line << " path:" << path;
+ }
+}
+
+void deleteTest(int line,
+ const BSONObj& doc,
+ const StringMap<UnencodedCellValue_ForTest>& expected) {
+ StringSet seenPaths;
+ visitPathsForDelete(doc, [&](PathView path) {
+ seenPaths.insert(path.toString());
+
+ auto it = expected.find(path);
+ if (it == expected.end()) {
+ FAIL("Unexpected path in delete") << "test:" << line << " path:" << path;
+ }
+ });
+
+ for (auto [path, _] : expected) {
+ if (seenPaths.contains(path))
+ continue;
+
+ FAIL("Expected to see path in delete, but didn't") << "test:" << line << " path:" << path;
+ }
+}
+
+void updateToEmptyTest(int line,
+ const BSONObj& doc,
+ const StringMap<UnencodedCellValue_ForTest>& expected) {
+ StringSet seenPaths;
+ visitDiffForUpdate(
+ doc, BSONObj(), [&](DiffAction action, PathView path, const UnencodedCellView* cell) {
+ ASSERT_EQ(action, kDelete) << "test:" << line << " path:" << path;
+
+ ASSERT(!seenPaths.contains(path)) << "test:" << line << " path:" << path;
+ seenPaths.insert(path.toString());
+
+ auto it = expected.find(path);
+ if (it == expected.end()) {
+ FAIL("Unexpected path in updateToEmpty") << "action:" << action << " cell:" << cell
+ << " test:" << line << " path:" << path;
+ }
+
+ ASSERT(!cell) << "test:" << line << " path:" << path;
+ });
+
+ for (auto [path, _] : expected) {
+ if (seenPaths.contains(path))
+ continue;
+
+ FAIL("Expected to see path in updateToEmpty, but didn't")
+ << "test:" << line << " path:" << path;
+ }
+}
+
+void updateFromEmptyTest(int line,
+ const BSONObj& doc,
+ const StringMap<UnencodedCellValue_ForTest>& expected) {
+ BSONObj owner;
+ std::vector<BSONElement> elems;
+
+ StringSet seenPaths;
+ visitDiffForUpdate(
+ BSONObj(), doc, [&](DiffAction action, PathView path, const UnencodedCellView* cell) {
+ ASSERT_EQ(action, kInsert) << "test:" << line << " path:" << path;
+
+ ASSERT(!seenPaths.contains(path)) << "test:" << line << " path:" << path;
+ seenPaths.insert(path.toString());
+
+ auto it = expected.find(path);
+ if (it == expected.end()) {
+ FAIL("Unexpected path in updateFromEmpty")
+ << "action:" << action << " cell:" << cell << " test:" << line
+ << " path:" << path;
+ }
+
+ ASSERT(cell) << "test:" << line << " path:" << path;
+ auto expectedCell = it->second.toView(owner, elems);
+ ASSERT_EQ(*cell, expectedCell) << "test:" << line << " path:" << path;
+ });
+
+ for (auto [path, _] : expected) {
+ if (seenPaths.contains(path))
+ continue;
+
+ FAIL("Expected to see path in updateFromEmpty, but didn't")
+ << "test:" << line << " path:" << path;
+ }
+}
+
+void updateWithNoChange(int line, const BSONObj& doc) {
+ BSONObj owner;
+ std::vector<BSONElement> elems;
+
+ StringSet seenPaths;
+ visitDiffForUpdate(
+ doc, doc, [&](DiffAction action, PathView path, const UnencodedCellView* cell) {
+ FAIL("Unexpected path in updateNoChange")
+ << "action:" << action << " cell:" << cell << " test:" << line << " path:" << path;
+ });
+}
+
+
+void basicTests(int line, std::string json, StringMap<UnencodedCellValue_ForTest> expected) {
+ const BSONObj doc = fromjson(json);
+
+ // Add in the RowID column. Since it is always the same, tests shouldn't include it.
+ // We always expect to see it in inserts and deletes, and never in updates.
+ expected.insert({ColumnStore::kRowIdPath.toString(), {"", "", kHasSubPath}});
+
+ insertTest(line, doc, expected);
+ deleteTest(line, doc, expected);
+
+ expected.erase(ColumnStore::kRowIdPath);
+ updateToEmptyTest(line, doc, expected);
+ updateFromEmptyTest(line, doc, expected);
+ updateWithNoChange(line, doc);
+}
+
+TEST(ColKeyGen, BasicTests) {
+ basicTests(__LINE__, R"({})", {});
+ basicTests(__LINE__,
+ R"({a: 1})",
+ {
+ {"a", {"1", ""}},
+ });
+ basicTests(__LINE__,
+ R"({a: 1, b:2})",
+ {
+ {"a", {"1", ""}},
+ {"b", {"2", ""}},
+ });
+ basicTests(__LINE__,
+ R"({a: [1, 2]})",
+ {
+ {"a", {"1, 2", "["}},
+ });
+ basicTests(__LINE__,
+ R"({a: [1, 1]})", // Identical
+ {
+ {"a", {"1, 1", "["}},
+ });
+ basicTests(__LINE__,
+ R"({a: [1, [2]]})",
+ {
+ {"a", {"1, 2", "[|[", kHasDoubleNestedArrays}},
+ });
+ basicTests(__LINE__,
+ R"({a: [1, []]})", // Empty array isn't "double nested" (unless it is...)
+ {
+ {"a", {"1, []", "["}},
+ });
+ basicTests(__LINE__,
+ R"({a: [1, [[]]]})", // ... now it is
+ {
+ {"a", {"1, []", "[|[", kHasDoubleNestedArrays}},
+ });
+ basicTests(__LINE__,
+ R"({a: [{b:1}, {b:2}]})",
+ {
+ {"a", {"", "[o1", kHasSubPath}},
+ {"a.b", {"1, 2", "["}},
+ });
+ basicTests(__LINE__,
+ R"({a: [{b:1}, {c:2}]})",
+ {
+ {"a", {"", "[o1", kHasSubPath}},
+ {"a.b", {"1", "[", kIsSparse}},
+ {"a.c", {"2", "[1", kIsSparse}},
+ });
+ basicTests(__LINE__,
+ R"({a: [{b:1}, {c:[2, 3]}, null]})",
+ {
+ {"a", {"null", "[o1", kHasSubPath}},
+ {"a.b", {"1", "[", kIsSparse}},
+ {"a.c", {"2, 3", "[1{[", kIsSparse}},
+ });
+ basicTests(__LINE__,
+ R"({a: [{b:1}, {b:[2, 3]}]})",
+ {
+ {"a", {"", "[o1", kHasSubPath}},
+ {"a.b", {"1,2,3", "[|{["}},
+ });
+ basicTests(__LINE__,
+ R"({a: [{b:1}, {b:[2, 3]}, null]})",
+ {
+ {"a", {"null", "[o1", kHasSubPath}},
+ {"a.b", {"1,2,3", "[|{[", kIsSparse}},
+ });
+ basicTests(__LINE__,
+ R"({a: [{b:1}, [{b:[2, 3]}], null]})",
+ {
+ {"a", {"null", "[o[o]", kHasSubPath}},
+ {"a.b", {"1,2,3", "[|[{[", kIsSparse | kHasDoubleNestedArrays}},
+ });
+ basicTests(__LINE__,
+ R"({a: [[{b: {c: 1}}]]})",
+ {
+ {"a", {"", "[[o", kHasSubPath}},
+ {"a.b", {"", "[[o", kHasSubPath | kHasDoubleNestedArrays}},
+ {"a.b.c", {"1", "[[", kHasDoubleNestedArrays}},
+ });
+ basicTests(__LINE__,
+ R"({a: 1, 'dotted.field': 1})",
+ {
+ {"a", {"1", ""}},
+ });
+ basicTests(__LINE__,
+ R"({a: 1, b: {'dotted.field': 1}})",
+ {
+ {"a", {"1", ""}},
+ {"b", {"", "", kHasSubPath}},
+ });
+ basicTests(__LINE__,
+ R"({a: 1, b: [{'dotted.field': 1}]})",
+ {
+ {"a", {"1", ""}},
+ {"b", {"", "[o", kHasSubPath}},
+ });
+ basicTests(__LINE__,
+ R"({a: 1, b: [{'dotted.field': 1, c: 1}]})",
+ {
+ {"a", {"1", ""}},
+ {"b", {"", "[o", kHasSubPath}},
+ {"b.c", {"1", "["}},
+ });
+ basicTests(__LINE__,
+ R"({'': 1})",
+ {
+ {"", {"1", ""}},
+ });
+ basicTests(__LINE__,
+ R"({'': {'': 1}})",
+ {
+ {"", {"", "", kHasSubPath}},
+ {".", {"1", ""}},
+ });
+ basicTests(__LINE__,
+ R"({'': {'': {'': 1}}})",
+ {
+ {"", {"", "", kHasSubPath}},
+ {".", {"", "", kHasSubPath}},
+ {"..", {"1", ""}},
+ });
+ basicTests(__LINE__,
+ R"({'': [{'': [{'': [1]}]}]})",
+ {
+ {"", {"", "[o", kHasSubPath}},
+ {".", {"", "[{[o", kHasSubPath}},
+ {"..", {"1", "[{[{["}},
+ });
+ basicTests(__LINE__,
+ R"({'a': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]})",
+ {
+ {"a", {"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", "["}},
+ });
+ basicTests(__LINE__,
+ R"({'a': [[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], [99]]})",
+ {
+ {"a",
+ {
+ "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,99",
+ "[[|19][",
+ kHasDoubleNestedArrays,
+ }},
+ });
+ basicTests(
+ __LINE__,
+ R"({'a': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20, {b: 1}]})",
+ {
+ {"a", {"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", "[|19o", kHasSubPath}},
+ {"a.b", {"1", "[20", kIsSparse}},
+ });
+ basicTests(
+ __LINE__,
+ R"({'a': [{b:1}, 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20, {b: 1}]})",
+ {
+ {"a", {"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", "[o|19o", kHasSubPath}},
+ {"a.b", {"1,1", "[|+20", kIsSparse}},
+ });
+ basicTests(__LINE__,
+ R"({
+ 'a': [
+ {b:1},{b:2},{b:3},{b:4},{b:5},{b:6},{b:7},{b:8},{b:9},{b:10},
+ {b:11},{b:12},{b:13},{b:14},{b:15},{b:16},{b:17},{b:18},{b:19},{b:20}
+ ]
+ })",
+ {
+ {"a", {"", "[o19", kHasSubPath}},
+ {"a.b", {"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", "["}},
+ });
+}
+
+TEST(ColKeyGen, DeepObjectTests) {
+ // Can't use lambdas because they can't be recursive (especially mutually recursive), but I'd
+ // still like to keep these local to this test
+ struct funcs {
+ static void addObjToStr(std::string& str, int depthLeft) {
+ if (!depthLeft) {
+ str += '1';
+ return;
+ }
+
+ str += "{x:";
+ addObjToStr(str, depthLeft - 1);
+ str += "}";
+ }
+
+ static void addArrToStr(std::string& str, int depthLeft) {
+ if (!depthLeft) {
+ str += '1';
+ return;
+ }
+
+ str += "[";
+ addArrToStr(str, depthLeft - 1);
+ str += "]";
+ }
+
+ static std::string dottedPath(int length) {
+ invariant(length >= 1);
+ std::string out = "x";
+ for (int i = 1; i < length; i++) {
+ out += ".x";
+ }
+ return out;
+ }
+
+ static void addAlternatingObjToStr(std::string& str, int depthLeft) {
+ if (!depthLeft) {
+ str += '1';
+ return;
+ }
+
+ str += "{x:";
+ addAlternatingArrToStr(str, depthLeft - 1);
+ str += "}";
+ }
+
+ static void addAlternatingArrToStr(std::string& str, int depthLeft) {
+ if (!depthLeft) {
+ str += '1';
+ return;
+ }
+
+ str += "[";
+ addAlternatingObjToStr(str, depthLeft - 1);
+ str += "]";
+ }
+
+ static std::string alternatingArrayInfo(int pathDepth) {
+ invariant(pathDepth >= 1);
+ std::string out = "[";
+ for (int i = 1; i < pathDepth; i++) {
+ out += "{[";
+ }
+ return out;
+ }
+ };
+
+ // The full name is quite the line-full.
+ constexpr int kDepth = BSONDepth::kBSONDepthParameterCeiling;
+
+ { // Just object nesting
+ std::string obj;
+ funcs::addObjToStr(obj, kDepth);
+ StringMap<UnencodedCellValue_ForTest> expected;
+ for (int i = 1; i < kDepth; i++) {
+ expected[funcs::dottedPath(i)] = {"", "", kHasSubPath};
+ }
+ expected[funcs::dottedPath(kDepth)] = {"1", ""};
+ basicTests(__LINE__, obj, expected);
+ }
+
+ { // Just array nesting
+ std::string arr;
+ funcs::addArrToStr(arr, kDepth);
+ basicTests(__LINE__,
+ "{x: " + arr + "}",
+ {
+ {"x", {"1", std::string(kDepth, '['), kHasDoubleNestedArrays}},
+ });
+ }
+
+ // The next two tests cover a mix of object and array nesting, but differ only in which is the
+ // innermost. Since the constant was even when this was written, the first tests with an
+ // innermost array, and the second tests an innermost object. There may need to be slight
+ // adjustments to the tests in the unlikely event that the constant ever becomes odd.
+ static_assert(BSONDepth::kBSONDepthParameterCeiling % 2 == 0); // See above if this fails.
+
+ { // Innermost array.
+ std::string obj;
+ funcs::addAlternatingObjToStr(obj, kDepth);
+ StringMap<UnencodedCellValue_ForTest> expected;
+ constexpr auto kPathLen = kDepth / 2;
+ for (int i = 1; i < kPathLen; i++) {
+ expected[funcs::dottedPath(i)] = {
+ "", funcs::alternatingArrayInfo(i) + 'o', kHasSubPath};
+ }
+ expected[funcs::dottedPath(kPathLen)] = {"1", funcs::alternatingArrayInfo(kPathLen)};
+ basicTests(__LINE__, obj, expected);
+ }
+ { // Innermost object.
+ std::string obj;
+ funcs::addAlternatingObjToStr(obj, kDepth + 1);
+ StringMap<UnencodedCellValue_ForTest> expected;
+ constexpr auto kPathLen = kDepth / 2 + 1;
+ for (int i = 1; i < kPathLen; i++) {
+ expected[funcs::dottedPath(i)] = {
+ "", funcs::alternatingArrayInfo(i) + 'o', kHasSubPath};
+ }
+ expected[funcs::dottedPath(kPathLen)] = {"1", funcs::alternatingArrayInfo(kPathLen - 1)};
+ basicTests(__LINE__, obj, expected);
+ }
+}
+
+TEST(ColKeyGen, DuplicateFieldTests) {
+ basicTests(__LINE__,
+ R"({a: 1, a: 2})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({a: [1], a: 2})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({a: 1, a: [2]})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({a: [1], a: [2]})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({a: {b:1}, a: {b:2}})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({a: {b:[1]}, a: {b:[2]}})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({a: [{b:[1]}], a: [{b:[2]}]})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({a: {b:[1]}, a: [{b:[2]}]})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({a: [{b:[1]}], a: {b:[2]}})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({a: {b:[1]}, a: [null, {b:[2]}]})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({a: [null, {b:[1]}], a: {b:[2]}})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({a: [{b:[1, 3]}], a: [{b:[2]}]})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({a: [{b:[1, 3]}, null], a: [{b:[2]}]})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({a: [null, {b:[1, 3]}], a: [{b:[2]}]})", // No index in second a.b
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({a: [null, null, {b:[1, 3]}], a: [null, {b:[2]}]})", // Index in second a.b
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({a: [null, {b:[{c:1}, {c:3}]}], a: [{b:[{c:2}]}]})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"", "", kHasDuplicateFields}},
+ {"a.b.c", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({a: [null, null, {b:[{c:1}, {c:3}]}], a: [null, {b:[{c:2}]}]})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"", "", kHasDuplicateFields}},
+ {"a.b.c", {"", "", kHasDuplicateFields}},
+ });
+ basicTests(__LINE__,
+ R"({"": 1, "": 2})",
+ {
+ {"", {"", "", kHasDuplicateFields}},
+ });
+
+ // This behaves the same as {a:[{b:[1,3]}, {b:2}]} as far as a.b can tell.
+ basicTests(__LINE__,
+ R"({a: [{b:[1, 3]}], a: [null, {b:[2]}]})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"1,3,2", "[{[|1]{[", kIsSparse}},
+ });
+
+ // These tests only have one a.b path.
+ basicTests(__LINE__,
+ R"({a: [{b:1}], a: 2})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"1", "[", kIsSparse}},
+ });
+ basicTests(__LINE__,
+ R"({a: 1, a: [{b:2}]})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"2", "[", kIsSparse}},
+ });
+ basicTests(__LINE__,
+ R"({a: [{b:1}], a: [2]})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"1", "[", kIsSparse}},
+ });
+ basicTests(__LINE__,
+ R"({a: [1], a: [{b:2}]})",
+ {
+ {"a", {"", "", kHasDuplicateFields}},
+ {"a.b", {"2", "[", kIsSparse}},
+ });
+}
+
+void updateTest(int line,
+ std::string jsonOld,
+ std::string jsonNew,
+ StringMap<std::pair<DiffAction, UnencodedCellValue_ForTest>> expected) {
+ BSONObj owner;
+ std::vector<BSONElement> elems;
+
+ StringSet seenPaths;
+ visitDiffForUpdate(fromjson(jsonOld),
+ fromjson(jsonNew),
+ [&](DiffAction action, PathView path, const UnencodedCellView* cell) {
+ ASSERT(!seenPaths.contains(path)) << "test:" << line << " path:" << path;
+ seenPaths.insert(path.toString());
+
+ auto it = expected.find(path);
+ if (it == expected.end()) {
+ FAIL("Unexpected path in updateTest")
+ << "action:" << action << " cell:" << cell << " test:" << line
+ << " path:" << path;
+ }
+
+ auto expectedAction = it->second.first;
+ ASSERT_EQ(action, expectedAction) << "test:" << line << " path:" << path;
+ if (action == kDelete) {
+ ASSERT(!cell) << "test:" << line << " path:" << path;
+ } else {
+ ASSERT(cell) << "test:" << line << " path:" << path;
+ auto expectedCell = it->second.second.toView(owner, elems);
+ ASSERT_EQ(*cell, expectedCell)
+ << "test:" << line << " path:" << path;
+ }
+ });
+
+ for (auto [path, _] : expected) {
+ if (seenPaths.contains(path))
+ continue;
+
+ FAIL("Expected to see path in updateTest, but didn't")
+ << "test:" << line << " path:" << path;
+ }
+}
+
+TEST(ColKeyGen, UpdateTests) {
+
+ updateTest(__LINE__,
+ R"({a: [1, {b: 1}]})",
+ R"({a: [1, {b: 2}]})",
+ {
+ {"a.b", {kUpdate, {"2", "[1", kIsSparse}}},
+ });
+ updateTest(__LINE__,
+ R"({a: [1, {b: 1}]})",
+ R"({a: [2, {b: 1}]})",
+ {
+ {"a", {kUpdate, {"2", "[|o", kHasSubPath}}},
+
+ });
+ updateTest(__LINE__,
+ R"({a: [{b: 1}]})", // a.b becomes isSparse
+ R"({a: [{b: 1}, {c:1}]})",
+ {
+ {"a", {kUpdate, {"", "[o1", kHasSubPath}}},
+ {"a.b", {kUpdate, {"1", "[", kIsSparse}}},
+ {"a.c", {kInsert, {"1", "[1", kIsSparse}}},
+ });
+ updateTest(__LINE__,
+ R"({a: [{b: 1}, {c:1}]})", // a.b becomes not isSparse
+ R"({a: [{b: 1}]})",
+ {
+ {"a", {kUpdate, {"", "[o", kHasSubPath}}},
+ {"a.b", {kUpdate, {"1", "["}}},
+ {"a.c", {kDelete, {}}},
+ });
+ updateTest(__LINE__,
+ R"({'': 1})",
+ R"({'': 2})",
+ {
+ {"", {kUpdate, {"2", ""}}},
+ });
+ updateTest(__LINE__,
+ R"({'': [1, {'': 1}]})",
+ R"({'': [1, {'': 2}]})",
+ {
+ {".", {kUpdate, {"2", "[1", kIsSparse}}},
+ });
+}
+
+// TODO more tests, of course! In particular, the testing of complex update changes is a bit light.
+} // namespace
+} // namespace mongo::column_keygen
diff --git a/src/mongo/db/storage/column_store.h b/src/mongo/db/storage/column_store.h
index 268fc1280d1..aab7a719634 100644
--- a/src/mongo/db/storage/column_store.h
+++ b/src/mongo/db/storage/column_store.h
@@ -302,6 +302,23 @@ public:
return x;
}
+ /**
+ * If the range [*itPtr, end) begins with a number, returns it and positions *itPtr after the
+ * last byte of number. If there is no number, returns 0 (which is typically encoded by omitting
+ * an optional number) and does not reposition *itPtr.
+ */
+ static int readArrInfoNumber(StringData::const_iterator* itInOut,
+ StringData::const_iterator end) {
+ auto it = *itInOut; // Use local to allow compiler to assume it doesn't point to itself.
+ size_t res = 0;
+ while (it != end && *it >= '0' && *it <= '9') {
+ res *= 10; // noop first pass.
+ res += (*it++) - '0';
+ }
+ *itInOut = it;
+ return res;
+ }
+
protected:
class Cursor {
public: