diff options
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/index/SConscript | 2 | ||||
-rw-r--r-- | src/mongo/db/index/column_key_generator.cpp | 595 | ||||
-rw-r--r-- | src/mongo/db/index/column_key_generator.h | 111 | ||||
-rw-r--r-- | src/mongo/db/index/column_key_generator_test.cpp | 734 | ||||
-rw-r--r-- | src/mongo/db/storage/column_store.h | 17 |
5 files changed, 1459 insertions, 0 deletions
diff --git a/src/mongo/db/index/SConscript b/src/mongo/db/index/SConscript index 410c0a2825c..48ab6f723cc 100644 --- a/src/mongo/db/index/SConscript +++ b/src/mongo/db/index/SConscript @@ -23,6 +23,7 @@ env.Library( target='key_generator', source=[ 'btree_key_generator.cpp', + 'column_key_generator.cpp', 'expression_keys_private.cpp', 'sort_key_generator.cpp', 'wildcard_key_generator.cpp', @@ -150,6 +151,7 @@ env.CppUnitTest( source=[ '2d_key_generator_test.cpp', 'btree_key_generator_test.cpp', + 'column_key_generator_test.cpp', 'hash_key_generator_test.cpp', 's2_key_generator_test.cpp', 's2_bucket_key_generator_test.cpp', diff --git a/src/mongo/db/index/column_key_generator.cpp b/src/mongo/db/index/column_key_generator.cpp new file mode 100644 index 00000000000..6e348179da8 --- /dev/null +++ b/src/mongo/db/index/column_key_generator.cpp @@ -0,0 +1,595 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kIndex + +#include "mongo/platform/basic.h" + +#include "mongo/db/index/column_key_generator.h" + +#include <iomanip> +#include <ostream> +#include <vector> + +#include "mongo/db/storage/column_store.h" +#include "mongo/util/functional.h" +#include "mongo/util/string_map.h" + +namespace mongo::column_keygen { +namespace { + +/** + * Special handling for vals because BSONElement doesn't support ==, and we want binaryEqualValues + * rather than woCompare equality anyway. + */ +bool identicalBSONElementArrays(const std::vector<BSONElement>& lhs, + const std::vector<BSONElement>& rhs) { + if (lhs.size() != rhs.size()) + return false; + for (size_t i = 0; i < lhs.size(); i++) { + if (!lhs[i].binaryEqualValues(rhs[i])) + return false; + } + return true; +} + +/** + * This class handles the logic of key generation for columnar indexes. It produces + * UnencodedCellViews, which have all of the data that should be put in the index values, but it is + * not responsible for encoding that data into a flat buffer. That is handled by XXX. + * + * TODO once the code for that that is written (SERVER-64766) update this to replace the XXX. + * + * "Shredding" is an informal term for taking a single BSON document and splitting it into the data + * for each unique path. The data at each path should be sufficient to reconstruct the object in + * full fidelity, possibly considering parent paths. When determining the path, array index are not + * considered, only field names are. + * + * This class is private to this file, and only exposed via the free-functions in the header. + */ +class ColumnShredder { +public: + /** + * Option to constructor to avoid overhead when you only need to know about the paths in an + * object, such as for deletes. + */ + enum PathsAndCells : bool { kOnlyPaths = false, kPathsAndCells = true }; + + explicit ColumnShredder(const BSONObj& obj, PathsAndCells pathsAndCells = kPathsAndCells) + : _pathsAndCells(pathsAndCells) { + walkObj(_paths[ColumnStore::kRowIdPath], obj, /*isRoot=*/true); + } + + void visitPaths(function_ref<void(PathView)> cb) const { + for (auto&& [path, _] : _paths) { + cb(path); + } + } + + void visitCells(function_ref<void(PathView, const UnencodedCellView&)> cb) { + // This function requires that the shredder was constructed to record cell values. + invariant(_pathsAndCells); + + for (auto&& [path, cell] : _paths) { + cb(path, makeCellView(path, cell)); + } + } + + static void visitDiff(const BSONObj& oldObj, + const BSONObj& newObj, + function_ref<void(DiffAction, PathView, const UnencodedCellView*)> cb) { + auto oldShredder = ColumnShredder(oldObj); + auto newShredder = ColumnShredder(newObj); + + for (auto&& [path, rawCellNew] : newShredder._paths) { + auto itOld = oldShredder._paths.find(path); + if (itOld == oldShredder._paths.end()) { + auto cell = newShredder.makeCellView(path, rawCellNew); + cb(kInsert, path, &cell); + continue; + } + + auto&& [_, rawCellOld] = *itOld; // For symmetry with rawCellNew. + + // Need to compute sparseness prior to checking cell equality. + newShredder.computeIsSparse(path, &rawCellNew); + oldShredder.computeIsSparse(path, &rawCellOld); + if (rawCellNew == rawCellOld) + continue; // No change for this path. + + + auto cell = newShredder.makeCellView(path, rawCellNew); + cb(kUpdate, path, &cell); + } + + for (auto&& [path, _] : oldShredder._paths) { + if (!newShredder._paths.contains(path)) { + cb(kDelete, path, nullptr); + } + } + } + + +private: + enum Sparseness : int8_t { kUnknown, kIsSparse, kNotSparse }; + + struct RawCellValue { + // This is where we incrementally build up the full arrayInfo for this cell. While each + // position is essentially a walk from the root to somewhere in the document, this is a walk + // that starts at the root and then passes through each value or subobject at this path + // taking (hopefully) the shortest distance between each value. First we build it up in + // "uncompressed" format, then we call compressArrayInfo() to remove redundant information. + std::string arrayInfoBuf; + + // This is used when building up arrayInfoBuf. It records the absolute position of the last + // value or subobject appended to arrayInfoBuf, so that we can easily compute the relative + // path to the next value. Equivalently, it is the value of _currentArrayInfo at the time of + // the last call to appendToArrayInfo(). It never has any '|' or 'o' bytes. + std::string lastPosition; + + std::vector<BSONElement> vals; + int nSeen = 0; // Number of times this field has been seen. + int nNonEmptySubobjects = 0; // Number of values that are non-empty objects + Sparseness sparseness = kUnknown; // Memoized in computeIsSparse() + bool childrenMustBeSparse = false; + bool hasDoubleNestedArrays = false; + bool hasDuplicateFields = false; + + bool operator==(const RawCellValue& rhs) const { + const RawCellValue& lhs = *this; + + // Sparseness must have already been resolved. + invariant(lhs.sparseness != kUnknown); + invariant(rhs.sparseness != kUnknown); + + // Intentionally not checking nSeen or childrenMustBeSparse, since they are just inputs + // to sparseness computation. + + // Ordered to test cheaper stuff before more expensive stuff. + return lhs.sparseness == rhs.sparseness && // + lhs.hasDoubleNestedArrays == rhs.hasDoubleNestedArrays && // + lhs.nNonEmptySubobjects == rhs.nNonEmptySubobjects && // + lhs.hasDuplicateFields == rhs.hasDuplicateFields && // + lhs.arrayInfoBuf == rhs.arrayInfoBuf && // + identicalBSONElementArrays(lhs.vals, rhs.vals); + } + }; + + UnencodedCellView makeCellView(PathView path, RawCellValue& cell) { + compressArrayInfo(cell.arrayInfoBuf); + return { + cell.vals, + cell.arrayInfoBuf, + cell.hasDuplicateFields, + /*hasSubPaths*/ cell.nNonEmptySubobjects != 0, + computeIsSparse(path, &cell), + cell.hasDoubleNestedArrays, + }; + } + + // Memoized, so amortized O(1) when called on all paths, even though it must check all parent + // paths. + bool computeIsSparse(PathView path, RawCellValue* cell) { + if (cell->sparseness != kUnknown) + return cell->sparseness == kIsSparse; + + auto parentPath = parentPathOf(path); + if (!parentPath) { + // Top level fields are never considered sparse. + cell->sparseness = kNotSparse; + return false; + } + + auto parentIt = _paths.find(*parentPath); + invariant(parentIt != _paths.end()); + auto& parent = parentIt->second; + const bool isSparse = parent.childrenMustBeSparse || + cell->nSeen != parent.nNonEmptySubobjects || computeIsSparse(*parentPath, &parent); + cell->sparseness = isSparse ? kIsSparse : kNotSparse; + return isSparse; + } + + void walkObj(RawCellValue& cell, const BSONObj& obj, bool isRoot = false) { + if (_pathsAndCells) { + cell.nNonEmptySubobjects++; + if (!isRoot) { + appendToArrayInfo(cell, 'o'); + } + } + + ON_BLOCK_EXIT([&, oldArrInfoSize = _currentArrayInfo.size()] { + if (_pathsAndCells) + _currentArrayInfo.resize(oldArrInfoSize); + }); + if (_pathsAndCells && !isRoot) + _currentArrayInfo += '{'; + + for (auto [name, elem] : obj) { + // We skip fields in some edge cases such as dots in the field name. This may also throw + // if the field name contains invalid UTF-8 in a way that would break the index. + if (shouldSkipField(name)) + continue; + + ON_BLOCK_EXIT([&, oldPathSize = _currentPath.size()] { // + _currentPath.resize(oldPathSize); + }); + if (!isRoot) + _currentPath += '.'; + _currentPath += name; + + auto& subCell = _paths[_currentPath]; + subCell.nSeen++; + if (_inDoubleNestedArray) + subCell.hasDoubleNestedArrays = true; + handleElem(subCell, elem); + } + } + + void walkArray(RawCellValue& cell, const BSONObj& arr) { + DecimalCounter<unsigned> index; + + for (auto elem : arr) { + ON_BLOCK_EXIT([&, + oldArrInfoSize = _currentArrayInfo.size(), + oldInDoubleNested = _inDoubleNestedArray] { + if (_pathsAndCells) { + _currentArrayInfo.resize(oldArrInfoSize); + _inDoubleNestedArray = oldInDoubleNested; + } + }); + + if (_pathsAndCells) { + _currentArrayInfo += '['; + if (index == 0) { + // Zero is common so make it implied rather than explicit. + } else { + // Theoretically, index should be the same as elem.fieldNameStringData(), + // however, since we don't validate the array indexes, they cannot be trusted. + // The logic to encode array info relies on "sane" array indexes (at the very + // least they must be monotonically increasing), so we create a new index string + // here. + _currentArrayInfo += StringData(index); + } + ++index; + + // [] doesn't start a double nesting since {a:{$eq: []}} matches {a: [[]]} + if (elem.type() == Array && !elem.Obj().isEmpty()) + _inDoubleNestedArray = true; + } + + // Note: always same cell, since array traversal never changes path. + handleElem(cell, elem); + } + } + + void handleElem(RawCellValue& cell, const BSONElement& elem) { + // Only recurse on non-empty objects and arrays. Empty objects and arrays are handled as + // scalars. + if (elem.type() == Object) { + if (auto obj = elem.Obj(); !obj.isEmpty()) + return walkObj(cell, obj); + } else if (elem.type() == Array) { + if (auto obj = elem.Obj(); !obj.isEmpty()) + return walkArray(cell, obj); + } + + if (_pathsAndCells) { + // If we get here, then this walk will not have any children. This means that there is + // at least one path where all children (if any) will be missing structural information, + // so they will need to consult the parent path. + cell.childrenMustBeSparse = true; + + if (_inDoubleNestedArray) + cell.hasDoubleNestedArrays = true; + + cell.vals.push_back(elem); + appendToArrayInfo(cell, '|'); + } + } + + void appendToArrayInfo(RawCellValue& rcd, char finalByte) { + dassert(finalByte == '|' || finalByte == 'o'); + + auto foundDuplicateField = [&] { + rcd.arrayInfoBuf.clear(); + rcd.hasDuplicateFields = true; + }; + + if (rcd.hasDuplicateFields) { + // arrayInfo should be left empty in this case. + invariant(rcd.arrayInfoBuf.empty()); + return; + } + + ON_BLOCK_EXIT([&] { rcd.lastPosition = _currentArrayInfo; }); + + if (rcd.arrayInfoBuf.empty()) { + // The first time we get a position for this path, just record it verbatim. The first + // is special because we are essentially recording the absolute path to this value, + // while every other time we append the path relative to the prior value. + invariant(rcd.lastPosition.empty()); + rcd.arrayInfoBuf.reserve(_currentArrayInfo.size() + 1); + rcd.arrayInfoBuf += _currentArrayInfo; + rcd.arrayInfoBuf += finalByte; + return; + } + + // Make better names for symmetry (and to prevent accidental modifications): + StringData oldPosition = rcd.lastPosition; + StringData newPosition = _currentArrayInfo; + + if (MONGO_unlikely(oldPosition.empty() || newPosition.empty())) { + // This can only happen if there is a duplicate field at the top level. + return foundDuplicateField(); + } + + auto [oldIt, newIt] = std::mismatch(oldPosition.begin(), // + oldPosition.end(), + newPosition.begin(), + newPosition.end()); + if (MONGO_unlikely(newIt == newPosition.end())) { + // This can only happen if there is a duplicate field in an array, because otherwise + // the raw array infos must differ by an index in some array. + return foundDuplicateField(); + } + + // Walk back to start of differing elem. Important to use newIt here because if they are + // in the same array, oldIt may have an implicitly encoded 0 index, while newIt must + // have a higher index. + while (*newIt != '[') { + if (MONGO_unlikely(!(*newIt >= '0' && *newIt <= '9'))) { + // Non-index difference can only happen if there are duplicate fields in an array. + return foundDuplicateField(); + } + invariant(newIt > newPosition.begin()); + dassert(oldIt > oldPosition.begin()); // oldIt and newIt are at same index. + --newIt; + --oldIt; + } + invariant(oldIt < oldPosition.end()); + if (MONGO_unlikely(*oldIt != '[')) { + // This is another type of non-index difference. + return foundDuplicateField(); + } + + // Close out arrays past the first mismatch in LIFO order. + for (auto revOldIt = oldPosition.end() - 1; revOldIt != oldIt; --revOldIt) { + if (*revOldIt == '[') { + rcd.arrayInfoBuf += ']'; + } else { + dassert((*revOldIt >= '0' && *revOldIt <= '9') || *revOldIt == '{'); + } + } + + // Now process the mismatch. It must be a difference in array index (checked above). + dassert(*oldIt == '[' && *newIt == '['); + ++oldIt; + ++newIt; + const auto oldIx = ColumnStore::readArrInfoNumber(&oldIt, oldPosition.end()); + const auto newIx = ColumnStore::readArrInfoNumber(&newIt, newPosition.end()); + + if (MONGO_unlikely(newIx <= oldIx)) { + // If this element is at a same or lower index, we must have hit a duplicate field + // above and restarted the array indexing. + return foundDuplicateField(); + } + const auto delta = newIx - oldIx; + const auto skips = delta - 1; + if (skips == 0) { + // Nothing. Increment by one (skipping zero) is implicit. + } else { + rcd.arrayInfoBuf += '+'; + rcd.arrayInfoBuf += ItoA(skips); + } + + // Now put the rest of the new array info into the output buffer. + rcd.arrayInfoBuf += StringData(newIt, newPosition.end()); + rcd.arrayInfoBuf += finalByte; + } + + static void compressArrayInfo(std::string& arrayInfo) { + // NOTE: all operations in this function either shrink the array info or keep it the same + // size, so they are able to work in-place in the arrayInfo buffer. + + // Logic below assumes arrayInfo is null terminated as a simplification. + dassert(strlen(arrayInfo.data()) == arrayInfo.size()); + + const char* in = arrayInfo.data(); + char* out = arrayInfo.data(); + bool anyArrays = false; + + // Remove all '{' immediately before a '|' or 'o', and check for arrays + char* lastNonBrace = nullptr; + while (auto c = *in++) { + if (c == '[') { + anyArrays = true; + } else if ((c == '|' || c == 'o') && lastNonBrace) { + // Rewind output to just past last non-brace, since the last set of opens are + // encoded implicitly. + dassert(lastNonBrace + 1 <= out); + out = lastNonBrace + 1; + } + + if (c != '{') + lastNonBrace = out; + + *out++ = c; + } + + // If there were no arrays, we don't need any array info. + if (!anyArrays) { + arrayInfo.clear(); + return; + } + + invariant(size_t(out - arrayInfo.data()) <= arrayInfo.size()); + + // Remove final run of '|' since end implies an infinite sequence of them. + while (out - arrayInfo.data() >= 1 && out[-1] == '|') { + out--; + } + arrayInfo.resize(out - arrayInfo.data()); // Reestablishes null termination. + + // Now do a final pass to RLE remaining runs of '|' or 'o'. It may be possible to integrate + // this with the first loop above, but it would be a bit more complicated because we need + // to have removed any implicit runs of '{' prior to each '|' or 'o' before looking for + // runs. + out = arrayInfo.data(); + in = arrayInfo.data(); + while (auto c = *in++) { + *out++ = c; + if (c != '|' && c != 'o') + continue; + + size_t repeats = 0; + while (*in == c) { + repeats++; + in++; + } + + if (repeats) { + auto buf = ItoA(repeats); // Must be on own line so the buffer outlives numStr. + auto numStr = StringData(buf); + + // We know there is room because a number > 0 takes up no more space than the number + // of '|' that it is replacing. repeats == 1 is the worst case because it isn't + // saving any space. However we still replace here since it should make decoding + // slightly faster. + dassert(numStr.size() <= repeats); + for (char c : numStr) { + dassert(out < in); // Note: `in` has already been advanced. + *out++ = c; + } + } + } + + invariant(size_t(out - arrayInfo.data()) <= arrayInfo.size()); + arrayInfo.resize(out - arrayInfo.data()); + } + + static bool shouldSkipField(PathView fieldName) { + // TODO consider skipping when the nesting depths exceed our documented limits of 100 + // documents. This would allow decoding to use fixed-size bitsets without needing to check + // for overflow. + + // WARNING: do not include the field name in error messages because if we throw from this + // function then the field name isn't valid utf-8, and could poison the whole error message! + + // We only care about \xFF at the beginning, even though utf-8 bans it everywhere. + static_assert(ColumnStore::kRowIdPath == "\xFF"_sd); + uassert(6519200, + "Field name contains '\\xFF' which isn't valid in UTF-8", + fieldName[0] != '\xFF'); // We know field names are nul-terminated so this is safe. + + // Fields with dots are not illegal, but we skip them because a) the query optimizer can't + // correctly track dependencies with them, and b) the current storage format uses dots as a + // separator and everything (including the array info encoder) would get confused with + // documents like {a: {b:1}, a.b: 2} because that breaks some of the assumptions. + return fieldName.find('.') != std::string::npos; + } + + static boost::optional<PathView> parentPathOf(PathView path) { + auto sep = path.rfind('.'); + if (sep == std::string::npos) + return {}; + + return path.substr(0, sep); + } + + // This is the same as StringMap<V> but with node_hash_map. We need the reference stability + // guarantee because we have recursive functions that both insert into _paths and hold a + // reference to one of its values. It seems better to use a node-based hash table than to redo + // the lookup every time we want to touch a cell. + template <typename V> + using NodeStringMap = absl::node_hash_map<std::string, V, StringMapHasher, StringMapEq>; + + PathValue _currentPath; + std::string _currentArrayInfo; // Describes a walk from root to current pos. No '|' or 'o'. + NodeStringMap<RawCellValue> _paths; + bool _inDoubleNestedArray = false; + + const PathsAndCells _pathsAndCells = kPathsAndCells; +}; +} // namespace + +void visitCellsForInsert(const BSONObj& obj, + function_ref<void(PathView, const UnencodedCellView&)> cb) { + ColumnShredder(obj).visitCells(cb); +} + +void visitPathsForDelete(const BSONObj& obj, function_ref<void(PathView)> cb) { + ColumnShredder(obj, ColumnShredder::kOnlyPaths).visitPaths(cb); +} + +void visitDiffForUpdate(const BSONObj& oldObj, + const BSONObj& newObj, + function_ref<void(DiffAction, PathView, const UnencodedCellView*)> cb) { + ColumnShredder::visitDiff(oldObj, newObj, cb); +} + +bool operator==(const UnencodedCellView& lhs, const UnencodedCellView& rhs) { + if (lhs.hasDuplicateFields || rhs.hasDuplicateFields) { + // As a special case, if either is true, we only care about comparing this field, since all + // other fields are suspect. This simplifies testing, because we don't need to guess what + // the output will be (and lock it into a correctness test!) for the case with duplicate + // fields. + return lhs.hasDuplicateFields == rhs.hasDuplicateFields; + } + + return identicalBSONElementArrays(lhs.vals, rhs.vals) && // + lhs.arrayInfo == rhs.arrayInfo && // + lhs.hasSubPaths == rhs.hasSubPaths && // + lhs.isSparse == rhs.isSparse && // + lhs.hasDoubleNestedArrays == rhs.hasDoubleNestedArrays; +} + +std::ostream& operator<<(std::ostream& os, const UnencodedCellView& cell) { + if (cell.hasDuplicateFields) { + // As a special case just output this info, since other fields don't matter. + return os << "{duplicateFields: 1}"; + } + + os << "{vals: ["; + for (auto&& elem : cell.vals) { + if (&elem != &cell.vals.front()) + os << ", "; + os << elem.toString(/*includeFieldName*/ false); + } + os << "], arrayInfo: '" << cell.arrayInfo; + os << "', hasSubPaths: " << cell.hasSubPaths; + os << ", isSparse: " << cell.isSparse; + os << ", hasDoubleNestedArrays: " << cell.hasDoubleNestedArrays; + os << '}'; + + return os; +} +std::ostream& operator<<(std::ostream& os, const UnencodedCellView* cell) { + return cell ? (os << *cell) : (os << "(no cell)"); +} +} // namespace mongo::column_keygen diff --git a/src/mongo/db/index/column_key_generator.h b/src/mongo/db/index/column_key_generator.h new file mode 100644 index 00000000000..d88e642ff0d --- /dev/null +++ b/src/mongo/db/index/column_key_generator.h @@ -0,0 +1,111 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ +#include <iosfwd> + +#include "mongo/db/storage/column_store.h" +#include "mongo/util/functional.h" + +namespace mongo::column_keygen { +/** + * This is a representation of the cell prior to flattening it out into a buffer which is passed to + * visitor callbacks. + * + * All memory within the UnencodedCellView should only be assumed valid within the callback. If you + * need it longer, you must copy it yourself. Non-test callers will generally immediately encode + * this to a flat buffer, so this shouldn't be a problem. + */ +struct UnencodedCellView { + const std::vector<BSONElement>& vals; + StringData arrayInfo; + + // If true, this path has multiple values in a single (possibly nested) object with the same + // field name. In this case, arrayInfo will be empty and this cell must not be used to + // reconstruct an object. We should probably not attempt to encode vals in the index either, and + // just put a marker that causes us to either skip the row (because it breaks the rules) or go + // to the row store. + // + // Note that this detection is best-effort and will only detect cases that would result in + // corrupt array info. We have decided that query results do not need to be precise for objects + // with duplicate fields, so it is OK if we don't detect every case, as long as we don't crash + // or cause corruption on the undetected cases. + bool hasDuplicateFields; + + // If true, this cell omits values that are stored in subpaths. + bool hasSubPaths; + + // If true, when reconstructing an object, you will need to visit the parent path in order to + // match current semantics for projections and field-path expressions. + bool isSparse; + + // If true, at least one of the values in vals is inside of a directly-double-nested array, or + // the field name was encountered while already inside of a directly-double-nested array, so + // arrayInfo must be consulted to know which values to skip when matching. If false, simple + // matches can ignore the array info and just match against each value independently. + bool hasDoubleNestedArrays; + + // These are only used in tests and for debugging. + friend bool operator==(const UnencodedCellView&, const UnencodedCellView&); + friend std::ostream& operator<<(std::ostream&, const UnencodedCellView&); + friend std::ostream& operator<<(std::ostream&, const UnencodedCellView*); +}; + +/** + * Visits all paths within obj and provides their cell values. + * Path visit order is unspecified. + */ +void visitCellsForInsert(const BSONObj& obj, + function_ref<void(PathView, const UnencodedCellView&)> cb); + +/** + * Visits all paths within obj. When deleting, you do not need to know about values. + * Path visit order is unspecified. + */ +void visitPathsForDelete(const BSONObj& obj, function_ref<void(PathView)> cb); + +/** + * See visitDiffForUpdate(). + */ +enum DiffAction { kInsert, kUpdate, kDelete }; + +/** + * Computes differences between oldObj and newObj, and invokes cb() with the required actions to + * take to update the columnar index. + * + * For kInsert and kUpdate, the UnencodedCellView will point to the cell data for newObj (you + * don't need to know the value for oldObj). + * + * For kDelete, the UnencodedCellView pointer will be null. + * + * Path visit order is unspecified. + */ +void visitDiffForUpdate(const BSONObj& oldObj, + const BSONObj& newObj, + function_ref<void(DiffAction, PathView, const UnencodedCellView*)> cb); + +} // namespace mongo::column_keygen diff --git a/src/mongo/db/index/column_key_generator_test.cpp b/src/mongo/db/index/column_key_generator_test.cpp new file mode 100644 index 00000000000..e299bb1a68d --- /dev/null +++ b/src/mongo/db/index/column_key_generator_test.cpp @@ -0,0 +1,734 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/platform/basic.h" + +#include "mongo/bson/bson_depth.h" +#include "mongo/bson/json.h" +#include "mongo/db/index/column_key_generator.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::column_keygen { +namespace { +enum Flags_ForTest { + kNoFlags = 0, + kHasDuplicateFields = 1 << 0, + kHasSubPath = 1 << 1, + kIsSparse = 1 << 2, + kHasDoubleNestedArrays = 1 << 3, +}; + +struct UnencodedCellValue_ForTest { + StringData valsJson; // Comma-separated values + std::string arrayInfo; + int flags = kNoFlags; + + UnencodedCellView toView(BSONObj& owner, std::vector<BSONElement>& elems) const { + owner = fromjson(std::string("{x:[") + valsJson + "]}"); + elems = owner.firstElement().Array(); + return { + elems, + arrayInfo, + bool(flags & kHasDuplicateFields), + bool(flags & kHasSubPath), + bool(flags & kIsSparse), + bool(flags & kHasDoubleNestedArrays), + }; + } +}; + +void insertTest(int line, + const BSONObj& doc, + const StringMap<UnencodedCellValue_ForTest>& expected) { + BSONObj owner; + std::vector<BSONElement> elems; + + StringSet seenPaths; + visitCellsForInsert(doc, [&](PathView path, const UnencodedCellView& cell) { + seenPaths.insert(path.toString()); + + auto it = expected.find(path); + if (it == expected.end()) { + FAIL("Unexpected path in insert") << "test:" << line << " path:" << path; + } + auto expectedCell = it->second.toView(owner, elems); + ASSERT_EQ(cell, expectedCell) << "test:" << line << " path:" << path; + }); + + for (auto [path, _] : expected) { + if (seenPaths.contains(path)) + continue; + + FAIL("Expected to see path in insert, but didn't") << "test:" << line << " path:" << path; + } +} + +void deleteTest(int line, + const BSONObj& doc, + const StringMap<UnencodedCellValue_ForTest>& expected) { + StringSet seenPaths; + visitPathsForDelete(doc, [&](PathView path) { + seenPaths.insert(path.toString()); + + auto it = expected.find(path); + if (it == expected.end()) { + FAIL("Unexpected path in delete") << "test:" << line << " path:" << path; + } + }); + + for (auto [path, _] : expected) { + if (seenPaths.contains(path)) + continue; + + FAIL("Expected to see path in delete, but didn't") << "test:" << line << " path:" << path; + } +} + +void updateToEmptyTest(int line, + const BSONObj& doc, + const StringMap<UnencodedCellValue_ForTest>& expected) { + StringSet seenPaths; + visitDiffForUpdate( + doc, BSONObj(), [&](DiffAction action, PathView path, const UnencodedCellView* cell) { + ASSERT_EQ(action, kDelete) << "test:" << line << " path:" << path; + + ASSERT(!seenPaths.contains(path)) << "test:" << line << " path:" << path; + seenPaths.insert(path.toString()); + + auto it = expected.find(path); + if (it == expected.end()) { + FAIL("Unexpected path in updateToEmpty") << "action:" << action << " cell:" << cell + << " test:" << line << " path:" << path; + } + + ASSERT(!cell) << "test:" << line << " path:" << path; + }); + + for (auto [path, _] : expected) { + if (seenPaths.contains(path)) + continue; + + FAIL("Expected to see path in updateToEmpty, but didn't") + << "test:" << line << " path:" << path; + } +} + +void updateFromEmptyTest(int line, + const BSONObj& doc, + const StringMap<UnencodedCellValue_ForTest>& expected) { + BSONObj owner; + std::vector<BSONElement> elems; + + StringSet seenPaths; + visitDiffForUpdate( + BSONObj(), doc, [&](DiffAction action, PathView path, const UnencodedCellView* cell) { + ASSERT_EQ(action, kInsert) << "test:" << line << " path:" << path; + + ASSERT(!seenPaths.contains(path)) << "test:" << line << " path:" << path; + seenPaths.insert(path.toString()); + + auto it = expected.find(path); + if (it == expected.end()) { + FAIL("Unexpected path in updateFromEmpty") + << "action:" << action << " cell:" << cell << " test:" << line + << " path:" << path; + } + + ASSERT(cell) << "test:" << line << " path:" << path; + auto expectedCell = it->second.toView(owner, elems); + ASSERT_EQ(*cell, expectedCell) << "test:" << line << " path:" << path; + }); + + for (auto [path, _] : expected) { + if (seenPaths.contains(path)) + continue; + + FAIL("Expected to see path in updateFromEmpty, but didn't") + << "test:" << line << " path:" << path; + } +} + +void updateWithNoChange(int line, const BSONObj& doc) { + BSONObj owner; + std::vector<BSONElement> elems; + + StringSet seenPaths; + visitDiffForUpdate( + doc, doc, [&](DiffAction action, PathView path, const UnencodedCellView* cell) { + FAIL("Unexpected path in updateNoChange") + << "action:" << action << " cell:" << cell << " test:" << line << " path:" << path; + }); +} + + +void basicTests(int line, std::string json, StringMap<UnencodedCellValue_ForTest> expected) { + const BSONObj doc = fromjson(json); + + // Add in the RowID column. Since it is always the same, tests shouldn't include it. + // We always expect to see it in inserts and deletes, and never in updates. + expected.insert({ColumnStore::kRowIdPath.toString(), {"", "", kHasSubPath}}); + + insertTest(line, doc, expected); + deleteTest(line, doc, expected); + + expected.erase(ColumnStore::kRowIdPath); + updateToEmptyTest(line, doc, expected); + updateFromEmptyTest(line, doc, expected); + updateWithNoChange(line, doc); +} + +TEST(ColKeyGen, BasicTests) { + basicTests(__LINE__, R"({})", {}); + basicTests(__LINE__, + R"({a: 1})", + { + {"a", {"1", ""}}, + }); + basicTests(__LINE__, + R"({a: 1, b:2})", + { + {"a", {"1", ""}}, + {"b", {"2", ""}}, + }); + basicTests(__LINE__, + R"({a: [1, 2]})", + { + {"a", {"1, 2", "["}}, + }); + basicTests(__LINE__, + R"({a: [1, 1]})", // Identical + { + {"a", {"1, 1", "["}}, + }); + basicTests(__LINE__, + R"({a: [1, [2]]})", + { + {"a", {"1, 2", "[|[", kHasDoubleNestedArrays}}, + }); + basicTests(__LINE__, + R"({a: [1, []]})", // Empty array isn't "double nested" (unless it is...) + { + {"a", {"1, []", "["}}, + }); + basicTests(__LINE__, + R"({a: [1, [[]]]})", // ... now it is + { + {"a", {"1, []", "[|[", kHasDoubleNestedArrays}}, + }); + basicTests(__LINE__, + R"({a: [{b:1}, {b:2}]})", + { + {"a", {"", "[o1", kHasSubPath}}, + {"a.b", {"1, 2", "["}}, + }); + basicTests(__LINE__, + R"({a: [{b:1}, {c:2}]})", + { + {"a", {"", "[o1", kHasSubPath}}, + {"a.b", {"1", "[", kIsSparse}}, + {"a.c", {"2", "[1", kIsSparse}}, + }); + basicTests(__LINE__, + R"({a: [{b:1}, {c:[2, 3]}, null]})", + { + {"a", {"null", "[o1", kHasSubPath}}, + {"a.b", {"1", "[", kIsSparse}}, + {"a.c", {"2, 3", "[1{[", kIsSparse}}, + }); + basicTests(__LINE__, + R"({a: [{b:1}, {b:[2, 3]}]})", + { + {"a", {"", "[o1", kHasSubPath}}, + {"a.b", {"1,2,3", "[|{["}}, + }); + basicTests(__LINE__, + R"({a: [{b:1}, {b:[2, 3]}, null]})", + { + {"a", {"null", "[o1", kHasSubPath}}, + {"a.b", {"1,2,3", "[|{[", kIsSparse}}, + }); + basicTests(__LINE__, + R"({a: [{b:1}, [{b:[2, 3]}], null]})", + { + {"a", {"null", "[o[o]", kHasSubPath}}, + {"a.b", {"1,2,3", "[|[{[", kIsSparse | kHasDoubleNestedArrays}}, + }); + basicTests(__LINE__, + R"({a: [[{b: {c: 1}}]]})", + { + {"a", {"", "[[o", kHasSubPath}}, + {"a.b", {"", "[[o", kHasSubPath | kHasDoubleNestedArrays}}, + {"a.b.c", {"1", "[[", kHasDoubleNestedArrays}}, + }); + basicTests(__LINE__, + R"({a: 1, 'dotted.field': 1})", + { + {"a", {"1", ""}}, + }); + basicTests(__LINE__, + R"({a: 1, b: {'dotted.field': 1}})", + { + {"a", {"1", ""}}, + {"b", {"", "", kHasSubPath}}, + }); + basicTests(__LINE__, + R"({a: 1, b: [{'dotted.field': 1}]})", + { + {"a", {"1", ""}}, + {"b", {"", "[o", kHasSubPath}}, + }); + basicTests(__LINE__, + R"({a: 1, b: [{'dotted.field': 1, c: 1}]})", + { + {"a", {"1", ""}}, + {"b", {"", "[o", kHasSubPath}}, + {"b.c", {"1", "["}}, + }); + basicTests(__LINE__, + R"({'': 1})", + { + {"", {"1", ""}}, + }); + basicTests(__LINE__, + R"({'': {'': 1}})", + { + {"", {"", "", kHasSubPath}}, + {".", {"1", ""}}, + }); + basicTests(__LINE__, + R"({'': {'': {'': 1}}})", + { + {"", {"", "", kHasSubPath}}, + {".", {"", "", kHasSubPath}}, + {"..", {"1", ""}}, + }); + basicTests(__LINE__, + R"({'': [{'': [{'': [1]}]}]})", + { + {"", {"", "[o", kHasSubPath}}, + {".", {"", "[{[o", kHasSubPath}}, + {"..", {"1", "[{[{["}}, + }); + basicTests(__LINE__, + R"({'a': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]})", + { + {"a", {"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", "["}}, + }); + basicTests(__LINE__, + R"({'a': [[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], [99]]})", + { + {"a", + { + "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,99", + "[[|19][", + kHasDoubleNestedArrays, + }}, + }); + basicTests( + __LINE__, + R"({'a': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20, {b: 1}]})", + { + {"a", {"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", "[|19o", kHasSubPath}}, + {"a.b", {"1", "[20", kIsSparse}}, + }); + basicTests( + __LINE__, + R"({'a': [{b:1}, 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20, {b: 1}]})", + { + {"a", {"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", "[o|19o", kHasSubPath}}, + {"a.b", {"1,1", "[|+20", kIsSparse}}, + }); + basicTests(__LINE__, + R"({ + 'a': [ + {b:1},{b:2},{b:3},{b:4},{b:5},{b:6},{b:7},{b:8},{b:9},{b:10}, + {b:11},{b:12},{b:13},{b:14},{b:15},{b:16},{b:17},{b:18},{b:19},{b:20} + ] + })", + { + {"a", {"", "[o19", kHasSubPath}}, + {"a.b", {"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", "["}}, + }); +} + +TEST(ColKeyGen, DeepObjectTests) { + // Can't use lambdas because they can't be recursive (especially mutually recursive), but I'd + // still like to keep these local to this test + struct funcs { + static void addObjToStr(std::string& str, int depthLeft) { + if (!depthLeft) { + str += '1'; + return; + } + + str += "{x:"; + addObjToStr(str, depthLeft - 1); + str += "}"; + } + + static void addArrToStr(std::string& str, int depthLeft) { + if (!depthLeft) { + str += '1'; + return; + } + + str += "["; + addArrToStr(str, depthLeft - 1); + str += "]"; + } + + static std::string dottedPath(int length) { + invariant(length >= 1); + std::string out = "x"; + for (int i = 1; i < length; i++) { + out += ".x"; + } + return out; + } + + static void addAlternatingObjToStr(std::string& str, int depthLeft) { + if (!depthLeft) { + str += '1'; + return; + } + + str += "{x:"; + addAlternatingArrToStr(str, depthLeft - 1); + str += "}"; + } + + static void addAlternatingArrToStr(std::string& str, int depthLeft) { + if (!depthLeft) { + str += '1'; + return; + } + + str += "["; + addAlternatingObjToStr(str, depthLeft - 1); + str += "]"; + } + + static std::string alternatingArrayInfo(int pathDepth) { + invariant(pathDepth >= 1); + std::string out = "["; + for (int i = 1; i < pathDepth; i++) { + out += "{["; + } + return out; + } + }; + + // The full name is quite the line-full. + constexpr int kDepth = BSONDepth::kBSONDepthParameterCeiling; + + { // Just object nesting + std::string obj; + funcs::addObjToStr(obj, kDepth); + StringMap<UnencodedCellValue_ForTest> expected; + for (int i = 1; i < kDepth; i++) { + expected[funcs::dottedPath(i)] = {"", "", kHasSubPath}; + } + expected[funcs::dottedPath(kDepth)] = {"1", ""}; + basicTests(__LINE__, obj, expected); + } + + { // Just array nesting + std::string arr; + funcs::addArrToStr(arr, kDepth); + basicTests(__LINE__, + "{x: " + arr + "}", + { + {"x", {"1", std::string(kDepth, '['), kHasDoubleNestedArrays}}, + }); + } + + // The next two tests cover a mix of object and array nesting, but differ only in which is the + // innermost. Since the constant was even when this was written, the first tests with an + // innermost array, and the second tests an innermost object. There may need to be slight + // adjustments to the tests in the unlikely event that the constant ever becomes odd. + static_assert(BSONDepth::kBSONDepthParameterCeiling % 2 == 0); // See above if this fails. + + { // Innermost array. + std::string obj; + funcs::addAlternatingObjToStr(obj, kDepth); + StringMap<UnencodedCellValue_ForTest> expected; + constexpr auto kPathLen = kDepth / 2; + for (int i = 1; i < kPathLen; i++) { + expected[funcs::dottedPath(i)] = { + "", funcs::alternatingArrayInfo(i) + 'o', kHasSubPath}; + } + expected[funcs::dottedPath(kPathLen)] = {"1", funcs::alternatingArrayInfo(kPathLen)}; + basicTests(__LINE__, obj, expected); + } + { // Innermost object. + std::string obj; + funcs::addAlternatingObjToStr(obj, kDepth + 1); + StringMap<UnencodedCellValue_ForTest> expected; + constexpr auto kPathLen = kDepth / 2 + 1; + for (int i = 1; i < kPathLen; i++) { + expected[funcs::dottedPath(i)] = { + "", funcs::alternatingArrayInfo(i) + 'o', kHasSubPath}; + } + expected[funcs::dottedPath(kPathLen)] = {"1", funcs::alternatingArrayInfo(kPathLen - 1)}; + basicTests(__LINE__, obj, expected); + } +} + +TEST(ColKeyGen, DuplicateFieldTests) { + basicTests(__LINE__, + R"({a: 1, a: 2})", + { + {"a", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({a: [1], a: 2})", + { + {"a", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({a: 1, a: [2]})", + { + {"a", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({a: [1], a: [2]})", + { + {"a", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({a: {b:1}, a: {b:2}})", + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({a: {b:[1]}, a: {b:[2]}})", + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({a: [{b:[1]}], a: [{b:[2]}]})", + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({a: {b:[1]}, a: [{b:[2]}]})", + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({a: [{b:[1]}], a: {b:[2]}})", + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({a: {b:[1]}, a: [null, {b:[2]}]})", + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({a: [null, {b:[1]}], a: {b:[2]}})", + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({a: [{b:[1, 3]}], a: [{b:[2]}]})", + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({a: [{b:[1, 3]}, null], a: [{b:[2]}]})", + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({a: [null, {b:[1, 3]}], a: [{b:[2]}]})", // No index in second a.b + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({a: [null, null, {b:[1, 3]}], a: [null, {b:[2]}]})", // Index in second a.b + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({a: [null, {b:[{c:1}, {c:3}]}], a: [{b:[{c:2}]}]})", + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"", "", kHasDuplicateFields}}, + {"a.b.c", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({a: [null, null, {b:[{c:1}, {c:3}]}], a: [null, {b:[{c:2}]}]})", + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"", "", kHasDuplicateFields}}, + {"a.b.c", {"", "", kHasDuplicateFields}}, + }); + basicTests(__LINE__, + R"({"": 1, "": 2})", + { + {"", {"", "", kHasDuplicateFields}}, + }); + + // This behaves the same as {a:[{b:[1,3]}, {b:2}]} as far as a.b can tell. + basicTests(__LINE__, + R"({a: [{b:[1, 3]}], a: [null, {b:[2]}]})", + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"1,3,2", "[{[|1]{[", kIsSparse}}, + }); + + // These tests only have one a.b path. + basicTests(__LINE__, + R"({a: [{b:1}], a: 2})", + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"1", "[", kIsSparse}}, + }); + basicTests(__LINE__, + R"({a: 1, a: [{b:2}]})", + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"2", "[", kIsSparse}}, + }); + basicTests(__LINE__, + R"({a: [{b:1}], a: [2]})", + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"1", "[", kIsSparse}}, + }); + basicTests(__LINE__, + R"({a: [1], a: [{b:2}]})", + { + {"a", {"", "", kHasDuplicateFields}}, + {"a.b", {"2", "[", kIsSparse}}, + }); +} + +void updateTest(int line, + std::string jsonOld, + std::string jsonNew, + StringMap<std::pair<DiffAction, UnencodedCellValue_ForTest>> expected) { + BSONObj owner; + std::vector<BSONElement> elems; + + StringSet seenPaths; + visitDiffForUpdate(fromjson(jsonOld), + fromjson(jsonNew), + [&](DiffAction action, PathView path, const UnencodedCellView* cell) { + ASSERT(!seenPaths.contains(path)) << "test:" << line << " path:" << path; + seenPaths.insert(path.toString()); + + auto it = expected.find(path); + if (it == expected.end()) { + FAIL("Unexpected path in updateTest") + << "action:" << action << " cell:" << cell << " test:" << line + << " path:" << path; + } + + auto expectedAction = it->second.first; + ASSERT_EQ(action, expectedAction) << "test:" << line << " path:" << path; + if (action == kDelete) { + ASSERT(!cell) << "test:" << line << " path:" << path; + } else { + ASSERT(cell) << "test:" << line << " path:" << path; + auto expectedCell = it->second.second.toView(owner, elems); + ASSERT_EQ(*cell, expectedCell) + << "test:" << line << " path:" << path; + } + }); + + for (auto [path, _] : expected) { + if (seenPaths.contains(path)) + continue; + + FAIL("Expected to see path in updateTest, but didn't") + << "test:" << line << " path:" << path; + } +} + +TEST(ColKeyGen, UpdateTests) { + + updateTest(__LINE__, + R"({a: [1, {b: 1}]})", + R"({a: [1, {b: 2}]})", + { + {"a.b", {kUpdate, {"2", "[1", kIsSparse}}}, + }); + updateTest(__LINE__, + R"({a: [1, {b: 1}]})", + R"({a: [2, {b: 1}]})", + { + {"a", {kUpdate, {"2", "[|o", kHasSubPath}}}, + + }); + updateTest(__LINE__, + R"({a: [{b: 1}]})", // a.b becomes isSparse + R"({a: [{b: 1}, {c:1}]})", + { + {"a", {kUpdate, {"", "[o1", kHasSubPath}}}, + {"a.b", {kUpdate, {"1", "[", kIsSparse}}}, + {"a.c", {kInsert, {"1", "[1", kIsSparse}}}, + }); + updateTest(__LINE__, + R"({a: [{b: 1}, {c:1}]})", // a.b becomes not isSparse + R"({a: [{b: 1}]})", + { + {"a", {kUpdate, {"", "[o", kHasSubPath}}}, + {"a.b", {kUpdate, {"1", "["}}}, + {"a.c", {kDelete, {}}}, + }); + updateTest(__LINE__, + R"({'': 1})", + R"({'': 2})", + { + {"", {kUpdate, {"2", ""}}}, + }); + updateTest(__LINE__, + R"({'': [1, {'': 1}]})", + R"({'': [1, {'': 2}]})", + { + {".", {kUpdate, {"2", "[1", kIsSparse}}}, + }); +} + +// TODO more tests, of course! In particular, the testing of complex update changes is a bit light. +} // namespace +} // namespace mongo::column_keygen diff --git a/src/mongo/db/storage/column_store.h b/src/mongo/db/storage/column_store.h index 268fc1280d1..aab7a719634 100644 --- a/src/mongo/db/storage/column_store.h +++ b/src/mongo/db/storage/column_store.h @@ -302,6 +302,23 @@ public: return x; } + /** + * If the range [*itPtr, end) begins with a number, returns it and positions *itPtr after the + * last byte of number. If there is no number, returns 0 (which is typically encoded by omitting + * an optional number) and does not reposition *itPtr. + */ + static int readArrInfoNumber(StringData::const_iterator* itInOut, + StringData::const_iterator end) { + auto it = *itInOut; // Use local to allow compiler to assume it doesn't point to itself. + size_t res = 0; + while (it != end && *it >= '0' && *it <= '9') { + res *= 10; // noop first pass. + res += (*it++) - '0'; + } + *itInOut = it; + return res; + } + protected: class Cursor { public: |