5 files changed, 1459 insertions, 0 deletions
diff --git a/src/mongo/db/index/SConscript b/src/mongo/db/index/SConscript
index 410c0a2825c..48ab6f723cc 100644
--- a/src/mongo/db/index/SConscript
+++ b/src/mongo/db/index/SConscript
@@ -23,6 +23,7 @@ env.Library(
     target='key_generator',
     source=[
         'btree_key_generator.cpp',
+        'column_key_generator.cpp',
         'expression_keys_private.cpp',
         'sort_key_generator.cpp',
         'wildcard_key_generator.cpp',
@@ -150,6 +151,7 @@ env.CppUnitTest(
     source=[
         '2d_key_generator_test.cpp',
         'btree_key_generator_test.cpp',
+        'column_key_generator_test.cpp',
         'hash_key_generator_test.cpp',
         's2_key_generator_test.cpp',
         's2_bucket_key_generator_test.cpp',
diff --git a/src/mongo/db/index/column_key_generator.cpp b/src/mongo/db/index/column_key_generator.cpp
new file mode 100644
index 00000000000..6e348179da8
--- /dev/null
+++ b/src/mongo/db/index/column_key_generator.cpp
@@ -0,0 +1,595 @@
+/**
+ *    Copyright (C) 2022-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kIndex
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/index/column_key_generator.h"
+
+#include <iomanip>
+#include <ostream>
+#include <vector>
+
+#include "mongo/db/storage/column_store.h"
+#include "mongo/util/functional.h"
+#include "mongo/util/string_map.h"
+
+namespace mongo::column_keygen {
+namespace {
+
+/**
+ * Special handling for vals because BSONElement doesn't support ==, and we want binaryEqualValues
+ * rather than woCompare equality anyway.
+ */
+bool identicalBSONElementArrays(const std::vector<BSONElement>& lhs,
+                                const std::vector<BSONElement>& rhs) {
+    if (lhs.size() != rhs.size())
+        return false;
+    for (size_t i = 0; i < lhs.size(); i++) {
+        if (!lhs[i].binaryEqualValues(rhs[i]))
+            return false;
+    }
+    return true;
+}
+
+/**
+ * This class handles the logic of key generation for columnar indexes. It produces
+ * UnencodedCellViews, which have all of the data that should be put in the index values, but it is
+ * not responsible for encoding that data into a flat buffer. That is handled by XXX.
+ *
+ * TODO once the code for that that is written (SERVER-64766) update this to replace the XXX.
+ *
+ * "Shredding" is an informal term for taking a single BSON document and splitting it into the data
+ * for each unique path. The data at each path should be sufficient to reconstruct the object in
+ * full fidelity, possibly considering parent paths. When determining the path, array index are not
+ * considered, only field names are.
+ *
+ * This class is private to this file, and only exposed via the free-functions in the header.
+ */
+class ColumnShredder {
+public:
+    /**
+     * Option to constructor to avoid overhead when you only need to know about the paths in an
+     * object, such as for deletes.
+     */
+    enum PathsAndCells : bool { kOnlyPaths = false, kPathsAndCells = true };
+
+    explicit ColumnShredder(const BSONObj& obj, PathsAndCells pathsAndCells = kPathsAndCells)
+        : _pathsAndCells(pathsAndCells) {
+        walkObj(_paths[ColumnStore::kRowIdPath], obj, /*isRoot=*/true);
+    }
+
+    void visitPaths(function_ref<void(PathView)> cb) const {
+        for (auto&& [path, _] : _paths) {
+            cb(path);
+        }
+    }
+
+    void visitCells(function_ref<void(PathView, const UnencodedCellView&)> cb) {
+        // This function requires that the shredder was constructed to record cell values.
+        invariant(_pathsAndCells);
+
+        for (auto&& [path, cell] : _paths) {
+            cb(path, makeCellView(path, cell));
+        }
+    }
+
+    static void visitDiff(const BSONObj& oldObj,
+                          const BSONObj& newObj,
+                          function_ref<void(DiffAction, PathView, const UnencodedCellView*)> cb) {
+        auto oldShredder = ColumnShredder(oldObj);
+        auto newShredder = ColumnShredder(newObj);
+
+        for (auto&& [path, rawCellNew] : newShredder._paths) {
+            auto itOld = oldShredder._paths.find(path);
+            if (itOld == oldShredder._paths.end()) {
+                auto cell = newShredder.makeCellView(path, rawCellNew);
+                cb(kInsert, path, &cell);
+                continue;
+            }
+
+            auto&& [_, rawCellOld] = *itOld;  // For symmetry with rawCellNew.
+
+            // Need to compute sparseness prior to checking cell equality.
+            newShredder.computeIsSparse(path, &rawCellNew);
+            oldShredder.computeIsSparse(path, &rawCellOld);
+            if (rawCellNew == rawCellOld)
+                continue;  // No change for this path.
+
+
+            auto cell = newShredder.makeCellView(path, rawCellNew);
+            cb(kUpdate, path, &cell);
+        }
+
+        for (auto&& [path, _] : oldShredder._paths) {
+            if (!newShredder._paths.contains(path)) {
+                cb(kDelete, path, nullptr);
+            }
+        }
+    }
+
+
+private:
+    enum Sparseness : int8_t { kUnknown, kIsSparse, kNotSparse };
+
+    struct RawCellValue {
+        // This is where we incrementally build up the full arrayInfo for this cell. While each
+        // position is essentially a walk from the root to somewhere in the document, this is a walk
+        // that starts at the root and then passes through each value or subobject at this path
+        // taking (hopefully) the shortest distance between each value. First we build it up in
+        // "uncompressed" format, then we call compressArrayInfo() to remove redundant information.
+        std::string arrayInfoBuf;
+
+        // This is used when building up arrayInfoBuf. It records the absolute position of the last
+        // value or subobject appended to arrayInfoBuf, so that we can easily compute the relative
+        // path to the next value. Equivalently, it is the value of _currentArrayInfo at the time of
+        // the last call to appendToArrayInfo(). It never has any '|' or 'o' bytes.
+        std::string lastPosition;
+
+        std::vector<BSONElement> vals;
+        int nSeen = 0;                     // Number of times this field has been seen.
+        int nNonEmptySubobjects = 0;       // Number of values that are non-empty objects
+        Sparseness sparseness = kUnknown;  // Memoized in computeIsSparse()
+        bool childrenMustBeSparse = false;
+        bool hasDoubleNestedArrays = false;
+        bool hasDuplicateFields = false;
+
+        bool operator==(const RawCellValue& rhs) const {
+            const RawCellValue& lhs = *this;
+
+            // Sparseness must have already been resolved.
+            invariant(lhs.sparseness != kUnknown);
+            invariant(rhs.sparseness != kUnknown);
+
+            // Intentionally not checking nSeen or childrenMustBeSparse, since they are just inputs
+            // to sparseness computation.
+
+            // Ordered to test cheaper stuff before more expensive stuff.
+            return lhs.sparseness == rhs.sparseness &&                     //
+                lhs.hasDoubleNestedArrays == rhs.hasDoubleNestedArrays &&  //
+                lhs.nNonEmptySubobjects == rhs.nNonEmptySubobjects &&      //
+                lhs.hasDuplicateFields == rhs.hasDuplicateFields &&        //
+                lhs.arrayInfoBuf == rhs.arrayInfoBuf &&                    //
+                identicalBSONElementArrays(lhs.vals, rhs.vals);
+        }
+    };
+
+    UnencodedCellView makeCellView(PathView path, RawCellValue& cell) {
+        compressArrayInfo(cell.arrayInfoBuf);
+        return {
+            cell.vals,
+            cell.arrayInfoBuf,
+            cell.hasDuplicateFields,
+            /*hasSubPaths*/ cell.nNonEmptySubobjects != 0,
+            computeIsSparse(path, &cell),
+            cell.hasDoubleNestedArrays,
+        };
+    }
+
+    // Memoized, so amortized O(1) when called on all paths, even though it must check all parent
+    // paths.
+    bool computeIsSparse(PathView path, RawCellValue* cell) {
+        if (cell->sparseness != kUnknown)
+            return cell->sparseness == kIsSparse;
+
+        auto parentPath = parentPathOf(path);
+        if (!parentPath) {
+            // Top level fields are never considered sparse.
+            cell->sparseness = kNotSparse;
+            return false;
+        }
+
+        auto parentIt = _paths.find(*parentPath);
+        invariant(parentIt != _paths.end());
+        auto& parent = parentIt->second;
+        const bool isSparse = parent.childrenMustBeSparse ||
+            cell->nSeen != parent.nNonEmptySubobjects || computeIsSparse(*parentPath, &parent);
+        cell->sparseness = isSparse ? kIsSparse : kNotSparse;
+        return isSparse;
+    }
+
+    void walkObj(RawCellValue& cell, const BSONObj& obj, bool isRoot = false) {
+        if (_pathsAndCells) {
+            cell.nNonEmptySubobjects++;
+            if (!isRoot) {
+                appendToArrayInfo(cell, 'o');
+            }
+        }
+
+        ON_BLOCK_EXIT([&, oldArrInfoSize = _currentArrayInfo.size()] {
+            if (_pathsAndCells)
+                _currentArrayInfo.resize(oldArrInfoSize);
+        });
+        if (_pathsAndCells && !isRoot)
+            _currentArrayInfo += '{';
+
+        for (auto [name, elem] : obj) {
+            // We skip fields in some edge cases such as dots in the field name. This may also throw
+            // if the field name contains invalid UTF-8 in a way that would break the index.
+            if (shouldSkipField(name))
+                continue;
+
+            ON_BLOCK_EXIT([&, oldPathSize = _currentPath.size()] {  //
+                _currentPath.resize(oldPathSize);
+            });
+            if (!isRoot)
+                _currentPath += '.';
+            _currentPath += name;
+
+            auto& subCell = _paths[_currentPath];
+            subCell.nSeen++;
+            if (_inDoubleNestedArray)
+                subCell.hasDoubleNestedArrays = true;
+            handleElem(subCell, elem);
+        }
+    }
+
+    void walkArray(RawCellValue& cell, const BSONObj& arr) {
+        DecimalCounter<unsigned> index;
+
+        for (auto elem : arr) {
+            ON_BLOCK_EXIT([&,
+                           oldArrInfoSize = _currentArrayInfo.size(),
+                           oldInDoubleNested = _inDoubleNestedArray] {
+                if (_pathsAndCells) {
+                    _currentArrayInfo.resize(oldArrInfoSize);
+                    _inDoubleNestedArray = oldInDoubleNested;
+                }
+            });
+
+            if (_pathsAndCells) {
+                _currentArrayInfo += '[';
+                if (index == 0) {
+                    // Zero is common so make it implied rather than explicit.
+                } else {
+                    // Theoretically, index should be the same as elem.fieldNameStringData(),
+                    // however, since we don't validate the array indexes, they cannot be trusted.
+                    // The logic to encode array info relies on "sane" array indexes (at the very
+                    // least they must be monotonically increasing), so we create a new index string
+                    // here.
+                    _currentArrayInfo += StringData(index);
+                }
+                ++index;
+
+                // [] doesn't start a double nesting since {a:{$eq: []}} matches {a: [[]]}
+                if (elem.type() == Array && !elem.Obj().isEmpty())
+                    _inDoubleNestedArray = true;
+            }
+
+            // Note: always same cell, since array traversal never changes path.
+            handleElem(cell, elem);
+        }
+    }
+
+    void handleElem(RawCellValue& cell, const BSONElement& elem) {
+        // Only recurse on non-empty objects and arrays. Empty objects and arrays are handled as
+        // scalars.
+        if (elem.type() == Object) {
+            if (auto obj = elem.Obj(); !obj.isEmpty())
+                return walkObj(cell, obj);
+        } else if (elem.type() == Array) {
+            if (auto obj = elem.Obj(); !obj.isEmpty())
+                return walkArray(cell, obj);
+        }
+
+        if (_pathsAndCells) {
+            // If we get here, then this walk will not have any children. This means that there is
+            // at least one path where all children (if any) will be missing structural information,
+            // so they will need to consult the parent path.
+            cell.childrenMustBeSparse = true;
+
+            if (_inDoubleNestedArray)
+                cell.hasDoubleNestedArrays = true;
+
+            cell.vals.push_back(elem);
+            appendToArrayInfo(cell, '|');
+        }
+    }
+
+    void appendToArrayInfo(RawCellValue& rcd, char finalByte) {
+        dassert(finalByte == '|' || finalByte == 'o');
+
+        auto foundDuplicateField = [&] {
+            rcd.arrayInfoBuf.clear();
+            rcd.hasDuplicateFields = true;
+        };
+
+        if (rcd.hasDuplicateFields) {
+            // arrayInfo should be left empty in this case.
+            invariant(rcd.arrayInfoBuf.empty());
+            return;
+        }
+
+        ON_BLOCK_EXIT([&] { rcd.lastPosition = _currentArrayInfo; });
+
+        if (rcd.arrayInfoBuf.empty()) {
+            // The first time we get a position for this path, just record it verbatim. The first
+            // is special because we are essentially recording the absolute path to this value,
+            // while every other time we append the path relative to the prior value.
+            invariant(rcd.lastPosition.empty());
+            rcd.arrayInfoBuf.reserve(_currentArrayInfo.size() + 1);
+            rcd.arrayInfoBuf += _currentArrayInfo;
+            rcd.arrayInfoBuf += finalByte;
+            return;
+        }
+
+        // Make better names for symmetry (and to prevent accidental modifications):
+        StringData oldPosition = rcd.lastPosition;
+        StringData newPosition = _currentArrayInfo;
+
+        if (MONGO_unlikely(oldPosition.empty() || newPosition.empty())) {
+            // This can only happen if there is a duplicate field at the top level.
+            return foundDuplicateField();
+        }
+
+        auto [oldIt, newIt] = std::mismatch(oldPosition.begin(),  //
+                                            oldPosition.end(),
+                                            newPosition.begin(),
+                                            newPosition.end());
+        if (MONGO_unlikely(newIt == newPosition.end())) {
+            // This can only happen if there is a duplicate field in an array, because otherwise
+            // the raw array infos must differ by an index in some array.
+            return foundDuplicateField();
+        }
+
+        // Walk back to start of differing elem. Important to use newIt here because if they are
+        // in the same array, oldIt may have an implicitly encoded 0 index, while newIt must
+        // have a higher index.
+        while (*newIt != '[') {
+            if (MONGO_unlikely(!(*newIt >= '0' && *newIt <= '9'))) {
+                // Non-index difference can only happen if there are duplicate fields in an array.
+                return foundDuplicateField();
+            }
+            invariant(newIt > newPosition.begin());
+            dassert(oldIt > oldPosition.begin());  // oldIt and newIt are at same index.
+            --newIt;
+            --oldIt;
+        }
+        invariant(oldIt < oldPosition.end());
+        if (MONGO_unlikely(*oldIt != '[')) {
+            // This is another type of non-index difference.
+            return foundDuplicateField();
+        }
+
+        // Close out arrays past the first mismatch in LIFO order.
+        for (auto revOldIt = oldPosition.end() - 1; revOldIt != oldIt; --revOldIt) {
+            if (*revOldIt == '[') {
+                rcd.arrayInfoBuf += ']';
+            } else {
+                dassert((*revOldIt >= '0' && *revOldIt <= '9') || *revOldIt == '{');
+            }
+        }
+
+        // Now process the mismatch. It must be a difference in array index (checked above).
+        dassert(*oldIt == '[' && *newIt == '[');
+        ++oldIt;
+        ++newIt;
+        const auto oldIx = ColumnStore::readArrInfoNumber(&oldIt, oldPosition.end());
+        const auto newIx = ColumnStore::readArrInfoNumber(&newIt, newPosition.end());
+
+        if (MONGO_unlikely(newIx <= oldIx)) {
+            // If this element is at a same or lower index, we must have hit a duplicate field
+            // above and restarted the array indexing.
+            return foundDuplicateField();
+        }
+        const auto delta = newIx - oldIx;
+        const auto skips = delta - 1;
+        if (skips == 0) {
+            // Nothing. Increment by one (skipping zero) is implicit.
+        } else {
+            rcd.arrayInfoBuf += '+';
+            rcd.arrayInfoBuf += ItoA(skips);
+        }
+
+        // Now put the rest of the new array info into the output buffer.
+        rcd.arrayInfoBuf += StringData(newIt, newPosition.end());
+        rcd.arrayInfoBuf += finalByte;
+    }
+
+    static void compressArrayInfo(std::string& arrayInfo) {
+        // NOTE: all operations in this function either shrink the array info or keep it the same
+        // size, so they are able to work in-place in the arrayInfo buffer.
+
+        // Logic below assumes arrayInfo is null terminated as a simplification.
+        dassert(strlen(arrayInfo.data()) == arrayInfo.size());
+
+        const char* in = arrayInfo.data();
+        char* out = arrayInfo.data();
+        bool anyArrays = false;
+
+        // Remove all '{' immediately before a '|' or 'o', and check for arrays
+        char* lastNonBrace = nullptr;
+        while (auto c = *in++) {
+            if (c == '[') {
+                anyArrays = true;
+            } else if ((c == '|' || c == 'o') && lastNonBrace) {
+                // Rewind output to just past last non-brace, since the last set of opens are
+                // encoded implicitly.
+                dassert(lastNonBrace + 1 <= out);
+                out = lastNonBrace + 1;
+            }
+
+            if (c != '{')
+                lastNonBrace = out;
+
+            *out++ = c;
+        }
+
+        // If there were no arrays, we don't need any array info.
+        if (!anyArrays) {
+            arrayInfo.clear();
+            return;
+        }
+
+        invariant(size_t(out - arrayInfo.data()) <= arrayInfo.size());
+
+        // Remove final run of '|' since end implies an infinite sequence of them.
+        while (out - arrayInfo.data() >= 1 && out[-1] == '|') {
+            out--;
+        }
+        arrayInfo.resize(out - arrayInfo.data());  // Reestablishes null termination.
+
+        // Now do a final pass to RLE remaining runs of '|' or 'o'. It may be possible to integrate
+        // this with the first loop above, but it would be a bit more complicated because we need
+        // to have removed any implicit runs of '{' prior to each '|' or 'o' before looking for
+        // runs.
+        out = arrayInfo.data();
+        in = arrayInfo.data();
+        while (auto c = *in++) {
+            *out++ = c;
+            if (c != '|' && c != 'o')
+                continue;
+
+            size_t repeats = 0;
+            while (*in == c) {
+                repeats++;
+                in++;
+            }
+
+            if (repeats) {
+                auto buf = ItoA(repeats);  // Must be on own line so the buffer outlives numStr.
+                auto numStr = StringData(buf);
+
+                // We know there is room because a number > 0 takes up no more space than the number
+                // of '|' that it is replacing. repeats == 1 is the worst case because it isn't
+                // saving any space. However we still replace here since it should make decoding
+                // slightly faster.
+                dassert(numStr.size() <= repeats);
+                for (char c : numStr) {
+                    dassert(out < in);  // Note: `in` has already been advanced.
+                    *out++ = c;
+                }
+            }
+        }
+
+        invariant(size_t(out - arrayInfo.data()) <= arrayInfo.size());
+        arrayInfo.resize(out - arrayInfo.data());
+    }
+
+    static bool shouldSkipField(PathView fieldName) {
+        // TODO consider skipping when the nesting depths exceed our documented limits of 100
+        // documents. This would allow decoding to use fixed-size bitsets without needing to check
+        // for overflow.
+
+        // WARNING: do not include the field name in error messages because if we throw from this
+        // function then the field name isn't valid utf-8, and could poison the whole error message!
+
+        // We only care about \xFF at the beginning, even though utf-8 bans it everywhere.
+        static_assert(ColumnStore::kRowIdPath == "\xFF"_sd);
+        uassert(6519200,
+                "Field name contains '\\xFF' which isn't valid in UTF-8",
+                fieldName[0] != '\xFF');  // We know field names are nul-terminated so this is safe.
+
+        // Fields with dots are not illegal, but we skip them because a) the query optimizer can't
+        // correctly track dependencies with them, and b) the current storage format uses dots as a
+        // separator and everything (including the array info encoder) would get confused with
+        // documents like {a: {b:1}, a.b: 2} because that breaks some of the assumptions.
+        return fieldName.find('.') != std::string::npos;
+    }
+
+    static boost::optional<PathView> parentPathOf(PathView path) {
+        auto sep = path.rfind('.');
+        if (sep == std::string::npos)
+            return {};
+
+        return path.substr(0, sep);
+    }
+
+    // This is the same as StringMap<V> but with node_hash_map. We need the reference stability
+    // guarantee because we have recursive functions that both insert into _paths and hold a
+    // reference to one of its values. It seems better to use a node-based hash table than to redo
+    // the lookup every time we want to touch a cell.
+    template <typename V>
+    using NodeStringMap = absl::node_hash_map<std::string, V, StringMapHasher, StringMapEq>;
+
+    PathValue _currentPath;
+    std::string _currentArrayInfo;  // Describes a walk from root to current pos. No '|' or 'o'.
+    NodeStringMap<RawCellValue> _paths;
+    bool _inDoubleNestedArray = false;
+
+    const PathsAndCells _pathsAndCells = kPathsAndCells;
+};
+}  // namespace
+
+void visitCellsForInsert(const BSONObj& obj,
+                         function_ref<void(PathView, const UnencodedCellView&)> cb) {
+    ColumnShredder(obj).visitCells(cb);
+}
+
+void visitPathsForDelete(const BSONObj& obj, function_ref<void(PathView)> cb) {
+    ColumnShredder(obj, ColumnShredder::kOnlyPaths).visitPaths(cb);
+}
+
+void visitDiffForUpdate(const BSONObj& oldObj,
+                        const BSONObj& newObj,
+                        function_ref<void(DiffAction, PathView, const UnencodedCellView*)> cb) {
+    ColumnShredder::visitDiff(oldObj, newObj, cb);
+}
+
+bool operator==(const UnencodedCellView& lhs, const UnencodedCellView& rhs) {
+    if (lhs.hasDuplicateFields || rhs.hasDuplicateFields) {
+        // As a special case, if either is true, we only care about comparing this field, since all
+        // other fields are suspect. This simplifies testing, because we don't need to guess what
+        // the output will be (and lock it into a correctness test!) for the case with duplicate
+        // fields.
+        return lhs.hasDuplicateFields == rhs.hasDuplicateFields;
+    }
+
+    return identicalBSONElementArrays(lhs.vals, rhs.vals) &&  //
+        lhs.arrayInfo == rhs.arrayInfo &&                     //
+        lhs.hasSubPaths == rhs.hasSubPaths &&                 //
+        lhs.isSparse == rhs.isSparse &&                       //
+        lhs.hasDoubleNestedArrays == rhs.hasDoubleNestedArrays;
+}
+
+std::ostream& operator<<(std::ostream& os, const UnencodedCellView& cell) {
+    if (cell.hasDuplicateFields) {
+        // As a special case just output this info, since other fields don't matter.
+        return os << "{duplicateFields: 1}";
+    }
+
+    os << "{vals: [";
+    for (auto&& elem : cell.vals) {
+        if (&elem != &cell.vals.front())
+            os << ", ";
+        os << elem.toString(/*includeFieldName*/ false);
+    }
+    os << "], arrayInfo: '" << cell.arrayInfo;
+    os << "', hasSubPaths: " << cell.hasSubPaths;
+    os << ", isSparse: " << cell.isSparse;
+    os << ", hasDoubleNestedArrays: " << cell.hasDoubleNestedArrays;
+    os << '}';
+
+    return os;
+}
+std::ostream& operator<<(std::ostream& os, const UnencodedCellView* cell) {
+    return cell ? (os << *cell) : (os << "(no cell)");
+}
+}  // namespace mongo::column_keygen
diff --git a/src/mongo/db/index/column_key_generator.h b/src/mongo/db/index/column_key_generator.h
new file mode 100644
index 00000000000..d88e642ff0d
--- /dev/null
+++ b/src/mongo/db/index/column_key_generator.h
@@ -0,0 +1,111 @@
+/**
+ *    Copyright (C) 2022-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+#include <iosfwd>
+
+#include "mongo/db/storage/column_store.h"
+#include "mongo/util/functional.h"
+
+namespace mongo::column_keygen {
+/**
+ * This is a representation of the cell prior to flattening it out into a buffer which is passed to
+ * visitor callbacks.
+ *
+ * All memory within the UnencodedCellView should only be assumed valid within the callback. If you
+ * need it longer, you must copy it yourself. Non-test callers will generally immediately encode
+ * this to a flat buffer, so this shouldn't be a problem.
+ */
+struct UnencodedCellView {
+    const std::vector<BSONElement>& vals;
+    StringData arrayInfo;
+
+    // If true, this path has multiple values in a single (possibly nested) object with the same
+    // field name. In this case, arrayInfo will be empty and this cell must not be used to
+    // reconstruct an object. We should probably not attempt to encode vals in the index either, and
+    // just put a marker that causes us to either skip the row (because it breaks the rules) or go
+    // to the row store.
+    //
+    // Note that this detection is best-effort and will only detect cases that would result in
+    // corrupt array info. We have decided that query results do not need to be precise for objects
+    // with duplicate fields, so it is OK if we don't detect every case, as long as we don't crash
+    // or cause corruption on the undetected cases.
+    bool hasDuplicateFields;
+
+    // If true, this cell omits values that are stored in subpaths.
+    bool hasSubPaths;
+
+    // If true, when reconstructing an object, you will need to visit the parent path in order to
+    // match current semantics for projections and field-path expressions.
+    bool isSparse;
+
+    // If true, at least one of the values in vals is inside of a directly-double-nested array, or
+    // the field name was encountered while already inside of a directly-double-nested array, so
+    // arrayInfo must be consulted to know which values to skip when matching. If false, simple
+    // matches can ignore the array info and just match against each value independently.
+    bool hasDoubleNestedArrays;
+
+    // These are only used in tests and for debugging.
+    friend bool operator==(const UnencodedCellView&, const UnencodedCellView&);
+    friend std::ostream& operator<<(std::ostream&, const UnencodedCellView&);
+    friend std::ostream& operator<<(std::ostream&, const UnencodedCellView*);
+};
+
+/**
+ * Visits all paths within obj and provides their cell values.
+ * Path visit order is unspecified.
+ */
+void visitCellsForInsert(const BSONObj& obj,
+                         function_ref<void(PathView, const UnencodedCellView&)> cb);
+
+/**
+ * Visits all paths within obj. When deleting, you do not need to know about values.
+ * Path visit order is unspecified.
+ */
+void visitPathsForDelete(const BSONObj& obj, function_ref<void(PathView)> cb);
+
+/**
+ * See visitDiffForUpdate().
+ */
+enum DiffAction { kInsert, kUpdate, kDelete };
+
+/**
+ * Computes differences between oldObj and newObj, and invokes cb() with the required actions to
+ * take to update the columnar index.
+ *
+ * For kInsert and kUpdate, the UnencodedCellView will point to the cell data for newObj (you
+ * don't need to know the value for oldObj).
+ *
+ * For kDelete, the UnencodedCellView pointer will be null.
+ *
+ * Path visit order is unspecified.
+ */
+void visitDiffForUpdate(const BSONObj& oldObj,
+                        const BSONObj& newObj,
+                        function_ref<void(DiffAction, PathView, const UnencodedCellView*)> cb);
+
+}  // namespace mongo::column_keygen
diff --git a/src/mongo/db/index/column_key_generator_test.cpp b/src/mongo/db/index/column_key_generator_test.cpp
new file mode 100644
index 00000000000..e299bb1a68d
--- /dev/null
+++ b/src/mongo/db/index/column_key_generator_test.cpp
@@ -0,0 +1,734 @@
+/**
+ *    Copyright (C) 2022-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/bson/bson_depth.h"
+#include "mongo/bson/json.h"
+#include "mongo/db/index/column_key_generator.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo::column_keygen {
+namespace {
+enum Flags_ForTest {
+    kNoFlags = 0,
+    kHasDuplicateFields = 1 << 0,
+    kHasSubPath = 1 << 1,
+    kIsSparse = 1 << 2,
+    kHasDoubleNestedArrays = 1 << 3,
+};
+
+struct UnencodedCellValue_ForTest {
+    StringData valsJson;  // Comma-separated values
+    std::string arrayInfo;
+    int flags = kNoFlags;
+
+    UnencodedCellView toView(BSONObj& owner, std::vector<BSONElement>& elems) const {
+        owner = fromjson(std::string("{x:[") + valsJson + "]}");
+        elems = owner.firstElement().Array();
+        return {
+            elems,
+            arrayInfo,
+            bool(flags & kHasDuplicateFields),
+            bool(flags & kHasSubPath),
+            bool(flags & kIsSparse),
+            bool(flags & kHasDoubleNestedArrays),
+        };
+    }
+};
+
+void insertTest(int line,
+                const BSONObj& doc,
+                const StringMap<UnencodedCellValue_ForTest>& expected) {
+    BSONObj owner;
+    std::vector<BSONElement> elems;
+
+    StringSet seenPaths;
+    visitCellsForInsert(doc, [&](PathView path, const UnencodedCellView& cell) {
+        seenPaths.insert(path.toString());
+
+        auto it = expected.find(path);
+        if (it == expected.end()) {
+            FAIL("Unexpected path in insert") << "test:" << line << " path:" << path;
+        }
+        auto expectedCell = it->second.toView(owner, elems);
+        ASSERT_EQ(cell, expectedCell) << "test:" << line << " path:" << path;
+    });
+
+    for (auto [path, _] : expected) {
+        if (seenPaths.contains(path))
+            continue;
+
+        FAIL("Expected to see path in insert, but didn't") << "test:" << line << " path:" << path;
+    }
+}
+
+void deleteTest(int line,
+                const BSONObj& doc,
+                const StringMap<UnencodedCellValue_ForTest>& expected) {
+    StringSet seenPaths;
+    visitPathsForDelete(doc, [&](PathView path) {
+        seenPaths.insert(path.toString());
+
+        auto it = expected.find(path);
+        if (it == expected.end()) {
+            FAIL("Unexpected path in delete") << "test:" << line << " path:" << path;
+        }
+    });
+
+    for (auto [path, _] : expected) {
+        if (seenPaths.contains(path))
+            continue;
+
+        FAIL("Expected to see path in delete, but didn't") << "test:" << line << " path:" << path;
+    }
+}
+
+void updateToEmptyTest(int line,
+                       const BSONObj& doc,
+                       const StringMap<UnencodedCellValue_ForTest>& expected) {
+    StringSet seenPaths;
+    visitDiffForUpdate(
+        doc, BSONObj(), [&](DiffAction action, PathView path, const UnencodedCellView* cell) {
+            ASSERT_EQ(action, kDelete) << "test:" << line << " path:" << path;
+
+            ASSERT(!seenPaths.contains(path)) << "test:" << line << " path:" << path;
+            seenPaths.insert(path.toString());
+
+            auto it = expected.find(path);
+            if (it == expected.end()) {
+                FAIL("Unexpected path in updateToEmpty") << "action:" << action << " cell:" << cell
+                                                         << " test:" << line << " path:" << path;
+            }
+
+            ASSERT(!cell) << "test:" << line << " path:" << path;
+        });
+
+    for (auto [path, _] : expected) {
+        if (seenPaths.contains(path))
+            continue;
+
+        FAIL("Expected to see path in updateToEmpty, but didn't")
+            << "test:" << line << " path:" << path;
+    }
+}
+
+void updateFromEmptyTest(int line,
+                         const BSONObj& doc,
+                         const StringMap<UnencodedCellValue_ForTest>& expected) {
+    BSONObj owner;
+    std::vector<BSONElement> elems;
+
+    StringSet seenPaths;
+    visitDiffForUpdate(
+        BSONObj(), doc, [&](DiffAction action, PathView path, const UnencodedCellView* cell) {
+            ASSERT_EQ(action, kInsert) << "test:" << line << " path:" << path;
+
+            ASSERT(!seenPaths.contains(path)) << "test:" << line << " path:" << path;
+            seenPaths.insert(path.toString());
+
+            auto it = expected.find(path);
+            if (it == expected.end()) {
+                FAIL("Unexpected path in updateFromEmpty")
+                    << "action:" << action << " cell:" << cell << " test:" << line
+                    << " path:" << path;
+            }
+
+            ASSERT(cell) << "test:" << line << " path:" << path;
+            auto expectedCell = it->second.toView(owner, elems);
+            ASSERT_EQ(*cell, expectedCell) << "test:" << line << " path:" << path;
+        });
+
+    for (auto [path, _] : expected) {
+        if (seenPaths.contains(path))
+            continue;
+
+        FAIL("Expected to see path in updateFromEmpty, but didn't")
+            << "test:" << line << " path:" << path;
+    }
+}
+
+void updateWithNoChange(int line, const BSONObj& doc) {
+    BSONObj owner;
+    std::vector<BSONElement> elems;
+
+    StringSet seenPaths;
+    visitDiffForUpdate(
+        doc, doc, [&](DiffAction action, PathView path, const UnencodedCellView* cell) {
+            FAIL("Unexpected path in updateNoChange")
+                << "action:" << action << " cell:" << cell << " test:" << line << " path:" << path;
+        });
+}
+
+
+void basicTests(int line, std::string json, StringMap<UnencodedCellValue_ForTest> expected) {
+    const BSONObj doc = fromjson(json);
+
+    // Add in the RowID column. Since it is always the same, tests shouldn't include it.
+    // We always expect to see it in inserts and deletes, and never in updates.
+    expected.insert({ColumnStore::kRowIdPath.toString(), {"", "", kHasSubPath}});
+
+    insertTest(line, doc, expected);
+    deleteTest(line, doc, expected);
+
+    expected.erase(ColumnStore::kRowIdPath);
+    updateToEmptyTest(line, doc, expected);
+    updateFromEmptyTest(line, doc, expected);
+    updateWithNoChange(line, doc);
+}
+
+TEST(ColKeyGen, BasicTests) {
+    basicTests(__LINE__, R"({})", {});
+    basicTests(__LINE__,
+               R"({a: 1})",
+               {
+                   {"a", {"1", ""}},
+               });
+    basicTests(__LINE__,
+               R"({a: 1, b:2})",
+               {
+                   {"a", {"1", ""}},
+                   {"b", {"2", ""}},
+               });
+    basicTests(__LINE__,
+               R"({a: [1, 2]})",
+               {
+                   {"a", {"1, 2", "["}},
+               });
+    basicTests(__LINE__,
+               R"({a: [1, 1]})",  // Identical
+               {
+                   {"a", {"1, 1", "["}},
+               });
+    basicTests(__LINE__,
+               R"({a: [1, [2]]})",
+               {
+                   {"a", {"1, 2", "[|[", kHasDoubleNestedArrays}},
+               });
+    basicTests(__LINE__,
+               R"({a: [1, []]})",  // Empty array isn't "double nested" (unless it is...)
+               {
+                   {"a", {"1, []", "["}},
+               });
+    basicTests(__LINE__,
+               R"({a: [1, [[]]]})",  // ... now it is
+               {
+                   {"a", {"1, []", "[|[", kHasDoubleNestedArrays}},
+               });
+    basicTests(__LINE__,
+               R"({a: [{b:1}, {b:2}]})",
+               {
+                   {"a", {"", "[o1", kHasSubPath}},
+                   {"a.b", {"1, 2", "["}},
+               });
+    basicTests(__LINE__,
+               R"({a: [{b:1}, {c:2}]})",
+               {
+                   {"a", {"", "[o1", kHasSubPath}},
+                   {"a.b", {"1", "[", kIsSparse}},
+                   {"a.c", {"2", "[1", kIsSparse}},
+               });
+    basicTests(__LINE__,
+               R"({a: [{b:1}, {c:[2, 3]}, null]})",
+               {
+                   {"a", {"null", "[o1", kHasSubPath}},
+                   {"a.b", {"1", "[", kIsSparse}},
+                   {"a.c", {"2, 3", "[1{[", kIsSparse}},
+               });
+    basicTests(__LINE__,
+               R"({a: [{b:1}, {b:[2, 3]}]})",
+               {
+                   {"a", {"", "[o1", kHasSubPath}},
+                   {"a.b", {"1,2,3", "[|{["}},
+               });
+    basicTests(__LINE__,
+               R"({a: [{b:1}, {b:[2, 3]}, null]})",
+               {
+                   {"a", {"null", "[o1", kHasSubPath}},
+                   {"a.b", {"1,2,3", "[|{[", kIsSparse}},
+               });
+    basicTests(__LINE__,
+               R"({a: [{b:1}, [{b:[2, 3]}], null]})",
+               {
+                   {"a", {"null", "[o[o]", kHasSubPath}},
+                   {"a.b", {"1,2,3", "[|[{[", kIsSparse | kHasDoubleNestedArrays}},
+               });
+    basicTests(__LINE__,
+               R"({a: [[{b: {c: 1}}]]})",
+               {
+                   {"a", {"", "[[o", kHasSubPath}},
+                   {"a.b", {"", "[[o", kHasSubPath | kHasDoubleNestedArrays}},
+                   {"a.b.c", {"1", "[[", kHasDoubleNestedArrays}},
+               });
+    basicTests(__LINE__,
+               R"({a: 1, 'dotted.field': 1})",
+               {
+                   {"a", {"1", ""}},
+               });
+    basicTests(__LINE__,
+               R"({a: 1, b: {'dotted.field': 1}})",
+               {
+                   {"a", {"1", ""}},
+                   {"b", {"", "", kHasSubPath}},
+               });
+    basicTests(__LINE__,
+               R"({a: 1, b: [{'dotted.field': 1}]})",
+               {
+                   {"a", {"1", ""}},
+                   {"b", {"", "[o", kHasSubPath}},
+               });
+    basicTests(__LINE__,
+               R"({a: 1, b: [{'dotted.field': 1, c: 1}]})",
+               {
+                   {"a", {"1", ""}},
+                   {"b", {"", "[o", kHasSubPath}},
+                   {"b.c", {"1", "["}},
+               });
+    basicTests(__LINE__,
+               R"({'': 1})",
+               {
+                   {"", {"1", ""}},
+               });
+    basicTests(__LINE__,
+               R"({'': {'': 1}})",
+               {
+                   {"", {"", "", kHasSubPath}},
+                   {".", {"1", ""}},
+               });
+    basicTests(__LINE__,
+               R"({'': {'': {'': 1}}})",
+               {
+                   {"", {"", "", kHasSubPath}},
+                   {".", {"", "", kHasSubPath}},
+                   {"..", {"1", ""}},
+               });
+    basicTests(__LINE__,
+               R"({'': [{'': [{'': [1]}]}]})",
+               {
+                   {"", {"", "[o", kHasSubPath}},
+                   {".", {"", "[{[o", kHasSubPath}},
+                   {"..", {"1", "[{[{["}},
+               });
+    basicTests(__LINE__,
+               R"({'a': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]})",
+               {
+                   {"a", {"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", "["}},
+               });
+    basicTests(__LINE__,
+               R"({'a': [[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], [99]]})",
+               {
+                   {"a",
+                    {
+                        "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,99",
+                        "[[|19][",
+                        kHasDoubleNestedArrays,
+                    }},
+               });
+    basicTests(
+        __LINE__,
+        R"({'a': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20, {b: 1}]})",
+        {
+            {"a", {"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", "[|19o", kHasSubPath}},
+            {"a.b", {"1", "[20", kIsSparse}},
+        });
+    basicTests(
+        __LINE__,
+        R"({'a': [{b:1}, 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20, {b: 1}]})",
+        {
+            {"a", {"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", "[o|19o", kHasSubPath}},
+            {"a.b", {"1,1", "[|+20", kIsSparse}},
+        });
+    basicTests(__LINE__,
+               R"({
+                    'a': [
+                        {b:1},{b:2},{b:3},{b:4},{b:5},{b:6},{b:7},{b:8},{b:9},{b:10},
+                        {b:11},{b:12},{b:13},{b:14},{b:15},{b:16},{b:17},{b:18},{b:19},{b:20}
+                    ]
+                })",
+               {
+                   {"a", {"", "[o19", kHasSubPath}},
+                   {"a.b", {"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", "["}},
+               });
+}
+
+TEST(ColKeyGen, DeepObjectTests) {
+    // Can't use lambdas because they can't be recursive (especially mutually recursive), but I'd
+    // still like to keep these local to this test
+    struct funcs {
+        static void addObjToStr(std::string& str, int depthLeft) {
+            if (!depthLeft) {
+                str += '1';
+                return;
+            }
+
+            str += "{x:";
+            addObjToStr(str, depthLeft - 1);
+            str += "}";
+        }
+
+        static void addArrToStr(std::string& str, int depthLeft) {
+            if (!depthLeft) {
+                str += '1';
+                return;
+            }
+
+            str += "[";
+            addArrToStr(str, depthLeft - 1);
+            str += "]";
+        }
+
+        static std::string dottedPath(int length) {
+            invariant(length >= 1);
+            std::string out = "x";
+            for (int i = 1; i < length; i++) {
+                out += ".x";
+            }
+            return out;
+        }
+
+        static void addAlternatingObjToStr(std::string& str, int depthLeft) {
+            if (!depthLeft) {
+                str += '1';
+                return;
+            }
+
+            str += "{x:";
+            addAlternatingArrToStr(str, depthLeft - 1);
+            str += "}";
+        }
+
+        static void addAlternatingArrToStr(std::string& str, int depthLeft) {
+            if (!depthLeft) {
+                str += '1';
+                return;
+            }
+
+            str += "[";
+            addAlternatingObjToStr(str, depthLeft - 1);
+            str += "]";
+        }
+
+        static std::string alternatingArrayInfo(int pathDepth) {
+            invariant(pathDepth >= 1);
+            std::string out = "[";
+            for (int i = 1; i < pathDepth; i++) {
+                out += "{[";
+            }
+            return out;
+        }
+    };
+
+    // The full name is quite the line-full.
+    constexpr int kDepth = BSONDepth::kBSONDepthParameterCeiling;
+
+    {  // Just object nesting
+        std::string obj;
+        funcs::addObjToStr(obj, kDepth);
+        StringMap<UnencodedCellValue_ForTest> expected;
+        for (int i = 1; i < kDepth; i++) {
+            expected[funcs::dottedPath(i)] = {"", "", kHasSubPath};
+        }
+        expected[funcs::dottedPath(kDepth)] = {"1", ""};
+        basicTests(__LINE__, obj, expected);
+    }
+
+    {  // Just array nesting
+        std::string arr;
+        funcs::addArrToStr(arr, kDepth);
+        basicTests(__LINE__,
+                   "{x: " + arr + "}",
+                   {
+                       {"x", {"1", std::string(kDepth, '['), kHasDoubleNestedArrays}},
+                   });
+    }
+
+    // The next two tests cover a mix of object and array nesting, but differ only in which is the
+    // innermost. Since the constant was even when this was written, the first tests with an
+    // innermost array, and the second tests an innermost object. There may need to be slight
+    // adjustments to the tests in the unlikely event that the constant ever becomes odd.
+    static_assert(BSONDepth::kBSONDepthParameterCeiling % 2 == 0);  // See above if this fails.
+
+    {  // Innermost array.
+        std::string obj;
+        funcs::addAlternatingObjToStr(obj, kDepth);
+        StringMap<UnencodedCellValue_ForTest> expected;
+        constexpr auto kPathLen = kDepth / 2;
+        for (int i = 1; i < kPathLen; i++) {
+            expected[funcs::dottedPath(i)] = {
+                "", funcs::alternatingArrayInfo(i) + 'o', kHasSubPath};
+        }
+        expected[funcs::dottedPath(kPathLen)] = {"1", funcs::alternatingArrayInfo(kPathLen)};
+        basicTests(__LINE__, obj, expected);
+    }
+    {  // Innermost object.
+        std::string obj;
+        funcs::addAlternatingObjToStr(obj, kDepth + 1);
+        StringMap<UnencodedCellValue_ForTest> expected;
+        constexpr auto kPathLen = kDepth / 2 + 1;
+        for (int i = 1; i < kPathLen; i++) {
+            expected[funcs::dottedPath(i)] = {
+                "", funcs::alternatingArrayInfo(i) + 'o', kHasSubPath};
+        }
+        expected[funcs::dottedPath(kPathLen)] = {"1", funcs::alternatingArrayInfo(kPathLen - 1)};
+        basicTests(__LINE__, obj, expected);
+    }
+}
+
+TEST(ColKeyGen, DuplicateFieldTests) {
+    basicTests(__LINE__,
+               R"({a: 1, a: 2})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({a: [1], a: 2})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({a: 1, a: [2]})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({a: [1], a: [2]})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({a: {b:1}, a: {b:2}})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({a: {b:[1]}, a: {b:[2]}})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({a: [{b:[1]}], a: [{b:[2]}]})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({a: {b:[1]}, a: [{b:[2]}]})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({a: [{b:[1]}], a: {b:[2]}})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({a: {b:[1]}, a: [null, {b:[2]}]})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({a: [null, {b:[1]}], a: {b:[2]}})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({a: [{b:[1, 3]}], a: [{b:[2]}]})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({a: [{b:[1, 3]}, null], a: [{b:[2]}]})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({a: [null, {b:[1, 3]}], a: [{b:[2]}]})",  // No index in second a.b
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({a: [null, null, {b:[1, 3]}], a: [null, {b:[2]}]})",  // Index in second a.b
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({a: [null, {b:[{c:1}, {c:3}]}], a: [{b:[{c:2}]}]})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"", "", kHasDuplicateFields}},
+                   {"a.b.c", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({a: [null, null, {b:[{c:1}, {c:3}]}], a: [null, {b:[{c:2}]}]})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"", "", kHasDuplicateFields}},
+                   {"a.b.c", {"", "", kHasDuplicateFields}},
+               });
+    basicTests(__LINE__,
+               R"({"": 1, "": 2})",
+               {
+                   {"", {"", "", kHasDuplicateFields}},
+               });
+
+    // This behaves the same as {a:[{b:[1,3]}, {b:2}]} as far as a.b can tell.
+    basicTests(__LINE__,
+               R"({a: [{b:[1, 3]}], a: [null, {b:[2]}]})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"1,3,2", "[{[|1]{[", kIsSparse}},
+               });
+
+    // These tests only have one a.b path.
+    basicTests(__LINE__,
+               R"({a: [{b:1}], a: 2})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"1", "[", kIsSparse}},
+               });
+    basicTests(__LINE__,
+               R"({a: 1, a: [{b:2}]})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"2", "[", kIsSparse}},
+               });
+    basicTests(__LINE__,
+               R"({a: [{b:1}], a: [2]})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"1", "[", kIsSparse}},
+               });
+    basicTests(__LINE__,
+               R"({a: [1], a: [{b:2}]})",
+               {
+                   {"a", {"", "", kHasDuplicateFields}},
+                   {"a.b", {"2", "[", kIsSparse}},
+               });
+}
+
+void updateTest(int line,
+                std::string jsonOld,
+                std::string jsonNew,
+                StringMap<std::pair<DiffAction, UnencodedCellValue_ForTest>> expected) {
+    BSONObj owner;
+    std::vector<BSONElement> elems;
+
+    StringSet seenPaths;
+    visitDiffForUpdate(fromjson(jsonOld),
+                       fromjson(jsonNew),
+                       [&](DiffAction action, PathView path, const UnencodedCellView* cell) {
+                           ASSERT(!seenPaths.contains(path)) << "test:" << line << " path:" << path;
+                           seenPaths.insert(path.toString());
+
+                           auto it = expected.find(path);
+                           if (it == expected.end()) {
+                               FAIL("Unexpected path in updateTest")
+                                   << "action:" << action << " cell:" << cell << " test:" << line
+                                   << " path:" << path;
+                           }
+
+                           auto expectedAction = it->second.first;
+                           ASSERT_EQ(action, expectedAction) << "test:" << line << " path:" << path;
+                           if (action == kDelete) {
+                               ASSERT(!cell) << "test:" << line << " path:" << path;
+                           } else {
+                               ASSERT(cell) << "test:" << line << " path:" << path;
+                               auto expectedCell = it->second.second.toView(owner, elems);
+                               ASSERT_EQ(*cell, expectedCell)
+                                   << "test:" << line << " path:" << path;
+                           }
+                       });
+
+    for (auto [path, _] : expected) {
+        if (seenPaths.contains(path))
+            continue;
+
+        FAIL("Expected to see path in updateTest, but didn't")
+            << "test:" << line << " path:" << path;
+    }
+}
+
+TEST(ColKeyGen, UpdateTests) {
+
+    updateTest(__LINE__,
+               R"({a: [1, {b: 1}]})",
+               R"({a: [1, {b: 2}]})",
+               {
+                   {"a.b", {kUpdate, {"2", "[1", kIsSparse}}},
+               });
+    updateTest(__LINE__,
+               R"({a: [1, {b: 1}]})",
+               R"({a: [2, {b: 1}]})",
+               {
+                   {"a", {kUpdate, {"2", "[|o", kHasSubPath}}},
+
+               });
+    updateTest(__LINE__,
+               R"({a: [{b: 1}]})",  // a.b becomes isSparse
+               R"({a: [{b: 1}, {c:1}]})",
+               {
+                   {"a", {kUpdate, {"", "[o1", kHasSubPath}}},
+                   {"a.b", {kUpdate, {"1", "[", kIsSparse}}},
+                   {"a.c", {kInsert, {"1", "[1", kIsSparse}}},
+               });
+    updateTest(__LINE__,
+               R"({a: [{b: 1}, {c:1}]})",  // a.b becomes not isSparse
+               R"({a: [{b: 1}]})",
+               {
+                   {"a", {kUpdate, {"", "[o", kHasSubPath}}},
+                   {"a.b", {kUpdate, {"1", "["}}},
+                   {"a.c", {kDelete, {}}},
+               });
+    updateTest(__LINE__,
+               R"({'': 1})",
+               R"({'': 2})",
+               {
+                   {"", {kUpdate, {"2", ""}}},
+               });
+    updateTest(__LINE__,
+               R"({'': [1, {'': 1}]})",
+               R"({'': [1, {'': 2}]})",
+               {
+                   {".", {kUpdate, {"2", "[1", kIsSparse}}},
+               });
+}
+
+// TODO more tests, of course! In particular, the testing of complex update changes is a bit light.
+}  // namespace
+}  // namespace mongo::column_keygen
diff --git a/src/mongo/db/storage/column_store.h b/src/mongo/db/storage/column_store.h
index 268fc1280d1..aab7a719634 100644
--- a/src/mongo/db/storage/column_store.h
+++ b/src/mongo/db/storage/column_store.h
@@ -302,6 +302,23 @@ public:
         return x;
     }
 
+    /**
+     * If the range [*itPtr, end) begins with a number, returns it and positions *itPtr after the
+     * last byte of number. If there is no number, returns 0 (which is typically encoded by omitting
+     * an optional number) and does not reposition *itPtr.
+     */
+    static int readArrInfoNumber(StringData::const_iterator* itInOut,
+                                 StringData::const_iterator end) {
+        auto it = *itInOut;  // Use local to allow compiler to assume it doesn't point to itself.
+        size_t res = 0;
+        while (it != end && *it >= '0' && *it <= '9') {
+            res *= 10;  // noop first pass.
+            res += (*it++) - '0';
+        }
+        *itInOut = it;
+        return res;
+    }
+
 protected:
     class Cursor {
     public: