summaryrefslogtreecommitdiff
path: root/src/mongo/db/index/column_store_sorter.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/db/index/column_store_sorter.h')
-rw-r--r--src/mongo/db/index/column_store_sorter.h134
1 files changed, 134 insertions, 0 deletions
diff --git a/src/mongo/db/index/column_store_sorter.h b/src/mongo/db/index/column_store_sorter.h
new file mode 100644
index 00000000000..833213b3646
--- /dev/null
+++ b/src/mongo/db/index/column_store_sorter.h
@@ -0,0 +1,134 @@
+/**
+ * Copyright (C) 2022-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/record_id.h"
+#include "mongo/db/sorter/sorter.h"
+#include "mongo/db/storage/column_store.h"
+
+namespace mongo {
+/**
+ * Performs the organizing and sorting steps of a column store index bulk build, presenting an
+ * interface similar to the 'Sorter' interface. The client can add cells with the 'add()' method
+ * until none remain and then call 'done()' to get an iterator that returns the cells in sorted
+ * order.
+ *
+ * This class assumes that inputs are _already sorted_ by RecordId. Adding out-of-orders cells will
+ * result in undefined behavior.
+ *
+ * Internally, this class maintains a hash table that maps each path to a sorted list of
+ * (RecordId, CellValue) pairs. Because we use a hash table and not a sorted data structure (like
+ * std::map), we need to sort the list of paths when finalizing the output or when writing a spill
+ * file. The total number of cells inserted into this mapping is potentially very large, making it
+ * preferable to defer the cost of sorting to the end in order to avoid the cost of a binary tree
+ * traversal for each inserted cell.
+ */
+class ColumnStoreSorter {
+public:
+ ColumnStoreSorter(size_t maxMemoryUsageBytes, StringData dbName, SorterFileStats* stats);
+
+ void add(PathView path, RecordId recordId, CellView cellContents);
+
+ size_t numSpills() const {
+ return _numSpills;
+ }
+
+ struct Key {
+ PathView path;
+ RecordId recordId;
+
+ struct SorterDeserializeSettings {};
+
+ bool operator<(const Key& other) const;
+ void serializeForSorter(BufBuilder& buf) const;
+
+ // Assumes that the source buffer will remain valid for the lifetime of the returned
+ // ColumnStoreSorter::Key object.
+ static Key deserializeForSorter(BufReader& buf, SorterDeserializeSettings);
+
+ size_t memUsageForSorter() const {
+ return sizeof(path) + path.size() + recordId.memUsage();
+ }
+
+ Key getOwned() const {
+ MONGO_UNREACHABLE;
+ }
+ };
+
+ struct Value {
+ CellView cell;
+
+ struct SorterDeserializeSettings {};
+
+ void serializeForSorter(BufBuilder& buf) const;
+ static Value deserializeForSorter(BufReader& buf, SorterDeserializeSettings);
+
+ size_t memUsageForSorter() const {
+ return sizeof(cell) + cell.size();
+ }
+
+ Value getOwned() const {
+ MONGO_UNREACHABLE;
+ }
+ };
+
+ using Iterator = SortIteratorInterface<Key, Value>;
+
+ Iterator* done();
+
+private:
+ class InMemoryIterator;
+
+ static SortOptions makeSortOptions(const std::string& dbName, SorterFileStats* stats);
+ static std::string pathForNewSpillFile();
+
+ void spill();
+
+ Iterator* inMemoryIterator() const;
+
+ const std::string _dbName;
+ SorterFileStats* _stats; // Unowned
+
+ const size_t _maxMemoryUsageBytes;
+ size_t _memUsed = 0;
+ size_t _numSpills = 0;
+
+ /**
+ * Mapping from path name to the sorted list of (RecordId, Cell) pairs.
+ */
+ using CellVector = std::vector<std::pair<RecordId, CellValue>>;
+ StringMap<CellVector> _dataByPath;
+
+ std::shared_ptr<Sorter<Key, Value>::File> _spillFile;
+ std::vector<std::shared_ptr<Iterator>> _spilledFileIterators;
+
+ bool _done = false;
+};
+} // namespace mongo