// record_store.h /** * Copyright (C) 2013 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the GNU Affero General Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #pragma once #include #include "mongo/base/owned_pointer_vector.h" #include "mongo/bson/mutable/damage_vector.h" #include "mongo/db/exec/collection_scan_common.h" #include "mongo/db/record_id.h" #include "mongo/db/storage/record_data.h" #include "mongo/db/storage/record_fetcher.h" namespace mongo { class CappedDocumentDeleteCallback; class Collection; struct CompactOptions; struct CompactStats; class DocWriter; class MAdvise; class NamespaceDetails; class OperationContext; class RecordFetcher; class RecordStoreCompactAdaptor; class RecordStore; struct ValidateResults; class ValidateAdaptor; /** * Allows inserting a Record "in-place" without creating a copy ahead of time. */ class DocWriter { public: virtual ~DocWriter() {} virtual void writeDocument( char* buf ) const = 0; virtual size_t documentSize() const = 0; virtual bool addPadding() const { return true; } }; /** * @see RecordStore::updateRecord */ class UpdateNotifier { public: virtual ~UpdateNotifier(){} virtual Status recordStoreGoingToMove( OperationContext* txn, const RecordId& oldLocation, const char* oldBuffer, size_t oldSize ) = 0; virtual Status recordStoreGoingToUpdateInPlace( OperationContext* txn, const RecordId& loc ) = 0; }; /** * The data items stored in a RecordStore. */ struct Record { RecordId id; RecordData data; }; /** * Retrieves Records from a RecordStore. * * A cursor is constructed with a direction flag with the following effects: * - The direction that next() moves. * - If a restore cannot return to the saved position, cursors will be positioned on the * closest position *after* the query in the direction of the scan. * * A cursor is tied to a transaction, such as the OperationContext or a WriteUnitOfWork * inside that context. Any cursor acquired inside a transaction is invalid outside * of that transaction, instead use the save and restore methods to reestablish the cursor. * * Any method other than invalidate and the save methods may throw WriteConflict exception. If * that happens, the cursor may not be used again until it has been saved and successfully * restored. If next() or restore() throw a WCE the cursor's position will be the same as before * the call (strong exception guarantee). All other methods leave the cursor in a valid state * but with an unspecified position (basic exception guarantee). If any exception other than * WCE is thrown, the cursor must be destroyed, which is guaranteed not to leak any resources. * * Any returned unowned BSON is only valid until the next call to any method on this * interface. * * Implementations may override any default implementation if they can provide a more * efficient implementation. */ class RecordCursor { public: virtual ~RecordCursor() = default; /** * Moves forward and returns the new data or boost::none if there is no more data. * Continues returning boost::none once it reaches EOF. */ virtual boost::optional next() = 0; // // Seeking // // Warning: MMAPv1 cannot detect if RecordIds are valid. Therefore callers should only pass // potentially deleted RecordIds to seek methods if they know that MMAPv1 is not the current // storage engine. All new storage engines must support detecting the existence of Records. // /** * Seeks to a Record with the provided id. * * If an exact match can't be found, boost::none will be returned and the resulting position * of the cursor is unspecified. */ virtual boost::optional seekExact(const RecordId& id) = 0; // // Saving and restoring state // /** * Prepares for state changes in underlying data in a way that allows the cursor's * current position to be restored. * * It is safe to call savePositioned multiple times in a row. * No other method (excluding destructor) may be called until successfully restored. */ virtual void savePositioned() = 0; /** * Prepares for state changes in underlying data without necessarily saving the current * state. * * The cursor's position when restored is unspecified. Caller is expected to seek rather * than call next() following the restore. * * It is safe to call saveUnpositioned multiple times in a row. * No other method (excluding destructor) may be called until successfully restored. */ virtual void saveUnpositioned() { savePositioned(); } /** * Recovers from potential state changes in underlying data. * * Returns false if it is invalid to continue using this iterator. This usually means that * capped deletes have caught up to the position of this iterator and continuing could * result in missed data. * * If the former position no longer exists, but it is safe to continue iterating, the * following call to next() will return the next closest position in the direction of the * scan, if any. * * This handles restoring after either savePositioned() or saveUnpositioned(). */ virtual bool restore(OperationContext* txn) = 0; /** * Inform the cursor that this id is being invalidated. * Must be called between save and restore. * * WARNING: Storage engines other than MMAPv1 should not depend on this being called. */ virtual void invalidate(const RecordId& id) {}; // // RecordFetchers // // Storage engines which do not support document-level locking hold locks at collection or // database granularity. As an optimization, these locks can be yielded when a record needs // to be fetched from secondary storage. If this method returns non-NULL, then it indicates // that the query system layer should yield its locks, following the protocol defined by the // RecordFetcher class, so that a potential page fault is triggered out of the lock. // // Storage engines which support document-level locking need not implement this. // // TODO see if these can be replaced by WriteConflictException. // /** * Returns a RecordFetcher if needed for a call to next() or none if unneeded. */ virtual std::unique_ptr fetcherForNext() const { return {}; } /** * Returns a RecordFetcher if needed to fetch the provided Record or none if unneeded. */ virtual std::unique_ptr fetcherForId(const RecordId& id) const { return {}; } }; /** * A RecordStore provides an abstraction used for storing documents in a collection, * or entries in an index. In storage engines implementing the KVEngine, record stores * are also used for implementing catalogs. * * Many methods take an OperationContext parameter. This contains the RecoveryUnit, with * all RecordStore specific transaction information, as well as the LockState. Methods that take * an OperationContext may throw a WriteConflictException. */ class RecordStore { MONGO_DISALLOW_COPYING(RecordStore); public: RecordStore( StringData ns ) : _ns(ns.toString()) { } virtual ~RecordStore() { } // META // name of the RecordStore implementation virtual const char* name() const = 0; virtual const std::string& ns() const { return _ns; } /** * The dataSize is an approximation of the sum of the sizes (in bytes) of the * documents or entries in the recordStore. */ virtual long long dataSize(OperationContext* txn) const = 0; /** * Total number of record in the RecordStore. You may need to cache it, so this call * takes constant time, as it is called often. */ virtual long long numRecords(OperationContext* txn) const = 0; virtual bool isCapped() const = 0; virtual void setCappedDeleteCallback(CappedDocumentDeleteCallback*) {invariant( false );} /** * @param extraInfo - optional more debug info * @param level - optional, level of debug info to put in (higher is more) * @return total estimate size (in bytes) on stable storage */ virtual int64_t storageSize( OperationContext* txn, BSONObjBuilder* extraInfo = NULL, int infoLevel = 0 ) const = 0; // CRUD related /** * Get the RecordData at loc, which must exist. * * If unowned data is returned, it is valid until the next modification of this Record or * the lock on this collection is released. * * In general, prefer findRecord or RecordCursor::seekExact since they can tell you if a * record has been removed. */ virtual RecordData dataFor(OperationContext* txn, const RecordId& loc) const { RecordData data; invariant(findRecord(txn, loc, &data)); return data; } /** * @param out - If the record exists, the contents of this are set. * @return true iff there is a Record for loc * * If unowned data is returned, it is valid until the next modification of this Record or * the lock on this collection is released. * * In general prefer RecordCursor::seekExact since it can avoid copying data in more * storageEngines. * * Warning: MMAPv1 cannot detect if RecordIds are valid. Therefore callers should only pass * potentially deleted RecordIds to seek methods if they know that MMAPv1 is not the current * storage engine. All new storage engines must support detecting the existence of Records. */ virtual bool findRecord(OperationContext* txn, const RecordId& loc, RecordData* out) const { auto cursor = getCursor(txn); auto record = cursor->seekExact(loc); if (!record) return false; record->data.makeOwned(); // Unowned data expires when cursor goes out of scope. *out = std::move(record->data); return true; } virtual void deleteRecord( OperationContext* txn, const RecordId& dl ) = 0; virtual StatusWith insertRecord( OperationContext* txn, const char* data, int len, bool enforceQuota ) = 0; virtual StatusWith insertRecord( OperationContext* txn, const DocWriter* doc, bool enforceQuota ) = 0; /** * @param notifier - Only used by record stores which do not support doc-locking. * In the case of a document move, this is called after the document * has been written to the new location, but before it is deleted from * the old location. * In the case of an in-place update, this is called just before the * in-place write occurs. * @return Status or RecordId, RecordId might be different */ virtual StatusWith updateRecord( OperationContext* txn, const RecordId& oldLocation, const char* data, int len, bool enforceQuota, UpdateNotifier* notifier ) = 0; /** * @return Returns 'false' if this record store does not implement * 'updatewithDamages'. If this method returns false, 'updateWithDamages' must not be * called, and all updates must be routed through 'updateRecord' above. This allows the * update framework to avoid doing the work of damage tracking if the underlying record * store cannot utilize that information. */ virtual bool updateWithDamagesSupported() const = 0; virtual Status updateWithDamages( OperationContext* txn, const RecordId& loc, const RecordData& oldRec, const char* damageSource, const mutablebson::DamageVector& damages ) = 0; /** * Returns a new cursor over this record store. * * The cursor is logically positioned before the first (or last if !forward) Record in the * collection so that Record will be returned on the first call to next(). Implementations * are allowed to lazily seek to the first Record when next() is called rather than doing * it on construction. */ virtual std::unique_ptr getCursor(OperationContext* txn, bool forward = true) const = 0; /** * Constructs a cursor over a potentially corrupted store, which can be used to salvage * damaged records. The iterator might return every record in the store if all of them * are reachable and not corrupted. Returns NULL if not supported. * * Repair cursors are only required to support forward scanning, so it is illegal to call * seekExact() on the returned cursor. */ virtual std::unique_ptr getCursorForRepair( OperationContext* txn ) const { return {}; } /** * Returns many RecordCursors that partition the RecordStore into many disjoint sets. * Iterating all returned RecordCursors is equivalent to iterating the full store. * * Partition cursors are only required to support forward scanning, so it is illegal to call * seekExact() on any of the returned cursors. * * WARNING: the first call to restore() on each cursor may (but is not guaranteed to) be on * a different RecoveryUnit than the initial save. This will be made more sane as part of * SERVER-17364. */ virtual std::vector> getManyCursors( OperationContext* txn) const { std::vector> out(1); out[0] = getCursor(txn); return out; } // higher level /** * removes all Records */ virtual Status truncate( OperationContext* txn ) = 0; /** * Truncate documents newer than the document at 'end' from the capped * collection. The collection cannot be completely emptied using this * function. An assertion will be thrown if that is attempted. * @param inclusive - Truncate 'end' as well iff true * XXX: this will go away soon, just needed to move for now */ virtual void temp_cappedTruncateAfter(OperationContext* txn, RecordId end, bool inclusive) = 0; /** * does this RecordStore support the compact operation? * * If you return true, you must provide implementations of all compact methods. */ virtual bool compactSupported() const { return false; } /** * Does compact() leave RecordIds alone or can they change. * * Only called if compactSupported() returns true. */ virtual bool compactsInPlace() const { invariant(false); } /** * Attempt to reduce the storage space used by this RecordStore. * * Only called if compactSupported() returns true. * No RecordStoreCompactAdaptor will be passed if compactsInPlace() returns true. */ virtual Status compact( OperationContext* txn, RecordStoreCompactAdaptor* adaptor, const CompactOptions* options, CompactStats* stats ) { invariant(false); } /** * @param full - does more checks * @param scanData - scans each document * @return OK if the validate run successfully * OK will be returned even if corruption is found * deatils will be in result */ virtual Status validate( OperationContext* txn, bool full, bool scanData, ValidateAdaptor* adaptor, ValidateResults* results, BSONObjBuilder* output ) = 0; /** * @param scaleSize - amount by which to scale size metrics * appends any custom stats from the RecordStore or other unique stats */ virtual void appendCustomStats( OperationContext* txn, BSONObjBuilder* result, double scale ) const = 0; /** * Load all data into cache. * What cache depends on implementation. * * If the underlying storage engine does not support the operation, * returns ErrorCodes::CommandNotSupported * * @param output (optional) - where to put detailed stats */ virtual Status touch( OperationContext* txn, BSONObjBuilder* output ) const { return Status(ErrorCodes::CommandNotSupported, "this storage engine does not support touch"); } /** * Return the RecordId of an oplog entry as close to startingPosition as possible without * being higher. If there are no entries <= startingPosition, return RecordId(). * * If you don't implement the oplogStartHack, just use the default implementation which * returns boost::none. */ virtual boost::optional oplogStartHack(OperationContext* txn, const RecordId& startingPosition) const { return boost::none; } /** * When we write to an oplog, we call this so that if the storage engine * supports doc locking, it can manage the visibility of oplog entries to ensure * they are ordered. */ virtual Status oplogDiskLocRegister( OperationContext* txn, const Timestamp& opTime ) { return Status::OK(); } /** * Called after a repair operation is run with the recomputed numRecords and dataSize. */ virtual void updateStatsAfterRepair(OperationContext* txn, long long numRecords, long long dataSize) = 0; protected: std::string _ns; }; class RecordStoreCompactAdaptor { public: virtual ~RecordStoreCompactAdaptor(){} virtual bool isDataValid( const RecordData& recData ) = 0; virtual size_t dataSize( const RecordData& recData ) = 0; virtual void inserted( const RecordData& recData, const RecordId& newLocation ) = 0; }; struct ValidateResults { ValidateResults() { valid = true; } bool valid; std::vector errors; }; /** * This is so when a RecordStore is validating all records * it can call back to someone to check if a record is valid. * The actual data contained in a Record is totally opaque to the implementation. */ class ValidateAdaptor { public: virtual ~ValidateAdaptor(){} virtual Status validate( const RecordData& recordData, size_t* dataSize ) = 0; }; }