// record_store.h
/**
* Copyright (C) 2013 10gen Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#pragma once
#include
#include "mongo/base/owned_pointer_vector.h"
#include "mongo/bson/mutable/damage_vector.h"
#include "mongo/db/exec/collection_scan_common.h"
#include "mongo/db/record_id.h"
#include "mongo/db/storage/record_data.h"
#include "mongo/db/storage/record_fetcher.h"
namespace mongo {
class CappedDocumentDeleteCallback;
class Collection;
struct CompactOptions;
struct CompactStats;
class DocWriter;
class MAdvise;
class NamespaceDetails;
class OperationContext;
class RecordFetcher;
class RecordStoreCompactAdaptor;
class RecordStore;
struct ValidateResults;
class ValidateAdaptor;
/**
* Allows inserting a Record "in-place" without creating a copy ahead of time.
*/
class DocWriter {
public:
virtual ~DocWriter() {}
virtual void writeDocument( char* buf ) const = 0;
virtual size_t documentSize() const = 0;
virtual bool addPadding() const { return true; }
};
/**
* @see RecordStore::updateRecord
*/
class UpdateNotifier {
public:
virtual ~UpdateNotifier(){}
virtual Status recordStoreGoingToMove( OperationContext* txn,
const RecordId& oldLocation,
const char* oldBuffer,
size_t oldSize ) = 0;
virtual Status recordStoreGoingToUpdateInPlace( OperationContext* txn,
const RecordId& loc ) = 0;
};
/**
* The data items stored in a RecordStore.
*/
struct Record {
RecordId id;
RecordData data;
};
/**
* Retrieves Records from a RecordStore.
*
* A cursor is constructed with a direction flag with the following effects:
* - The direction that next() moves.
* - If a restore cannot return to the saved position, cursors will be positioned on the
* closest position *after* the query in the direction of the scan.
*
* A cursor is tied to a transaction, such as the OperationContext or a WriteUnitOfWork
* inside that context. Any cursor acquired inside a transaction is invalid outside
* of that transaction, instead use the save and restore methods to reestablish the cursor.
*
* Any method other than invalidate and the save methods may throw WriteConflict exception. If
* that happens, the cursor may not be used again until it has been saved and successfully
* restored. If next() or restore() throw a WCE the cursor's position will be the same as before
* the call (strong exception guarantee). All other methods leave the cursor in a valid state
* but with an unspecified position (basic exception guarantee). If any exception other than
* WCE is thrown, the cursor must be destroyed, which is guaranteed not to leak any resources.
*
* Any returned unowned BSON is only valid until the next call to any method on this
* interface.
*
* Implementations may override any default implementation if they can provide a more
* efficient implementation.
*/
class RecordCursor {
public:
virtual ~RecordCursor() = default;
/**
* Moves forward and returns the new data or boost::none if there is no more data.
* Continues returning boost::none once it reaches EOF.
*/
virtual boost::optional next() = 0;
//
// Seeking
//
// Warning: MMAPv1 cannot detect if RecordIds are valid. Therefore callers should only pass
// potentially deleted RecordIds to seek methods if they know that MMAPv1 is not the current
// storage engine. All new storage engines must support detecting the existence of Records.
//
/**
* Seeks to a Record with the provided id.
*
* If an exact match can't be found, boost::none will be returned and the resulting position
* of the cursor is unspecified.
*/
virtual boost::optional seekExact(const RecordId& id) = 0;
//
// Saving and restoring state
//
/**
* Prepares for state changes in underlying data in a way that allows the cursor's
* current position to be restored.
*
* It is safe to call savePositioned multiple times in a row.
* No other method (excluding destructor) may be called until successfully restored.
*/
virtual void savePositioned() = 0;
/**
* Prepares for state changes in underlying data without necessarily saving the current
* state.
*
* The cursor's position when restored is unspecified. Caller is expected to seek rather
* than call next() following the restore.
*
* It is safe to call saveUnpositioned multiple times in a row.
* No other method (excluding destructor) may be called until successfully restored.
*/
virtual void saveUnpositioned() { savePositioned(); }
/**
* Recovers from potential state changes in underlying data.
*
* Returns false if it is invalid to continue using this iterator. This usually means that
* capped deletes have caught up to the position of this iterator and continuing could
* result in missed data.
*
* If the former position no longer exists, but it is safe to continue iterating, the
* following call to next() will return the next closest position in the direction of the
* scan, if any.
*
* This handles restoring after either savePositioned() or saveUnpositioned().
*/
virtual bool restore(OperationContext* txn) = 0;
/**
* Inform the cursor that this id is being invalidated.
* Must be called between save and restore.
*
* WARNING: Storage engines other than MMAPv1 should not depend on this being called.
*/
virtual void invalidate(const RecordId& id) {};
//
// RecordFetchers
//
// Storage engines which do not support document-level locking hold locks at collection or
// database granularity. As an optimization, these locks can be yielded when a record needs
// to be fetched from secondary storage. If this method returns non-NULL, then it indicates
// that the query system layer should yield its locks, following the protocol defined by the
// RecordFetcher class, so that a potential page fault is triggered out of the lock.
//
// Storage engines which support document-level locking need not implement this.
//
// TODO see if these can be replaced by WriteConflictException.
//
/**
* Returns a RecordFetcher if needed for a call to next() or none if unneeded.
*/
virtual std::unique_ptr fetcherForNext() const { return {}; }
/**
* Returns a RecordFetcher if needed to fetch the provided Record or none if unneeded.
*/
virtual std::unique_ptr fetcherForId(const RecordId& id) const { return {}; }
};
/**
* A RecordStore provides an abstraction used for storing documents in a collection,
* or entries in an index. In storage engines implementing the KVEngine, record stores
* are also used for implementing catalogs.
*
* Many methods take an OperationContext parameter. This contains the RecoveryUnit, with
* all RecordStore specific transaction information, as well as the LockState. Methods that take
* an OperationContext may throw a WriteConflictException.
*/
class RecordStore {
MONGO_DISALLOW_COPYING(RecordStore);
public:
RecordStore( StringData ns ) : _ns(ns.toString()) { }
virtual ~RecordStore() { }
// META
// name of the RecordStore implementation
virtual const char* name() const = 0;
virtual const std::string& ns() const { return _ns; }
/**
* The dataSize is an approximation of the sum of the sizes (in bytes) of the
* documents or entries in the recordStore.
*/
virtual long long dataSize(OperationContext* txn) const = 0;
/**
* Total number of record in the RecordStore. You may need to cache it, so this call
* takes constant time, as it is called often.
*/
virtual long long numRecords(OperationContext* txn) const = 0;
virtual bool isCapped() const = 0;
virtual void setCappedDeleteCallback(CappedDocumentDeleteCallback*) {invariant( false );}
/**
* @param extraInfo - optional more debug info
* @param level - optional, level of debug info to put in (higher is more)
* @return total estimate size (in bytes) on stable storage
*/
virtual int64_t storageSize( OperationContext* txn,
BSONObjBuilder* extraInfo = NULL,
int infoLevel = 0 ) const = 0;
// CRUD related
/**
* Get the RecordData at loc, which must exist.
*
* If unowned data is returned, it is valid until the next modification of this Record or
* the lock on this collection is released.
*
* In general, prefer findRecord or RecordCursor::seekExact since they can tell you if a
* record has been removed.
*/
virtual RecordData dataFor(OperationContext* txn, const RecordId& loc) const {
RecordData data;
invariant(findRecord(txn, loc, &data));
return data;
}
/**
* @param out - If the record exists, the contents of this are set.
* @return true iff there is a Record for loc
*
* If unowned data is returned, it is valid until the next modification of this Record or
* the lock on this collection is released.
*
* In general prefer RecordCursor::seekExact since it can avoid copying data in more
* storageEngines.
*
* Warning: MMAPv1 cannot detect if RecordIds are valid. Therefore callers should only pass
* potentially deleted RecordIds to seek methods if they know that MMAPv1 is not the current
* storage engine. All new storage engines must support detecting the existence of Records.
*/
virtual bool findRecord(OperationContext* txn,
const RecordId& loc,
RecordData* out) const {
auto cursor = getCursor(txn);
auto record = cursor->seekExact(loc);
if (!record) return false;
record->data.makeOwned(); // Unowned data expires when cursor goes out of scope.
*out = std::move(record->data);
return true;
}
virtual void deleteRecord( OperationContext* txn, const RecordId& dl ) = 0;
virtual StatusWith insertRecord( OperationContext* txn,
const char* data,
int len,
bool enforceQuota ) = 0;
virtual StatusWith insertRecord( OperationContext* txn,
const DocWriter* doc,
bool enforceQuota ) = 0;
/**
* @param notifier - Only used by record stores which do not support doc-locking.
* In the case of a document move, this is called after the document
* has been written to the new location, but before it is deleted from
* the old location.
* In the case of an in-place update, this is called just before the
* in-place write occurs.
* @return Status or RecordId, RecordId might be different
*/
virtual StatusWith updateRecord( OperationContext* txn,
const RecordId& oldLocation,
const char* data,
int len,
bool enforceQuota,
UpdateNotifier* notifier ) = 0;
/**
* @return Returns 'false' if this record store does not implement
* 'updatewithDamages'. If this method returns false, 'updateWithDamages' must not be
* called, and all updates must be routed through 'updateRecord' above. This allows the
* update framework to avoid doing the work of damage tracking if the underlying record
* store cannot utilize that information.
*/
virtual bool updateWithDamagesSupported() const = 0;
virtual Status updateWithDamages( OperationContext* txn,
const RecordId& loc,
const RecordData& oldRec,
const char* damageSource,
const mutablebson::DamageVector& damages ) = 0;
/**
* Returns a new cursor over this record store.
*
* The cursor is logically positioned before the first (or last if !forward) Record in the
* collection so that Record will be returned on the first call to next(). Implementations
* are allowed to lazily seek to the first Record when next() is called rather than doing
* it on construction.
*/
virtual std::unique_ptr getCursor(OperationContext* txn,
bool forward = true) const = 0;
/**
* Constructs a cursor over a potentially corrupted store, which can be used to salvage
* damaged records. The iterator might return every record in the store if all of them
* are reachable and not corrupted. Returns NULL if not supported.
*
* Repair cursors are only required to support forward scanning, so it is illegal to call
* seekExact() on the returned cursor.
*/
virtual std::unique_ptr getCursorForRepair( OperationContext* txn ) const {
return {};
}
/**
* Returns many RecordCursors that partition the RecordStore into many disjoint sets.
* Iterating all returned RecordCursors is equivalent to iterating the full store.
*
* Partition cursors are only required to support forward scanning, so it is illegal to call
* seekExact() on any of the returned cursors.
*
* WARNING: the first call to restore() on each cursor may (but is not guaranteed to) be on
* a different RecoveryUnit than the initial save. This will be made more sane as part of
* SERVER-17364.
*/
virtual std::vector> getManyCursors(
OperationContext* txn) const {
std::vector> out(1);
out[0] = getCursor(txn);
return out;
}
// higher level
/**
* removes all Records
*/
virtual Status truncate( OperationContext* txn ) = 0;
/**
* Truncate documents newer than the document at 'end' from the capped
* collection. The collection cannot be completely emptied using this
* function. An assertion will be thrown if that is attempted.
* @param inclusive - Truncate 'end' as well iff true
* XXX: this will go away soon, just needed to move for now
*/
virtual void temp_cappedTruncateAfter(OperationContext* txn,
RecordId end,
bool inclusive) = 0;
/**
* does this RecordStore support the compact operation?
*
* If you return true, you must provide implementations of all compact methods.
*/
virtual bool compactSupported() const { return false; }
/**
* Does compact() leave RecordIds alone or can they change.
*
* Only called if compactSupported() returns true.
*/
virtual bool compactsInPlace() const { invariant(false); }
/**
* Attempt to reduce the storage space used by this RecordStore.
*
* Only called if compactSupported() returns true.
* No RecordStoreCompactAdaptor will be passed if compactsInPlace() returns true.
*/
virtual Status compact( OperationContext* txn,
RecordStoreCompactAdaptor* adaptor,
const CompactOptions* options,
CompactStats* stats ) {
invariant(false);
}
/**
* @param full - does more checks
* @param scanData - scans each document
* @return OK if the validate run successfully
* OK will be returned even if corruption is found
* deatils will be in result
*/
virtual Status validate( OperationContext* txn,
bool full, bool scanData,
ValidateAdaptor* adaptor,
ValidateResults* results, BSONObjBuilder* output ) = 0;
/**
* @param scaleSize - amount by which to scale size metrics
* appends any custom stats from the RecordStore or other unique stats
*/
virtual void appendCustomStats( OperationContext* txn,
BSONObjBuilder* result,
double scale ) const = 0;
/**
* Load all data into cache.
* What cache depends on implementation.
*
* If the underlying storage engine does not support the operation,
* returns ErrorCodes::CommandNotSupported
*
* @param output (optional) - where to put detailed stats
*/
virtual Status touch( OperationContext* txn, BSONObjBuilder* output ) const {
return Status(ErrorCodes::CommandNotSupported,
"this storage engine does not support touch");
}
/**
* Return the RecordId of an oplog entry as close to startingPosition as possible without
* being higher. If there are no entries <= startingPosition, return RecordId().
*
* If you don't implement the oplogStartHack, just use the default implementation which
* returns boost::none.
*/
virtual boost::optional oplogStartHack(OperationContext* txn,
const RecordId& startingPosition) const {
return boost::none;
}
/**
* When we write to an oplog, we call this so that if the storage engine
* supports doc locking, it can manage the visibility of oplog entries to ensure
* they are ordered.
*/
virtual Status oplogDiskLocRegister( OperationContext* txn,
const Timestamp& opTime ) {
return Status::OK();
}
/**
* Called after a repair operation is run with the recomputed numRecords and dataSize.
*/
virtual void updateStatsAfterRepair(OperationContext* txn,
long long numRecords,
long long dataSize) = 0;
protected:
std::string _ns;
};
class RecordStoreCompactAdaptor {
public:
virtual ~RecordStoreCompactAdaptor(){}
virtual bool isDataValid( const RecordData& recData ) = 0;
virtual size_t dataSize( const RecordData& recData ) = 0;
virtual void inserted( const RecordData& recData, const RecordId& newLocation ) = 0;
};
struct ValidateResults {
ValidateResults() {
valid = true;
}
bool valid;
std::vector errors;
};
/**
* This is so when a RecordStore is validating all records
* it can call back to someone to check if a record is valid.
* The actual data contained in a Record is totally opaque to the implementation.
*/
class ValidateAdaptor {
public:
virtual ~ValidateAdaptor(){}
virtual Status validate( const RecordData& recordData, size_t* dataSize ) = 0;
};
}