summaryrefslogtreecommitdiff
path: root/src/mongo/db/storage/snapshot_helper.cpp
blob: 2fecee62b269e01f763a979e7a9e1986351afc87 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
/**
 *    Copyright (C) 2020-present MongoDB, Inc.
 *
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the Server Side Public License, version 1,
 *    as published by MongoDB, Inc.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    Server Side Public License for more details.
 *
 *    You should have received a copy of the Server Side Public License
 *    along with this program. If not, see
 *    <http://www.mongodb.com/licensing/server-side-public-license>.
 *
 *    As a special exception, the copyright holders give permission to link the
 *    code of portions of this program with the OpenSSL library under certain
 *    conditions as described in each individual source file and distribute
 *    linked combinations including the program with the OpenSSL library. You
 *    must comply with the Server Side Public License in all respects for
 *    all of the code used other than as permitted herein. If you modify file(s)
 *    with this exception, you may extend this exception to your version of the
 *    file(s), but you are not obligated to do so. If you do not wish to do so,
 *    delete this exception statement from your version. If you delete this
 *    exception statement from all source files in the program, then also delete
 *    it in the license file.
 */


#include "mongo/platform/basic.h"

#include "mongo/db/storage/snapshot_helper.h"

#include "mongo/db/repl/read_concern_args.h"
#include "mongo/db/repl/replication_coordinator.h"
#include "mongo/logv2/log.h"

#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kStorage


namespace mongo {
namespace {
bool canReadAtLastApplied(OperationContext* opCtx) {
    // Local and available are the only ReadConcern levels that allow their ReadSource to be
    // overridden to read at lastApplied. They read without a timestamp by default, but this check
    // allows user secondary reads from conflicting with oplog batch application by reading at a
    // consistent point in time.
    // Internal operations use DBDirectClient as a loopback to perform local operations, and they
    // expect the same level of consistency guarantees as any user operation. For that reason,
    // DBDirectClient should be able to change the owning operation's ReadSource in order to serve
    // consistent data.
    const auto readConcernLevel = repl::ReadConcernArgs::get(opCtx).getLevel();
    if ((opCtx->getClient()->isFromUserConnection() || opCtx->getClient()->isInDirectClient()) &&
        (readConcernLevel == repl::ReadConcernLevel::kLocalReadConcern ||
         readConcernLevel == repl::ReadConcernLevel::kAvailableReadConcern)) {
        return true;
    }
    return false;
}

bool shouldReadAtLastApplied(OperationContext* opCtx,
                             const NamespaceString& nss,
                             std::string* reason) {
    // If this is true, then the operation opted-in to the PBWM lock, implying that it cannot change
    // its ReadSource. It's important to note that it is possible for this to be false, but still be
    // holding the PBWM lock, explained below.
    if (opCtx->lockState()->shouldConflictWithSecondaryBatchApplication()) {
        if (reason) {
            *reason = "conflicts with batch application";
        }
        return false;
    }

    // If we are already holding the PBWM lock, do not change ReadSource. Snapshots acquired by an
    // operation after a yield/restore must see all writes in the pre-yield snapshot. Once a
    // snapshot is reading without a timestamp, we choose to continue acquiring snapshots without a
    // timestamp. This is done in lieu of determining a timestamp far enough in the future that's
    // guaranteed to observe all previous writes. This may occur when multiple collection locks are
    // held concurrently, which is often the case when DBDirectClient is used.
    if (opCtx->lockState()->isLockHeldForMode(resourceIdParallelBatchWriterMode, MODE_IS)) {
        if (reason) {
            *reason = "PBWM lock is held";
        }
        LOGV2_DEBUG(20577, 1, "not reading at lastApplied because the PBWM lock is held");
        return false;
    }

    // If this node can accept writes (i.e. primary), then no conflicting replication batches are
    // being applied and we can read from the default snapshot. If we are in a replication state
    // (like secondary or primary catch-up) where we are not accepting writes, we should read at
    // lastApplied.
    if (repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesForDatabase(opCtx, "admin")) {
        if (reason) {
            *reason = "primary";
        }
        return false;
    }

    // If we are not secondary, then we should not attempt to read at lastApplied because it may not
    // be available or valid. Any operations reading outside of the primary or secondary states must
    // be internal. We give these operations the benefit of the doubt rather than attempting to read
    // at a lastApplied timestamp that is not valid.
    if (!repl::ReplicationCoordinator::get(opCtx)->isInPrimaryOrSecondaryState(opCtx)) {
        if (reason) {
            *reason = "not primary or secondary";
        }
        return false;
    }

    // Non-replicated collections do not need to read at lastApplied, as those collections are not
    // written by the replication system.  However, the oplog is special, as it *is* written by the
    // replication system.
    if (!nss.isReplicated() && !nss.isOplog()) {
        if (reason) {
            *reason = "unreplicated collection";
        }
        return false;
    }

    // Linearizable read concern should never be read at lastApplied, they must always read from
    // latest and are only allowed on primaries.
    if (repl::ReadConcernArgs::get(opCtx).getLevel() ==
        repl::ReadConcernLevel::kLinearizableReadConcern) {
        if (reason) {
            *reason = "linearizable read concern";
        }
        return false;
    }

    return true;
}
}  // namespace

namespace SnapshotHelper {

ReadSourceChange shouldChangeReadSource(OperationContext* opCtx, const NamespaceString& nss) {
    std::string reason;
    const bool readAtLastApplied = shouldReadAtLastApplied(opCtx, nss, &reason);

    if (!canReadAtLastApplied(opCtx)) {
        return {boost::none, readAtLastApplied};
    }

    const auto existing = opCtx->recoveryUnit()->getTimestampReadSource();
    if (opCtx->recoveryUnit()->isReadSourcePinned()) {
        LOGV2_DEBUG(5863601,
                    2,
                    "Not changing readSource as it is pinned",
                    "current"_attr = RecoveryUnit::toString(existing),
                    "rejected"_attr = readAtLastApplied
                        ? RecoveryUnit::toString(RecoveryUnit::ReadSource::kLastApplied)
                        : RecoveryUnit::toString(RecoveryUnit::ReadSource::kNoTimestamp));
        return {boost::none, false};
    }

    if (existing == RecoveryUnit::ReadSource::kNoTimestamp) {
        // Shifting from reading without a timestamp to reading with a timestamp can be dangerous
        // because writes will appear to vanish. This case is intended for new reads on secondaries
        // and query yield recovery after state transitions from primary to secondary.

        // If a query recovers from a yield and the node is no longer primary, it must start reading
        // at the lastApplied point because reading without a timestamp is not safe.
        if (readAtLastApplied) {
            LOGV2_DEBUG(4452901, 2, "Changing ReadSource to kLastApplied", logAttrs(nss));
            return {RecoveryUnit::ReadSource::kLastApplied, readAtLastApplied};
        }
    } else if (existing == RecoveryUnit::ReadSource::kLastApplied) {
        // For some reason, we can no longer read at lastApplied.
        // An operation that yields a timestamped snapshot must restore a snapshot with at least as
        // large of a timestamp, or with proper consideration of rollback scenarios, no timestamp.
        // Given readers do not survive rollbacks, it's okay to go from reading with a timestamp to
        // reading without one. More writes will become visible.
        if (!readAtLastApplied) {
            LOGV2_DEBUG(4452902,
                        2,
                        "Changing ReadSource to kNoTimestamp",
                        logAttrs(nss),
                        "reason"_attr = reason);
            // This shift to kNoTimestamp assumes that callers will not make future attempts to
            // manipulate their ReadSources after performing reads at an un-timetamped snapshot. The
            // only exception is callers of this function that may need to change from kNoTimestamp
            // to kLastApplied in the event of a catalog conflict or query yield.
            return {RecoveryUnit::ReadSource::kNoTimestamp, readAtLastApplied};
        }
    }
    return {boost::none, readAtLastApplied};
}

bool collectionChangesConflictWithRead(boost::optional<Timestamp> collectionMin,
                                       boost::optional<Timestamp> readTimestamp) {
    if (!collectionMin) {
        return false;
    }

    // If we do not have a point in time to conflict with collectionMin, return.
    if (!readTimestamp || readTimestamp->isNull()) {
        return false;
    }

    // If the last change to the collection was before or at the read timestamp, then the storage
    // snapshot will match the collection in-memory state. Return true only if there would be an
    // inconsistency: a collection with a newer min timestamp would not match an older storage
    // snapshot.
    return *collectionMin > readTimestamp;
}
}  // namespace SnapshotHelper
}  // namespace mongo