summaryrefslogtreecommitdiff
path: root/jstests/concurrency/fsm_workloads/random_moveChunk_refine_collection_shard_key.js
blob: c397518ecc53d3facb9461c28ecf959bf6801a46 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
'use strict';

/**
 * Performs updates in transactions without the shard key while chunks are being moved. This
 * includes multi=true updates and multi=false updates with exact _id queries.
 *
 * @tags: [
 *  requires_sharding,
 *  assumes_balancer_off,
 *  requires_non_retryable_writes,
 *  uses_transactions,
 * ]
 */
load('jstests/concurrency/fsm_libs/extend_workload.js');
load('jstests/concurrency/fsm_workloads/random_moveChunk_base.js');

var $config = extendWorkload($config, function($config, $super) {
    $config.threadCount = 5;
    $config.iterations = 50;

    // Number of documents per partition. Note that there is one chunk per partition and one
    // partition per thread.
    $config.data.partitionSize = 100;

    $config.data.defaultShardKeyField = 'a';
    $config.data.defaultShardKey = {a: 1};

    // The variables used by the random_moveChunk_base config in order to move chunks.
    $config.data.shardKey = {a: 1};
    $config.data.newShardKey = {a: 1, b: 1};
    $config.data.newShardKeyFields = ["a", "b"];

    // Use a CountDownLatch as if it were a std::atomic<long long> shared between all of the
    // threads. The collection name is suffixed with the current this.latch.getCount() value
    // when concurrent CRUD operations are run against it. With every refineCollectionShardKey,
    // call this.latch.countDown() and run CRUD operations against the new collection suffixed
    // with this.latch.getCount(). This bypasses the need to drop and reshard the current
    // collection with every refineCollectionShardKey since it cannot be achieved in an atomic
    // fashion under the FSM infrastructure (meaning CRUD operations would fail).
    $config.data.latchCount = $config.iterations;
    $config.data.latch = new CountDownLatch($config.data.latchCount);

    $config.data.getCurrentLatchCollName = function(collName) {
        return collName + '_' + this.latch.getCount().toString();
    };

    $config.data.getCurrentOrPreviousLatchCollName = function(collName) {
        const latchNumber = (Random.rand() < 0.5)
            ? this.latch.getCount()
            : Math.min(this.latch.getCount() + 1, this.latchCount);

        return collName + '_' + latchNumber.toString();
    };

    // Because updates don't have a shard filter stage, a migration may fail if a
    // broadcast update is operating on orphans from a previous migration in the range being
    // migrated back in. The particular error code is replaced with a more generic one, so this
    // is identified by the failed migration's error message.
    $config.data.isMoveChunkErrorAcceptable = (err) => {
        const codes = [
            // TODO SERVER-68551: Remove lockbusy error since the balancer won't acquire anymore the
            // DDL lock for migrations
            ErrorCodes.LockBusy,
            ErrorCodes.ShardKeyNotFound,
            ErrorCodes.LockTimeout,
            // The refienCollectionCoordinator interrupt all migrations by setting `allowMigration`
            // to false
            ErrorCodes.Interrupted,
        ];
        return (err.code && codes.includes(err.code)) ||
            (err.message &&
             (err.message.includes("CommandFailed") ||
              err.message.includes("Documents in target range may still be in use") ||
              // This error will occur as a result of trying to move a chunk with a pre-refine
              // collection epoch.
              err.message.includes("collection may have been dropped") ||
              // This error will occur if a moveChunk command has been sent with chunk boundaries
              // that represent the pre-refine chunks, but the collection has already been changed
              // to possess the post-refine chunk boundaries.
              (err.message.includes("shard key bounds") &&
               err.message.includes("are not valid for shard key pattern"))));
    };

    $config.states.refineCollectionShardKey = function refineCollectionShardKey(
        db, collName, connCache) {
        const latchCollName = this.getCurrentLatchCollName(collName);

        try {
            assertAlways.commandWorked(db.adminCommand({
                refineCollectionShardKey: db.getCollection(latchCollName).getFullName(),
                key: this.newShardKey
            }));
        } catch (e) {
            // There is a race that could occur where two threads run refineCollectionShardKey
            // concurrently on the same collection. Since the epoch of the collection changes,
            // the later thread may receive a StaleEpoch error, which is an acceptable error.
            //
            // It is also possible to receive a LockBusy error if refineCollectionShardKey is unable
            // to acquire the distlock before timing out due to ongoing migrations acquiring the
            // distlock first.
            // TODO SERVER-68551: Remove lockbusy error since the balancer won't acquire anymore the
            // DDL lock for migrations
            if (e.code == ErrorCodes.StaleEpoch || e.code == ErrorCodes.LockBusy) {
                print("Ignoring acceptable refineCollectionShardKey error: " + tojson(e));
                return;
            }
            throw e;
        }

        this.shardKeyField[latchCollName] = this.newShardKeyFields;
        this.latch.countDown();
    };

    $config.states.moveChunk = function moveChunk(db, collName, connCache) {
        $super.states.moveChunk.apply(
            this, [db, this.getCurrentOrPreviousLatchCollName(collName), connCache]);
    };

    $config.states.init = function init(db, collName, connCache) {
        for (let i = this.latchCount; i >= 0; --i) {
            const latchCollName = collName + '_' + i;
            $super.states.init.apply(this, [db, latchCollName, connCache]);
        }
    };

    // Occasionally flush the router's cached metadata to verify the metadata for the refined
    // collections can be successfully loaded.
    $config.states.flushRouterConfig = function flushRouterConfig(db, collName, connCache) {
        assert.commandWorked(db.adminCommand({flushRouterConfig: db.getName()}));
    };

    $config.transitions = {
        init: {moveChunk: 0.4, refineCollectionShardKey: 0.4, flushRouterConfig: 0.2},
        moveChunk: {moveChunk: 0.4, refineCollectionShardKey: 0.4, flushRouterConfig: 0.2},
        refineCollectionShardKey:
            {moveChunk: 0.4, refineCollectionShardKey: 0.4, flushRouterConfig: 0.2},
        flushRouterConfig: {moveChunk: 0.5, refineCollectionShardKey: 0.5},
    };

    $config.setup = function setup(db, collName, cluster) {
        // Proactively create and shard all possible collections suffixed with this.latch.getCount()
        // that could receive CRUD operations over the course of the FSM workload. This prevents the
        // race that could occur between sharding a collection and creating an index on the new
        // shard key (if this step were done after every refineCollectionShardKey).
        for (let i = this.latchCount; i >= 0; --i) {
            const latchCollName = collName + '_' + i;
            let coll = db.getCollection(latchCollName);
            assertAlways.commandWorked(
                db.adminCommand({shardCollection: coll.getFullName(), key: this.defaultShardKey}));
            assertAlways.commandWorked(coll.createIndex(this.newShardKey));
            $super.setup.apply(this, [db, latchCollName, cluster]);
        }
    };

    return $config;
});