diff options
author | Pierlauro Sciarelli <pierlauro.sciarelli@mongodb.com> | 2021-10-22 16:55:43 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-10-22 17:08:07 +0000 |
commit | 6d6a3e814c3a36b180d696eef3a5ae4914c1f306 (patch) | |
tree | 0f75f4aff8464e07897b82c19115f6f7da596413 | |
parent | 7ea91003d2dcd8d02599dc56a157ffd3f79c9fb6 (diff) | |
download | mongo-6d6a3e814c3a36b180d696eef3a5ae4914c1f306.tar.gz |
SERVER-60652 Backport the autoSplitVector logic to v4.4
24 files changed, 1204 insertions, 161 deletions
diff --git a/jstests/core/views/views_all_commands.js b/jstests/core/views/views_all_commands.js index 1fdaf68dca1..260eaef9b39 100644 --- a/jstests/core/views/views_all_commands.js +++ b/jstests/core/views/views_all_commands.js @@ -136,6 +136,14 @@ let viewsCommandTests = { skipSharded: true, }, authenticate: {skip: isUnrelated}, + autoSplitVector: { + command: { + splitVector: "test.view", + keyPattern: {x: 1}, + maxChunkSize: 1, + }, + expectFailure: true, + }, availableQueryOptions: {skip: isAnInternalCommand}, balancerCollectionStatus: { command: {balancerCollectionStatus: "test.view"}, diff --git a/jstests/replsets/db_reads_while_recovering_all_commands.js b/jstests/replsets/db_reads_while_recovering_all_commands.js index 5bb5d5f6a00..7bcd62f3de3 100644 --- a/jstests/replsets/db_reads_while_recovering_all_commands.js +++ b/jstests/replsets/db_reads_while_recovering_all_commands.js @@ -76,6 +76,7 @@ const allCommands = { appendOplogNote: {skip: isPrimaryOnly}, applyOps: {skip: isPrimaryOnly}, authenticate: {skip: isNotAUserDataRead}, + autoSplitVector: {skip: isNotAUserDataRead}, availableQueryOptions: {skip: isNotAUserDataRead}, buildInfo: {skip: isNotAUserDataRead}, captrunc: {skip: isPrimaryOnly}, diff --git a/jstests/sharding/autosplit_low_cardinality.js b/jstests/sharding/autosplit_low_cardinality.js index f9c4d8bf2ae..479c20361a9 100644 --- a/jstests/sharding/autosplit_low_cardinality.js +++ b/jstests/sharding/autosplit_low_cardinality.js @@ -42,13 +42,40 @@ insertBigDocsWithKey(-10, 4); insertBigDocsWithKey(10, 4); waitForOngoingChunkSplits(st); -// At least one split should have been performed -assert.gte(numChunks(), 2, "Number of chunks is less then 2, no split have been perfomed"); +let expectedNumChunks = 2; +try { + // At least one split should have been performed + assert.gte(numChunks(), + expectedNumChunks, + "Number of chunks is less than 2, no split have been perfomed"); +} catch (e) { + // (SERVER-59882) split may not have happened due to commit delay of the inserted documents + print("Retrying performing one insert after catching exception " + e); + insertBigDocsWithKey(10, 1); + waitForOngoingChunkSplits(st); + assert.gte( + numChunks(), + expectedNumChunks, + "Number of chunks is less than " + expectedNumChunks + ", no split has been perfomed"); +} + +expectedNumChunks++; insertBigDocsWithKey(20, 4); waitForOngoingChunkSplits(st); // An additional split should have been performed -assert.gte(numChunks(), 3, "Number of chunks must be at least 3"); +try { + assert.gte(numChunks(), expectedNumChunks, "Number of chunks must be at least 3"); +} catch (e) { + // (SERVER-59882) split may not have happened due to commit delay of the inserted documents + print("Retrying performing one insert after catching exception " + e); + insertBigDocsWithKey(20, 1); + waitForOngoingChunkSplits(st); + assert.gte( + numChunks(), + expectedNumChunks, + "Number of chunks is less than " + 3 + ", not all expected splits have been perfomed"); +} st.stop(); })(); diff --git a/jstests/sharding/presplit.js b/jstests/sharding/presplit.js index 87d4e81a315..9fbec19f0ec 100644 --- a/jstests/sharding/presplit.js +++ b/jstests/sharding/presplit.js @@ -1,3 +1,9 @@ +/* + * @tags: [ + * requires_fcv_44, # autoSplitVector not present in older v4.2 binaries + * ] + */ + (function() { var s = new ShardingTest({name: "presplit", shards: 2, mongos: 1, other: {chunkSize: 1}}); @@ -31,7 +37,7 @@ s.adminCommand({shardcollection: "test.foo", key: {_id: 1}}); // Make sure the collection's original chunk got split s.printChunks(); -assert.lt(20, s.config.chunks.count({"ns": "test.foo"}), "many chunks assertion"); +assert.lte(20, s.config.chunks.count({"ns": "test.foo"}), "many chunks assertion"); assert.eq(num, primary.foo.count()); s.printChangeLog(); diff --git a/jstests/sharding/read_write_concern_defaults_application.js b/jstests/sharding/read_write_concern_defaults_application.js index 43e3abbf3e6..b1afbbadd8b 100644 --- a/jstests/sharding/read_write_concern_defaults_application.js +++ b/jstests/sharding/read_write_concern_defaults_application.js @@ -163,6 +163,7 @@ let testCases = { }, applyOps: {skip: "internal command"}, authenticate: {skip: "does not accept read or write concern"}, + autoSplitVector: {skip: "internal command"}, availableQueryOptions: {skip: "internal command"}, balancerCollectionStatus: {skip: "does not accept read or write concern"}, balancerStart: {skip: "does not accept read or write concern"}, diff --git a/jstests/sharding/shard_existing.js b/jstests/sharding/shard_existing.js index 6e3242647f8..8fb475367e3 100644 --- a/jstests/sharding/shard_existing.js +++ b/jstests/sharding/shard_existing.js @@ -1,3 +1,9 @@ +/* + * @tags: [ + * requires_fcv_44, # autoSplitVector not present in older v4.2 binaries + * ] + */ + (function() { 'use strict'; @@ -28,10 +34,10 @@ s.ensurePrimaryShard('test', s.shard1.shardName); var res = s.adminCommand({shardcollection: "test.data", key: {_id: 1}}); printjson(res); -// number of chunks should be approx equal to the total data size / half the chunk size +// number of chunks should be approx equal to the total data size / chunk size var numChunks = s.config.chunks.find({ns: 'test.data'}).itcount(); -var guess = Math.ceil(dataSize / (512 * 1024 + avgObjSize)); -assert(Math.abs(numChunks - guess) < 2, "not right number of chunks"); +var guess = Math.ceil(dataSize / (1024 * 1024 + avgObjSize)); +assert.lte(Math.abs(numChunks - guess), 2, "not right number of chunks"); s.stop(); })(); diff --git a/jstests/sharding/shard_existing_coll_chunk_count.js b/jstests/sharding/shard_existing_coll_chunk_count.js index 7ee54444976..51fe2239b9f 100644 --- a/jstests/sharding/shard_existing_coll_chunk_count.js +++ b/jstests/sharding/shard_existing_coll_chunk_count.js @@ -2,7 +2,10 @@ * This test confirms that after sharding a collection with some pre-existing data, * the resulting chunks aren't auto-split too aggressively. * - * @tags: [requires_persistence] + * @tags: [ + * requires_fcv_44, # autoSplitVector not present in older v4.2 binaries + * requires_persistence + * ] */ (function() { 'use strict'; @@ -82,7 +85,7 @@ var runCase = function(opts) { // Confirm number of chunks for this stage. var numChunks = getNumberChunks(coll.getFullName()); - assert.gte(numChunks, + assert.lte(numChunks, stage.expectedNumChunks, 'in ' + coll.getFullName() + ' expected ' + stage.expectedNumChunks + ' chunks for stage ' + stageNum + ', but found ' + numChunks + '\nopts: ' + @@ -157,7 +160,7 @@ runCase({ docSize: 510 * 1024, stages: [ {numDocsToInsert: 10, expectedNumChunks: 6}, - {numDocsToInsert: 10, expectedNumChunks: 10}, + {numDocsToInsert: 10, expectedNumChunks: 12}, ], }); @@ -166,7 +169,7 @@ runCase({ docSize: 514 * 1024, stages: [ {numDocsToInsert: 10, expectedNumChunks: 10}, - {numDocsToInsert: 10, expectedNumChunks: 18}, + {numDocsToInsert: 10, expectedNumChunks: 20}, ], }); diff --git a/jstests/sharding/sharding_balance1.js b/jstests/sharding/sharding_balance1.js index d6e0384d7f7..821f6faf58d 100644 --- a/jstests/sharding/sharding_balance1.js +++ b/jstests/sharding/sharding_balance1.js @@ -1,3 +1,9 @@ +/* + * @tags: [ + * requires_fcv_44, # autoSplitVector not present in older v4.2 binaries + * ] + */ + (function() { 'use strict'; @@ -23,7 +29,7 @@ while (inserted < (20 * 1024 * 1024)) { assert.commandWorked(bulk.execute()); assert.commandWorked(s.s0.adminCommand({shardcollection: "test.foo", key: {_id: 1}})); -assert.lt(20, s.config.chunks.count({"ns": "test.foo"}), "setup2"); +assert.lte(20, s.config.chunks.count({"ns": "test.foo"}), "setup2"); function diff1() { var x = s.chunkCounts("foo"); @@ -37,7 +43,7 @@ function sum() { return x[s.shard0.shardName] + x[s.shard1.shardName]; } -assert.lt(20, diff1(), "big differential here"); +assert.lte(20, diff1(), "big differential here"); print(diff1()); assert.soon(function() { diff --git a/jstests/sharding/sharding_rs1.js b/jstests/sharding/sharding_rs1.js index d15f97bafd6..5536505ef07 100644 --- a/jstests/sharding/sharding_rs1.js +++ b/jstests/sharding/sharding_rs1.js @@ -4,7 +4,7 @@ (function() { 'use strict'; -var s = new ShardingTest({shards: 3, other: {rs: true, chunkSize: 1, enableBalancer: true}}); +var s = new ShardingTest({shards: 3, other: {rs: true, chunkSize: 2, enableBalancer: true}}); s.adminCommand({enablesharding: "test"}); s.ensurePrimaryShard('test', s.shard0.shardName); @@ -12,14 +12,14 @@ s.config.settings.update({_id: "balancer"}, {$set: {_waitForDelete: true}}, true var db = s.getDB("test"); -var bigString = "X".repeat(256 * 1024); +var bigString = "X".repeat(256 * 1024); // 250 KB var insertedBytes = 0; var num = 0; -// Insert 10 MB of data to result in 10+ chunks +// Insert 20 MB of data to result in 20 chunks var bulk = db.foo.initializeUnorderedBulkOp(); -while (insertedBytes < (10 * 1024 * 1024)) { +while (insertedBytes < (20 * 1024 * 1024)) { bulk.insert({_id: num++, s: bigString, x: Math.random()}); insertedBytes += bigString.length; } diff --git a/jstests/sharding/write_cmd_auto_split.js b/jstests/sharding/write_cmd_auto_split.js index 197d29ccc90..55796aa1c30 100644 --- a/jstests/sharding/write_cmd_auto_split.js +++ b/jstests/sharding/write_cmd_auto_split.js @@ -14,148 +14,186 @@ assert.commandWorked(configDB.adminCommand({shardCollection: 'test.insert', key: var doc1k = (new Array(1024)).join('x'); var testDB = st.s.getDB('test'); -jsTest.log('Test single batch insert should auto-split'); +function testSingleBatchInsertShouldAutoSplit() { + jsTest.log('Test single batch insert should auto-split'); + + assert.commandWorked(configDB.adminCommand({enableSharding: 'test'})); + assert.commandWorked(configDB.adminCommand({shardCollection: 'test.insert', key: {x: 1}})); + + assert.eq(1, configDB.chunks.find({"ns": "test.insert"}).itcount()); + + // This should result in a little over 3MB inserted into the chunk, so with + // a max chunk size of 1MB we'd expect the autosplitter to split this into + // at least 3 chunks + for (var x = 0; x < 3100; x++) { + assert.commandWorked(testDB.runCommand({ + insert: 'insert', + documents: [{x: x, v: doc1k}], + ordered: false, + writeConcern: {w: 1} + })); + } -assert.eq(1, configDB.chunks.find({"ns": "test.insert"}).itcount()); + waitForOngoingChunkSplits(st); -// This should result in a little over 3MB inserted into the chunk, so with -// a max chunk size of 1MB we'd expect the autosplitter to split this into -// at least 3 chunks -for (var x = 0; x < 3100; x++) { - assert.commandWorked(testDB.runCommand( - {insert: 'insert', documents: [{x: x, v: doc1k}], ordered: false, writeConcern: {w: 1}})); -} + // Inserted batch is a multiple of the chunkSize, expect the chunks to split into + // more than 2. + assert.gt(configDB.chunks.find({"ns": "test.insert"}).itcount(), 2); + testDB.dropDatabase(); -waitForOngoingChunkSplits(st); + jsTest.log('Test single batch update should auto-split'); -// Inserted batch is a multiple of the chunkSize, expect the chunks to split into -// more than 2. -assert.gt(configDB.chunks.find({"ns": "test.insert"}).itcount(), 2); -testDB.dropDatabase(); + assert.commandWorked(configDB.adminCommand({enableSharding: 'test'})); + assert.commandWorked(configDB.adminCommand({shardCollection: 'test.update', key: {x: 1}})); -jsTest.log('Test single batch update should auto-split'); + assert.eq(1, configDB.chunks.find({"ns": "test.update"}).itcount()); -assert.commandWorked(configDB.adminCommand({enableSharding: 'test'})); -assert.commandWorked(configDB.adminCommand({shardCollection: 'test.update', key: {x: 1}})); + for (var x = 0; x < 2100; x++) { + assert.commandWorked(testDB.runCommand({ + update: 'update', + updates: [{q: {x: x}, u: {x: x, v: doc1k}, upsert: true}], + ordered: false, + writeConcern: {w: 1} + })); + } -assert.eq(1, configDB.chunks.find({"ns": "test.update"}).itcount()); + waitForOngoingChunkSplits(st); -for (var x = 0; x < 2100; x++) { - assert.commandWorked(testDB.runCommand({ - update: 'update', - updates: [{q: {x: x}, u: {x: x, v: doc1k}, upsert: true}], - ordered: false, - writeConcern: {w: 1} - })); + assert.gt(configDB.chunks.find({"ns": "test.update"}).itcount(), 1); + testDB.dropDatabase(); } -waitForOngoingChunkSplits(st); +function testSingleDeleteShouldNotAutoSplit() { + jsTest.log('Test single delete should not auto-split'); -assert.gt(configDB.chunks.find({"ns": "test.update"}).itcount(), 1); -testDB.dropDatabase(); + assert.commandWorked(configDB.adminCommand({enableSharding: 'test'})); + assert.commandWorked(configDB.adminCommand({shardCollection: 'test.delete', key: {x: 1}})); -jsTest.log('Test single delete should not auto-split'); + assert.eq(1, configDB.chunks.find({"ns": "test.delete"}).itcount()); -assert.commandWorked(configDB.adminCommand({enableSharding: 'test'})); -assert.commandWorked(configDB.adminCommand({shardCollection: 'test.delete', key: {x: 1}})); + for (var x = 0; x < 1100; x++) { + assert.commandWorked(testDB.runCommand({ + delete: 'delete', + deletes: [{q: {x: x, v: doc1k}, limit: NumberInt(0)}], + ordered: false, + writeConcern: {w: 1} + })); + } -assert.eq(1, configDB.chunks.find({"ns": "test.delete"}).itcount()); + // If we are autosplitting (which we shouldn't be), we want to wait until + // it's finished, otherwise we could falsely think no autosplitting was + // done when really it was just in progress. + waitForOngoingChunkSplits(st); -for (var x = 0; x < 1100; x++) { - assert.commandWorked(testDB.runCommand({ - delete: 'delete', - deletes: [{q: {x: x, v: doc1k}, limit: NumberInt(0)}], - ordered: false, - writeConcern: {w: 1} - })); + assert.eq(1, configDB.chunks.find({"ns": "test.delete"}).itcount()); + testDB.dropDatabase(); } -// If we are autosplitting (which we shouldn't be), we want to wait until -// it's finished, otherwise we could falsely think no autosplitting was -// done when really it was just in progress. -waitForOngoingChunkSplits(st); +function testBatchedInsertShouldAutoSplit() { + jsTest.log('Test batched insert should auto-split'); -assert.eq(1, configDB.chunks.find({"ns": "test.delete"}).itcount()); -testDB.dropDatabase(); + assert.commandWorked(configDB.adminCommand({enableSharding: 'test'})); + assert.commandWorked(configDB.adminCommand({shardCollection: 'test.insert', key: {x: 1}})); -jsTest.log('Test batched insert should auto-split'); + assert.eq(1, configDB.chunks.find({"ns": "test.insert"}).itcount()); -assert.commandWorked(configDB.adminCommand({enableSharding: 'test'})); -assert.commandWorked(configDB.adminCommand({shardCollection: 'test.insert', key: {x: 1}})); + // Note: Estimated 'chunk size' tracked by mongos is initialized with a random value so + // we are going to be conservative. + for (var x = 0; x < 2100; x += 400) { + var docs = []; -assert.eq(1, configDB.chunks.find({"ns": "test.insert"}).itcount()); + for (var y = 0; y < 400; y++) { + docs.push({x: (x + y), v: doc1k}); + } -// Note: Estimated 'chunk size' tracked by mongos is initialized with a random value so -// we are going to be conservative. -for (var x = 0; x < 2100; x += 400) { - var docs = []; - - for (var y = 0; y < 400; y++) { - docs.push({x: (x + y), v: doc1k}); + assert.commandWorked(testDB.runCommand( + {insert: 'insert', documents: docs, ordered: false, writeConcern: {w: 1}})); } - assert.commandWorked(testDB.runCommand( - {insert: 'insert', documents: docs, ordered: false, writeConcern: {w: 1}})); -} + waitForOngoingChunkSplits(st); -waitForOngoingChunkSplits(st); + assert.gt(configDB.chunks.find({"ns": "test.insert"}).itcount(), 1); + testDB.dropDatabase(); +} -assert.gt(configDB.chunks.find({"ns": "test.insert"}).itcount(), 1); -testDB.dropDatabase(); +function testBatchedUpdateShouldAutoSplit() { + jsTest.log('Test batched update should auto-split'); -jsTest.log('Test batched update should auto-split'); + assert.commandWorked(configDB.adminCommand({enableSharding: 'test'})); + assert.commandWorked(configDB.adminCommand({shardCollection: 'test.update', key: {x: 1}})); -assert.commandWorked(configDB.adminCommand({enableSharding: 'test'})); -assert.commandWorked(configDB.adminCommand({shardCollection: 'test.update', key: {x: 1}})); + assert.eq(1, configDB.chunks.find({"ns": "test.update"}).itcount()); -assert.eq(1, configDB.chunks.find({"ns": "test.update"}).itcount()); + for (var x = 0; x < 2100; x += 400) { + var docs = []; -for (var x = 0; x < 2100; x += 400) { - var docs = []; + for (var y = 0; y < 400; y++) { + var id = x + y; + docs.push({q: {x: id}, u: {x: id, v: doc1k}, upsert: true}); + } - for (var y = 0; y < 400; y++) { - var id = x + y; - docs.push({q: {x: id}, u: {x: id, v: doc1k}, upsert: true}); + assert.commandWorked(testDB.runCommand( + {update: 'update', updates: docs, ordered: false, writeConcern: {w: 1}})); } - assert.commandWorked( - testDB.runCommand({update: 'update', updates: docs, ordered: false, writeConcern: {w: 1}})); -} + waitForOngoingChunkSplits(st); -waitForOngoingChunkSplits(st); + assert.gt(configDB.chunks.find({"ns": "test.update"}).itcount(), 1); + testDB.dropDatabase(); +} -assert.gt(configDB.chunks.find({"ns": "test.update"}).itcount(), 1); -testDB.dropDatabase(); +function testBatchedDeleteShouldNotAutoSplit() { + jsTest.log('Test batched delete should not auto-split'); -jsTest.log('Test batched delete should not auto-split'); + assert.commandWorked(configDB.adminCommand({enableSharding: 'test'})); + assert.commandWorked(configDB.adminCommand({shardCollection: 'test.delete', key: {x: 1}})); -assert.commandWorked(configDB.adminCommand({enableSharding: 'test'})); -assert.commandWorked(configDB.adminCommand({shardCollection: 'test.delete', key: {x: 1}})); + assert.eq(1, configDB.chunks.find({"ns": "test.delete"}).itcount()); -assert.eq(1, configDB.chunks.find({"ns": "test.delete"}).itcount()); + for (var x = 0; x < 2100; x += 400) { + var docs = []; -for (var x = 0; x < 2100; x += 400) { - var docs = []; + for (var y = 0; y < 400; y++) { + var id = x + y; + docs.push({q: {x: id, v: doc1k}, top: 0}); + } - for (var y = 0; y < 400; y++) { - var id = x + y; - docs.push({q: {x: id, v: doc1k}, top: 0}); + assert.commandWorked(testDB.runCommand({ + delete: 'delete', + deletes: [{q: {x: x, v: doc1k}, limit: NumberInt(0)}], + ordered: false, + writeConcern: {w: 1} + })); } - assert.commandWorked(testDB.runCommand({ - delete: 'delete', - deletes: [{q: {x: x, v: doc1k}, limit: NumberInt(0)}], - ordered: false, - writeConcern: {w: 1} - })); -} + // If we are autosplitting (which we shouldn't be), we want to wait until + // it's finished, otherwise we could falsely think no autosplitting was + // done when really it was just in progress. + waitForOngoingChunkSplits(st); -// If we are autosplitting (which we shouldn't be), we want to wait until -// it's finished, otherwise we could falsely think no autosplitting was -// done when really it was just in progress. -waitForOngoingChunkSplits(st); + assert.eq(1, configDB.chunks.find({"ns": "test.delete"}).itcount()); +} -assert.eq(1, configDB.chunks.find({"ns": "test.delete"}).itcount()); +var testCases = [ + testSingleBatchInsertShouldAutoSplit, + testSingleDeleteShouldNotAutoSplit, + testBatchedInsertShouldAutoSplit, + testBatchedUpdateShouldAutoSplit, + testBatchedDeleteShouldNotAutoSplit +]; + +for (let testCase of testCases) { + try { + testDB.dropDatabase(); + testCase(); + } catch (e) { + print("Retrying test case failed due to " + e); + // (SERVER-59882) The split may not have happened due to write-unit-of-work commit delay + // Give it another best-effort try, given the low probability it would happen again + testDB.dropDatabase(); + testCase(); + } +} st.stop(); })(); diff --git a/src/mongo/db/s/SConscript b/src/mongo/db/s/SConscript index 14741dcafa7..6c55a3c8e02 100644 --- a/src/mongo/db/s/SConscript +++ b/src/mongo/db/s/SConscript @@ -46,6 +46,7 @@ env.Library( 'active_migrations_registry.cpp', 'active_move_primaries_registry.cpp', 'active_shard_collection_registry.cpp', + 'auto_split_vector.cpp', 'chunk_move_write_concern_options.cpp', 'chunk_splitter.cpp', 'collection_sharding_runtime.cpp', @@ -263,6 +264,7 @@ env.Library( target='sharding_commands_d', source=[ 'add_shard_cmd.cpp', + 'auto_split_vector_command.cpp', 'check_sharding_index_command.cpp', 'cleanup_orphaned_cmd.cpp', 'clone_catalog_data_command.cpp', @@ -405,6 +407,7 @@ env.CppUnitTest( 'active_migrations_registry_test.cpp', 'active_move_primaries_registry_test.cpp', 'active_shard_collection_registry_test.cpp', + 'auto_split_vector_test.cpp', 'catalog_cache_loader_mock.cpp', 'implicit_create_collection_test.cpp', 'migration_chunk_cloner_source_legacy_test.cpp', diff --git a/src/mongo/db/s/auto_split_vector.cpp b/src/mongo/db/s/auto_split_vector.cpp new file mode 100644 index 00000000000..7569142fde0 --- /dev/null +++ b/src/mongo/db/s/auto_split_vector.cpp @@ -0,0 +1,368 @@ +/** + * Copyright (C) 2021-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kSharding + +#include "mongo/platform/basic.h" + +#include "mongo/db/s/auto_split_vector.h" + +#include "mongo/base/status_with.h" +#include "mongo/db/bson/dotted_path_support.h" +#include "mongo/db/catalog/index_catalog.h" +#include "mongo/db/catalog_raii.h" +#include "mongo/db/dbhelpers.h" +#include "mongo/db/exec/working_set_common.h" +#include "mongo/db/index/index_descriptor.h" +#include "mongo/db/keypattern.h" +#include "mongo/db/namespace_string.h" +#include "mongo/db/query/internal_plans.h" +#include "mongo/db/query/plan_executor.h" +#include "mongo/logv2/log.h" + +namespace mongo { +namespace { + +constexpr int estimatedAdditionalBytesPerItemInBSONArray{2}; + +constexpr int kMaxSplitPointsToReposition{3}; + +BSONObj prettyKey(const BSONObj& keyPattern, const BSONObj& key) { + return key.replaceFieldNames(keyPattern).clientReadable(); +} + +/* + * Takes the given min/max BSON objects that are a prefix of the shardKey and return two new BSON + * object extended to cover the entire shardKey. See KeyPattern::extendRangeBound documentation for + * some examples. + */ +const std::tuple<BSONObj, BSONObj> getMinMaxExtendedBounds(const IndexDescriptor* shardKeyIdx, + const BSONObj& min, + const BSONObj& max) { + KeyPattern kp(shardKeyIdx->keyPattern()); + + // Extend min to get (min, MinKey, MinKey, ....) + BSONObj minKey = Helpers::toKeyFormat(kp.extendRangeBound(min, false /* upperInclusive */)); + BSONObj maxKey; + if (max.isEmpty()) { + // if max not specified, make it (MaxKey, Maxkey, MaxKey...) + maxKey = Helpers::toKeyFormat(kp.extendRangeBound(max, true /* upperInclusive */)); + } else { + // otherwise make it (max,MinKey,MinKey...) so that bound is non-inclusive + maxKey = Helpers::toKeyFormat(kp.extendRangeBound(max, false /* upperInclusive*/)); + } + + return {minKey, maxKey}; +} + +/* + * Returns true if the final key in the range is the same as the first key, false otherwise. + */ +bool maxKeyEqualToMinKey(OperationContext* opCtx, + const Collection* collection, + const IndexDescriptor* shardKeyIdx, + const BSONObj& minBound, + const BSONObj& maxBound, + const BSONObj& minKeyInChunk) { + BSONObj maxKeyInChunk; + { + auto backwardIdxScanner = InternalPlanner::indexScan(opCtx, + collection, + shardKeyIdx, + maxBound, + minBound, + BoundInclusion::kIncludeEndKeyOnly, + PlanExecutor::YIELD_AUTO, + InternalPlanner::BACKWARD); + + PlanExecutor::ExecState state = backwardIdxScanner->getNext(&maxKeyInChunk, nullptr); + uassert(ErrorCodes::OperationFailed, + "can't open a cursor to find final key in range (desired range is possibly empty)", + state == PlanExecutor::ADVANCED); + } + + if (minKeyInChunk.woCompare(maxKeyInChunk) == 0) { + // Range contains only documents with a single key value. So we cannot possibly find a + // split point, and there is no need to scan any further. + LOGV2_WARNING( + 5865001, + "Possible low cardinality key detected in range. Range contains only a single key.", + "namespace"_attr = collection->ns(), + "minKey"_attr = redact(prettyKey(shardKeyIdx->keyPattern(), minBound)), + "maxKey"_attr = redact(prettyKey(shardKeyIdx->keyPattern(), maxBound)), + "key"_attr = redact(prettyKey(shardKeyIdx->keyPattern(), minKeyInChunk))); + return true; + } + + return false; +} + +/* + * Reshuffle fields according to the shard key pattern. + */ +auto orderShardKeyFields(const BSONObj& keyPattern, BSONObj& key) { + return dotted_path_support::extractElementsBasedOnTemplate( + prettyKey(keyPattern, key.getOwned()), keyPattern); +} + +} // namespace + +std::vector<BSONObj> autoSplitVector(OperationContext* opCtx, + const NamespaceString& nss, + const BSONObj& keyPattern, + const BSONObj& min, + const BSONObj& max, + long long maxChunkSizeBytes) { + std::vector<BSONObj> splitKeys; + + int elapsedMillisToFindSplitPoints; + + // Contains each key appearing multiple times and estimated to be able to fill-in a chunk alone + auto tooFrequentKeys = SimpleBSONObjComparator::kInstance.makeBSONObjSet(); + + { + AutoGetCollection autoGetColl(opCtx, nss, MODE_IS); + auto* collection = autoGetColl.getCollection(); + + uassert(ErrorCodes::NamespaceNotFound, "ns not found", collection); + + // Get the size estimate for this namespace + const long long totalLocalCollDocuments = collection->numRecords(opCtx); + const long long dataSize = collection->dataSize(opCtx); + + // Return empty vector if current estimated data size is less than max chunk size + if (dataSize < maxChunkSizeBytes || totalLocalCollDocuments == 0) { + return {}; + } + + // Allow multiKey based on the invariant that shard keys must be single-valued. Therefore, + // any multi-key index prefixed by shard key cannot be multikey over the shard key fields. + auto catalog = collection->getIndexCatalog(); + auto shardKeyIdx = + catalog->findShardKeyPrefixedIndex(opCtx, keyPattern, /*requireSingleKey=*/false); + uassert(ErrorCodes::IndexNotFound, + str::stream() << "couldn't find index over splitting key " + << keyPattern.clientReadable().toString(), + shardKeyIdx); + + const auto [minKey, maxKey] = getMinMaxExtendedBounds(shardKeyIdx, min, max); + + // Setup the index scanner that will be used to find the split points + auto forwardIdxScanner = InternalPlanner::indexScan(opCtx, + collection, + shardKeyIdx, + minKey, + maxKey, + BoundInclusion::kIncludeStartKeyOnly, + PlanExecutor::YIELD_AUTO, + InternalPlanner::FORWARD); + + // Get minimum key belonging to the chunk + BSONObj minKeyInOriginalChunk; + { + PlanExecutor::ExecState state = + forwardIdxScanner->getNext(&minKeyInOriginalChunk, nullptr); + uassert(ErrorCodes::OperationFailed, + "can't open a cursor to scan the range (desired range is possibly empty)", + state == PlanExecutor::ADVANCED); + } + + // Return empty vector if chunk's min and max keys are the same. + if (maxKeyEqualToMinKey( + opCtx, collection, shardKeyIdx, minKey, maxKey, minKeyInOriginalChunk)) { + return {}; + } + + LOGV2(5865000, + "Requested split points lookup for chunk", + "namespace"_attr = nss, + "minKey"_attr = redact(prettyKey(keyPattern, minKey)), + "maxKey"_attr = redact(prettyKey(keyPattern, maxKey))); + + // Use the average document size and number of documents to find the approximate number of + // keys each chunk should contain + const long long avgDocSize = dataSize / totalLocalCollDocuments; + + // Split at max chunk size + long long maxDocsPerChunk = maxChunkSizeBytes / avgDocSize; + + BSONObj currentKey; // Last key seen during the index scan + long long numScannedKeys = 1; // minKeyInOriginalChunk has already been scanned + std::size_t resultArraySize = 0; // Approximate size in bytes of the split points array + bool reachedMaxBSONSize = false; // True if the split points vector becomes too big + + // Lambda to check whether the split points vector would exceed BSONObjMaxUserSize in case + // of additional split key of the specified size. + auto checkMaxBSONSize = [&resultArraySize](const int additionalKeySize) { + return resultArraySize + additionalKeySize > BSONObjMaxUserSize; + }; + + // Reference to last split point that needs to be checked in order to avoid adding duplicate + // split points. Initialized to the min of the first chunk being split. + auto minKeyElement = orderShardKeyFields(keyPattern, minKeyInOriginalChunk); + auto lastSplitPoint = minKeyElement; + + Timer timer; // To measure time elapsed while searching split points + + // Traverse the index and add the maxDocsPerChunk-th key to the result vector + while (forwardIdxScanner->getNext(¤tKey, nullptr) == PlanExecutor::ADVANCED) { + if (++numScannedKeys >= maxDocsPerChunk) { + currentKey = orderShardKeyFields(keyPattern, currentKey); + + if (currentKey.woCompare(lastSplitPoint) == 0) { + // Do not add again the same split point in case of frequent shard key. + tooFrequentKeys.insert(currentKey.getOwned()); + continue; + } + + const auto additionalKeySize = + currentKey.objsize() + estimatedAdditionalBytesPerItemInBSONArray; + if (checkMaxBSONSize(additionalKeySize)) { + if (splitKeys.empty()) { + // Keep trying until finding at least one split point that isn't above + // the max object user size. Very improbable corner case: the shard key + // size for the chosen split point is exactly 16MB. + continue; + } + reachedMaxBSONSize = true; + break; + } + + resultArraySize += additionalKeySize; + splitKeys.push_back(currentKey.getOwned()); + lastSplitPoint = splitKeys.back(); + numScannedKeys = 0; + + LOGV2_DEBUG(5865003, 4, "Picked a split key", "key"_attr = redact(currentKey)); + } + } + + // Avoid creating small chunks by fairly recalculating the last split points if the last + // chunk would be too small (containing less than `80% maxDocsPerChunk` documents). + bool lastChunk80PercentFull = numScannedKeys >= maxDocsPerChunk * 0.8; + if (!lastChunk80PercentFull && !splitKeys.empty() && !reachedMaxBSONSize) { + // Eventually recalculate the last split points (at most `kMaxSplitPointsToReposition`). + int nSplitPointsToReposition = splitKeys.size() > kMaxSplitPointsToReposition + ? kMaxSplitPointsToReposition + : splitKeys.size(); + + // Equivalent to: (nSplitPointsToReposition * maxDocsPerChunk + numScannedKeys) divided + // by the number of reshuffled chunks (nSplitPointsToReposition + 1). + const auto maxDocsPerNewChunk = maxDocsPerChunk - + ((maxDocsPerChunk - numScannedKeys) / (nSplitPointsToReposition + 1)); + + if (numScannedKeys < maxDocsPerChunk - maxDocsPerNewChunk) { + // If the surplus is not too much, simply keep a bigger last chunk. + // The surplus is considered enough if repositioning the split points would imply + // generating chunks with a number of documents lower than `67% maxDocsPerChunk`. + splitKeys.pop_back(); + } else { + // Fairly recalculate the last `nSplitPointsToReposition` split points. + splitKeys.erase(splitKeys.end() - nSplitPointsToReposition, splitKeys.end()); + + auto forwardIdxScanner = + InternalPlanner::indexScan(opCtx, + collection, + shardKeyIdx, + splitKeys.empty() ? minKeyElement : splitKeys.back(), + maxKey, + BoundInclusion::kIncludeStartKeyOnly, + PlanExecutor::YIELD_AUTO, + InternalPlanner::FORWARD); + + numScannedKeys = 0; + + auto previousSplitPoint = splitKeys.empty() ? minKeyElement : splitKeys.back(); + while (forwardIdxScanner->getNext(¤tKey, nullptr) == PlanExecutor::ADVANCED) { + if (++numScannedKeys >= maxDocsPerNewChunk) { + currentKey = orderShardKeyFields(keyPattern, currentKey); + + const auto compareWithPreviousSplitPoint = + currentKey.woCompare(previousSplitPoint); + if (compareWithPreviousSplitPoint > 0) { + const auto additionalKeySize = + currentKey.objsize() + estimatedAdditionalBytesPerItemInBSONArray; + if (checkMaxBSONSize(additionalKeySize)) { + reachedMaxBSONSize = true; + break; + } + + splitKeys.push_back(currentKey.getOwned()); + previousSplitPoint = splitKeys.back(); + numScannedKeys = 0; + + if (--nSplitPointsToReposition == 0) { + break; + } + } else if (compareWithPreviousSplitPoint == 0) { + // Don't add again the same split point in case of frequent shard key. + tooFrequentKeys.insert(currentKey.getOwned()); + } + } + } + } + } + + elapsedMillisToFindSplitPoints = timer.millis(); + + if (reachedMaxBSONSize) { + LOGV2(5865002, + "Max BSON response size reached for split vector before the end of chunk", + "namespace"_attr = nss, + "minKey"_attr = redact(prettyKey(shardKeyIdx->keyPattern(), minKey)), + "maxKey"_attr = redact(prettyKey(shardKeyIdx->keyPattern(), maxKey))); + } + } + + // Emit a warning for each frequent key + for (const auto& frequentKey : tooFrequentKeys) { + LOGV2_WARNING(5865004, + "Possible low cardinality key detected", + "namespace"_attr = nss, + "key"_attr = redact(prettyKey(keyPattern, frequentKey))); + } + + if (elapsedMillisToFindSplitPoints > serverGlobalParams.slowMS) { + LOGV2_WARNING(5865005, + "Finding the auto split vector completed", + "namespace"_attr = nss, + "keyPattern"_attr = redact(keyPattern), + "numSplits"_attr = splitKeys.size(), + "duration"_attr = Milliseconds(elapsedMillisToFindSplitPoints)); + } + + // TODO SERVER-58750: investigate if it is really needed to sort the vector + // Make sure splitKeys is in ascending order + std::sort( + splitKeys.begin(), splitKeys.end(), SimpleBSONObjComparator::kInstance.makeLessThan()); + + return splitKeys; +} + +} // namespace mongo diff --git a/src/mongo/db/s/auto_split_vector.h b/src/mongo/db/s/auto_split_vector.h new file mode 100644 index 00000000000..559c1f814d6 --- /dev/null +++ b/src/mongo/db/s/auto_split_vector.h @@ -0,0 +1,134 @@ +/** + * Copyright (C) 2021-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/namespace_string.h" +#include "mongo/db/operation_context.h" + +namespace mongo { + +/** + * Given a chunk, determines whether it satisfies the requisites to be auto-splitted and - if so - + * returns the split points (shard keys representing the lower bounds of the new chunks to create). + * + * The logic implemented can be summarized as follows: given a `maxChunkSize` of `x` MB, the + * algorithm aims to choose the split points so that the resulting chunks' size would be around + * `maxChunkSize`. As it is too expensive to precisely determine the dimension of a chunk, it is + * assumed a uniform distribution of document sizes, hence the aim is to balance the number of + * documents per chunk. + * + * ======= ALGORITHM DESCRIPTION ======= + * + * The split points for a chunk `C` belonging to a collection `coll` are calculated as follows: + * - `averageDocumentSize` = `totalCollSizeOnShard / numberOfCollDocs` + * - `maxNumberOfDocsPerChunk` = `maxChunkSize / averageDocumentSize` + * - Scan forward the shard key index entries for `coll` that are belonging to chunk `C`: + * - (1) Choose a split point every `maxNumberOfDocsPerChunk` scanned keys. + * - (2) As it needs to be avoided the creation of small chunks, consider the number of documents + * `S` that the right-most chunk would contain given the calculated split points: + * --- (2.1) IF `S >= 80% maxNumberOfDocsPerChunk`, return the list of calculated split points. + * --- (2.2) ELSE IF `S` documents could be fairly redistributed in the last chunks so that their + * size would be at least `67% maxNumberOfDocsPerChunk`: recalculate the last split points (max 3). + * --- (2.3) ELSE simply remove the last split point and keep a bigger last chunk. + * + * + * ============== EXAMPLES ============= + * + * ========= EXAMPLE (CASE 2.1) ======== + * `maxChunkSize` = 100MB + * `averageDocumentSize` = 1MB + * `maxNumberOfDocsPerChunk` = 100 + * + * Shard key type: integer + * Chunk `C` bounds: `[0, maxKey)` . Chunk `C` contains 190 documents with shard keys [0...189]. + * + * (1) Initially calculated split points: [99]. + * (2) The last chunk would contain the interval `[99-189]` so `S = 90` + * (2.1) `S >= 80% maxChunkSize`, so keep the current split points. + * + * Returned split points: [99]. + * + * ========= EXAMPLE (CASE 2.2) ======== + * `maxChunkSize` = 100MB + * `averageDocumentSize` = 1MB + * `maxNumberOfDocsPerChunk` = 100 + * + * Shard key type: integer + * Chunk `C` bounds: `[0, maxKey)` . Chunk `C` contains 140 documents with shard keys [0...139]. + * + * (1) Initially calculated split points: [99]. + * (2) The last chunk would contain the interval `[99-139]` so `S = 40` + * (2.2) `S` documents can be redistributed on the last split point by generating chunks of size >= + * 67% maxChunkSize. Recalculate. + * + * Returned split points: [69]. + * + * ========= EXAMPLE (CASE 2.3) ======== + * `maxChunkSize` = 100MB + * `averageDocumentSize` = 1MB + * `maxNumberOfDocsPerChunk` = 100 + * + * Shard key type: integer + * Chunk `C` bounds: `[0, maxKey)` . Chunk `C` contains 120 documents with shard keys [0...119]. + * + * (1) Initially calculated split points: [99]. + * (2) The last chunk would contain the interval `[99-119]` so `S = 20` + * (2.3) `S` documents can't be redistributed on the last split point by generating chunks of size + * >= 67% maxChunkSize. So remove the last split point. + * + * Returned split points: []. + */ +std::vector<BSONObj> autoSplitVector(OperationContext* opCtx, + const NamespaceString& nss, + const BSONObj& keyPattern, + const BSONObj& min, + const BSONObj& max, + long long maxChunkSizeBytes); + +/* + * Utility function for deserializing autoSplitVector/splitVector responses. + */ +static std::vector<BSONObj> parseSplitKeys(const BSONElement& splitKeysArray) { + uassert(ErrorCodes::TypeMismatch, + "The split keys vector must be represented as a BSON array", + !splitKeysArray.eoo() && splitKeysArray.type() == BSONType::Array); + + std::vector<BSONObj> splitKeys; + for (const auto& elem : splitKeysArray.Obj()) { + uassert(ErrorCodes::TypeMismatch, + "Each element of the split keys array must be an object", + elem.type() == BSONType::Object); + splitKeys.push_back(elem.embeddedObject().getOwned()); + } + + return splitKeys; +} + +} // namespace mongo diff --git a/src/mongo/db/s/auto_split_vector_command.cpp b/src/mongo/db/s/auto_split_vector_command.cpp new file mode 100644 index 00000000000..32275403eff --- /dev/null +++ b/src/mongo/db/s/auto_split_vector_command.cpp @@ -0,0 +1,100 @@ +/** + * Copyright (C) 2021-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kSharding + +#include "mongo/db/auth/authorization_session.h" +#include "mongo/db/commands.h" +#include "mongo/db/s/auto_split_vector.h" +#include "mongo/db/s/sharding_state.h" +#include "mongo/logv2/log.h" +#include "mongo/s/request_types/auto_split_vector_gen.h" + +namespace mongo { +namespace { + +class AutoSplitVectorCommand final : public TypedCommand<AutoSplitVectorCommand> { +public: + AllowedOnSecondary secondaryAllowed(ServiceContext*) const override { + return Command::AllowedOnSecondary::kNever; + } + + std::string help() const override { + return "Internal command returning the split points for a chunk, given the maximum chunk " + "size."; + } + + using Request = AutoSplitVectorRequest; + using Response = AutoSplitVectorResponse; + + class Invocation final : public InvocationBase { + public: + using InvocationBase::InvocationBase; + + Response typedRun(OperationContext* opCtx) { + uassert(ErrorCodes::IllegalOperation, + "The autoSplitVector command can only be invoked on shards (no CSRS).", + serverGlobalParams.clusterRole == ClusterRole::ShardServer); + + uassertStatusOK(ShardingState::get(opCtx)->canAcceptShardedCommands()); + opCtx->setAlwaysInterruptAtStepDownOrUp(); + + const auto& req = request(); + + auto splitKeys = autoSplitVector(opCtx, + ns(), + req.getKeyPattern(), + req.getMin(), + req.getMax(), + req.getMaxChunkSizeBytes()); + + return splitKeys; + } + + private: + NamespaceString ns() const override { + return request().getNamespace(); + } + + bool supportsWriteConcern() const override { + return false; + } + + void doCheckAuthorization(OperationContext* opCtx) const override { + uassert(ErrorCodes::Unauthorized, + "Unauthorized", + AuthorizationSession::get(opCtx->getClient()) + ->isAuthorizedForActionsOnResource(ResourcePattern::forClusterResource(), + ActionType::splitVector)); + } + }; +} autoSplitVectorCommand; + +} // namespace +} // namespace mongo diff --git a/src/mongo/db/s/auto_split_vector_test.cpp b/src/mongo/db/s/auto_split_vector_test.cpp new file mode 100644 index 00000000000..18c2cceab38 --- /dev/null +++ b/src/mongo/db/s/auto_split_vector_test.cpp @@ -0,0 +1,252 @@ +/** + * Copyright (C) 2021-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kTest + +#include "mongo/db/catalog/create_collection.h" +#include "mongo/db/db_raii.h" +#include "mongo/db/dbdirectclient.h" +#include "mongo/db/s/auto_split_vector.h" +#include "mongo/db/s/collection_sharding_runtime.h" +#include "mongo/db/s/operation_sharding_state.h" +#include "mongo/db/s/split_vector.h" +#include "mongo/logv2/log.h" +#include "mongo/platform/random.h" +#include "mongo/s/shard_server_test_fixture.h" + +namespace mongo { +namespace { + +const NamespaceString kNss = NamespaceString("autosplitDB", "coll"); +const std::string kPattern = "_id"; + +/* + * Call the autoSplitVector function of the test collection on a chunk with bounds [0, 100) and with + * the specified `maxChunkSizeMB`. + */ +std::vector<BSONObj> autoSplit(OperationContext* opCtx, int maxChunkSizeMB) { + return autoSplitVector(opCtx, + kNss, + BSON(kPattern << 1) /* shard key pattern */, + BSON(kPattern << 0) /* min */, + BSON(kPattern << 1000) /* max */, + maxChunkSizeMB * 1024 * 1024 /* max chunk size in bytes*/); +} + +class AutoSplitVectorTest : public ShardServerTestFixture { +public: + /* + * Before each test case: + * - Creates a sharded collection with shard key `_id` + */ + void setUp() { + ShardServerTestFixture::setUp(); + + auto opCtx = operationContext(); + + uassertStatusOK(createCollection( + operationContext(), kNss.db().toString(), BSON("create" << kNss.coll()))); + + DBDirectClient client(opCtx); + client.createIndex(kNss.ns(), BSON(kPattern << 1)); + } + + /* + * Insert the specified number of documents in the test collection, with incremental shard key + * `_id` starting from `_nextShardKey`. + */ + void insertNDocsOf1MB(OperationContext* opCtx, int nDocs) { + DBDirectClient client(opCtx); + + std::string s(1024 * 1024 - 24, 'a'); // To get a 1MB document + for (int i = 0; i < nDocs; i++) { + BSONObjBuilder builder; + builder.append(kPattern, _nextShardKey++); + builder.append("str", s); + BSONObj obj = builder.obj(); + ASSERT(obj.objsize() == 1024 * 1024); // 1 MB document + client.insert(kNss.toString(), obj); + } + } + + /* + * Get the number of documents inserted until now. + */ + int getInsertedSize() { + return _nextShardKey; + } + +private: + int _nextShardKey = 0; +}; + +class AutoSplitVectorTest10MB : public AutoSplitVectorTest { + /* + * Before each test case: + * - Creates a sharded collection with shard key `_id` + * - Inserts `10` documents of ~1MB size (shard keys [0...9]) + */ + void setUp() { + AutoSplitVectorTest::setUp(); + + auto opCtx = operationContext(); + + DBDirectClient client(opCtx); + client.createIndex(kNss.ns(), BSON(kPattern << 1)); + + insertNDocsOf1MB(opCtx, 10 /* nDocs */); + ASSERT_EQUALS(10, client.count(kNss)); + } +}; + +// Throw exception upon calling autoSplitVector on dropped/unexisting collection +TEST_F(AutoSplitVectorTest10MB, NoCollection) { + ASSERT_THROWS_CODE(autoSplitVector(operationContext(), + NamespaceString("dummy", "collection"), + BSON(kPattern << 1) /* shard key pattern */, + BSON(kPattern << 0) /* min */, + BSON(kPattern << 100) /* max */, + 1 * 1024 * 1024 /* max chunk size in bytes*/), + DBException, + ErrorCodes::NamespaceNotFound); +} + +// No split points if estimated `data size < max chunk size` +TEST_F(AutoSplitVectorTest10MB, NoSplitIfDataLessThanMaxChunkSize) { + std::vector<BSONObj> splitKeys = autoSplit(operationContext(), 11 /* maxChunkSizeMB */); + ASSERT_EQ(splitKeys.size(), 0); +} + +// Do not split in case of `chunk size == maxChunkSize` +TEST_F(AutoSplitVectorTest10MB, NoSplitIfDataEqualMaxChunkSize) { + std::vector<BSONObj> splitKeys = autoSplit(operationContext(), 10 /* maxChunkSizeMB */); + ASSERT_EQ(splitKeys.size(), 0); +} + +// No split points if `chunk size > max chunk size` but threshold not reached +TEST_F(AutoSplitVectorTest10MB, NoSplitIfDataLessThanThreshold) { + const auto surplus = 2; + { + // Increase collection size so that the auto splitter can actually be triggered. Use a + // different range to don't interfere with the chunk getting splitted. + insertNDocsOf1MB(operationContext(), surplus /* nDocs */); + } + std::vector<BSONObj> splitKeys = autoSplit(operationContext(), 10 /* maxChunkSizeMB */); + ASSERT_EQ(splitKeys.size(), 0); +} + +// One split point if `chunk size > max chunk size` and threshold reached +TEST_F(AutoSplitVectorTest10MB, SplitIfDataSlightlyMoreThanThreshold) { + const auto surplus = 4; + insertNDocsOf1MB(operationContext(), surplus /* nDocs */); + std::vector<BSONObj> splitKeys = autoSplit(operationContext(), 10 /* maxChunkSizeMB */); + ASSERT_EQ(splitKeys.size(), 1); + ASSERT_EQ(6, splitKeys.front().getIntField(kPattern)); +} + +// Split points if `data size > max chunk size * 2` and threshold reached +TEST_F(AutoSplitVectorTest10MB, SplitIfDataMoreThanThreshold) { + const auto surplus = 14; + insertNDocsOf1MB(operationContext(), surplus /* nDocs */); + std::vector<BSONObj> splitKeys = autoSplit(operationContext(), 10 /* maxChunkSizeMB */); + ASSERT_EQ(splitKeys.size(), 2); + ASSERT_EQ(7, splitKeys.front().getIntField(kPattern)); + ASSERT_EQ(15, splitKeys.back().getIntField(kPattern)); +} + +// Split points are not recalculated if the right-most chunk is at least `80% maxChunkSize` +TEST_F(AutoSplitVectorTest10MB, NoRecalculateIfBigLastChunk) { + const auto surplus = 8; + insertNDocsOf1MB(operationContext(), surplus /* nDocs */); + std::vector<BSONObj> splitKeys = autoSplit(operationContext(), 10 /* maxChunkSizeMB */); + ASSERT_EQ(splitKeys.size(), 1); + ASSERT_EQ(9, splitKeys.front().getIntField(kPattern)); +} + +class RepositionLastSplitPointsTest : public AutoSplitVectorTest { +public: + /* + * Tests that last split points are properly repositioned in case the surplus allows so or not + * repositioned otherwise. + */ + void checkRepositioning(int maxDocsPerChunk, int surplus, int nSplitPoints) { + ASSERT(surplus >= 0 && surplus < maxDocsPerChunk); + + const auto maxDocsPerNewChunk = + maxDocsPerChunk - ((maxDocsPerChunk - surplus) / (nSplitPoints + 1)); + bool mustReposition = + surplus >= maxDocsPerChunk - maxDocsPerNewChunk && surplus < maxDocsPerChunk * 0.8; + + int toInsert = (maxDocsPerChunk * nSplitPoints) - getInsertedSize() + surplus; + insertNDocsOf1MB(operationContext(), toInsert); + + int expectedChunkSize = + mustReposition ? getInsertedSize() / (nSplitPoints + 1) : maxDocsPerChunk; + std::vector<BSONObj> splitKeys = + autoSplit(operationContext(), maxDocsPerChunk /* maxChunkSizeMB */); + + int approximateNextMin = expectedChunkSize; + for (const auto& splitKey : splitKeys) { + int _id = splitKey.getIntField(kPattern); + // Expect an approximate match due to integers rounding in the split points algorithm. + ASSERT(_id >= approximateNextMin - 2 && _id <= approximateNextMin + 2) << BSON( + "approximateNextMin" + << approximateNextMin << "splitKeys" << splitKeys << "maxDocsPerChunk" + << maxDocsPerChunk << "surplus" << surplus << "nSplitPoints" << nSplitPoints + << "maxDocsPerNewChunk" << maxDocsPerNewChunk << "mustReposition" << mustReposition + << "toInsert" << toInsert << "expectedChunkSize" << expectedChunkSize); + approximateNextMin = _id + expectedChunkSize; + } + } +}; + + +// Test that last split points are recalculated fairly (if the surplus allows so) +TEST_F(RepositionLastSplitPointsTest, RandomRepositioningTest) { + PseudoRandom random(SecureRandom().nextInt64()); + // Avoid small sizes already checked in other test cases. + // Random maxDocsPerChunk in interval: [10, 110). + int maxDocsPerChunk = random.nextInt32(100) + 10; + // Random surplus in interval: [0, maxDocsPerChunk). + int surplus = random.nextInt32(maxDocsPerChunk); + + LOGV2(6000900, + "RandomRepositioningTest parameters", + "maxDocsPerChunk"_attr = maxDocsPerChunk, + "surplus"_attr = surplus); + + for (int nSplitPointsToReposition = 1; nSplitPointsToReposition < 4; + nSplitPointsToReposition++) { + checkRepositioning(maxDocsPerChunk, surplus, nSplitPointsToReposition); + } +} + + +} // namespace +} // namespace mongo diff --git a/src/mongo/db/s/chunk_splitter.cpp b/src/mongo/db/s/chunk_splitter.cpp index 298c6181568..8bab73b7047 100644 --- a/src/mongo/db/s/chunk_splitter.cpp +++ b/src/mongo/db/s/chunk_splitter.cpp @@ -38,11 +38,11 @@ #include "mongo/db/client.h" #include "mongo/db/dbdirectclient.h" #include "mongo/db/namespace_string.h" +#include "mongo/db/s/auto_split_vector.h" #include "mongo/db/s/chunk_split_state_driver.h" #include "mongo/db/s/shard_filtering_metadata_refresh.h" #include "mongo/db/s/sharding_state.h" #include "mongo/db/s/split_chunk.h" -#include "mongo/db/s/split_vector.h" #include "mongo/db/service_context.h" #include "mongo/logv2/log.h" #include "mongo/s/balancer_configuration.h" @@ -338,15 +338,13 @@ void ChunkSplitter::_runAutosplit(std::shared_ptr<ChunkSplitStateDriver> chunkSp "maxChunkSizeBytes"_attr = maxChunkSizeBytes); chunkSplitStateDriver->prepareSplit(); - auto splitPoints = uassertStatusOK(splitVector(opCtx.get(), - nss, - shardKeyPattern.toBSON(), - chunk.getMin(), - chunk.getMax(), - false, - boost::none, - boost::none, - maxChunkSizeBytes)); + + auto splitPoints = autoSplitVector(opCtx.get(), + nss, + shardKeyPattern.toBSON(), + chunk.getMin(), + chunk.getMax(), + maxChunkSizeBytes); if (splitPoints.empty()) { LOGV2_DEBUG(21907, @@ -356,8 +354,8 @@ void ChunkSplitter::_runAutosplit(std::shared_ptr<ChunkSplitStateDriver> chunkSp "ChunkSplitter attempted split but not enough split points were found for " "chunk", "chunk"_attr = redact(chunk.toString())); - // Reset our size estimate that we had prior to splitVector to 0, while still counting - // the bytes that have been written in parallel to this split task + // Reset our size estimate that we had prior to autoSplitVector to 0, while still + // counting the bytes that have been written in parallel to this split task chunkSplitStateDriver->abandonPrepare(); return; } diff --git a/src/mongo/db/s/config/initial_split_policy.cpp b/src/mongo/db/s/config/initial_split_policy.cpp index 1af9ceb57fd..3633e2f741e 100644 --- a/src/mongo/db/s/config/initial_split_policy.cpp +++ b/src/mongo/db/s/config/initial_split_policy.cpp @@ -314,7 +314,7 @@ InitialSplitPolicy::ShardCollectionConfig UnoptimizedSplitPolicy::createFirstChu std::vector<ShardId> shardIds{params.primaryShardId}; // Refresh the balancer settings to ensure the chunk size setting, which is sent as part of - // the splitVector command and affects the number of chunks returned, has been loaded. + // the autoSplitVector command and affects the number of chunks returned, has been loaded. const auto balancerConfig = Grid::get(opCtx)->getBalancerConfiguration(); uassertStatusOK(balancerConfig->refreshAndCheck(opCtx)); const auto shardSelectedSplitPoints = uassertStatusOK( diff --git a/src/mongo/db/s/config/initial_split_policy.h b/src/mongo/db/s/config/initial_split_policy.h index 3b5a5d9ea12..ade39dcf179 100644 --- a/src/mongo/db/s/config/initial_split_policy.h +++ b/src/mongo/db/s/config/initial_split_policy.h @@ -152,7 +152,7 @@ public: /** * Split point building strategy to be used when no optimizations are available. We send a - * splitVector command to the primary shard in order to calculate the appropriate split points. + * autoSplitVector command to the primary shard in order to calculate the appropriate split points. */ class UnoptimizedSplitPolicy : public InitialSplitPolicy { public: diff --git a/src/mongo/db/s/config/sharding_catalog_manager_shard_collection_test.cpp b/src/mongo/db/s/config/sharding_catalog_manager_shard_collection_test.cpp index 6a6bc2c7e66..11cbd8ce065 100644 --- a/src/mongo/db/s/config/sharding_catalog_manager_shard_collection_test.cpp +++ b/src/mongo/db/s/config/sharding_catalog_manager_shard_collection_test.cpp @@ -66,13 +66,15 @@ class ShardCollectionTestBase : public ConfigServerTestFixture { protected: void expectSplitVector(const HostAndPort& shardHost, const ShardKeyPattern& keyPattern, - const BSONObj& splitPoints) { + const BSONArray& splitPoints) { onCommand([&](const RemoteCommandRequest& request) { ASSERT_EQUALS(shardHost, request.target); string cmdName = request.cmdObj.firstElement().fieldName(); - ASSERT_EQUALS("splitVector", cmdName); - ASSERT_EQUALS(kNamespace.ns(), - request.cmdObj["splitVector"].String()); // splitVector uses full ns + ASSERT_EQUALS("autoSplitVector", cmdName); + // autoSplitVector concatenates the collection name to the command's db + const auto receivedNs = + request.dbname + '.' + request.cmdObj["autoSplitVector"].String(); + ASSERT_EQUALS(kNamespace.ns(), receivedNs); ASSERT_BSONOBJ_EQ(keyPattern.toBSON(), request.cmdObj["keyPattern"].Obj()); ASSERT_BSONOBJ_EQ(keyPattern.getKeyPattern().globalMin(), request.cmdObj["min"].Obj()); diff --git a/src/mongo/db/s/migration_chunk_cloner_source_legacy.cpp b/src/mongo/db/s/migration_chunk_cloner_source_legacy.cpp index d1fb2ca5fcd..d04c4fcf584 100644 --- a/src/mongo/db/s/migration_chunk_cloner_source_legacy.cpp +++ b/src/mongo/db/s/migration_chunk_cloner_source_legacy.cpp @@ -961,7 +961,7 @@ Status MigrationChunkClonerSourceLegacy::_storeCurrentLocs(OperationContext* opC avgRecSize = BSONObj::kMinBSONLength; } maxRecsWhenFull = _args.getMaxChunkSizeBytes() / avgRecSize; - maxRecsWhenFull = 130 * maxRecsWhenFull / 100; // pad some slack + maxRecsWhenFull = 2 * maxRecsWhenFull; // pad some slack } else { avgRecSize = 0; maxRecsWhenFull = kMaxObjectPerChunk + 1; diff --git a/src/mongo/s/SConscript b/src/mongo/s/SConscript index f54aad9499b..8b448c1736f 100644 --- a/src/mongo/s/SConscript +++ b/src/mongo/s/SConscript @@ -161,6 +161,7 @@ env.Library( 'catalog/type_tags.cpp', 'request_types/add_shard_request_type.cpp', 'request_types/add_shard_to_zone_request_type.cpp', + 'request_types/auto_split_vector.idl', 'request_types/balance_chunk_request_type.cpp', 'request_types/commit_chunk_migration_request_type.cpp', 'request_types/merge_chunk_request_type.cpp', diff --git a/src/mongo/s/request_types/auto_split_vector.idl b/src/mongo/s/request_types/auto_split_vector.idl new file mode 100644 index 00000000000..d8eb082d087 --- /dev/null +++ b/src/mongo/s/request_types/auto_split_vector.idl @@ -0,0 +1,71 @@ +# Copyright(C) 2021 - present MongoDB, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the Server Side Public License, version 1, +# as published by MongoDB, Inc. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Server Side Public License for more details. +# +# You should have received a copy of the Server Side Public License +# along with this program. If not, see +# <http://www.mongodb.com/licensing/server-side-public-license>. +# +# As a special exception, the copyright holders give permission to link the +# code of portions of this program with the OpenSSL library under certain +# conditions as described in each individual source file and distribute +# linked combinations including the program with the OpenSSL library. You +# must comply with the Server Side Public License in all respects for +# all of the code used other than as permitted herein. If you modify file(s) +# with this exception, you may extend this exception to your version of the +# file(s), but you are not obligated to do so. If you do not wish to do so, +# delete this exception statement from your version. If you delete this +# exception statement from all source files in the program, then also delete +# it in the license file. +# + +# This IDL file describes the BSON format for the autoSplitVector command. + +global: + cpp_namespace: "mongo" + cpp_includes: + - "mongo/db/s/auto_split_vector.h" + +imports: + - "mongo/idl/basic_types.idl" + +types: + bson_vector: + bson_serialization_type: any + description: "An array of objects representing the split keys." + cpp_type: "std::vector<mongo::BSONObj>" + deserializer: ::mongo::parseSplitKeys + +structs: + AutoSplitVectorResponse: + description: "The reply of an autoSplitVector command." + strict: false + fields: + splitKeys: bson_vector + +commands: + autoSplitVector: + cpp_name: AutoSplitVectorRequest + description: "Internal autoSplitVector command" + strict: false + namespace: concatenate_with_db + fields: + keyPattern: + type: object_owned + description: "Shard key pattern of the collection" + min: + type: object_owned + description: "Min key of the chunk" + max: + type: object_owned + description: "Max key of the chunk" + maxChunkSizeBytes: + type: safeInt64 + description: "Max chunk size of the collection expressed in bytes" diff --git a/src/mongo/s/shard_util.cpp b/src/mongo/s/shard_util.cpp index 9419cfdd687..869750458f1 100644 --- a/src/mongo/s/shard_util.cpp +++ b/src/mongo/s/shard_util.cpp @@ -41,6 +41,7 @@ #include "mongo/logv2/log.h" #include "mongo/s/client/shard_registry.h" #include "mongo/s/grid.h" +#include "mongo/s/request_types/auto_split_vector_gen.h" #include "mongo/s/shard_key_pattern.h" #include "mongo/util/str.h" @@ -95,43 +96,60 @@ StatusWith<std::vector<BSONObj>> selectChunkSplitPoints(OperationContext* opCtx, const ChunkRange& chunkRange, long long chunkSizeBytes, boost::optional<int> maxObjs) { - BSONObjBuilder cmd; - cmd.append("splitVector", nss.ns()); - cmd.append("keyPattern", shardKeyPattern.toBSON()); - chunkRange.append(&cmd); - cmd.append("maxChunkSizeBytes", chunkSizeBytes); - if (maxObjs) { - cmd.append("maxChunkObjects", *maxObjs); - } - auto shardStatus = Grid::get(opCtx)->shardRegistry()->getShard(opCtx, shardId); if (!shardStatus.isOK()) { return shardStatus.getStatus(); } - auto cmdStatus = shardStatus.getValue()->runCommandWithFixedRetryAttempts( - opCtx, - ReadPreferenceSetting{ReadPreference::PrimaryPreferred}, - "admin", - cmd.obj(), - Shard::RetryPolicy::kIdempotent); - if (!cmdStatus.isOK()) { - return std::move(cmdStatus.getStatus()); - } - if (!cmdStatus.getValue().commandStatus.isOK()) { - return std::move(cmdStatus.getValue().commandStatus); + auto invokeSplitCommand = [&](const BSONObj& command, const StringData db) { + return shardStatus.getValue()->runCommandWithFixedRetryAttempts( + opCtx, + ReadPreferenceSetting{ReadPreference::PrimaryPreferred}, + db.toString(), + command, + Shard::RetryPolicy::kIdempotent); + }; + + const AutoSplitVectorRequest req( + nss, shardKeyPattern.toBSON(), chunkRange.getMin(), chunkRange.getMax(), chunkSizeBytes); + + auto cmdStatus = invokeSplitCommand(req.toBSON({}), nss.db()); + + // Fallback to splitVector command in case of mixed binaries not supporting autoSplitVector + bool fallback = [&]() { + auto status = Shard::CommandResponse::getEffectiveStatus(cmdStatus); + return !status.isOK() && status.code() == ErrorCodes::CommandNotFound; + }(); + + if (fallback) { + BSONObjBuilder cmd; + cmd.append("splitVector", nss.ns()); + cmd.append("keyPattern", shardKeyPattern.toBSON()); + chunkRange.append(&cmd); + cmd.append("maxChunkSizeBytes", chunkSizeBytes); + cmdStatus = invokeSplitCommand(cmd.obj(), NamespaceString::kAdminDb); } - const auto response = std::move(cmdStatus.getValue().response); - std::vector<BSONObj> splitPoints; + auto status = Shard::CommandResponse::getEffectiveStatus(cmdStatus); + if (!status.isOK()) { + return status; + } + + if (fallback) { + const auto response = std::move(cmdStatus.getValue().response); + std::vector<BSONObj> splitPoints; - BSONObjIterator it(response.getObjectField("splitKeys")); - while (it.more()) { - splitPoints.push_back(it.next().Obj().getOwned()); + BSONObjIterator it(response.getObjectField("splitKeys")); + while (it.more()) { + splitPoints.push_back(it.next().Obj().getOwned()); + } + return std::move(splitPoints); } - return std::move(splitPoints); + const auto response = AutoSplitVectorResponse::parse( + IDLParserErrorContext("AutoSplitVectorResponse"), std::move(cmdStatus.getValue().response)); + return response.getSplitKeys(); } StatusWith<boost::optional<ChunkRange>> splitChunkAtMultiplePoints( diff --git a/src/mongo/s/sharding_mongod_test_fixture.cpp b/src/mongo/s/sharding_mongod_test_fixture.cpp index dbd20638bee..04fc5084396 100644 --- a/src/mongo/s/sharding_mongod_test_fixture.cpp +++ b/src/mongo/s/sharding_mongod_test_fixture.cpp @@ -106,7 +106,7 @@ void ShardingMongodTestFixture::setUp() { // Set up this node as part of a replica set. repl::ReplSettings replSettings; - replSettings.setOplogSizeBytes(512'000); + replSettings.setOplogSizeBytes(1024 * 1024 * 2); // 2MB replSettings.setReplSetString(ConnectionString::forReplicaSet(_setName, _servers).toString()); auto replCoordPtr = makeReplicationCoordinator(replSettings); _replCoord = replCoordPtr.get(); |