diff options
author | Steve Tarzia <steve.tarzia@mongodb.com> | 2022-11-10 23:53:55 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-11-11 00:50:23 +0000 |
commit | ca82aabc3e86d85148e129fd39c00d504234882f (patch) | |
tree | 73bc719ae9037f69f90b34e58aed7ce6064809d1 /jstests | |
parent | 2ef0ee31539036523136230b93c8e7cbf62a0ddd (diff) | |
download | mongo-ca82aabc3e86d85148e129fd39c00d504234882f.tar.gz |
SERVER-71241 Fix long getMores in allow_partial_results_with_maxTimeMS.js
Diffstat (limited to 'jstests')
-rw-r--r-- | jstests/serial_run/allow_partial_results_with_maxTimeMS.js | 98 | ||||
-rw-r--r-- | jstests/sharding/allow_partial_results_with_maxTimeMS_failpoints.js | 100 |
2 files changed, 130 insertions, 68 deletions
diff --git a/jstests/serial_run/allow_partial_results_with_maxTimeMS.js b/jstests/serial_run/allow_partial_results_with_maxTimeMS.js index 65a10a82f0e..402039c2ee8 100644 --- a/jstests/serial_run/allow_partial_results_with_maxTimeMS.js +++ b/jstests/serial_run/allow_partial_results_with_maxTimeMS.js @@ -28,8 +28,6 @@ function isError(res) { return !res.hasOwnProperty('ok') || !res['ok']; } -Random.setRandomSeed(); - const dbName = "test-SERVER-57469"; const collName = "test-SERVER-57469-coll"; @@ -41,11 +39,9 @@ assert.commandWorked(st.s.adminCommand({enableSharding: dbName})); st.ensurePrimaryShard(dbName, st.shard0.name); // Insert some data. -function initDb(numSamples) { +function initDb(numSamples, splitPoint) { coll.drop(); - // Use ranged sharding with 90% of the value range on the second shard. - const splitPoint = Math.max(1, numSamples / 10); st.shardColl( coll, {_id: 1}, // shard key @@ -61,7 +57,9 @@ function initDb(numSamples) { } let nDocs = 1000; -initDb(nDocs); +// Use ranged sharding with 90% of the value range on the second shard. +let splitPoint = Math.max(1, nDocs / 10); +initDb(nDocs, splitPoint); /** * @param {Object} cmdRes coll.runCommand() result @@ -101,18 +99,18 @@ function runBigBatchQuery(timeoutMs) { let fullQueryTimeoutMS = runtimeMillis(() => assert.eq("full", runBigBatchQuery(9999999))); print("ran in " + fullQueryTimeoutMS + " ms"); const targetTimeoutMS = - 50; // We want the query to run for at least this long, to allow for timeout. + 1000; // We want the query to run for at least this long, to allow for timeout. if (fullQueryTimeoutMS < targetTimeoutMS) { // Assume linear scaling of runtime with the number of docs. nDocs *= Math.ceil(targetTimeoutMS / fullQueryTimeoutMS); // Limit size to prevent long runtime due to bad first sample. - nDocs = Math.min(nDocs, 100000); + nDocs = Math.min(nDocs, 1000000); if (nDocs % 2 == 1) { // make sure it's even so the math for half size is easier nDocs += 1; } + splitPoint = Math.max(1, nDocs / 10); print("adjusting size to " + nDocs); - fullQueryTimeoutMS = 100; - initDb(nDocs); + initDb(nDocs, splitPoint); // Re-time the full query after resizing, with unlimited time allowed. fullQueryTimeoutMS = runtimeMillis(() => assert.eq("full", runBigBatchQuery(9999999))); @@ -126,40 +124,32 @@ if (fullQueryTimeoutMS < targetTimeoutMS) { * never seen. */ function searchForAndAssertPartialResults(initialTimeoutMS, queryFunc) { - // Try this test twice because it's very sensitive to timing and resource contention. - for (let i = 1; i <= 2; i++) { - let timeoutMS = initialTimeoutMS; - const attempts = 20; - for (let j = 1; j <= attempts; j++) { - print("try query with maxTimeMS: " + timeoutMS); - let res = queryFunc(timeoutMS); - if (res == "partial") { - // Got partial results! - return timeoutMS; - } else if (res == "full") { - // Timeout was so long that we got complete results. Make it shorter and try again - if (timeoutMS > 1) { // 1 ms is the min timeout allowed. - timeoutMS = Math.floor(0.8 * timeoutMS); - } - } else { - assert.eq("error", res); - // Timeout was so short that we go no results. Increase maxTimeMS and try again - timeoutMS = Math.ceil(1.1 * timeoutMS); - // Don't let the timeout explode upward without bound. - if (timeoutMS > 100 * initialTimeoutMS) { - break; - } + let timeoutMS = initialTimeoutMS; + const attempts = 1000; + for (let j = 1; j <= attempts; j++) { + print("try query with maxTimeMS: " + timeoutMS); + // The longer we are searching, the more fine-grained our changes to the timeout become. + const changeFactor = 0.2 - ((0.2 * j) / attempts); + let res = queryFunc(timeoutMS); + if (res == "partial") { + // Got partial results! + return timeoutMS; + } else if (res == "full") { + // Timeout was so long that we got complete results. Make it shorter and try again + if (timeoutMS > 1) { // 1 ms is the min timeout allowed. + timeoutMS = Math.floor((1 - changeFactor) * timeoutMS); + } + } else { + assert.eq("error", res); + // Timeout was so short that we got no results. Increase maxTimeMS and try again + timeoutMS = Math.ceil((1 + changeFactor) * timeoutMS); + // Don't let the timeout explode upward without bound. + if (timeoutMS > 100 * initialTimeoutMS) { + break; } } - // Pause for one minute then try once again. We don't expect to ever reach this except - // in rare cases when the test infrastructure is behaving inconsistently. We are trying - // the test again after a long delay instead of failing the test. - sleep(60 * 1000); } // Failed to ever see partial results :-( - if (fullQueryTimeoutMS < 10) { - lsTest.log("!!!: This error is likely due to the nDocs constant being set too small."); - } assert(false, "Did not find partial results after max number of attempts"); } @@ -167,26 +157,28 @@ function searchForAndAssertPartialResults(initialTimeoutMS, queryFunc) { // This first case will try to get all the results in one big batch. // Start with half of the full runtime of the query. -// fetch one big batch of results +// Fetch one big batch of results. searchForAndAssertPartialResults(Math.round(fullQueryTimeoutMS), runBigBatchQuery); -// Try to get partial results in a getMore, while fetching the second half of data. +// Try to get partial results in a getMore. searchForAndAssertPartialResults(Math.round(0.5 * fullQueryTimeoutMS), function(timeout) { - // Find a small first batch. - const smallBatchSize = 1; + // Find the first batch. + // First batch size must be chosen carefully. We want it to be small enough that we don't get + // all the docs from the small shard in the first batch. We want it to be large enough that + // the repeated getMores on the remotes for the remaining data does not overwhelm the exec time. + const firstBatchSize = Math.round(splitPoint / 2); // Half the size of the small shard. let findRes = coll.runCommand( - {find: collName, allowPartialResults: true, batchSize: smallBatchSize, maxTimeMS: timeout}); - if (isError(findRes)) { - // We don't expect this first small-batch find to timeout, but it can if we're unlucky. - assert.eq(ErrorCodes.MaxTimeMSExpired, findRes.code); // timeout - return "error"; + {find: collName, allowPartialResults: true, batchSize: firstBatchSize, maxTimeMS: timeout}); + // We don't expect this first batch find to timeout, but it can if we're unlucky. + const findResStatus = interpretCommandResult(findRes, firstBatchSize); + if (findResStatus == "error" || findResStatus == "partial") { + return findResStatus; } - // Partial results can be either size zero or smallBatchSize. - assert.lte(findRes.cursor.firstBatch.length, smallBatchSize); - assert.eq(undefined, findRes.cursor.partialResultsReturned); // Try to get partial results with a getMore. - const secondBatchSize = nDocs - smallBatchSize; + // TODO SERVER-71248: Note that the getMore below uses the original firstBatchSize, not + // secondBatchSize in the getMores sent to the shards. + const secondBatchSize = nDocs - firstBatchSize; return interpretCommandResult( coll.runCommand( {getMore: findRes.cursor.id, collection: collName, batchSize: secondBatchSize}), diff --git a/jstests/sharding/allow_partial_results_with_maxTimeMS_failpoints.js b/jstests/sharding/allow_partial_results_with_maxTimeMS_failpoints.js index 3bcc281aebd..fdf85caea0a 100644 --- a/jstests/sharding/allow_partial_results_with_maxTimeMS_failpoints.js +++ b/jstests/sharding/allow_partial_results_with_maxTimeMS_failpoints.js @@ -38,11 +38,10 @@ const collName = "test-SERVER-57469-coll"; const coll = st.s0.getDB(dbName)[collName]; -function initDb(numSamples) { +function initDb(numSamples, splitPoint) { coll.drop(); // Use ranged sharding with 50% of the data on the second shard. - const splitPoint = Math.max(1, numSamples / 2); st.shardColl( coll, {_id: 1}, // shard key @@ -59,11 +58,17 @@ function initDb(numSamples) { // Insert some data. const size = 1000; -initDb(size); +const splitPoint = Math.max(1, size / 2); +initDb(size, splitPoint); + +// We will sometimes use $where expressions to inject delays in processing documents on some shards. +// Maps from shard to a snippet of JS code. This is modified by FindWhereSleepController +let whereExpressions = {}; function runQueryWithTimeout(doAllowPartialResults, timeout) { return coll.runCommand({ find: collName, + filter: {$where: Object.values(whereExpressions).join("") + "return 1;"}, allowPartialResults: doAllowPartialResults, batchSize: size, maxTimeMS: timeout @@ -98,6 +103,7 @@ function getMoreMongosTimeout(allowPartialResults) { // Get the first batch. const res = assert.commandWorked(coll.runCommand({ find: collName, + filter: {$where: Object.values(whereExpressions).join("") + "return 1;"}, allowPartialResults: allowPartialResults, batchSize: batchSizeForGetMore, maxTimeMS: ampleTimeMS @@ -124,6 +130,7 @@ function getMoreMongosTimeout(allowPartialResults) { print(numReturned + " docs returned so far"); assert.neq(numReturned, size, "Got full results even through mongos had MaxTimeMSExpired."); if (res2.cursor.partialResultsReturned) { + assert(allowPartialResults); assert.lt(numReturned, size); break; } @@ -189,6 +196,27 @@ class MultiFailureController { } } +class FindWhereSleepController { + constructor(shard) { + this.shard = shard; + } + + enable() { + // Add a $where expression to find command that sleeps when processing a document on the + // shard of interest. + let slowDocId = (this.shard == st.shard0) ? 0 : splitPoint; + // Offset the slowDocId by batchSizeForGetMore so that when testing getMore, we quickly + // return enough documents to serve the first batch without timing out. + slowDocId += batchSizeForGetMore; + const sleepTimeMS = 2 * ampleTimeMS; + whereExpressions[this.shard] = `if (this._id == ${slowDocId}) {sleep(${sleepTimeMS})};`; + } + + disable() { + delete whereExpressions[this.shard]; + } +} + const shard0Failpoint = new MaxTimeMSFailpointFailureController(st.shard0); const shard1Failpoint = new MaxTimeMSFailpointFailureController(st.shard1); const allShardsFailpoint = new MultiFailureController([shard0Failpoint, shard1Failpoint]); @@ -198,14 +226,18 @@ const shard1NetworkFailure = new NetworkFailureController(st.rs1); const allshardsNetworkFailure = new MultiFailureController([shard0NetworkFailure, shard1NetworkFailure]); +const shard0SleepFailure = new FindWhereSleepController(st.shard0); +const shard1SleepFailure = new FindWhereSleepController(st.shard1); +const allShardsSleepFailure = new MultiFailureController([shard0SleepFailure, shard1SleepFailure]); + const allshardsMixedFailures = new MultiFailureController([shard0NetworkFailure, shard1Failpoint]); -// With 'allowPartialResults: true', if a shard times out on getMore then return partial results. -function partialResultsTrueGetMoreTimeout(failureController) { +function getMoreShardTimeout(allowPartialResults, failureController) { // Get the first batch. const res = assert.commandWorked(coll.runCommand({ find: collName, - allowPartialResults: true, + filter: {$where: Object.values(whereExpressions).join("") + "return 1;"}, + allowPartialResults: allowPartialResults, batchSize: batchSizeForGetMore, maxTimeMS: ampleTimeMS })); @@ -217,23 +249,55 @@ function partialResultsTrueGetMoreTimeout(failureController) { print(numReturned + " docs returned in the first batch"); while (true) { // Run getmores repeatedly until we exhaust the cache on mongos. - // Eventually we should get partial results because a shard is down. - const res2 = assert.commandWorked(coll.runCommand( - {getMore: res.cursor.id, collection: collName, batchSize: batchSizeForGetMore})); + // Eventually we should get partial results or an error because a shard is down. + const res2 = coll.runCommand( + {getMore: res.cursor.id, collection: collName, batchSize: batchSizeForGetMore}); + if (allowPartialResults) { + assert.commandWorked(res2); + } else { + if (isError(res2)) { + assert.commandFailedWithCode( + res2, ErrorCodes.MaxTimeMSExpired, "failure should be due to MaxTimeMSExpired"); + break; + } + } numReturned += res2.cursor.nextBatch.length; print(numReturned + " docs returned so far"); assert.neq(numReturned, size, "Entire collection seemed to be cached by the first find!"); if (res2.cursor.partialResultsReturned) { - assert.lt(numReturned, size); - break; + if (allowPartialResults) { + assert.lt(numReturned, size); + break; + } else { + assert(false, "Partial results should not have been allowed."); + } } } failureController.disable(); } -partialResultsTrueGetMoreTimeout(shard0Failpoint); -partialResultsTrueGetMoreTimeout(shard1Failpoint); -partialResultsTrueGetMoreTimeout(shard0NetworkFailure); -partialResultsTrueGetMoreTimeout(shard1NetworkFailure); +// getMore timeout with allowPartialResults=true. +getMoreShardTimeout(true, shard0Failpoint); +getMoreShardTimeout(true, shard1Failpoint); +getMoreShardTimeout(true, shard0NetworkFailure); +getMoreShardTimeout(true, shard1NetworkFailure); +// The FindWhereSleepFailureController must be set before the first "find" because that's when the +// $where clause is set. +shard0SleepFailure.enable(); +getMoreShardTimeout(true, shard0SleepFailure); +shard1SleepFailure.enable(); +getMoreShardTimeout(true, shard1SleepFailure); + +// getMore timeout with allowPartialResults=false. +getMoreShardTimeout(false, shard0Failpoint); +getMoreShardTimeout(false, shard1Failpoint); +getMoreShardTimeout(false, shard0NetworkFailure); +getMoreShardTimeout(false, shard1NetworkFailure); +// The FindWhereSleepFailureController must be set before the first "find" because that's when the +// $where clause is set. +shard0SleepFailure.enable(); +getMoreShardTimeout(false, shard0SleepFailure); +shard1SleepFailure.enable(); +getMoreShardTimeout(false, shard1SleepFailure); // With 'allowPartialResults: true', if a shard times out on the first batch then return // partial results. @@ -249,6 +313,8 @@ partialResultsTrueFirstBatch(shard0Failpoint); partialResultsTrueFirstBatch(shard1Failpoint); partialResultsTrueFirstBatch(shard0NetworkFailure); partialResultsTrueFirstBatch(shard1NetworkFailure); +partialResultsTrueFirstBatch(shard0SleepFailure); +partialResultsTrueFirstBatch(shard1SleepFailure); // With 'allowPartialResults: false', if one shard times out then return a timeout error. function partialResultsFalseOneFailure(failureController) { @@ -260,6 +326,8 @@ partialResultsFalseOneFailure(shard0Failpoint); partialResultsFalseOneFailure(shard1Failpoint); partialResultsFalseOneFailure(shard0NetworkFailure); partialResultsFalseOneFailure(shard1NetworkFailure); +partialResultsFalseOneFailure(shard0SleepFailure); +partialResultsFalseOneFailure(shard1SleepFailure); // With 'allowPartialResults: false', if both shards time out then return a timeout error. function allowPartialResultsFalseAllFailed(failureController) { @@ -270,6 +338,7 @@ function allowPartialResultsFalseAllFailed(failureController) { allowPartialResultsFalseAllFailed(allShardsFailpoint); allowPartialResultsFalseAllFailed(allshardsNetworkFailure); allowPartialResultsFalseAllFailed(allshardsMixedFailures); +allowPartialResultsFalseAllFailed(allShardsSleepFailure); // With 'allowPartialResults: true', if both shards time out then return empty "partial" results. function allowPartialResultsTrueAllFailed(failureController) { @@ -283,6 +352,7 @@ function allowPartialResultsTrueAllFailed(failureController) { allowPartialResultsTrueAllFailed(allShardsFailpoint); allowPartialResultsTrueAllFailed(allshardsNetworkFailure); allowPartialResultsTrueAllFailed(allshardsMixedFailures); +allowPartialResultsTrueAllFailed(allShardsSleepFailure); st.stop(); }()); |