summaryrefslogtreecommitdiff
path: root/jstests
diff options
context:
space:
mode:
authorSteve Tarzia <steve.tarzia@mongodb.com>2022-11-10 23:53:55 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-11-11 00:50:23 +0000
commitca82aabc3e86d85148e129fd39c00d504234882f (patch)
tree73bc719ae9037f69f90b34e58aed7ce6064809d1 /jstests
parent2ef0ee31539036523136230b93c8e7cbf62a0ddd (diff)
downloadmongo-ca82aabc3e86d85148e129fd39c00d504234882f.tar.gz
SERVER-71241 Fix long getMores in allow_partial_results_with_maxTimeMS.js
Diffstat (limited to 'jstests')
-rw-r--r--jstests/serial_run/allow_partial_results_with_maxTimeMS.js98
-rw-r--r--jstests/sharding/allow_partial_results_with_maxTimeMS_failpoints.js100
2 files changed, 130 insertions, 68 deletions
diff --git a/jstests/serial_run/allow_partial_results_with_maxTimeMS.js b/jstests/serial_run/allow_partial_results_with_maxTimeMS.js
index 65a10a82f0e..402039c2ee8 100644
--- a/jstests/serial_run/allow_partial_results_with_maxTimeMS.js
+++ b/jstests/serial_run/allow_partial_results_with_maxTimeMS.js
@@ -28,8 +28,6 @@ function isError(res) {
return !res.hasOwnProperty('ok') || !res['ok'];
}
-Random.setRandomSeed();
-
const dbName = "test-SERVER-57469";
const collName = "test-SERVER-57469-coll";
@@ -41,11 +39,9 @@ assert.commandWorked(st.s.adminCommand({enableSharding: dbName}));
st.ensurePrimaryShard(dbName, st.shard0.name);
// Insert some data.
-function initDb(numSamples) {
+function initDb(numSamples, splitPoint) {
coll.drop();
- // Use ranged sharding with 90% of the value range on the second shard.
- const splitPoint = Math.max(1, numSamples / 10);
st.shardColl(
coll,
{_id: 1}, // shard key
@@ -61,7 +57,9 @@ function initDb(numSamples) {
}
let nDocs = 1000;
-initDb(nDocs);
+// Use ranged sharding with 90% of the value range on the second shard.
+let splitPoint = Math.max(1, nDocs / 10);
+initDb(nDocs, splitPoint);
/**
* @param {Object} cmdRes coll.runCommand() result
@@ -101,18 +99,18 @@ function runBigBatchQuery(timeoutMs) {
let fullQueryTimeoutMS = runtimeMillis(() => assert.eq("full", runBigBatchQuery(9999999)));
print("ran in " + fullQueryTimeoutMS + " ms");
const targetTimeoutMS =
- 50; // We want the query to run for at least this long, to allow for timeout.
+ 1000; // We want the query to run for at least this long, to allow for timeout.
if (fullQueryTimeoutMS < targetTimeoutMS) {
// Assume linear scaling of runtime with the number of docs.
nDocs *= Math.ceil(targetTimeoutMS / fullQueryTimeoutMS);
// Limit size to prevent long runtime due to bad first sample.
- nDocs = Math.min(nDocs, 100000);
+ nDocs = Math.min(nDocs, 1000000);
if (nDocs % 2 == 1) { // make sure it's even so the math for half size is easier
nDocs += 1;
}
+ splitPoint = Math.max(1, nDocs / 10);
print("adjusting size to " + nDocs);
- fullQueryTimeoutMS = 100;
- initDb(nDocs);
+ initDb(nDocs, splitPoint);
// Re-time the full query after resizing, with unlimited time allowed.
fullQueryTimeoutMS = runtimeMillis(() => assert.eq("full", runBigBatchQuery(9999999)));
@@ -126,40 +124,32 @@ if (fullQueryTimeoutMS < targetTimeoutMS) {
* never seen.
*/
function searchForAndAssertPartialResults(initialTimeoutMS, queryFunc) {
- // Try this test twice because it's very sensitive to timing and resource contention.
- for (let i = 1; i <= 2; i++) {
- let timeoutMS = initialTimeoutMS;
- const attempts = 20;
- for (let j = 1; j <= attempts; j++) {
- print("try query with maxTimeMS: " + timeoutMS);
- let res = queryFunc(timeoutMS);
- if (res == "partial") {
- // Got partial results!
- return timeoutMS;
- } else if (res == "full") {
- // Timeout was so long that we got complete results. Make it shorter and try again
- if (timeoutMS > 1) { // 1 ms is the min timeout allowed.
- timeoutMS = Math.floor(0.8 * timeoutMS);
- }
- } else {
- assert.eq("error", res);
- // Timeout was so short that we go no results. Increase maxTimeMS and try again
- timeoutMS = Math.ceil(1.1 * timeoutMS);
- // Don't let the timeout explode upward without bound.
- if (timeoutMS > 100 * initialTimeoutMS) {
- break;
- }
+ let timeoutMS = initialTimeoutMS;
+ const attempts = 1000;
+ for (let j = 1; j <= attempts; j++) {
+ print("try query with maxTimeMS: " + timeoutMS);
+ // The longer we are searching, the more fine-grained our changes to the timeout become.
+ const changeFactor = 0.2 - ((0.2 * j) / attempts);
+ let res = queryFunc(timeoutMS);
+ if (res == "partial") {
+ // Got partial results!
+ return timeoutMS;
+ } else if (res == "full") {
+ // Timeout was so long that we got complete results. Make it shorter and try again
+ if (timeoutMS > 1) { // 1 ms is the min timeout allowed.
+ timeoutMS = Math.floor((1 - changeFactor) * timeoutMS);
+ }
+ } else {
+ assert.eq("error", res);
+ // Timeout was so short that we got no results. Increase maxTimeMS and try again
+ timeoutMS = Math.ceil((1 + changeFactor) * timeoutMS);
+ // Don't let the timeout explode upward without bound.
+ if (timeoutMS > 100 * initialTimeoutMS) {
+ break;
}
}
- // Pause for one minute then try once again. We don't expect to ever reach this except
- // in rare cases when the test infrastructure is behaving inconsistently. We are trying
- // the test again after a long delay instead of failing the test.
- sleep(60 * 1000);
}
// Failed to ever see partial results :-(
- if (fullQueryTimeoutMS < 10) {
- lsTest.log("!!!: This error is likely due to the nDocs constant being set too small.");
- }
assert(false, "Did not find partial results after max number of attempts");
}
@@ -167,26 +157,28 @@ function searchForAndAssertPartialResults(initialTimeoutMS, queryFunc) {
// This first case will try to get all the results in one big batch.
// Start with half of the full runtime of the query.
-// fetch one big batch of results
+// Fetch one big batch of results.
searchForAndAssertPartialResults(Math.round(fullQueryTimeoutMS), runBigBatchQuery);
-// Try to get partial results in a getMore, while fetching the second half of data.
+// Try to get partial results in a getMore.
searchForAndAssertPartialResults(Math.round(0.5 * fullQueryTimeoutMS), function(timeout) {
- // Find a small first batch.
- const smallBatchSize = 1;
+ // Find the first batch.
+ // First batch size must be chosen carefully. We want it to be small enough that we don't get
+ // all the docs from the small shard in the first batch. We want it to be large enough that
+ // the repeated getMores on the remotes for the remaining data does not overwhelm the exec time.
+ const firstBatchSize = Math.round(splitPoint / 2); // Half the size of the small shard.
let findRes = coll.runCommand(
- {find: collName, allowPartialResults: true, batchSize: smallBatchSize, maxTimeMS: timeout});
- if (isError(findRes)) {
- // We don't expect this first small-batch find to timeout, but it can if we're unlucky.
- assert.eq(ErrorCodes.MaxTimeMSExpired, findRes.code); // timeout
- return "error";
+ {find: collName, allowPartialResults: true, batchSize: firstBatchSize, maxTimeMS: timeout});
+ // We don't expect this first batch find to timeout, but it can if we're unlucky.
+ const findResStatus = interpretCommandResult(findRes, firstBatchSize);
+ if (findResStatus == "error" || findResStatus == "partial") {
+ return findResStatus;
}
- // Partial results can be either size zero or smallBatchSize.
- assert.lte(findRes.cursor.firstBatch.length, smallBatchSize);
- assert.eq(undefined, findRes.cursor.partialResultsReturned);
// Try to get partial results with a getMore.
- const secondBatchSize = nDocs - smallBatchSize;
+ // TODO SERVER-71248: Note that the getMore below uses the original firstBatchSize, not
+ // secondBatchSize in the getMores sent to the shards.
+ const secondBatchSize = nDocs - firstBatchSize;
return interpretCommandResult(
coll.runCommand(
{getMore: findRes.cursor.id, collection: collName, batchSize: secondBatchSize}),
diff --git a/jstests/sharding/allow_partial_results_with_maxTimeMS_failpoints.js b/jstests/sharding/allow_partial_results_with_maxTimeMS_failpoints.js
index 3bcc281aebd..fdf85caea0a 100644
--- a/jstests/sharding/allow_partial_results_with_maxTimeMS_failpoints.js
+++ b/jstests/sharding/allow_partial_results_with_maxTimeMS_failpoints.js
@@ -38,11 +38,10 @@ const collName = "test-SERVER-57469-coll";
const coll = st.s0.getDB(dbName)[collName];
-function initDb(numSamples) {
+function initDb(numSamples, splitPoint) {
coll.drop();
// Use ranged sharding with 50% of the data on the second shard.
- const splitPoint = Math.max(1, numSamples / 2);
st.shardColl(
coll,
{_id: 1}, // shard key
@@ -59,11 +58,17 @@ function initDb(numSamples) {
// Insert some data.
const size = 1000;
-initDb(size);
+const splitPoint = Math.max(1, size / 2);
+initDb(size, splitPoint);
+
+// We will sometimes use $where expressions to inject delays in processing documents on some shards.
+// Maps from shard to a snippet of JS code. This is modified by FindWhereSleepController
+let whereExpressions = {};
function runQueryWithTimeout(doAllowPartialResults, timeout) {
return coll.runCommand({
find: collName,
+ filter: {$where: Object.values(whereExpressions).join("") + "return 1;"},
allowPartialResults: doAllowPartialResults,
batchSize: size,
maxTimeMS: timeout
@@ -98,6 +103,7 @@ function getMoreMongosTimeout(allowPartialResults) {
// Get the first batch.
const res = assert.commandWorked(coll.runCommand({
find: collName,
+ filter: {$where: Object.values(whereExpressions).join("") + "return 1;"},
allowPartialResults: allowPartialResults,
batchSize: batchSizeForGetMore,
maxTimeMS: ampleTimeMS
@@ -124,6 +130,7 @@ function getMoreMongosTimeout(allowPartialResults) {
print(numReturned + " docs returned so far");
assert.neq(numReturned, size, "Got full results even through mongos had MaxTimeMSExpired.");
if (res2.cursor.partialResultsReturned) {
+ assert(allowPartialResults);
assert.lt(numReturned, size);
break;
}
@@ -189,6 +196,27 @@ class MultiFailureController {
}
}
+class FindWhereSleepController {
+ constructor(shard) {
+ this.shard = shard;
+ }
+
+ enable() {
+ // Add a $where expression to find command that sleeps when processing a document on the
+ // shard of interest.
+ let slowDocId = (this.shard == st.shard0) ? 0 : splitPoint;
+ // Offset the slowDocId by batchSizeForGetMore so that when testing getMore, we quickly
+ // return enough documents to serve the first batch without timing out.
+ slowDocId += batchSizeForGetMore;
+ const sleepTimeMS = 2 * ampleTimeMS;
+ whereExpressions[this.shard] = `if (this._id == ${slowDocId}) {sleep(${sleepTimeMS})};`;
+ }
+
+ disable() {
+ delete whereExpressions[this.shard];
+ }
+}
+
const shard0Failpoint = new MaxTimeMSFailpointFailureController(st.shard0);
const shard1Failpoint = new MaxTimeMSFailpointFailureController(st.shard1);
const allShardsFailpoint = new MultiFailureController([shard0Failpoint, shard1Failpoint]);
@@ -198,14 +226,18 @@ const shard1NetworkFailure = new NetworkFailureController(st.rs1);
const allshardsNetworkFailure =
new MultiFailureController([shard0NetworkFailure, shard1NetworkFailure]);
+const shard0SleepFailure = new FindWhereSleepController(st.shard0);
+const shard1SleepFailure = new FindWhereSleepController(st.shard1);
+const allShardsSleepFailure = new MultiFailureController([shard0SleepFailure, shard1SleepFailure]);
+
const allshardsMixedFailures = new MultiFailureController([shard0NetworkFailure, shard1Failpoint]);
-// With 'allowPartialResults: true', if a shard times out on getMore then return partial results.
-function partialResultsTrueGetMoreTimeout(failureController) {
+function getMoreShardTimeout(allowPartialResults, failureController) {
// Get the first batch.
const res = assert.commandWorked(coll.runCommand({
find: collName,
- allowPartialResults: true,
+ filter: {$where: Object.values(whereExpressions).join("") + "return 1;"},
+ allowPartialResults: allowPartialResults,
batchSize: batchSizeForGetMore,
maxTimeMS: ampleTimeMS
}));
@@ -217,23 +249,55 @@ function partialResultsTrueGetMoreTimeout(failureController) {
print(numReturned + " docs returned in the first batch");
while (true) {
// Run getmores repeatedly until we exhaust the cache on mongos.
- // Eventually we should get partial results because a shard is down.
- const res2 = assert.commandWorked(coll.runCommand(
- {getMore: res.cursor.id, collection: collName, batchSize: batchSizeForGetMore}));
+ // Eventually we should get partial results or an error because a shard is down.
+ const res2 = coll.runCommand(
+ {getMore: res.cursor.id, collection: collName, batchSize: batchSizeForGetMore});
+ if (allowPartialResults) {
+ assert.commandWorked(res2);
+ } else {
+ if (isError(res2)) {
+ assert.commandFailedWithCode(
+ res2, ErrorCodes.MaxTimeMSExpired, "failure should be due to MaxTimeMSExpired");
+ break;
+ }
+ }
numReturned += res2.cursor.nextBatch.length;
print(numReturned + " docs returned so far");
assert.neq(numReturned, size, "Entire collection seemed to be cached by the first find!");
if (res2.cursor.partialResultsReturned) {
- assert.lt(numReturned, size);
- break;
+ if (allowPartialResults) {
+ assert.lt(numReturned, size);
+ break;
+ } else {
+ assert(false, "Partial results should not have been allowed.");
+ }
}
}
failureController.disable();
}
-partialResultsTrueGetMoreTimeout(shard0Failpoint);
-partialResultsTrueGetMoreTimeout(shard1Failpoint);
-partialResultsTrueGetMoreTimeout(shard0NetworkFailure);
-partialResultsTrueGetMoreTimeout(shard1NetworkFailure);
+// getMore timeout with allowPartialResults=true.
+getMoreShardTimeout(true, shard0Failpoint);
+getMoreShardTimeout(true, shard1Failpoint);
+getMoreShardTimeout(true, shard0NetworkFailure);
+getMoreShardTimeout(true, shard1NetworkFailure);
+// The FindWhereSleepFailureController must be set before the first "find" because that's when the
+// $where clause is set.
+shard0SleepFailure.enable();
+getMoreShardTimeout(true, shard0SleepFailure);
+shard1SleepFailure.enable();
+getMoreShardTimeout(true, shard1SleepFailure);
+
+// getMore timeout with allowPartialResults=false.
+getMoreShardTimeout(false, shard0Failpoint);
+getMoreShardTimeout(false, shard1Failpoint);
+getMoreShardTimeout(false, shard0NetworkFailure);
+getMoreShardTimeout(false, shard1NetworkFailure);
+// The FindWhereSleepFailureController must be set before the first "find" because that's when the
+// $where clause is set.
+shard0SleepFailure.enable();
+getMoreShardTimeout(false, shard0SleepFailure);
+shard1SleepFailure.enable();
+getMoreShardTimeout(false, shard1SleepFailure);
// With 'allowPartialResults: true', if a shard times out on the first batch then return
// partial results.
@@ -249,6 +313,8 @@ partialResultsTrueFirstBatch(shard0Failpoint);
partialResultsTrueFirstBatch(shard1Failpoint);
partialResultsTrueFirstBatch(shard0NetworkFailure);
partialResultsTrueFirstBatch(shard1NetworkFailure);
+partialResultsTrueFirstBatch(shard0SleepFailure);
+partialResultsTrueFirstBatch(shard1SleepFailure);
// With 'allowPartialResults: false', if one shard times out then return a timeout error.
function partialResultsFalseOneFailure(failureController) {
@@ -260,6 +326,8 @@ partialResultsFalseOneFailure(shard0Failpoint);
partialResultsFalseOneFailure(shard1Failpoint);
partialResultsFalseOneFailure(shard0NetworkFailure);
partialResultsFalseOneFailure(shard1NetworkFailure);
+partialResultsFalseOneFailure(shard0SleepFailure);
+partialResultsFalseOneFailure(shard1SleepFailure);
// With 'allowPartialResults: false', if both shards time out then return a timeout error.
function allowPartialResultsFalseAllFailed(failureController) {
@@ -270,6 +338,7 @@ function allowPartialResultsFalseAllFailed(failureController) {
allowPartialResultsFalseAllFailed(allShardsFailpoint);
allowPartialResultsFalseAllFailed(allshardsNetworkFailure);
allowPartialResultsFalseAllFailed(allshardsMixedFailures);
+allowPartialResultsFalseAllFailed(allShardsSleepFailure);
// With 'allowPartialResults: true', if both shards time out then return empty "partial" results.
function allowPartialResultsTrueAllFailed(failureController) {
@@ -283,6 +352,7 @@ function allowPartialResultsTrueAllFailed(failureController) {
allowPartialResultsTrueAllFailed(allShardsFailpoint);
allowPartialResultsTrueAllFailed(allshardsNetworkFailure);
allowPartialResultsTrueAllFailed(allshardsMixedFailures);
+allowPartialResultsTrueAllFailed(allShardsSleepFailure);
st.stop();
}());