diff options
author | Charlie Swanson <charlie.swanson@mongodb.com> | 2023-04-11 13:37:35 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2023-04-11 14:46:01 +0000 |
commit | c0e7ce05c657ffef5da25aa0399737311f3d48dc (patch) | |
tree | 5f7648843679d17726c7b540e55734a5063e5425 | |
parent | a2722330e5a975b9692e69f744a9769006ff94ea (diff) | |
download | mongo-c0e7ce05c657ffef5da25aa0399737311f3d48dc.tar.gz |
SERVER-75596 Invoke $telemetry in the background of telemetry suite.
11 files changed, 232 insertions, 64 deletions
diff --git a/buildscripts/resmokeconfig/suites/telemetry_passthrough.yml b/buildscripts/resmokeconfig/suites/telemetry_passthrough.yml index 1aa2a490a5f..9e471accf67 100644 --- a/buildscripts/resmokeconfig/suites/telemetry_passthrough.yml +++ b/buildscripts/resmokeconfig/suites/telemetry_passthrough.yml @@ -22,6 +22,7 @@ executor: - class: ValidateCollections - class: CleanEveryN n: 20 + - class: RunTelemetry fixture: class: MongoDFixture mongod_options: diff --git a/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py b/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py index d392c6ae0b8..b49030ff0f4 100644 --- a/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py +++ b/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py @@ -6,68 +6,17 @@ internally sleep for 1 second between runs. import os.path -from buildscripts.resmokelib import errors -from buildscripts.resmokelib.testing.hooks import jsfile -from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob, _ContinuousDynamicJSTestCase +from buildscripts.resmokelib.testing.hooks.background_job import BackgroundRepeatedJsHook -class AggregateResourceConsumptionMetricsInBackground(jsfile.JSHook): +class AggregateResourceConsumptionMetricsInBackground(BackgroundRepeatedJsHook): """A hook to run $operationMetrics stage in the background.""" - IS_BACKGROUND = True - def __init__(self, hook_logger, fixture, shell_options=None): """Initialize AggregateResourceConsumptionMetricsInBackground.""" + description = "Run background $operationMetrics on all mongods while a test is running" js_filename = os.path.join("jstests", "hooks", "run_aggregate_metrics_background.js") - jsfile.JSHook.__init__(self, hook_logger, fixture, js_filename, description, - shell_options=shell_options) - self._background_job = None - - def before_suite(self, test_report): - """Start the background thread.""" - self._background_job = _BackgroundJob("AggregateResourceConsumptionMetricsInBackground") - self.logger.info("Starting the background aggregate metrics thread.") - self._background_job.start() - - def after_suite(self, test_report, teardown_flag=None): - """Signal the background aggregate metrics thread to exit, and wait until it does.""" - if self._background_job is None: - return - - self.logger.info("Stopping the background aggregate metrics thread.") - self._background_job.stop() - - def before_test(self, test, test_report): - """Instruct the background aggregate metrics thread to run while 'test' is also running.""" - if self._background_job is None: - return - - hook_test_case = _ContinuousDynamicJSTestCase.create_before_test( - test.logger, test, self, self._js_filename, self._shell_options) - hook_test_case.configure(self.fixture) - - self.logger.info("Resuming the background aggregate metrics thread.") - self._background_job.resume(hook_test_case, test_report) - - def after_test(self, test, test_report): # noqa: D205,D400 - """Instruct the background aggregate metrics thread to stop running now that 'test' has - finished running. - """ - if self._background_job is None: - return - - self.logger.info("Pausing the background aggregate metrics thread.") - self._background_job.pause() - - if self._background_job.exc_info is not None: - if isinstance(self._background_job.exc_info[1], errors.TestFailure): - # If the mongo shell process running the JavaScript file exited with a non-zero - # return code, then we raise an errors.ServerFailure exception to cause resmoke.py's - # test execution to stop. - raise errors.ServerFailure(self._background_job.exc_info[1].args[0]) - else: - self.logger.error( - "Encountered an error inside the background aggregate metrics thread.", - exc_info=self._background_job.exc_info) - raise self._background_job.exc_info[1] + super().__init__(hook_logger, fixture, js_filename, description, + "AggregateResourceConsumptionMetricsInBackground", + shell_options=shell_options) diff --git a/buildscripts/resmokelib/testing/hooks/background_job.py b/buildscripts/resmokelib/testing/hooks/background_job.py index b01196ba462..d239df4f6eb 100644 --- a/buildscripts/resmokelib/testing/hooks/background_job.py +++ b/buildscripts/resmokelib/testing/hooks/background_job.py @@ -3,6 +3,7 @@ import sys import threading +from buildscripts.resmokelib import errors from buildscripts.resmokelib.testing.hooks import jsfile @@ -98,3 +99,63 @@ class _ContinuousDynamicJSTestCase(jsfile.DynamicJSTestCase): last one. This method returns without waiting for the current execution to finish. """ self._should_stop.set() + + +class BackgroundRepeatedJsHook(jsfile.JSHook): + """A hook to run a js file on repeat in the background.""" + + IS_BACKGROUND = True + + def __init__(self, hook_logger, fixture, js_filename, description, thread_name, + shell_options=None): + """Initialize BackgroundRepeatedJsHook.""" + super().__init__(hook_logger, fixture, js_filename, description, + shell_options=shell_options) + self._background_job = None + self._thread_name = thread_name + + def before_suite(self, test_report): + """Start the background thread.""" + self._background_job = _BackgroundJob(self._thread_name) + self.logger.info("Starting background thread: {}.".format(self._thread_name)) + self._background_job.start() + + def after_suite(self, test_report, teardown_flag=None): + """Signal the background thread to exit, and wait until it does.""" + if self._background_job is None: + return + + self.logger.info("Stopping background thread: {}.".format(self._thread_name)) + self._background_job.stop() + + def before_test(self, test, test_report): + """Instruct the background thread to run while 'test' is also running.""" + if self._background_job is None: + return + + hook_test_case = _ContinuousDynamicJSTestCase.create_before_test( + test.logger, test, self, self._js_filename, self._shell_options) + hook_test_case.configure(self.fixture) + + self.logger.info("Resuming background thread: {}.".format(self._thread_name)) + self._background_job.resume(hook_test_case, test_report) + + def after_test(self, test, test_report): # noqa: D205,D400 + """Instruct the background thread to stop running now that 'test' has finished running.""" + if self._background_job is None: + return + + self.logger.info("Pausing background thread: {}.".format(self._thread_name)) + self._background_job.pause() + + if self._background_job.exc_info is not None: + if isinstance(self._background_job.exc_info[1], errors.TestFailure): + # If the mongo shell process running the JavaScript file exited with a non-zero + # return code, then we raise an errors.ServerFailure exception to cause resmoke.py's + # test execution to stop. + raise errors.ServerFailure(self._background_job.exc_info[1].args[0]) + else: + self.logger.error( + "Encountered an error inside background thread {}.".format(self._thread_name), + exc_info=self._background_job.exc_info) + raise self._background_job.exc_info[1] diff --git a/buildscripts/resmokelib/testing/hooks/jsfile.py b/buildscripts/resmokelib/testing/hooks/jsfile.py index 7489968d834..9a27f5298c1 100644 --- a/buildscripts/resmokelib/testing/hooks/jsfile.py +++ b/buildscripts/resmokelib/testing/hooks/jsfile.py @@ -1,8 +1,8 @@ """Interface for customizing the behavior of a test fixture by executing a JavaScript file.""" from buildscripts.resmokelib import errors -from buildscripts.resmokelib.testing.hooks import interface from buildscripts.resmokelib.testing.fixtures.interface import MultiClusterFixture +from buildscripts.resmokelib.testing.hooks import interface from buildscripts.resmokelib.testing.testcases import jstest from buildscripts.resmokelib.utils import registry diff --git a/buildscripts/resmokelib/testing/hooks/run_telemetry.py b/buildscripts/resmokelib/testing/hooks/run_telemetry.py new file mode 100644 index 00000000000..ac008180339 --- /dev/null +++ b/buildscripts/resmokelib/testing/hooks/run_telemetry.py @@ -0,0 +1,21 @@ +""" +Test hook for verifying $telemetry collects expected metrics and can redact query shapes. + +This runs in the background as other tests are ongoing. +""" +import os.path + +from buildscripts.resmokelib.testing.hooks.background_job import BackgroundRepeatedJsHook + + +# Running in the background will better stress concurrency control with other operations. +class RunTelemetry(BackgroundRepeatedJsHook): + """Periodically runs $telemetry.""" + + def __init__(self, hook_logger, fixture, shell_options=None): + """Initialize RunTelemetry.""" + + description = "Read telemetry data concurrently with ongoing tests" + js_filename = os.path.join("jstests", "hooks", "run_telemetry.js") + super().__init__(hook_logger, fixture, js_filename, description, "Telemetry", + shell_options=shell_options) diff --git a/jstests/hooks/run_aggregate_metrics_background.js b/jstests/hooks/run_aggregate_metrics_background.js index 32335852fcd..6ceb04c55f5 100644 --- a/jstests/hooks/run_aggregate_metrics_background.js +++ b/jstests/hooks/run_aggregate_metrics_background.js @@ -88,6 +88,7 @@ const aggregateMetricsBackground = function(host) { // which is why we sleep for 1 second here. This sleep is here rather than in // aggregate_metrics_background.py because the background job that file uses is designed to be run // continuously so it is easier and cleaner to just sleep here. +// TODO SERVER-75983 move this logic into the test harness. sleep(1000); if (topology.type === Topology.kStandalone) { try { diff --git a/jstests/hooks/run_telemetry.js b/jstests/hooks/run_telemetry.js new file mode 100644 index 00000000000..c79b39de134 --- /dev/null +++ b/jstests/hooks/run_telemetry.js @@ -0,0 +1,117 @@ +/** + * Runs the $telemetry stage and ensures that all the expected fields are present. + */ + +'use strict'; + +(function() { +load('jstests/libs/discover_topology.js'); // For Topology and DiscoverTopology. + +if (typeof db === 'undefined') { + throw new Error( + "Expected mongo shell to be connected a server, but global 'db' object isn't defined"); +} + +// Disable implicit sessions so tests that kill random sessions won't interrupt the operations in +// this test that aren't resilient to interruptions. +TestData.disableImplicitSessions = true; + +const topology = DiscoverTopology.findConnectedNodes(db.getMongo()); + +function runTelemetry(host) { + function verifyFields(doc) { + const kOtherTopLevelFields = [ + "key", + "metrics", + "asOf", + ]; + const kFacetedMetricFields = [ + "docsReturned", + "queryExecMicros", + ]; + const kSubMetricFields = [ + "sum", + "min", + "max", + "sumOfSquares", + ]; + const kSingletonMetricFields = [ + "execCount", + "firstSeenTimestamp", + "lastExecutionMicros", + ]; + for (let key of kOtherTopLevelFields) { + assert(doc.hasOwnProperty(key), + `The telemetry output is missing the "${key}" property: ${tojson(doc)}`); + } + const metricsDoc = doc["metrics"]; + for (let key of kSingletonMetricFields) { + assert(metricsDoc.hasOwnProperty(key), + `The telemetry 'metrics' output is missing the "${key}" property: ${ + tojson(metricsDoc)}`); + } + for (let facetedMetric of kFacetedMetricFields) { + assert(metricsDoc.hasOwnProperty(facetedMetric), + `The telemetry 'metrics' output is missing the "${facetedMetric}" property: ${ + tojson(metricsDoc)}`); + const subMetricsDoc = metricsDoc[facetedMetric]; + for (let subMetric of kSubMetricFields) { + assert( + subMetricsDoc.hasOwnProperty(subMetric), + `The telemetry 'metrics' output is missing the ${subMetric} property from the ${ + facetedMetric} subdoc: ${tojson(subMetricsDoc)}`); + } + } + } + + let conn = new Mongo(host); + conn.setSecondaryOk(); + + assert.neq( + null, conn, "Failed to connect to host '" + host + "' for background metrics collection"); + + let db = conn.getDB("admin"); + function verifyResults(telemetryCursor) { + while (telemetryCursor.hasNext()) { + let doc = telemetryCursor.next(); + try { + verifyFields(doc); + } catch (e) { + print( + "caught exception while verifying that all expected fields are in the metrics " + + "output: " + tojson(doc)); + throw (e); + } + } + } + verifyResults(db.aggregate([{$telemetry: {redactIdentifiers: false}}])); + verifyResults(db.aggregate([{$telemetry: {redactIdentifiers: true}}])); +} + +// This file is run continuously and is very fast so we want to impose some kind of rate limiting +// which is why we sleep for 1 second here. This sleep is here rather than in run_telemetry.py +// because the background job that file uses is designed to be run continuously so it is easier and +// cleaner to just sleep here. +// TODO SERVER-75983 move this logic into the test harness. +sleep(1000); +if (topology.type === Topology.kStandalone) { + try { + runTelemetry(topology.mongod); + } catch (e) { + print("background aggregate metrics against the standalone failed"); + throw e; + } +} else if (topology.type === Topology.kReplicaSet) { + for (let replicaMember of topology.nodes) { + try { + runTelemetry(replicaMember); + } catch (e) { + print("background aggregate metrics was not successful against all replica set " + + "members"); + throw e; + } + } +} else { + throw new Error("Unsupported topology configuration: " + tojson(topology)); +} +})(); diff --git a/src/mongo/db/matcher/schema/expression_internal_schema_allowed_properties.cpp b/src/mongo/db/matcher/schema/expression_internal_schema_allowed_properties.cpp index c9a76a5d61f..30d42699589 100644 --- a/src/mongo/db/matcher/schema/expression_internal_schema_allowed_properties.cpp +++ b/src/mongo/db/matcher/schema/expression_internal_schema_allowed_properties.cpp @@ -140,7 +140,7 @@ void InternalSchemaAllowedPropertiesMatchExpression::serialize(BSONObjBuilder* b for (auto&& item : _patternProperties) { BSONObjBuilder itemBuilder(patternPropertiesBuilder.subobjStart()); if (opts.replacementForLiteralArgs) { - itemBuilder.appendRegex("regex", opts.replacementForLiteralArgs.get()); + itemBuilder.append("regex", opts.replacementForLiteralArgs.get()); } else { itemBuilder.appendRegex("regex", item.first.rawRegex); } diff --git a/src/mongo/db/pipeline/document_source_telemetry.cpp b/src/mongo/db/pipeline/document_source_telemetry.cpp index 4d81065855f..db8ee94cd21 100644 --- a/src/mongo/db/pipeline/document_source_telemetry.cpp +++ b/src/mongo/db/pipeline/document_source_telemetry.cpp @@ -144,14 +144,26 @@ DocumentSource::GetNextResult DocumentSourceTelemetry::doGetNext() { const auto partitionReadTime = Timestamp{Timestamp(Date_t::now().toMillisSinceEpoch() / 1000, 0)}; for (auto&& [key, metrics] : *partition) { - auto swKey = metrics->redactKey(key, _redactIdentifiers, pExpCtx->opCtx); + StatusWith<BSONObj> swKey = Status{ErrorCodes::InternalError, "Uninitialized"}; + try { + swKey = metrics->redactKey(key, _redactIdentifiers, pExpCtx->opCtx); + } catch (const DBException& ex) { + swKey = ex.toStatus(); + } if (!swKey.isOK()) { + // We would like to print the whole key unredacted to debug, but of course the point + // of redacting it is to not leak PII, so we will use this raw/naive redaction + // algorithm which may cut things short. This will only do anything if log redaction + // is enabled. LOGV2_DEBUG(7349403, - 3, + 1, "Error encountered when redacting query shape, will not publish " "telemetry for this entry.", - "status"_attr = swKey.getStatus()); + "status"_attr = swKey.getStatus(), + "rawKey"_attr = redact(key)); if (kDebugBuild) { + LOGV2_DEBUG( + 7559600, 0, "Failed to redact telemetry", "rawKey"_attr = redact(key)); tasserted(7349401, "Was not able to re-parse telemetry key when reading telemetry."); } diff --git a/src/mongo/db/pipeline/field_path.cpp b/src/mongo/db/pipeline/field_path.cpp index 9a0a53f61fc..525b81a311a 100644 --- a/src/mongo/db/pipeline/field_path.cpp +++ b/src/mongo/db/pipeline/field_path.cpp @@ -109,8 +109,8 @@ void FieldPath::uassertValidFieldName(StringData fieldName) { if (fieldName[0] == '$' && !kAllowedDollarPrefixedFields.count(fieldName)) { uasserted(16410, - str::stream() << "FieldPath field names may not start with '$'." - << dotsAndDollarsHint); + str::stream() << "FieldPath field names may not start with '$'. Got '" + << fieldName << "'. " << dotsAndDollarsHint); } uassert( diff --git a/src/mongo/db/query/telemetry.cpp b/src/mongo/db/query/telemetry.cpp index a338942d4d8..05a205f8910 100644 --- a/src/mongo/db/query/telemetry.cpp +++ b/src/mongo/db/query/telemetry.cpp @@ -160,6 +160,12 @@ BSONObj redactHintComponent(BSONObj obj, const SerializationOptions& opts, bool continue; } + // $natural doesn't need to be redacted. + if (elem.fieldNameStringData().compare(query_request_helper::kNaturalSortField) == 0) { + bob.append(elem); + continue; + } + if (opts.replacementForLiteralArgs && redactValues) { bob.append(opts.serializeFieldPathFromString(elem.fieldName()), opts.replacementForLiteralArgs.get()); |