SERVER-71220 Apply additional filters when calibrating Cost Model

author: Alexander Ignatyev <alexander.ignatyev@mongodb.com> 2022-11-11 11:38:41 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-11-11 12:20:20 +0000
commit: ea088bcf555050361ef313edf402025dd37788c2 (patch)
tree: 2f24431f747ee5edc5bf75c3a5fd01386ef69174 /buildscripts/cost_model/start.py
parent: dd6c21b4f7c6d32161a0c262dbd63a8c86b2a44d (diff)
download: mongo-ea088bcf555050361ef313edf402025dd37788c2.tar.gz
1 files changed, 42 insertions, 75 deletions
diff --git a/buildscripts/cost_model/start.py b/buildscripts/cost_model/start.py
index f95e010ad7f..6dbe4ef2cce 100644
--- a/buildscripts/cost_model/start.py
+++ b/buildscripts/cost_model/start.py
@@ -63,16 +63,41 @@ def save_to_csv(parameters: Mapping[str, Sequence[CostModelParameters]], filepat
                 writer.writerow(fields)
 
 
-async def execute_general(database: DatabaseInstance, collections: Sequence[CollectionInfo]):
+async def execute_index_scan_queries(database: DatabaseInstance,
+                                     collections: Sequence[CollectionInfo]):
+    collection = [ci for ci in collections if ci.name.startswith('index_scan')][0]
+    fields = [f for f in collection.fields if f.name == 'choice']
+
     requests = []
-    for val in distributions['string_choice'].get_values()[::3]:
-        keys_length = len(val) + 2
-        for i in range(1, 5):
+
+    for field in fields:
+        for val in field.distribution.get_values():
+            if val.startswith('_'):
+                continue
+            keys_length = len(val) + 2
             requests.append(
-                Query(pipeline=[{'$match': {f'choice{i}': val}}], keys_length_in_bytes=keys_length))
+                Query(pipeline=[{'$match': {field.name: val}}], keys_length_in_bytes=keys_length,
+                      note='IndexScan'))
 
-    await workload_execution.execute(database, main_config.workload_execution,
-                                     [ci for ci in collections if ci.name.startswith('c_str')],
+    await workload_execution.execute(database, main_config.workload_execution, [collection],
+                                     requests)
+
+
+async def execute_physical_scan_queries(database: DatabaseInstance,
+                                        collections: Sequence[CollectionInfo]):
+    collections = [ci for ci in collections if ci.name.startswith('physical_scan')]
+    fields = [f for f in collections[0].fields if f.name == 'choice']
+    requests = []
+    for field in fields:
+        for val in field.distribution.get_values()[::3]:
+            if val.startswith('_'):
+                continue
+            keys_length = len(val) + 2
+            requests.append(
+                Query(pipeline=[{'$match': {field.name: val}}], keys_length_in_bytes=keys_length,
+                      note='PhysicalScan'))
+
+    await workload_execution.execute(database, main_config.workload_execution, collections,
                                      requests)
 
 
@@ -151,62 +176,6 @@ async def execute_unwind(database: DatabaseInstance, collections: Sequence[Colle
                                      requests)
 
 
-async def execute_small_queries(database: DatabaseInstance, collections: Sequence[CollectionInfo]):
-    # strings
-    requests = []
-    for val in distributions['string_choice_small'].get_values():
-        keys_length = len(val) + 2
-        for i in range(1, 3):
-            requests.append(
-                Query(pipeline=[{'$match': {f'choice{i}': val}}], keys_length_in_bytes=keys_length))
-
-    await workload_execution.execute(database, main_config.workload_execution,
-                                     [ci for ci in collections if ci.name.startswith('c_str_02')],
-                                     requests)
-
-    # index intersection
-    colls = [ci for ci in collections if ci.name.startswith('c_int_05')]
-    requests = []
-
-    for val in distributions['int_choice'].get_values():
-        for val2 in distributions['int_choice'].get_values():
-            requests.append(
-                Query(pipeline=[{'$match': {'in1': val, 'in2': val2}}], keys_length_in_bytes=1))
-
-        requests.append(
-            Query(pipeline=[{'$match': {'in1': val, 'in2': {'$gt': 500}}}], keys_length_in_bytes=1))
-
-        requests.append(
-            Query(pipeline=[{'$match': {'in1': {'$lte': 500}, 'in2': val}}],
-                  keys_length_in_bytes=1))
-
-    await execute_index_intersections_with_requests(database, colls, requests)
-
-    # Evaluation
-    colls = [ci for ci in collections if ci.name.startswith('c_int_05')]
-    requests = []
-
-    for val in distributions['int_choice'].get_values():
-        requests.append(
-            Query(pipeline=[{"$match": {'in1': val}}, {'$project': {'proj1': 1}}],
-                  keys_length_in_bytes=1, number_of_fields=1))
-
-    await workload_execution.execute(database, main_config.workload_execution, colls, requests)
-
-    # Unwind
-    colls = [ci for ci in collections if ci.name.startswith('c_arr_01')]
-    requests = []
-    # average size of arrays in the collection
-    average_size_of_arrays = 10
-
-    for val in distributions['int_choice'].get_values():
-        requests.append(
-            Query(pipeline=[{"$match": {'in1': val}}, {"$unwind": "$as"}],
-                  number_of_fields=average_size_of_arrays))
-
-    await workload_execution.execute(database, main_config.workload_execution, colls, requests)
-
-
 async def main():
     """Entry point function."""
     script_directory = os.path.abspath(os.path.dirname(__file__))
@@ -222,18 +191,16 @@ async def main():
 
         # 3. Collecting data for calibration (optional).
         # It runs the pipelines and stores explains to the database.
-
-        # Run this execute function only to collect calibration data in the "smaller" experiment.
-        # await execute_small_queries(database, generator.collection_infos);
-
-        await execute_general(database, generator.collection_infos)
-        main_config.workload_execution.write_mode = WriteMode.APPEND
-
-        await execute_index_intersections(database, generator.collection_infos)
-
-        await execute_evaluation(database, generator.collection_infos)
-
-        await execute_unwind(database, generator.collection_infos)
+        execution_query_functions = [
+            execute_index_scan_queries,
+            execute_physical_scan_queries,
+            execute_index_intersections,
+            execute_evaluation,
+            execute_unwind,
+        ]
+        for execute_query in execution_query_functions:
+            await execute_query(database, generator.collection_infos)
+            main_config.workload_execution.write_mode = WriteMode.APPEND
 
         # Calibration phase (optional).
         # Reads the explains stored on the previous step (this run and/or previous runs),
author	Alexander Ignatyev <alexander.ignatyev@mongodb.com>	2022-11-11 11:38:41 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-11-11 12:20:20 +0000
commit	ea088bcf555050361ef313edf402025dd37788c2 (patch)
tree	2f24431f747ee5edc5bf75c3a5fd01386ef69174 /buildscripts/cost_model/start.py
parent	dd6c21b4f7c6d32161a0c262dbd63a8c86b2a44d (diff)
download	mongo-ea088bcf555050361ef313edf402025dd37788c2.tar.gz