SERVER-70500 Calibrate ABT nodes on smaller queries

author: Ruoxin Xu <ruoxin.xu@mongodb.com> 2022-11-08 20:06:19 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-11-08 21:09:45 +0000
commit: 32545f21f5721a820acebc0dea64e3b4e4d9d38b (patch)
tree: 4a2f2873f4a4ac83400df66bc19267b52d010d09 /buildscripts
parent: 573609e856f0e9cb13f3ee70ab4d008315f96714 (diff)
download: mongo-32545f21f5721a820acebc0dea64e3b4e4d9d38b.tar.gz
9 files changed, 209 insertions, 31 deletions
diff --git a/buildscripts/cost_model/README.md b/buildscripts/cost_model/README.md
new file mode 100644
index 00000000000..398b16373a9
--- /dev/null
+++ b/buildscripts/cost_model/README.md
@@ -0,0 +1,20 @@
+# The smaller experiment - Calibration with queries that return smaller number of documents. (smaller "n_processed")
+
+This "smaller" experiment differs from the previous experiments that query on collections of a sequence of cardinalities. For calibrating each type of ABT node, we query only against one collection with around a big enough cardinality (e.g. 100k) and a small set of documents that will be queried for collecting calibration data.
+
+There are some differences to conduct this experiment from conducting previous experiments.
+
+## Data generation phase
+we need to populate the collections twice - 1) first populate the small set of documents for queries and then 2) populate larger set of documents that will not be queried during most types of ABT nodes calibration. The larger set is for the collections being big enough for queries.
+
+There are more detailed instructions in the "data_generator" settings in "calibration_settings.py".
+
+## Calibration data collection phase
+In this experiment, we will run different queries and these queries as mentioned above will mostly be querying the smaller set of documents in order to get a smaller value of "n_processed".
+
+More specifically, to collect data for calibration we only need to run "execute_small_queries()" in "start.py" and make sure to avoid running any other queries outside this function.
+
+## Calibration phase
+This phase should be the same as calibration in other experiments.
+
+Please note that this type of experiment may not work with some certain ABT nodes. For example, for PhyscialScan Node, all the data collected may come with the same "n_processed" - the collection cardinality, which does not make any sense for calibration.
diff --git a/buildscripts/cost_model/abt_calibrator.py b/buildscripts/cost_model/abt_calibrator.py
index 520bb024ade..89d7c812b59 100644
--- a/buildscripts/cost_model/abt_calibrator.py
+++ b/buildscripts/cost_model/abt_calibrator.py
@@ -29,8 +29,8 @@
 
 from __future__ import annotations
 import pandas as pd
-import statsmodels.api as sm
 from sklearn.linear_model import LinearRegression
+import statsmodels.api as sm
 from config import AbtCalibratorConfig, AbtNodeCalibrationConfig
 from database_instance import DatabaseInstance
 from cost_estimator import estimate
diff --git a/buildscripts/cost_model/calibration_settings.py b/buildscripts/cost_model/calibration_settings.py
index d7fccdd8386..ee814d8d337 100644
--- a/buildscripts/cost_model/calibration_settings.py
+++ b/buildscripts/cost_model/calibration_settings.py
@@ -27,6 +27,7 @@
 #
 """Calibration configuration."""
 
+import random
 import config
 from random_generator import RangeGenerator, DataType, RandomDistribution, ArrayRandomDistribution
 
@@ -55,6 +56,33 @@ string_choice_weights = [10, 20, 5, 17, 30, 7, 9, 15, 40, 2, 12, 1]
 distributions['string_choice'] = RandomDistribution.choice(string_choice_values,
                                                            string_choice_weights)
 
+small_query_weights = [i for i in range(10, 201, 10)]
+small_query_cardinality = sum(small_query_weights)
+
+int_choice_values = [i for i in range(1, 1000, 50)]
+random.shuffle(int_choice_values)
+distributions['int_choice'] = RandomDistribution.choice(int_choice_values, small_query_weights)
+
+distributions['random_string'] = ArrayRandomDistribution(
+    RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 5, 10, 2)),
+    RandomDistribution.uniform(RangeGenerator(DataType.STRING, "a", "z")))
+
+
+def generate_random_str(num: int):
+    strs = distributions['random_string'].generate(num)
+    str_list = []
+    for char_array in strs:
+        str_res = "".join(char_array)
+        str_list.append(str_res)
+
+    return str_list
+
+
+small_string_choice = generate_random_str(20)
+
+distributions['string_choice_small'] = RandomDistribution.choice(small_string_choice,
+                                                                 small_query_weights)
+
 string_range_4 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "abca", "abc_"))
 string_range_5 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "abcda", "abcd_"))
 string_range_7 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "hello_a", "hello__"))
@@ -75,16 +103,56 @@ distributions['array_small'] = ArrayRandomDistribution(lengths_distr, distributi
 
 # Database settings
 database = config.DatabaseConfig(connection_string='mongodb://localhost',
-                                 database_name='abt_calibration', dump_path='~/data/dump',
+                                 database_name='abt_calibration_small', dump_path='~/data/dump',
                                  restore_from_dump=config.RestoreMode.NEVER, dump_on_exit=False)
 
 # Collection template seetings
+
+# templates for small queries.
+c_int_05_small = config.CollectionTemplate(
+    name="c_int_05", fields=[
+        config.FieldTemplate(name="in1", data_type=config.DataType.INTEGER,
+                             distribution=distributions["int_choice"], indexed=True),
+        config.FieldTemplate(name="mixed1", data_type=config.DataType.STRING,
+                             distribution=distributions["string_mixed"], indexed=False),
+        config.FieldTemplate(name="uniform1", data_type=config.DataType.STRING,
+                             distribution=distributions["string_uniform"], indexed=False),
+        config.FieldTemplate(name="in2", data_type=config.DataType.INTEGER,
+                             distribution=distributions["int_choice"], indexed=True),
+        config.FieldTemplate(name="mixed2", data_type=config.DataType.STRING,
+                             distribution=distributions["string_mixed"], indexed=False),
+    ], compound_indexes=[])
+
+c_str_02_small = config.CollectionTemplate(
+    name="c_str_02", fields=[
+        config.FieldTemplate(name="choice1", data_type=config.DataType.STRING,
+                             distribution=distributions['string_choice_small'], indexed=True),
+        config.FieldTemplate(name="choice2", data_type=config.DataType.STRING,
+                             distribution=distributions['string_choice_small'], indexed=False)
+    ], compound_indexes=[])
+
+c_arr_01_small = config.CollectionTemplate(
+    name="c_arr_01", fields=[
+        config.FieldTemplate(name="as", data_type=config.DataType.INTEGER,
+                             distribution=distributions["array_small"], indexed=False),
+        config.FieldTemplate(name="in1", data_type=config.DataType.INTEGER,
+                             distribution=distributions["int_choice"], indexed=True),
+    ], compound_indexes=[])
+
 c_str_01 = config.CollectionTemplate(
     name="c_str_01", fields=[
         config.FieldTemplate(name="choice1", data_type=config.DataType.STRING,
                              distribution=distributions['string_choice'], indexed=True)
     ], compound_indexes=[])
 
+c_str_02 = config.CollectionTemplate(
+    name="c_str_02", fields=[
+        config.FieldTemplate(name="choice1", data_type=config.DataType.STRING,
+                             distribution=distributions['string_choice'], indexed=True),
+        config.FieldTemplate(name="choice2", data_type=config.DataType.STRING,
+                             distribution=distributions['string_choice'], indexed=False)
+    ], compound_indexes=[])
+
 c_str_05 = config.CollectionTemplate(
     name="c_str_05", fields=[
         config.FieldTemplate(name="choice1", data_type=config.DataType.STRING,
@@ -121,12 +189,31 @@ c_arr_01 = config.CollectionTemplate(
 
 # Data Generator settings
 data_generator = config.DataGeneratorConfig(
-    enabled=True, collection_cardinalities=list(range(10000, 50001, 2500)), batch_size=10000,
-    collection_templates=[c_str_01, c_str_05, c_int_05, c_arr_01])
+    enabled=True, collection_cardinalities=list(range(10000, 50001,
+                                                      2500)), collection_name_with_card=True,
+    batch_size=10000, collection_templates=[c_str_01, c_str_05, c_int_05,
+                                            c_arr_01], write_mode=config.WriteMode.REPLACE)
+"""Data generation settings for Calibration with queries that return smaller number of documents.
+# First round of data population generates a small set of documents that will be queried.
+
+data_generator = config.DataGeneratorConfig(
+    enabled=True, collection_cardinalities=[small_query_cardinality], collection_name_with_card=False, batch_size=10000,
+    collection_templates=[c_str_02_small, c_int_05_small, c_arr_01_small], write_mode=config.WriteMode.APPEND)
+
+# Second round of data population which generates 100,000 documents to the collection in order to
+# make sure the collection is big enough for queries.
+
+data_generator = config.DataGeneratorConfig(
+    enabled=True, collection_cardinalities=[100000], collection_name_with_card=False, batch_size=10000,
+    collection_templates=[c_str_02, c_int_05, c_arr_01], write_mode=config.WriteMode.APPEND)
+
+# Please note that the 'WriteMode' should be 'APPEND' type and the collection name should remain
+# unchanged. This can be controlled by setting "collection_name_with_card=False".
+"""
 
 # Workload Execution settings
 workload_execution = config.WorkloadExecutionConfig(
-    enabled=True, output_collection_name='calibrationData', write_mode=config.WriteMode.REPLACE,
+    enabled=True, output_collection_name='calibrationDataSmall', write_mode=config.WriteMode.APPEND,
     warmup_runs=3, runs=10)
 
 abt_nodes = [
diff --git a/buildscripts/cost_model/config.py b/buildscripts/cost_model/config.py
index ffaf48d5e4a..8080a4eaba5 100644
--- a/buildscripts/cost_model/config.py
+++ b/buildscripts/cost_model/config.py
@@ -74,7 +74,9 @@ class DataGeneratorConfig:
 
     enabled: bool
     collection_cardinalities: list[int]
+    collection_name_with_card: bool
     collection_templates: list[CollectionTemplate]
+    write_mode: WriteMode
     batch_size: int
 
 
diff --git a/buildscripts/cost_model/data_generator.py b/buildscripts/cost_model/data_generator.py
index 4899a728087..e10847a7546 100644
--- a/buildscripts/cost_model/data_generator.py
+++ b/buildscripts/cost_model/data_generator.py
@@ -36,7 +36,7 @@ import pymongo
 from pymongo import IndexModel
 from motor.motor_asyncio import AsyncIOMotorCollection
 from random_generator import RandomDistribution
-from config import DataGeneratorConfig, DataType
+from config import DataGeneratorConfig, DataType, WriteMode
 from database_instance import DatabaseInstance
 
 __all__ = ['DataGenerator']
@@ -92,7 +92,8 @@ class DataGenerator:
         tasks = []
         for coll_info in self.collection_infos:
             coll = self.database.database[coll_info.name]
-            await coll.drop()
+            if self.config.write_mode == WriteMode.REPLACE:
+                await coll.drop()
             tasks.append(asyncio.create_task(self._populate_collection(coll, coll_info)))
             tasks.append(asyncio.create_task(create_single_field_indexes(coll, coll_info.fields)))
             tasks.append(asyncio.create_task(create_compound_indexes(coll, coll_info)))
@@ -110,7 +111,9 @@ class DataGenerator:
                           indexed=ft.indexed) for ft in coll_template.fields
             ]
             for doc_count in self.config.collection_cardinalities:
-                name = f'{coll_template.name}_{doc_count}'
+                name = f'{coll_template.name}'
+                if self.config.collection_name_with_card is True:
+                    name = f'{coll_template.name}_{doc_count}'
                 yield CollectionInfo(name=name, fields=fields, documents_count=doc_count,
                                      compound_indexes=coll_template.compound_indexes)
 
diff --git a/buildscripts/cost_model/database_instance.py b/buildscripts/cost_model/database_instance.py
index 518bb1c776e..58df9f382dd 100644
--- a/buildscripts/cost_model/database_instance.py
+++ b/buildscripts/cost_model/database_instance.py
@@ -134,7 +134,8 @@ class DatabaseInstance:
 
     async def insert_many(self, collection_name: str, docs: Sequence[Mapping[str, any]]) -> None:
         """Insert documents into the collection with the given name."""
-        await self.database[collection_name].insert_many(docs, ordered=False)
+        if len(docs) > 0:
+            await self.database[collection_name].insert_many(docs, ordered=False)
 
     async def get_all_documents(self, collection_name: str):
         """Get all documents from the collection with the given name."""
diff --git a/buildscripts/cost_model/experiment.py b/buildscripts/cost_model/experiment.py
index 86d3e2c32de..2b2699db401 100644
--- a/buildscripts/cost_model/experiment.py
+++ b/buildscripts/cost_model/experiment.py
@@ -98,9 +98,9 @@ import dataclasses
 import bson.json_util as json
 import seaborn as sns
 import pandas as pd
-import statsmodels.api as sm
 from sklearn.metrics import r2_score
 from sklearn.linear_model import LinearRegression
+import statsmodels.api as sm
 from database_instance import DatabaseInstance
 import execution_tree as sbe
 import physical_tree as abt
diff --git a/buildscripts/cost_model/random_generator.py b/buildscripts/cost_model/random_generator.py
index c4bc2000b0c..1dcc2bb004c 100644
--- a/buildscripts/cost_model/random_generator.py
+++ b/buildscripts/cost_model/random_generator.py
@@ -230,7 +230,7 @@ class RandomDistribution:
     def _choice(size: int, values: Sequence[TVar], probs: Sequence[float]):
         if probs is None:
             raise ValueError("props must be specified for choice distribution")
-        return list(_rng.choice(a=values, size=size, p=probs))
+        return [val.item() for val in _rng.choice(a=values, size=size, p=probs)]
 
     @staticmethod
     def _normal(size: int, values: Sequence[TVar], _: Sequence[float]):
diff --git a/buildscripts/cost_model/start.py b/buildscripts/cost_model/start.py
index 7331405191c..f95e010ad7f 100644
--- a/buildscripts/cost_model/start.py
+++ b/buildscripts/cost_model/start.py
@@ -76,26 +76,9 @@ async def execute_general(database: DatabaseInstance, collections: Sequence[Coll
                                      requests)
 
 
-async def execute_index_intersections(database: DatabaseInstance,
-                                      collections: Sequence[CollectionInfo]):
-    collections = [ci for ci in collections if ci.name.startswith('c_int')]
-
-    requests = []
-
-    for i in range(0, 1000, 100):
-        requests.append(Query(pipeline=[{'$match': {'in1': i, 'in2': i}}], keys_length_in_bytes=1))
-
-        requests.append(
-            Query(pipeline=[{'$match': {'in1': i, 'in2': 1000 - i}}], keys_length_in_bytes=1))
-
-        requests.append(
-            Query(pipeline=[{'$match': {'in1': {'$lte': i}, 'in2': 1000 - i}}],
-                  keys_length_in_bytes=1))
-
-        requests.append(
-            Query(pipeline=[{'$match': {'in1': i, 'in2': {'$gt': 1000 - i}}}],
-                  keys_length_in_bytes=1))
-
+async def execute_index_intersections_with_requests(database: DatabaseInstance,
+                                                    collections: Sequence[CollectionInfo],
+                                                    requests: Sequence[Query]):
     try:
         await database.set_parameter('internalCostModelCoefficients',
                                      '{"filterIncrementalCost": 10000.0}')
@@ -118,6 +101,29 @@ async def execute_index_intersections(database: DatabaseInstance,
         await database.set_parameter('internalCostModelCoefficients', '')
 
 
+async def execute_index_intersections(database: DatabaseInstance,
+                                      collections: Sequence[CollectionInfo]):
+    collections = [ci for ci in collections if ci.name.startswith('c_int')]
+
+    requests = []
+
+    for i in range(0, 1000, 100):
+        requests.append(Query(pipeline=[{'$match': {'in1': i, 'in2': i}}], keys_length_in_bytes=1))
+
+        requests.append(
+            Query(pipeline=[{'$match': {'in1': i, 'in2': 1000 - i}}], keys_length_in_bytes=1))
+
+        requests.append(
+            Query(pipeline=[{'$match': {'in1': {'$lte': i}, 'in2': 1000 - i}}],
+                  keys_length_in_bytes=1))
+
+        requests.append(
+            Query(pipeline=[{'$match': {'in1': i, 'in2': {'$gt': 1000 - i}}}],
+                  keys_length_in_bytes=1))
+
+    await execute_index_intersections_with_requests(database, collections, requests)
+
+
 async def execute_evaluation(database: DatabaseInstance, collections: Sequence[CollectionInfo]):
     collections = [ci for ci in collections if ci.name.startswith('c_int')]
     requests = []
@@ -145,6 +151,62 @@ async def execute_unwind(database: DatabaseInstance, collections: Sequence[Colle
                                      requests)
 
 
+async def execute_small_queries(database: DatabaseInstance, collections: Sequence[CollectionInfo]):
+    # strings
+    requests = []
+    for val in distributions['string_choice_small'].get_values():
+        keys_length = len(val) + 2
+        for i in range(1, 3):
+            requests.append(
+                Query(pipeline=[{'$match': {f'choice{i}': val}}], keys_length_in_bytes=keys_length))
+
+    await workload_execution.execute(database, main_config.workload_execution,
+                                     [ci for ci in collections if ci.name.startswith('c_str_02')],
+                                     requests)
+
+    # index intersection
+    colls = [ci for ci in collections if ci.name.startswith('c_int_05')]
+    requests = []
+
+    for val in distributions['int_choice'].get_values():
+        for val2 in distributions['int_choice'].get_values():
+            requests.append(
+                Query(pipeline=[{'$match': {'in1': val, 'in2': val2}}], keys_length_in_bytes=1))
+
+        requests.append(
+            Query(pipeline=[{'$match': {'in1': val, 'in2': {'$gt': 500}}}], keys_length_in_bytes=1))
+
+        requests.append(
+            Query(pipeline=[{'$match': {'in1': {'$lte': 500}, 'in2': val}}],
+                  keys_length_in_bytes=1))
+
+    await execute_index_intersections_with_requests(database, colls, requests)
+
+    # Evaluation
+    colls = [ci for ci in collections if ci.name.startswith('c_int_05')]
+    requests = []
+
+    for val in distributions['int_choice'].get_values():
+        requests.append(
+            Query(pipeline=[{"$match": {'in1': val}}, {'$project': {'proj1': 1}}],
+                  keys_length_in_bytes=1, number_of_fields=1))
+
+    await workload_execution.execute(database, main_config.workload_execution, colls, requests)
+
+    # Unwind
+    colls = [ci for ci in collections if ci.name.startswith('c_arr_01')]
+    requests = []
+    # average size of arrays in the collection
+    average_size_of_arrays = 10
+
+    for val in distributions['int_choice'].get_values():
+        requests.append(
+            Query(pipeline=[{"$match": {'in1': val}}, {"$unwind": "$as"}],
+                  number_of_fields=average_size_of_arrays))
+
+    await workload_execution.execute(database, main_config.workload_execution, colls, requests)
+
+
 async def main():
     """Entry point function."""
     script_directory = os.path.abspath(os.path.dirname(__file__))
@@ -161,6 +223,9 @@ async def main():
         # 3. Collecting data for calibration (optional).
         # It runs the pipelines and stores explains to the database.
 
+        # Run this execute function only to collect calibration data in the "smaller" experiment.
+        # await execute_small_queries(database, generator.collection_infos);
+
         await execute_general(database, generator.collection_infos)
         main_config.workload_execution.write_mode = WriteMode.APPEND
author	Ruoxin Xu <ruoxin.xu@mongodb.com>	2022-11-08 20:06:19 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-11-08 21:09:45 +0000
commit	32545f21f5721a820acebc0dea64e3b4e4d9d38b (patch)
tree	4a2f2873f4a4ac83400df66bc19267b52d010d09 /buildscripts
parent	573609e856f0e9cb13f3ee70ab4d008315f96714 (diff)
download	mongo-32545f21f5721a820acebc0dea64e3b4e4d9d38b.tar.gz