summaryrefslogtreecommitdiff
path: root/buildscripts
diff options
context:
space:
mode:
authorRuoxin Xu <ruoxin.xu@mongodb.com>2022-11-08 20:06:19 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-11-08 21:09:45 +0000
commit32545f21f5721a820acebc0dea64e3b4e4d9d38b (patch)
tree4a2f2873f4a4ac83400df66bc19267b52d010d09 /buildscripts
parent573609e856f0e9cb13f3ee70ab4d008315f96714 (diff)
downloadmongo-32545f21f5721a820acebc0dea64e3b4e4d9d38b.tar.gz
SERVER-70500 Calibrate ABT nodes on smaller queries
Diffstat (limited to 'buildscripts')
-rw-r--r--buildscripts/cost_model/README.md20
-rw-r--r--buildscripts/cost_model/abt_calibrator.py2
-rw-r--r--buildscripts/cost_model/calibration_settings.py95
-rw-r--r--buildscripts/cost_model/config.py2
-rw-r--r--buildscripts/cost_model/data_generator.py9
-rw-r--r--buildscripts/cost_model/database_instance.py3
-rw-r--r--buildscripts/cost_model/experiment.py2
-rw-r--r--buildscripts/cost_model/random_generator.py2
-rw-r--r--buildscripts/cost_model/start.py105
9 files changed, 209 insertions, 31 deletions
diff --git a/buildscripts/cost_model/README.md b/buildscripts/cost_model/README.md
new file mode 100644
index 00000000000..398b16373a9
--- /dev/null
+++ b/buildscripts/cost_model/README.md
@@ -0,0 +1,20 @@
+# The smaller experiment - Calibration with queries that return smaller number of documents. (smaller "n_processed")
+
+This "smaller" experiment differs from the previous experiments that query on collections of a sequence of cardinalities. For calibrating each type of ABT node, we query only against one collection with around a big enough cardinality (e.g. 100k) and a small set of documents that will be queried for collecting calibration data.
+
+There are some differences to conduct this experiment from conducting previous experiments.
+
+## Data generation phase
+we need to populate the collections twice - 1) first populate the small set of documents for queries and then 2) populate larger set of documents that will not be queried during most types of ABT nodes calibration. The larger set is for the collections being big enough for queries.
+
+There are more detailed instructions in the "data_generator" settings in "calibration_settings.py".
+
+## Calibration data collection phase
+In this experiment, we will run different queries and these queries as mentioned above will mostly be querying the smaller set of documents in order to get a smaller value of "n_processed".
+
+More specifically, to collect data for calibration we only need to run "execute_small_queries()" in "start.py" and make sure to avoid running any other queries outside this function.
+
+## Calibration phase
+This phase should be the same as calibration in other experiments.
+
+Please note that this type of experiment may not work with some certain ABT nodes. For example, for PhyscialScan Node, all the data collected may come with the same "n_processed" - the collection cardinality, which does not make any sense for calibration.
diff --git a/buildscripts/cost_model/abt_calibrator.py b/buildscripts/cost_model/abt_calibrator.py
index 520bb024ade..89d7c812b59 100644
--- a/buildscripts/cost_model/abt_calibrator.py
+++ b/buildscripts/cost_model/abt_calibrator.py
@@ -29,8 +29,8 @@
from __future__ import annotations
import pandas as pd
-import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
+import statsmodels.api as sm
from config import AbtCalibratorConfig, AbtNodeCalibrationConfig
from database_instance import DatabaseInstance
from cost_estimator import estimate
diff --git a/buildscripts/cost_model/calibration_settings.py b/buildscripts/cost_model/calibration_settings.py
index d7fccdd8386..ee814d8d337 100644
--- a/buildscripts/cost_model/calibration_settings.py
+++ b/buildscripts/cost_model/calibration_settings.py
@@ -27,6 +27,7 @@
#
"""Calibration configuration."""
+import random
import config
from random_generator import RangeGenerator, DataType, RandomDistribution, ArrayRandomDistribution
@@ -55,6 +56,33 @@ string_choice_weights = [10, 20, 5, 17, 30, 7, 9, 15, 40, 2, 12, 1]
distributions['string_choice'] = RandomDistribution.choice(string_choice_values,
string_choice_weights)
+small_query_weights = [i for i in range(10, 201, 10)]
+small_query_cardinality = sum(small_query_weights)
+
+int_choice_values = [i for i in range(1, 1000, 50)]
+random.shuffle(int_choice_values)
+distributions['int_choice'] = RandomDistribution.choice(int_choice_values, small_query_weights)
+
+distributions['random_string'] = ArrayRandomDistribution(
+ RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 5, 10, 2)),
+ RandomDistribution.uniform(RangeGenerator(DataType.STRING, "a", "z")))
+
+
+def generate_random_str(num: int):
+ strs = distributions['random_string'].generate(num)
+ str_list = []
+ for char_array in strs:
+ str_res = "".join(char_array)
+ str_list.append(str_res)
+
+ return str_list
+
+
+small_string_choice = generate_random_str(20)
+
+distributions['string_choice_small'] = RandomDistribution.choice(small_string_choice,
+ small_query_weights)
+
string_range_4 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "abca", "abc_"))
string_range_5 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "abcda", "abcd_"))
string_range_7 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "hello_a", "hello__"))
@@ -75,16 +103,56 @@ distributions['array_small'] = ArrayRandomDistribution(lengths_distr, distributi
# Database settings
database = config.DatabaseConfig(connection_string='mongodb://localhost',
- database_name='abt_calibration', dump_path='~/data/dump',
+ database_name='abt_calibration_small', dump_path='~/data/dump',
restore_from_dump=config.RestoreMode.NEVER, dump_on_exit=False)
# Collection template seetings
+
+# templates for small queries.
+c_int_05_small = config.CollectionTemplate(
+ name="c_int_05", fields=[
+ config.FieldTemplate(name="in1", data_type=config.DataType.INTEGER,
+ distribution=distributions["int_choice"], indexed=True),
+ config.FieldTemplate(name="mixed1", data_type=config.DataType.STRING,
+ distribution=distributions["string_mixed"], indexed=False),
+ config.FieldTemplate(name="uniform1", data_type=config.DataType.STRING,
+ distribution=distributions["string_uniform"], indexed=False),
+ config.FieldTemplate(name="in2", data_type=config.DataType.INTEGER,
+ distribution=distributions["int_choice"], indexed=True),
+ config.FieldTemplate(name="mixed2", data_type=config.DataType.STRING,
+ distribution=distributions["string_mixed"], indexed=False),
+ ], compound_indexes=[])
+
+c_str_02_small = config.CollectionTemplate(
+ name="c_str_02", fields=[
+ config.FieldTemplate(name="choice1", data_type=config.DataType.STRING,
+ distribution=distributions['string_choice_small'], indexed=True),
+ config.FieldTemplate(name="choice2", data_type=config.DataType.STRING,
+ distribution=distributions['string_choice_small'], indexed=False)
+ ], compound_indexes=[])
+
+c_arr_01_small = config.CollectionTemplate(
+ name="c_arr_01", fields=[
+ config.FieldTemplate(name="as", data_type=config.DataType.INTEGER,
+ distribution=distributions["array_small"], indexed=False),
+ config.FieldTemplate(name="in1", data_type=config.DataType.INTEGER,
+ distribution=distributions["int_choice"], indexed=True),
+ ], compound_indexes=[])
+
c_str_01 = config.CollectionTemplate(
name="c_str_01", fields=[
config.FieldTemplate(name="choice1", data_type=config.DataType.STRING,
distribution=distributions['string_choice'], indexed=True)
], compound_indexes=[])
+c_str_02 = config.CollectionTemplate(
+ name="c_str_02", fields=[
+ config.FieldTemplate(name="choice1", data_type=config.DataType.STRING,
+ distribution=distributions['string_choice'], indexed=True),
+ config.FieldTemplate(name="choice2", data_type=config.DataType.STRING,
+ distribution=distributions['string_choice'], indexed=False)
+ ], compound_indexes=[])
+
c_str_05 = config.CollectionTemplate(
name="c_str_05", fields=[
config.FieldTemplate(name="choice1", data_type=config.DataType.STRING,
@@ -121,12 +189,31 @@ c_arr_01 = config.CollectionTemplate(
# Data Generator settings
data_generator = config.DataGeneratorConfig(
- enabled=True, collection_cardinalities=list(range(10000, 50001, 2500)), batch_size=10000,
- collection_templates=[c_str_01, c_str_05, c_int_05, c_arr_01])
+ enabled=True, collection_cardinalities=list(range(10000, 50001,
+ 2500)), collection_name_with_card=True,
+ batch_size=10000, collection_templates=[c_str_01, c_str_05, c_int_05,
+ c_arr_01], write_mode=config.WriteMode.REPLACE)
+"""Data generation settings for Calibration with queries that return smaller number of documents.
+# First round of data population generates a small set of documents that will be queried.
+
+data_generator = config.DataGeneratorConfig(
+ enabled=True, collection_cardinalities=[small_query_cardinality], collection_name_with_card=False, batch_size=10000,
+ collection_templates=[c_str_02_small, c_int_05_small, c_arr_01_small], write_mode=config.WriteMode.APPEND)
+
+# Second round of data population which generates 100,000 documents to the collection in order to
+# make sure the collection is big enough for queries.
+
+data_generator = config.DataGeneratorConfig(
+ enabled=True, collection_cardinalities=[100000], collection_name_with_card=False, batch_size=10000,
+ collection_templates=[c_str_02, c_int_05, c_arr_01], write_mode=config.WriteMode.APPEND)
+
+# Please note that the 'WriteMode' should be 'APPEND' type and the collection name should remain
+# unchanged. This can be controlled by setting "collection_name_with_card=False".
+"""
# Workload Execution settings
workload_execution = config.WorkloadExecutionConfig(
- enabled=True, output_collection_name='calibrationData', write_mode=config.WriteMode.REPLACE,
+ enabled=True, output_collection_name='calibrationDataSmall', write_mode=config.WriteMode.APPEND,
warmup_runs=3, runs=10)
abt_nodes = [
diff --git a/buildscripts/cost_model/config.py b/buildscripts/cost_model/config.py
index ffaf48d5e4a..8080a4eaba5 100644
--- a/buildscripts/cost_model/config.py
+++ b/buildscripts/cost_model/config.py
@@ -74,7 +74,9 @@ class DataGeneratorConfig:
enabled: bool
collection_cardinalities: list[int]
+ collection_name_with_card: bool
collection_templates: list[CollectionTemplate]
+ write_mode: WriteMode
batch_size: int
diff --git a/buildscripts/cost_model/data_generator.py b/buildscripts/cost_model/data_generator.py
index 4899a728087..e10847a7546 100644
--- a/buildscripts/cost_model/data_generator.py
+++ b/buildscripts/cost_model/data_generator.py
@@ -36,7 +36,7 @@ import pymongo
from pymongo import IndexModel
from motor.motor_asyncio import AsyncIOMotorCollection
from random_generator import RandomDistribution
-from config import DataGeneratorConfig, DataType
+from config import DataGeneratorConfig, DataType, WriteMode
from database_instance import DatabaseInstance
__all__ = ['DataGenerator']
@@ -92,7 +92,8 @@ class DataGenerator:
tasks = []
for coll_info in self.collection_infos:
coll = self.database.database[coll_info.name]
- await coll.drop()
+ if self.config.write_mode == WriteMode.REPLACE:
+ await coll.drop()
tasks.append(asyncio.create_task(self._populate_collection(coll, coll_info)))
tasks.append(asyncio.create_task(create_single_field_indexes(coll, coll_info.fields)))
tasks.append(asyncio.create_task(create_compound_indexes(coll, coll_info)))
@@ -110,7 +111,9 @@ class DataGenerator:
indexed=ft.indexed) for ft in coll_template.fields
]
for doc_count in self.config.collection_cardinalities:
- name = f'{coll_template.name}_{doc_count}'
+ name = f'{coll_template.name}'
+ if self.config.collection_name_with_card is True:
+ name = f'{coll_template.name}_{doc_count}'
yield CollectionInfo(name=name, fields=fields, documents_count=doc_count,
compound_indexes=coll_template.compound_indexes)
diff --git a/buildscripts/cost_model/database_instance.py b/buildscripts/cost_model/database_instance.py
index 518bb1c776e..58df9f382dd 100644
--- a/buildscripts/cost_model/database_instance.py
+++ b/buildscripts/cost_model/database_instance.py
@@ -134,7 +134,8 @@ class DatabaseInstance:
async def insert_many(self, collection_name: str, docs: Sequence[Mapping[str, any]]) -> None:
"""Insert documents into the collection with the given name."""
- await self.database[collection_name].insert_many(docs, ordered=False)
+ if len(docs) > 0:
+ await self.database[collection_name].insert_many(docs, ordered=False)
async def get_all_documents(self, collection_name: str):
"""Get all documents from the collection with the given name."""
diff --git a/buildscripts/cost_model/experiment.py b/buildscripts/cost_model/experiment.py
index 86d3e2c32de..2b2699db401 100644
--- a/buildscripts/cost_model/experiment.py
+++ b/buildscripts/cost_model/experiment.py
@@ -98,9 +98,9 @@ import dataclasses
import bson.json_util as json
import seaborn as sns
import pandas as pd
-import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
+import statsmodels.api as sm
from database_instance import DatabaseInstance
import execution_tree as sbe
import physical_tree as abt
diff --git a/buildscripts/cost_model/random_generator.py b/buildscripts/cost_model/random_generator.py
index c4bc2000b0c..1dcc2bb004c 100644
--- a/buildscripts/cost_model/random_generator.py
+++ b/buildscripts/cost_model/random_generator.py
@@ -230,7 +230,7 @@ class RandomDistribution:
def _choice(size: int, values: Sequence[TVar], probs: Sequence[float]):
if probs is None:
raise ValueError("props must be specified for choice distribution")
- return list(_rng.choice(a=values, size=size, p=probs))
+ return [val.item() for val in _rng.choice(a=values, size=size, p=probs)]
@staticmethod
def _normal(size: int, values: Sequence[TVar], _: Sequence[float]):
diff --git a/buildscripts/cost_model/start.py b/buildscripts/cost_model/start.py
index 7331405191c..f95e010ad7f 100644
--- a/buildscripts/cost_model/start.py
+++ b/buildscripts/cost_model/start.py
@@ -76,26 +76,9 @@ async def execute_general(database: DatabaseInstance, collections: Sequence[Coll
requests)
-async def execute_index_intersections(database: DatabaseInstance,
- collections: Sequence[CollectionInfo]):
- collections = [ci for ci in collections if ci.name.startswith('c_int')]
-
- requests = []
-
- for i in range(0, 1000, 100):
- requests.append(Query(pipeline=[{'$match': {'in1': i, 'in2': i}}], keys_length_in_bytes=1))
-
- requests.append(
- Query(pipeline=[{'$match': {'in1': i, 'in2': 1000 - i}}], keys_length_in_bytes=1))
-
- requests.append(
- Query(pipeline=[{'$match': {'in1': {'$lte': i}, 'in2': 1000 - i}}],
- keys_length_in_bytes=1))
-
- requests.append(
- Query(pipeline=[{'$match': {'in1': i, 'in2': {'$gt': 1000 - i}}}],
- keys_length_in_bytes=1))
-
+async def execute_index_intersections_with_requests(database: DatabaseInstance,
+ collections: Sequence[CollectionInfo],
+ requests: Sequence[Query]):
try:
await database.set_parameter('internalCostModelCoefficients',
'{"filterIncrementalCost": 10000.0}')
@@ -118,6 +101,29 @@ async def execute_index_intersections(database: DatabaseInstance,
await database.set_parameter('internalCostModelCoefficients', '')
+async def execute_index_intersections(database: DatabaseInstance,
+ collections: Sequence[CollectionInfo]):
+ collections = [ci for ci in collections if ci.name.startswith('c_int')]
+
+ requests = []
+
+ for i in range(0, 1000, 100):
+ requests.append(Query(pipeline=[{'$match': {'in1': i, 'in2': i}}], keys_length_in_bytes=1))
+
+ requests.append(
+ Query(pipeline=[{'$match': {'in1': i, 'in2': 1000 - i}}], keys_length_in_bytes=1))
+
+ requests.append(
+ Query(pipeline=[{'$match': {'in1': {'$lte': i}, 'in2': 1000 - i}}],
+ keys_length_in_bytes=1))
+
+ requests.append(
+ Query(pipeline=[{'$match': {'in1': i, 'in2': {'$gt': 1000 - i}}}],
+ keys_length_in_bytes=1))
+
+ await execute_index_intersections_with_requests(database, collections, requests)
+
+
async def execute_evaluation(database: DatabaseInstance, collections: Sequence[CollectionInfo]):
collections = [ci for ci in collections if ci.name.startswith('c_int')]
requests = []
@@ -145,6 +151,62 @@ async def execute_unwind(database: DatabaseInstance, collections: Sequence[Colle
requests)
+async def execute_small_queries(database: DatabaseInstance, collections: Sequence[CollectionInfo]):
+ # strings
+ requests = []
+ for val in distributions['string_choice_small'].get_values():
+ keys_length = len(val) + 2
+ for i in range(1, 3):
+ requests.append(
+ Query(pipeline=[{'$match': {f'choice{i}': val}}], keys_length_in_bytes=keys_length))
+
+ await workload_execution.execute(database, main_config.workload_execution,
+ [ci for ci in collections if ci.name.startswith('c_str_02')],
+ requests)
+
+ # index intersection
+ colls = [ci for ci in collections if ci.name.startswith('c_int_05')]
+ requests = []
+
+ for val in distributions['int_choice'].get_values():
+ for val2 in distributions['int_choice'].get_values():
+ requests.append(
+ Query(pipeline=[{'$match': {'in1': val, 'in2': val2}}], keys_length_in_bytes=1))
+
+ requests.append(
+ Query(pipeline=[{'$match': {'in1': val, 'in2': {'$gt': 500}}}], keys_length_in_bytes=1))
+
+ requests.append(
+ Query(pipeline=[{'$match': {'in1': {'$lte': 500}, 'in2': val}}],
+ keys_length_in_bytes=1))
+
+ await execute_index_intersections_with_requests(database, colls, requests)
+
+ # Evaluation
+ colls = [ci for ci in collections if ci.name.startswith('c_int_05')]
+ requests = []
+
+ for val in distributions['int_choice'].get_values():
+ requests.append(
+ Query(pipeline=[{"$match": {'in1': val}}, {'$project': {'proj1': 1}}],
+ keys_length_in_bytes=1, number_of_fields=1))
+
+ await workload_execution.execute(database, main_config.workload_execution, colls, requests)
+
+ # Unwind
+ colls = [ci for ci in collections if ci.name.startswith('c_arr_01')]
+ requests = []
+ # average size of arrays in the collection
+ average_size_of_arrays = 10
+
+ for val in distributions['int_choice'].get_values():
+ requests.append(
+ Query(pipeline=[{"$match": {'in1': val}}, {"$unwind": "$as"}],
+ number_of_fields=average_size_of_arrays))
+
+ await workload_execution.execute(database, main_config.workload_execution, colls, requests)
+
+
async def main():
"""Entry point function."""
script_directory = os.path.abspath(os.path.dirname(__file__))
@@ -161,6 +223,9 @@ async def main():
# 3. Collecting data for calibration (optional).
# It runs the pipelines and stores explains to the database.
+ # Run this execute function only to collect calibration data in the "smaller" experiment.
+ # await execute_small_queries(database, generator.collection_infos);
+
await execute_general(database, generator.collection_infos)
main_config.workload_execution.write_mode = WriteMode.APPEND