summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Ignatyev <alexander.ignatyev@mongodb.com>2022-09-01 08:57:24 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-09-01 09:50:08 +0000
commit5f11658224585f0399d57afa2fd8886165f854bf (patch)
tree9979d9e64143d59abd94cfa40adc0177f06ac7f7
parentbd8a8d4d880577302c777ff961f359b03435126a (diff)
downloadmongo-5f11658224585f0399d57afa2fd8886165f854bf.tar.gz
SERVER-69031 Move JSON configuration to python file
-rw-r--r--buildscripts/cost_model/calibration_settings.py (renamed from buildscripts/cost_model/random_generator_config.py)55
-rw-r--r--buildscripts/cost_model/config.json143
-rw-r--r--buildscripts/cost_model/config.py160
-rw-r--r--buildscripts/cost_model/data_generator.py8
-rw-r--r--buildscripts/cost_model/start.py19
5 files changed, 60 insertions, 325 deletions
diff --git a/buildscripts/cost_model/random_generator_config.py b/buildscripts/cost_model/calibration_settings.py
index d437758ac8a..1fe7684089a 100644
--- a/buildscripts/cost_model/random_generator_config.py
+++ b/buildscripts/cost_model/calibration_settings.py
@@ -25,17 +25,14 @@
# exception statement from all source files in the program, then also delete
# it in the license file.
#
-"""
-Configuration of distributions used to generate collections from templates.
+"""Calibration configuration."""
-They used in collection templates defined in json.config.
-"""
-
-from importlib.metadata import distributions
+import config
from random_generator import RangeGenerator, DataType, RandomDistribution
-__ALL__ = ['distributions']
+__all__ = ['main_config', 'distributions']
+# Data distributions settings.
distributions = {}
string_choice_values = [
@@ -69,3 +66,47 @@ distributions['string_mixed'] = RandomDistribution.mixed(
distributions['string_uniform'] = RandomDistribution.uniform(
RangeGenerator(DataType.STRING, "helloworldaa", "helloworldd_"))
+
+# Database settings
+database = config.DatabaseConfig(connection_string='mongodb://localhost',
+ database_name='abt_calibration', dump_path='~/data/dump',
+ restore_from_dump=config.RestoreMode.NEVER, dump_on_exit=False)
+
+# Collection template seetings
+c_str_01 = config.CollectionTemplate(
+ name="c_str_01", fields=[
+ config.FieldTemplate(name="choice1", data_type=config.DataType.STRING,
+ distribution=distributions['string_choice'], indexed=True)
+ ], compound_indexes=[])
+
+c_str_05 = config.CollectionTemplate(
+ name="c_str_05", fields=[
+ config.FieldTemplate(name="choice1", data_type=config.DataType.STRING,
+ distribution=distributions["string_choice"], indexed=True),
+ config.FieldTemplate(name="mixed1", data_type=config.DataType.STRING,
+ distribution=distributions["string_mixed"], indexed=True),
+ config.FieldTemplate(name="uniform1", data_type=config.DataType.STRING,
+ distribution=distributions["string_uniform"], indexed=True),
+ config.FieldTemplate(name="choice2", data_type=config.DataType.STRING,
+ distribution=distributions["string_choice"], indexed=True),
+ config.FieldTemplate(name="mixed2", data_type=config.DataType.STRING,
+ distribution=distributions["string_mixed"], indexed=True),
+ ], compound_indexes=[["choice1", "mixed1"]])
+
+# Data Generator settings
+data_generator = config.DataGeneratorConfig(enabled=True, collection_cardinalities=[100, 200, 500],
+ batch_size=10000,
+ collection_templates=[c_str_01, c_str_05])
+
+# Workload Execution settings
+workload_execution = config.WorkloadExecutionConfig(
+ enabled=True, output_collection_name='calibrationData', write_mode=config.WriteMode.REPLACE,
+ warmup_runs=1, runs=5)
+
+# Calibrator settings
+abt_calibrator = config.AbtCalibratorConfig(
+ enabled=True, test_size=0.2, input_collection_name=workload_execution.output_collection_name,
+ trace=False)
+
+main_config = config.Config(database=database, data_generator=data_generator,
+ abt_calibrator=abt_calibrator, workload_execution=workload_execution)
diff --git a/buildscripts/cost_model/config.json b/buildscripts/cost_model/config.json
deleted file mode 100644
index c4e3c51f49e..00000000000
--- a/buildscripts/cost_model/config.json
+++ /dev/null
@@ -1,143 +0,0 @@
-{
- "database": {
- "connectionString": "mongodb://localhost",
- "databaseName": "abt_calibration",
- "dumpPath": "~/data/dump",
- "restoreFromDump": "never",
- "dumpOnExit": false
- },
- "dataGenerator": {
- "enabled": true,
- "collectionCardinalities": [
- 100,
- 200,
- 500
- ],
- "batchSize": 10000,
- "collectionTemplates": [
- {
- "name": "c_str",
- "fields": [
- {
- "name": "choice1",
- "type": "str",
- "distribution": "string_choice"
- }
- ]
- },
- {
- "name": "c_str_5",
- "fields": [
- {
- "name": "choice1",
- "type": "str",
- "distribution": "string_choice",
- "indexed": true
- },
- {
- "name": "mixed1",
- "type": "str",
- "distribution": "string_mixed",
- "indexed": true
- },
- {
- "name": "uniform1",
- "type": "str",
- "distribution": "string_uniform"
- },
- {
- "name": "choice2",
- "type": "str",
- "distribution": "string_choice"
- },
- {
- "name": "mixed2",
- "type": "str",
- "distribution": "string_mixed"
- }
- ],
- "compoundIndexes": [
- [
- "choice1",
- "mixed1"
- ],
- [
- "uniform1",
- "mixed2"
- ]
- ]
- },
- {
- "name": "c_str_10",
- "fields": [
- {
- "name": "choice1",
- "type": "str",
- "distribution": "string_choice",
- "indexed": true
- },
- {
- "name": "mixed1",
- "type": "str",
- "distribution": "string_mixed"
- },
- {
- "name": "uniform1",
- "type": "str",
- "distribution": "string_uniform"
- },
- {
- "name": "choice2",
- "type": "str",
- "distribution": "string_choice",
- "indexed": true
- },
- {
- "name": "mixed2",
- "type": "str",
- "distribution": "string_mixed"
- },
- {
- "name": "uniform2",
- "type": "str",
- "distribution": "string_uniform"
- },
- {
- "name": "choice3",
- "type": "str",
- "distribution": "string_choice",
- "indexed": true
- },
- {
- "name": "mixed3",
- "type": "str",
- "distribution": "string_mixed"
- },
- {
- "name": "uniform3",
- "type": "str",
- "distribution": "string_uniform"
- },
- {
- "name": "choice4",
- "type": "str",
- "distribution": "string_choice",
- "indexed": true
- }
- ]
- }
- ]
- },
- "abtCalibrator": {
- "enabled": true,
- "test_size": 0.2,
- "inputCollectionName": "calibrationData"
- },
- "workloadExecution": {
- "enabled": true,
- "outputCollectionName": "calibrationData",
- "writeMode": "replace",
- "warmupRuns": 1,
- "runs": 5
- }
-}
diff --git a/buildscripts/cost_model/config.py b/buildscripts/cost_model/config.py
index b56d38d233e..883ab025875 100644
--- a/buildscripts/cost_model/config.py
+++ b/buildscripts/cost_model/config.py
@@ -30,8 +30,8 @@
from __future__ import annotations
from dataclasses import dataclass
from enum import Enum
-import os
-from typing import Mapping, Sequence
+from typing import Sequence
+from random_generator import RandomDistribution
@dataclass
@@ -43,18 +43,6 @@ class Config:
abt_calibrator: AbtCalibratorConfig
workload_execution: WorkloadExecutionConfig
- @staticmethod
- def create(json_config: dict[str, any]) -> Config:
- """Create new configuration object from JSON."""
-
- database = DatabaseConfig.create(json_config.get('database'))
- data_generator = DataGeneratorConfig.create(json_config.get('dataGenerator'))
- abt_calibrator = AbtCalibratorConfig.create(json_config.get('abtCalibrator'))
- workload_execution = WorkloadExecutionConfig.create(json_config.get("workloadExecution"))
-
- return Config(database=database, data_generator=data_generator,
- abt_calibrator=abt_calibrator, workload_execution=workload_execution)
-
@dataclass
class DatabaseConfig:
@@ -66,35 +54,6 @@ class DatabaseConfig:
restore_from_dump: RestoreMode
dump_on_exit: bool
- @staticmethod
- def create(json_config: dict[str, any]) -> DatabaseConfig:
- """Create new configuration object from JSON."""
-
- default = DatabaseConfig(connection_string='mongodb://localhost', database_name='test',
- dump_path='/data/dump', restore_from_dump=RestoreMode.NEVER,
- dump_on_exit=False)
- if json_config is None:
- return default
-
- connection_string = json_config.get('connectionString', default.connection_string)
- database_name = json_config.get('databaseName', default.database_name)
- dump_path = process_path(json_config.get('dumpPath', default.dump_path))
-
- restore_from_dump_str = json_config.get('restoreFromDump', str(default.dump_path)).lower()
- if restore_from_dump_str == "never":
- restore_from_dump = RestoreMode.NEVER
- elif restore_from_dump_str == "onlynew":
- restore_from_dump = RestoreMode.ONLY_NEW
- elif restore_from_dump_str == "always":
- restore_from_dump = RestoreMode.ALWAYS
- else:
- raise ValueError("restoreFromDump must be equal to 'never', 'onlyNew', or 'aways'")
-
- dump_on_exit = json_config.get('dumpOnExit', default.dump_path)
- return DatabaseConfig(connection_string=connection_string, database_name=database_name,
- dump_path=dump_path, restore_from_dump=restore_from_dump,
- dump_on_exit=dump_on_exit)
-
class RestoreMode(Enum):
"""Restore database from dump mode."""
@@ -115,35 +74,9 @@ class DataGeneratorConfig:
enabled: bool
collection_cardinalities: list[int]
- data_types: list[DataType]
collection_templates: list[CollectionTemplate]
batch_size: int
- @staticmethod
- def create(json_config: dict[str, any]) -> DataGeneratorConfig:
- """Create new configuration object from JSON."""
-
- default = DataGeneratorConfig(enabled=False, collection_cardinalities=[], data_types=[],
- collection_templates=[], batch_size=10000)
- if json_config is None:
- return default
-
- enabled = json_config.get('enabled', default.enabled)
- collection_cardinalities = json_config.get('collectionCardinalities',
- default.collection_cardinalities)
- data_types_str = json_config.get('dataTypes', default.data_types)
- data_types = [DataType.parse(dt, 'dataTypes') for dt in data_types_str]
-
- collection_templates = [
- CollectionTemplate.create(jc)
- for jc in json_config.get("collectionTemplates", default.collection_templates)
- ]
-
- batch_size = json_config.get('batchSize', default.batch_size)
- return DataGeneratorConfig(
- enabled=enabled, collection_cardinalities=collection_cardinalities,
- data_types=data_types, collection_templates=collection_templates, batch_size=batch_size)
-
@dataclass
class CollectionTemplate:
@@ -153,14 +86,6 @@ class CollectionTemplate:
fields: Sequence[FieldTemplate]
compound_indexes: Sequence[Sequence[str]]
- @staticmethod
- def create(json_config: dict[str, any]) -> CollectionTemplate:
- """Create new template object from JSON."""
- name = json_config['name']
- fields = [FieldTemplate.create(jc) for jc in json_config['fields']]
- compound_indexes = json_config.get('compoundIndexes', [])
- return CollectionTemplate(name=name, fields=fields, compound_indexes=compound_indexes)
-
@dataclass
class FieldTemplate:
@@ -168,19 +93,9 @@ class FieldTemplate:
name: str
data_type: DataType
- distribution: str
+ distribution: RandomDistribution
indexed: bool
- @staticmethod
- def create(json_config: dict[str, any]) -> FieldTemplate:
- """Create new template object from JSON."""
- name = json_config['name']
- data_type = DataType.parse(json_config['type'], 'type')
- distribution = json_config['distribution']
- indexed = json_config.get('indexed', False)
- return FieldTemplate(name=name, data_type=data_type, distribution=distribution,
- indexed=indexed)
-
class DataType(Enum):
"""Data types."""
@@ -192,13 +107,6 @@ class DataType(Enum):
def __str__(self):
return self.name.lower()[:3]
- @staticmethod
- def parse(type_str: str, field_name: str) -> DataType:
- """Parse DataType."""
- str_to_type = {'int': DataType.INTEGER, 'str': DataType.STRING, 'arr': DataType.ARRAY}
-
- return parse_multi_value(str_to_type, type_str, field_name)
-
@dataclass
class AbtCalibratorConfig:
@@ -210,26 +118,6 @@ class AbtCalibratorConfig:
input_collection_name: str
trace: bool
- @staticmethod
- def create(json_config: dict[str, any]) -> AbtCalibratorConfig:
- """Create new configuration object from JSON."""
-
- default = AbtCalibratorConfig(enabled=False, test_size=0.2,
- input_collection_name='explains', trace=False)
- if json_config is None:
- return default
-
- enabled = json_config.get('enabled', default.enabled)
- test_size = json_config.get("testSize", default.test_size)
- if test_size <= 0.0 or test_size >= 1.0:
- raise ValueError('testSize must be greater than 0 and less than 1')
- input_collection_name = json_config.get('inputCollectionName',
- default.input_collection_name)
- trace = json_config.get('trace', default.trace)
-
- return AbtCalibratorConfig(enabled=enabled, test_size=test_size,
- input_collection_name=input_collection_name, trace=trace)
-
class WriteMode(Enum):
"""Write mode enum."""
@@ -247,45 +135,3 @@ class WorkloadExecutionConfig:
write_mode: WriteMode
warmup_runs: int
runs: int
-
- @staticmethod
- def create(json_config: dict[str, any] | None) -> WorkloadExecutionConfig:
- """Create new configuration object from JSON."""
-
- default = WorkloadExecutionConfig(enabled=False, output_collection_name='explains',
- write_mode=WriteMode.APPEND, warmup_runs=1, runs=1)
- if json_config is None:
- return default
-
- enabled = json_config.get('enabled', default.enabled)
- output_collection_name = json_config.get('outputCollectionName',
- default.output_collection_name)
- write_mode_str = json_config.get('writeMode', 'append').lower()
- if write_mode_str == 'append':
- write_mode = WriteMode.APPEND
- elif write_mode_str == 'replace':
- write_mode = WriteMode.REPLACE
- else:
- raise ValueError("writeMode must be equal to 'append' or 'replace'")
-
- runs = json_config.get('runs', default.runs)
- warmup_runs = json_config.get('warmupRuns', default.warmup_runs)
-
- return WorkloadExecutionConfig(enabled=enabled,
- output_collection_name=output_collection_name,
- write_mode=write_mode, warmup_runs=warmup_runs, runs=runs)
-
-
-def process_path(path):
- """Expand user's home folder and convert to absolute path."""
- return os.path.abspath(os.path.expanduser(path))
-
-
-def parse_multi_value(from_str_dict: Mapping[str, any], value_str: str, field_name: str) -> any:
- """Parse a string which may contain one of the predefined in from_str_dict values."""
- value = from_str_dict.get(value_str)
- if value is None:
- raise ValueError(
- f"{field_name} got {value_str} but must be equal to one of: {', '.join(from_str_dict.keys())}"
- )
- return value
diff --git a/buildscripts/cost_model/data_generator.py b/buildscripts/cost_model/data_generator.py
index 83dc7cb929e..edbc8a3d4a6 100644
--- a/buildscripts/cost_model/data_generator.py
+++ b/buildscripts/cost_model/data_generator.py
@@ -29,9 +29,7 @@
from __future__ import annotations
from dataclasses import dataclass
-from importlib.metadata import distribution
import time
-import random
from typing import Sequence
import asyncio
import pymongo
@@ -41,7 +39,6 @@ from motor.motor_asyncio import AsyncIOMotorDatabase
from random_generator import RandomDistribution
from config import DataGeneratorConfig, DataType
from database_instance import DatabaseInstance
-from random_generator_config import distributions
__all__ = ['DataGenerator']
@@ -110,9 +107,8 @@ class DataGenerator:
def _generate_collection_infos(self):
for coll_template in self.config.collection_templates:
fields = [
- FieldInfo(name=ft.name, type=ft.data_type,
- distribution=distributions[ft.distribution], indexed=ft.indexed)
- for ft in coll_template.fields
+ FieldInfo(name=ft.name, type=ft.data_type, distribution=ft.distribution,
+ indexed=ft.indexed) for ft in coll_template.fields
]
for doc_count in self.config.collection_cardinalities:
name = f'{coll_template.name}_{doc_count}'
diff --git a/buildscripts/cost_model/start.py b/buildscripts/cost_model/start.py
index 254e7bcce55..f4fb3454ccf 100644
--- a/buildscripts/cost_model/start.py
+++ b/buildscripts/cost_model/start.py
@@ -30,18 +30,16 @@
import dataclasses
import os
import csv
-import json
import asyncio
from typing import Mapping, Sequence
from cost_estimator import ExecutionStats, ModelParameters
from data_generator import DataGenerator
from database_instance import DatabaseInstance
-from config import Config
import abt_calibrator
import workload_execution
from workload_execution import Query, QueryParameters
import parameters_extractor
-from random_generator_config import distributions
+from calibration_settings import distributions, main_config
__all__ = []
@@ -69,15 +67,12 @@ async def main():
script_directory = os.path.abspath(os.path.dirname(__file__))
os.chdir(script_directory)
- with open("config.json") as config_file:
- config = Config.create(json.load(config_file))
-
# 1. Database Instance provides connectivity to a MongoDB instance, it loads data optionally
# from the dump on creating and stores data optionally to the dump on closing.
- with DatabaseInstance(config.database) as database:
+ with DatabaseInstance(main_config.database) as database:
# 2. Data generation (optional), generates random data and populates collections with it.
- generator = DataGenerator(database, config.data_generator)
+ generator = DataGenerator(database, main_config.data_generator)
await generator.populate_collections()
# 3. Collecting data for calibration (optional).
@@ -90,21 +85,21 @@ async def main():
Query(pipeline=[{'$match': {f'choice{i}': val}}],
keys_length_in_bytes=keys_length))
- await workload_execution.execute(database, config.workload_execution,
+ await workload_execution.execute(database, main_config.workload_execution,
generator.collection_infos, requests)
# Calibration phase (optional).
# Reads the explains stored on the previous step (this run and/or previous runs),
# aparses the explains, nd calibrates the cost model for the ABT nodes.
models = await abt_calibrator.calibrate(
- config.abt_calibrator, database,
+ main_config.abt_calibrator, database,
['IndexScan', 'Seek', 'PhysicalScan', 'ValueScan', 'CoScan', 'Scan'])
for abt, model in models.items():
print(abt)
print(model)
- parameters = await parameters_extractor.extract_parameters(config.abt_calibrator, database,
- [])
+ parameters = await parameters_extractor.extract_parameters(main_config.abt_calibrator,
+ database, [])
save_to_csv(parameters, 'parameters.csv')
print("DONE!")