From 5f11658224585f0399d57afa2fd8886165f854bf Mon Sep 17 00:00:00 2001 From: Alexander Ignatyev Date: Thu, 1 Sep 2022 08:57:24 +0000 Subject: SERVER-69031 Move JSON configuration to python file --- buildscripts/cost_model/calibration_settings.py | 112 +++++++++++++++ buildscripts/cost_model/config.json | 143 ------------------ buildscripts/cost_model/config.py | 160 +-------------------- buildscripts/cost_model/data_generator.py | 8 +- buildscripts/cost_model/random_generator_config.py | 71 --------- buildscripts/cost_model/start.py | 19 +-- 6 files changed, 124 insertions(+), 389 deletions(-) create mode 100644 buildscripts/cost_model/calibration_settings.py delete mode 100644 buildscripts/cost_model/config.json delete mode 100644 buildscripts/cost_model/random_generator_config.py diff --git a/buildscripts/cost_model/calibration_settings.py b/buildscripts/cost_model/calibration_settings.py new file mode 100644 index 00000000000..1fe7684089a --- /dev/null +++ b/buildscripts/cost_model/calibration_settings.py @@ -0,0 +1,112 @@ +# Copyright (C) 2022-present MongoDB, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the Server Side Public License, version 1, +# as published by MongoDB, Inc. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Server Side Public License for more details. +# +# You should have received a copy of the Server Side Public License +# along with this program. If not, see +# . +# +# As a special exception, the copyright holders give permission to link the +# code of portions of this program with the OpenSSL library under certain +# conditions as described in each individual source file and distribute +# linked combinations including the program with the OpenSSL library. You +# must comply with the Server Side Public License in all respects for +# all of the code used other than as permitted herein. If you modify file(s) +# with this exception, you may extend this exception to your version of the +# file(s), but you are not obligated to do so. If you do not wish to do so, +# delete this exception statement from your version. If you delete this +# exception statement from all source files in the program, then also delete +# it in the license file. +# +"""Calibration configuration.""" + +import config +from random_generator import RangeGenerator, DataType, RandomDistribution + +__all__ = ['main_config', 'distributions'] + +# Data distributions settings. +distributions = {} + +string_choice_values = [ + 'h', + 'hi', + 'hi!', + 'hola', + 'hello', + 'square', + 'squared', + 'gaussian', + 'chisquare', + 'chisquared', + 'hello world', + 'distribution', +] + +string_choice_weights = [10, 20, 5, 17, 30, 7, 9, 15, 40, 2, 12, 1] + +distributions['string_choice'] = RandomDistribution.choice(string_choice_values, + string_choice_weights) + +string_range_4 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "abca", "abc_")) +string_range_5 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "abcda", "abcd_")) +string_range_7 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "hello_a", "hello__")) +string_range_12 = RandomDistribution.normal( + RangeGenerator(DataType.STRING, "helloworldaa", "helloworldd_")) + +distributions['string_mixed'] = RandomDistribution.mixed( + [string_range_4, string_range_5, string_range_7, string_range_12], [0.1, 0.15, 0.25, 0.5]) + +distributions['string_uniform'] = RandomDistribution.uniform( + RangeGenerator(DataType.STRING, "helloworldaa", "helloworldd_")) + +# Database settings +database = config.DatabaseConfig(connection_string='mongodb://localhost', + database_name='abt_calibration', dump_path='~/data/dump', + restore_from_dump=config.RestoreMode.NEVER, dump_on_exit=False) + +# Collection template seetings +c_str_01 = config.CollectionTemplate( + name="c_str_01", fields=[ + config.FieldTemplate(name="choice1", data_type=config.DataType.STRING, + distribution=distributions['string_choice'], indexed=True) + ], compound_indexes=[]) + +c_str_05 = config.CollectionTemplate( + name="c_str_05", fields=[ + config.FieldTemplate(name="choice1", data_type=config.DataType.STRING, + distribution=distributions["string_choice"], indexed=True), + config.FieldTemplate(name="mixed1", data_type=config.DataType.STRING, + distribution=distributions["string_mixed"], indexed=True), + config.FieldTemplate(name="uniform1", data_type=config.DataType.STRING, + distribution=distributions["string_uniform"], indexed=True), + config.FieldTemplate(name="choice2", data_type=config.DataType.STRING, + distribution=distributions["string_choice"], indexed=True), + config.FieldTemplate(name="mixed2", data_type=config.DataType.STRING, + distribution=distributions["string_mixed"], indexed=True), + ], compound_indexes=[["choice1", "mixed1"]]) + +# Data Generator settings +data_generator = config.DataGeneratorConfig(enabled=True, collection_cardinalities=[100, 200, 500], + batch_size=10000, + collection_templates=[c_str_01, c_str_05]) + +# Workload Execution settings +workload_execution = config.WorkloadExecutionConfig( + enabled=True, output_collection_name='calibrationData', write_mode=config.WriteMode.REPLACE, + warmup_runs=1, runs=5) + +# Calibrator settings +abt_calibrator = config.AbtCalibratorConfig( + enabled=True, test_size=0.2, input_collection_name=workload_execution.output_collection_name, + trace=False) + +main_config = config.Config(database=database, data_generator=data_generator, + abt_calibrator=abt_calibrator, workload_execution=workload_execution) diff --git a/buildscripts/cost_model/config.json b/buildscripts/cost_model/config.json deleted file mode 100644 index c4e3c51f49e..00000000000 --- a/buildscripts/cost_model/config.json +++ /dev/null @@ -1,143 +0,0 @@ -{ - "database": { - "connectionString": "mongodb://localhost", - "databaseName": "abt_calibration", - "dumpPath": "~/data/dump", - "restoreFromDump": "never", - "dumpOnExit": false - }, - "dataGenerator": { - "enabled": true, - "collectionCardinalities": [ - 100, - 200, - 500 - ], - "batchSize": 10000, - "collectionTemplates": [ - { - "name": "c_str", - "fields": [ - { - "name": "choice1", - "type": "str", - "distribution": "string_choice" - } - ] - }, - { - "name": "c_str_5", - "fields": [ - { - "name": "choice1", - "type": "str", - "distribution": "string_choice", - "indexed": true - }, - { - "name": "mixed1", - "type": "str", - "distribution": "string_mixed", - "indexed": true - }, - { - "name": "uniform1", - "type": "str", - "distribution": "string_uniform" - }, - { - "name": "choice2", - "type": "str", - "distribution": "string_choice" - }, - { - "name": "mixed2", - "type": "str", - "distribution": "string_mixed" - } - ], - "compoundIndexes": [ - [ - "choice1", - "mixed1" - ], - [ - "uniform1", - "mixed2" - ] - ] - }, - { - "name": "c_str_10", - "fields": [ - { - "name": "choice1", - "type": "str", - "distribution": "string_choice", - "indexed": true - }, - { - "name": "mixed1", - "type": "str", - "distribution": "string_mixed" - }, - { - "name": "uniform1", - "type": "str", - "distribution": "string_uniform" - }, - { - "name": "choice2", - "type": "str", - "distribution": "string_choice", - "indexed": true - }, - { - "name": "mixed2", - "type": "str", - "distribution": "string_mixed" - }, - { - "name": "uniform2", - "type": "str", - "distribution": "string_uniform" - }, - { - "name": "choice3", - "type": "str", - "distribution": "string_choice", - "indexed": true - }, - { - "name": "mixed3", - "type": "str", - "distribution": "string_mixed" - }, - { - "name": "uniform3", - "type": "str", - "distribution": "string_uniform" - }, - { - "name": "choice4", - "type": "str", - "distribution": "string_choice", - "indexed": true - } - ] - } - ] - }, - "abtCalibrator": { - "enabled": true, - "test_size": 0.2, - "inputCollectionName": "calibrationData" - }, - "workloadExecution": { - "enabled": true, - "outputCollectionName": "calibrationData", - "writeMode": "replace", - "warmupRuns": 1, - "runs": 5 - } -} diff --git a/buildscripts/cost_model/config.py b/buildscripts/cost_model/config.py index b56d38d233e..883ab025875 100644 --- a/buildscripts/cost_model/config.py +++ b/buildscripts/cost_model/config.py @@ -30,8 +30,8 @@ from __future__ import annotations from dataclasses import dataclass from enum import Enum -import os -from typing import Mapping, Sequence +from typing import Sequence +from random_generator import RandomDistribution @dataclass @@ -43,18 +43,6 @@ class Config: abt_calibrator: AbtCalibratorConfig workload_execution: WorkloadExecutionConfig - @staticmethod - def create(json_config: dict[str, any]) -> Config: - """Create new configuration object from JSON.""" - - database = DatabaseConfig.create(json_config.get('database')) - data_generator = DataGeneratorConfig.create(json_config.get('dataGenerator')) - abt_calibrator = AbtCalibratorConfig.create(json_config.get('abtCalibrator')) - workload_execution = WorkloadExecutionConfig.create(json_config.get("workloadExecution")) - - return Config(database=database, data_generator=data_generator, - abt_calibrator=abt_calibrator, workload_execution=workload_execution) - @dataclass class DatabaseConfig: @@ -66,35 +54,6 @@ class DatabaseConfig: restore_from_dump: RestoreMode dump_on_exit: bool - @staticmethod - def create(json_config: dict[str, any]) -> DatabaseConfig: - """Create new configuration object from JSON.""" - - default = DatabaseConfig(connection_string='mongodb://localhost', database_name='test', - dump_path='/data/dump', restore_from_dump=RestoreMode.NEVER, - dump_on_exit=False) - if json_config is None: - return default - - connection_string = json_config.get('connectionString', default.connection_string) - database_name = json_config.get('databaseName', default.database_name) - dump_path = process_path(json_config.get('dumpPath', default.dump_path)) - - restore_from_dump_str = json_config.get('restoreFromDump', str(default.dump_path)).lower() - if restore_from_dump_str == "never": - restore_from_dump = RestoreMode.NEVER - elif restore_from_dump_str == "onlynew": - restore_from_dump = RestoreMode.ONLY_NEW - elif restore_from_dump_str == "always": - restore_from_dump = RestoreMode.ALWAYS - else: - raise ValueError("restoreFromDump must be equal to 'never', 'onlyNew', or 'aways'") - - dump_on_exit = json_config.get('dumpOnExit', default.dump_path) - return DatabaseConfig(connection_string=connection_string, database_name=database_name, - dump_path=dump_path, restore_from_dump=restore_from_dump, - dump_on_exit=dump_on_exit) - class RestoreMode(Enum): """Restore database from dump mode.""" @@ -115,35 +74,9 @@ class DataGeneratorConfig: enabled: bool collection_cardinalities: list[int] - data_types: list[DataType] collection_templates: list[CollectionTemplate] batch_size: int - @staticmethod - def create(json_config: dict[str, any]) -> DataGeneratorConfig: - """Create new configuration object from JSON.""" - - default = DataGeneratorConfig(enabled=False, collection_cardinalities=[], data_types=[], - collection_templates=[], batch_size=10000) - if json_config is None: - return default - - enabled = json_config.get('enabled', default.enabled) - collection_cardinalities = json_config.get('collectionCardinalities', - default.collection_cardinalities) - data_types_str = json_config.get('dataTypes', default.data_types) - data_types = [DataType.parse(dt, 'dataTypes') for dt in data_types_str] - - collection_templates = [ - CollectionTemplate.create(jc) - for jc in json_config.get("collectionTemplates", default.collection_templates) - ] - - batch_size = json_config.get('batchSize', default.batch_size) - return DataGeneratorConfig( - enabled=enabled, collection_cardinalities=collection_cardinalities, - data_types=data_types, collection_templates=collection_templates, batch_size=batch_size) - @dataclass class CollectionTemplate: @@ -153,14 +86,6 @@ class CollectionTemplate: fields: Sequence[FieldTemplate] compound_indexes: Sequence[Sequence[str]] - @staticmethod - def create(json_config: dict[str, any]) -> CollectionTemplate: - """Create new template object from JSON.""" - name = json_config['name'] - fields = [FieldTemplate.create(jc) for jc in json_config['fields']] - compound_indexes = json_config.get('compoundIndexes', []) - return CollectionTemplate(name=name, fields=fields, compound_indexes=compound_indexes) - @dataclass class FieldTemplate: @@ -168,19 +93,9 @@ class FieldTemplate: name: str data_type: DataType - distribution: str + distribution: RandomDistribution indexed: bool - @staticmethod - def create(json_config: dict[str, any]) -> FieldTemplate: - """Create new template object from JSON.""" - name = json_config['name'] - data_type = DataType.parse(json_config['type'], 'type') - distribution = json_config['distribution'] - indexed = json_config.get('indexed', False) - return FieldTemplate(name=name, data_type=data_type, distribution=distribution, - indexed=indexed) - class DataType(Enum): """Data types.""" @@ -192,13 +107,6 @@ class DataType(Enum): def __str__(self): return self.name.lower()[:3] - @staticmethod - def parse(type_str: str, field_name: str) -> DataType: - """Parse DataType.""" - str_to_type = {'int': DataType.INTEGER, 'str': DataType.STRING, 'arr': DataType.ARRAY} - - return parse_multi_value(str_to_type, type_str, field_name) - @dataclass class AbtCalibratorConfig: @@ -210,26 +118,6 @@ class AbtCalibratorConfig: input_collection_name: str trace: bool - @staticmethod - def create(json_config: dict[str, any]) -> AbtCalibratorConfig: - """Create new configuration object from JSON.""" - - default = AbtCalibratorConfig(enabled=False, test_size=0.2, - input_collection_name='explains', trace=False) - if json_config is None: - return default - - enabled = json_config.get('enabled', default.enabled) - test_size = json_config.get("testSize", default.test_size) - if test_size <= 0.0 or test_size >= 1.0: - raise ValueError('testSize must be greater than 0 and less than 1') - input_collection_name = json_config.get('inputCollectionName', - default.input_collection_name) - trace = json_config.get('trace', default.trace) - - return AbtCalibratorConfig(enabled=enabled, test_size=test_size, - input_collection_name=input_collection_name, trace=trace) - class WriteMode(Enum): """Write mode enum.""" @@ -247,45 +135,3 @@ class WorkloadExecutionConfig: write_mode: WriteMode warmup_runs: int runs: int - - @staticmethod - def create(json_config: dict[str, any] | None) -> WorkloadExecutionConfig: - """Create new configuration object from JSON.""" - - default = WorkloadExecutionConfig(enabled=False, output_collection_name='explains', - write_mode=WriteMode.APPEND, warmup_runs=1, runs=1) - if json_config is None: - return default - - enabled = json_config.get('enabled', default.enabled) - output_collection_name = json_config.get('outputCollectionName', - default.output_collection_name) - write_mode_str = json_config.get('writeMode', 'append').lower() - if write_mode_str == 'append': - write_mode = WriteMode.APPEND - elif write_mode_str == 'replace': - write_mode = WriteMode.REPLACE - else: - raise ValueError("writeMode must be equal to 'append' or 'replace'") - - runs = json_config.get('runs', default.runs) - warmup_runs = json_config.get('warmupRuns', default.warmup_runs) - - return WorkloadExecutionConfig(enabled=enabled, - output_collection_name=output_collection_name, - write_mode=write_mode, warmup_runs=warmup_runs, runs=runs) - - -def process_path(path): - """Expand user's home folder and convert to absolute path.""" - return os.path.abspath(os.path.expanduser(path)) - - -def parse_multi_value(from_str_dict: Mapping[str, any], value_str: str, field_name: str) -> any: - """Parse a string which may contain one of the predefined in from_str_dict values.""" - value = from_str_dict.get(value_str) - if value is None: - raise ValueError( - f"{field_name} got {value_str} but must be equal to one of: {', '.join(from_str_dict.keys())}" - ) - return value diff --git a/buildscripts/cost_model/data_generator.py b/buildscripts/cost_model/data_generator.py index 83dc7cb929e..edbc8a3d4a6 100644 --- a/buildscripts/cost_model/data_generator.py +++ b/buildscripts/cost_model/data_generator.py @@ -29,9 +29,7 @@ from __future__ import annotations from dataclasses import dataclass -from importlib.metadata import distribution import time -import random from typing import Sequence import asyncio import pymongo @@ -41,7 +39,6 @@ from motor.motor_asyncio import AsyncIOMotorDatabase from random_generator import RandomDistribution from config import DataGeneratorConfig, DataType from database_instance import DatabaseInstance -from random_generator_config import distributions __all__ = ['DataGenerator'] @@ -110,9 +107,8 @@ class DataGenerator: def _generate_collection_infos(self): for coll_template in self.config.collection_templates: fields = [ - FieldInfo(name=ft.name, type=ft.data_type, - distribution=distributions[ft.distribution], indexed=ft.indexed) - for ft in coll_template.fields + FieldInfo(name=ft.name, type=ft.data_type, distribution=ft.distribution, + indexed=ft.indexed) for ft in coll_template.fields ] for doc_count in self.config.collection_cardinalities: name = f'{coll_template.name}_{doc_count}' diff --git a/buildscripts/cost_model/random_generator_config.py b/buildscripts/cost_model/random_generator_config.py deleted file mode 100644 index d437758ac8a..00000000000 --- a/buildscripts/cost_model/random_generator_config.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (C) 2022-present MongoDB, Inc. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the Server Side Public License, version 1, -# as published by MongoDB, Inc. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Server Side Public License for more details. -# -# You should have received a copy of the Server Side Public License -# along with this program. If not, see -# . -# -# As a special exception, the copyright holders give permission to link the -# code of portions of this program with the OpenSSL library under certain -# conditions as described in each individual source file and distribute -# linked combinations including the program with the OpenSSL library. You -# must comply with the Server Side Public License in all respects for -# all of the code used other than as permitted herein. If you modify file(s) -# with this exception, you may extend this exception to your version of the -# file(s), but you are not obligated to do so. If you do not wish to do so, -# delete this exception statement from your version. If you delete this -# exception statement from all source files in the program, then also delete -# it in the license file. -# -""" -Configuration of distributions used to generate collections from templates. - -They used in collection templates defined in json.config. -""" - -from importlib.metadata import distributions -from random_generator import RangeGenerator, DataType, RandomDistribution - -__ALL__ = ['distributions'] - -distributions = {} - -string_choice_values = [ - 'h', - 'hi', - 'hi!', - 'hola', - 'hello', - 'square', - 'squared', - 'gaussian', - 'chisquare', - 'chisquared', - 'hello world', - 'distribution', -] - -string_choice_weights = [10, 20, 5, 17, 30, 7, 9, 15, 40, 2, 12, 1] - -distributions['string_choice'] = RandomDistribution.choice(string_choice_values, - string_choice_weights) - -string_range_4 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "abca", "abc_")) -string_range_5 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "abcda", "abcd_")) -string_range_7 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "hello_a", "hello__")) -string_range_12 = RandomDistribution.normal( - RangeGenerator(DataType.STRING, "helloworldaa", "helloworldd_")) - -distributions['string_mixed'] = RandomDistribution.mixed( - [string_range_4, string_range_5, string_range_7, string_range_12], [0.1, 0.15, 0.25, 0.5]) - -distributions['string_uniform'] = RandomDistribution.uniform( - RangeGenerator(DataType.STRING, "helloworldaa", "helloworldd_")) diff --git a/buildscripts/cost_model/start.py b/buildscripts/cost_model/start.py index 254e7bcce55..f4fb3454ccf 100644 --- a/buildscripts/cost_model/start.py +++ b/buildscripts/cost_model/start.py @@ -30,18 +30,16 @@ import dataclasses import os import csv -import json import asyncio from typing import Mapping, Sequence from cost_estimator import ExecutionStats, ModelParameters from data_generator import DataGenerator from database_instance import DatabaseInstance -from config import Config import abt_calibrator import workload_execution from workload_execution import Query, QueryParameters import parameters_extractor -from random_generator_config import distributions +from calibration_settings import distributions, main_config __all__ = [] @@ -69,15 +67,12 @@ async def main(): script_directory = os.path.abspath(os.path.dirname(__file__)) os.chdir(script_directory) - with open("config.json") as config_file: - config = Config.create(json.load(config_file)) - # 1. Database Instance provides connectivity to a MongoDB instance, it loads data optionally # from the dump on creating and stores data optionally to the dump on closing. - with DatabaseInstance(config.database) as database: + with DatabaseInstance(main_config.database) as database: # 2. Data generation (optional), generates random data and populates collections with it. - generator = DataGenerator(database, config.data_generator) + generator = DataGenerator(database, main_config.data_generator) await generator.populate_collections() # 3. Collecting data for calibration (optional). @@ -90,21 +85,21 @@ async def main(): Query(pipeline=[{'$match': {f'choice{i}': val}}], keys_length_in_bytes=keys_length)) - await workload_execution.execute(database, config.workload_execution, + await workload_execution.execute(database, main_config.workload_execution, generator.collection_infos, requests) # Calibration phase (optional). # Reads the explains stored on the previous step (this run and/or previous runs), # aparses the explains, nd calibrates the cost model for the ABT nodes. models = await abt_calibrator.calibrate( - config.abt_calibrator, database, + main_config.abt_calibrator, database, ['IndexScan', 'Seek', 'PhysicalScan', 'ValueScan', 'CoScan', 'Scan']) for abt, model in models.items(): print(abt) print(model) - parameters = await parameters_extractor.extract_parameters(config.abt_calibrator, database, - []) + parameters = await parameters_extractor.extract_parameters(main_config.abt_calibrator, + database, []) save_to_csv(parameters, 'parameters.csv') print("DONE!") -- cgit v1.2.1