diff options
author | Timour Katchaounov <timour.katchaounov@mongodb.com> | 2023-01-17 13:13:27 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2023-01-17 13:48:13 +0000 |
commit | d692121e12a5cf62edcd8a08a7714c2f009c6d12 (patch) | |
tree | d9c26a746e4d66a1ce724b6e2a693a7a17fbf37e /buildscripts | |
parent | ab05cf2179d112a92467de3e454c53f75eee81ba (diff) | |
download | mongo-d692121e12a5cf62edcd8a08a7714c2f009c6d12.tar.gz |
SERVER-72663 Visualize distribution of generated data
* Visualize generated data via histograms stored as png files in stests/query_golden/libs/data.
* Added a couple of more mixed distributions.
Diffstat (limited to 'buildscripts')
-rw-r--r-- | buildscripts/cost_model/ce_data_settings.py | 24 | ||||
-rw-r--r-- | buildscripts/cost_model/ce_generate_data.py | 16 |
2 files changed, 36 insertions, 4 deletions
diff --git a/buildscripts/cost_model/ce_data_settings.py b/buildscripts/cost_model/ce_data_settings.py index 851d6324f47..f0203930880 100644 --- a/buildscripts/cost_model/ce_data_settings.py +++ b/buildscripts/cost_model/ce_data_settings.py @@ -70,22 +70,38 @@ for int_range in int_ranges: # Mixes of distributions with different NDV and value distances -uniform_int_mix_1 = [ - int_distributions['uniform_int_1000_1'], int_distributions['uniform_int_100000_100'], +unf_int_mix_1 = [ + int_distributions['uniform_int_10000_10'], int_distributions['uniform_int_100000_100'], int_distributions['uniform_int_10000000_1000'] ] -int_distributions['mixed_int_uniform_1'] = RandomDistribution.mixed(children=uniform_int_mix_1, +int_distributions['mixed_int_uniform_1'] = RandomDistribution.mixed(children=unf_int_mix_1, weight=[1, 1, 1]) +unf_norm_int_mix_1 = [ + int_distributions['uniform_int_1000_1'], int_distributions['normal_int_100000_100'], + int_distributions['normal_int_10000000_1000'] +] +int_distributions['mixed_int_unf_norm_1'] = RandomDistribution.mixed(children=unf_norm_int_mix_1, + weight=[1, 1, 1]) + unf_norm_chi_int_mix_1 = [ int_distributions['uniform_int_10000_10'], int_distributions['uniform_int_1000000_100'], int_distributions['normal_int_10000_10'], int_distributions['normal_int_1000000_100'], int_distributions['chi2_int_10000_10'], int_distributions['chi2_int_10000000_1000'] ] - int_distributions['mixed_int_unf_norm_chi_1'] = RandomDistribution.mixed( children=unf_norm_chi_int_mix_1, weight=[1, 1, 1, 1, 1, 1]) +unf_norm_chi_int_mix_2 = [ + int_distributions['uniform_int_10000_10'], + int_distributions['normal_int_10000_10'], + int_distributions['uniform_int_1000000_100'], + int_distributions['normal_int_1000000_100'], + int_distributions['chi2_int_1000000_100'], +] +int_distributions['mixed_int_unf_norm_chi_2'] = RandomDistribution.mixed( + children=unf_norm_chi_int_mix_2, weight=[1, 1, 1, 1, 1]) + ################################################################################ # Collection templates ################################################################################ diff --git a/buildscripts/cost_model/ce_generate_data.py b/buildscripts/cost_model/ce_generate_data.py index 1883aab5c73..6e057d74e06 100644 --- a/buildscripts/cost_model/ce_generate_data.py +++ b/buildscripts/cost_model/ce_generate_data.py @@ -30,10 +30,13 @@ import asyncio import dataclasses import json +import math import os import subprocess from pathlib import Path +import seaborn as sns import bson +import matplotlib.pyplot as plt from config import CollectionTemplate, FieldTemplate, DataType from data_generator import CollectionInfo, DataGenerator from database_instance import DatabaseInstance @@ -132,6 +135,19 @@ async def main(): collections.append( dict(collectionName=name, fields=coll_template.fields, compound_indexes=coll_template.compound_indexes, cardinality=card)) + # Generate one histogram per each collection field + coll = database_instance.database[name] + doc_count = await coll.count_documents({}) + for field in coll_template.fields: + field_data = [] + async for doc in coll.find({field.name: {"$exists": True}}, + {"_id": 0, field.name: 1}): + field_data.append(doc[field.name]) + hist = sns.displot(data=field_data, kind="hist", bins=round( + math.sqrt(doc_count))).figure + hist.savefig(f'{database_config.dump_path}/{name}_{field.name}.png') + plt.close(hist) + json_metadata = json.dumps(collections, indent=4, cls=CollectionTemplateEncoder) metadata_file.write("// This is a generated file.\nconst dbMetadata = ") metadata_file.write(json_metadata) |