summaryrefslogtreecommitdiff
path: root/buildscripts/cost_model
diff options
context:
space:
mode:
authorTimour Katchaounov <timour.katchaounov@mongodb.com>2023-01-17 13:13:27 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2023-01-17 13:48:13 +0000
commitd692121e12a5cf62edcd8a08a7714c2f009c6d12 (patch)
treed9c26a746e4d66a1ce724b6e2a693a7a17fbf37e /buildscripts/cost_model
parentab05cf2179d112a92467de3e454c53f75eee81ba (diff)
downloadmongo-d692121e12a5cf62edcd8a08a7714c2f009c6d12.tar.gz
SERVER-72663 Visualize distribution of generated data
* Visualize generated data via histograms stored as png files in stests/query_golden/libs/data. * Added a couple of more mixed distributions.
Diffstat (limited to 'buildscripts/cost_model')
-rw-r--r--buildscripts/cost_model/ce_data_settings.py24
-rw-r--r--buildscripts/cost_model/ce_generate_data.py16
2 files changed, 36 insertions, 4 deletions
diff --git a/buildscripts/cost_model/ce_data_settings.py b/buildscripts/cost_model/ce_data_settings.py
index 851d6324f47..f0203930880 100644
--- a/buildscripts/cost_model/ce_data_settings.py
+++ b/buildscripts/cost_model/ce_data_settings.py
@@ -70,22 +70,38 @@ for int_range in int_ranges:
# Mixes of distributions with different NDV and value distances
-uniform_int_mix_1 = [
- int_distributions['uniform_int_1000_1'], int_distributions['uniform_int_100000_100'],
+unf_int_mix_1 = [
+ int_distributions['uniform_int_10000_10'], int_distributions['uniform_int_100000_100'],
int_distributions['uniform_int_10000000_1000']
]
-int_distributions['mixed_int_uniform_1'] = RandomDistribution.mixed(children=uniform_int_mix_1,
+int_distributions['mixed_int_uniform_1'] = RandomDistribution.mixed(children=unf_int_mix_1,
weight=[1, 1, 1])
+unf_norm_int_mix_1 = [
+ int_distributions['uniform_int_1000_1'], int_distributions['normal_int_100000_100'],
+ int_distributions['normal_int_10000000_1000']
+]
+int_distributions['mixed_int_unf_norm_1'] = RandomDistribution.mixed(children=unf_norm_int_mix_1,
+ weight=[1, 1, 1])
+
unf_norm_chi_int_mix_1 = [
int_distributions['uniform_int_10000_10'], int_distributions['uniform_int_1000000_100'],
int_distributions['normal_int_10000_10'], int_distributions['normal_int_1000000_100'],
int_distributions['chi2_int_10000_10'], int_distributions['chi2_int_10000000_1000']
]
-
int_distributions['mixed_int_unf_norm_chi_1'] = RandomDistribution.mixed(
children=unf_norm_chi_int_mix_1, weight=[1, 1, 1, 1, 1, 1])
+unf_norm_chi_int_mix_2 = [
+ int_distributions['uniform_int_10000_10'],
+ int_distributions['normal_int_10000_10'],
+ int_distributions['uniform_int_1000000_100'],
+ int_distributions['normal_int_1000000_100'],
+ int_distributions['chi2_int_1000000_100'],
+]
+int_distributions['mixed_int_unf_norm_chi_2'] = RandomDistribution.mixed(
+ children=unf_norm_chi_int_mix_2, weight=[1, 1, 1, 1, 1])
+
################################################################################
# Collection templates
################################################################################
diff --git a/buildscripts/cost_model/ce_generate_data.py b/buildscripts/cost_model/ce_generate_data.py
index 1883aab5c73..6e057d74e06 100644
--- a/buildscripts/cost_model/ce_generate_data.py
+++ b/buildscripts/cost_model/ce_generate_data.py
@@ -30,10 +30,13 @@
import asyncio
import dataclasses
import json
+import math
import os
import subprocess
from pathlib import Path
+import seaborn as sns
import bson
+import matplotlib.pyplot as plt
from config import CollectionTemplate, FieldTemplate, DataType
from data_generator import CollectionInfo, DataGenerator
from database_instance import DatabaseInstance
@@ -132,6 +135,19 @@ async def main():
collections.append(
dict(collectionName=name, fields=coll_template.fields,
compound_indexes=coll_template.compound_indexes, cardinality=card))
+ # Generate one histogram per each collection field
+ coll = database_instance.database[name]
+ doc_count = await coll.count_documents({})
+ for field in coll_template.fields:
+ field_data = []
+ async for doc in coll.find({field.name: {"$exists": True}},
+ {"_id": 0, field.name: 1}):
+ field_data.append(doc[field.name])
+ hist = sns.displot(data=field_data, kind="hist", bins=round(
+ math.sqrt(doc_count))).figure
+ hist.savefig(f'{database_config.dump_path}/{name}_{field.name}.png')
+ plt.close(hist)
+
json_metadata = json.dumps(collections, indent=4, cls=CollectionTemplateEncoder)
metadata_file.write("// This is a generated file.\nconst dbMetadata = ")
metadata_file.write(json_metadata)