summaryrefslogtreecommitdiff
path: root/buildscripts
diff options
context:
space:
mode:
authorTimour Katchaounov <timour.katchaounov@mongodb.com>2023-02-15 15:01:33 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2023-02-15 16:32:12 +0000
commit5af5e4a1e8bc6930f837776cb533333d053a32f1 (patch)
treece130cdfab062ee2c47856e1050df62cf57ab747 /buildscripts
parent9962320d74254ec0517ba4f478386f7f0bcb1f9e (diff)
downloadmongo-5af5e4a1e8bc6930f837776cb533333d053a32f1.tar.gz
SERVER-73031 Generate random data with mixed data types
* Added generation of random data with mixed data types * Generation of random dates and doubles * Some refactoring of the python generation framework wrt types
Diffstat (limited to 'buildscripts')
-rw-r--r--buildscripts/cost_model/calibration_settings.py2
-rw-r--r--buildscripts/cost_model/ce_data_settings.py389
-rw-r--r--buildscripts/cost_model/ce_generate_data.py8
-rw-r--r--buildscripts/cost_model/config.py13
-rw-r--r--buildscripts/cost_model/data_generator.py4
-rw-r--r--buildscripts/cost_model/random_generator.py200
6 files changed, 442 insertions, 174 deletions
diff --git a/buildscripts/cost_model/calibration_settings.py b/buildscripts/cost_model/calibration_settings.py
index 12e43b82b80..024b25220f0 100644
--- a/buildscripts/cost_model/calibration_settings.py
+++ b/buildscripts/cost_model/calibration_settings.py
@@ -29,7 +29,7 @@
import random
import config
-from random_generator import RangeGenerator, DataType, RandomDistribution, ArrayRandomDistribution
+from random_generator import RangeGenerator, RandomDistribution, ArrayRandomDistribution, DataType
__all__ = ['main_config', 'distributions']
diff --git a/buildscripts/cost_model/ce_data_settings.py b/buildscripts/cost_model/ce_data_settings.py
index 4bd60f79c5d..f9f1e1c049b 100644
--- a/buildscripts/cost_model/ce_data_settings.py
+++ b/buildscripts/cost_model/ce_data_settings.py
@@ -28,10 +28,11 @@
"""Configuration of data generation for CE accuracy testing."""
from pathlib import Path
+from datetime import datetime
import random
from typing import Sequence
import config
-from random_generator import RangeGenerator, DataType, RandomDistribution, ArrayRandomDistribution
+from random_generator import RangeGenerator, RandomDistribution, ArrayRandomDistribution, DataType, DistributionType
__all__ = ['database_config', 'data_generator_config']
@@ -39,70 +40,157 @@ __all__ = ['database_config', 'data_generator_config']
# Data distributions
################################################################################
+
+def add_distribution(distr_set: Sequence[RandomDistribution], distr_type: DistributionType,
+ rg: RangeGenerator):
+ distr = None
+ if distr_type == DistributionType.UNIFORM:
+ distr = RandomDistribution.uniform(rg)
+ elif distr_type == DistributionType.NORMAL:
+ distr = RandomDistribution.normal(rg)
+ elif distr_type == DistributionType.CHI2:
+ distr = RandomDistribution.noncentral_chisquare(rg)
+ else:
+ raise ValueError("Unknown distribution")
+ distr_set.append(distr)
+
+
# Ranges
+int_ranges_1 = [
+ # 1K unique integers with different distances
+ RangeGenerator(DataType.INTEGER, 0, 1000, 1),
+ RangeGenerator(DataType.INTEGER, 0, 10000, 10),
+ RangeGenerator(DataType.INTEGER, 0, 100000, 100),
+ # 10K unique integers with different distances
+ RangeGenerator(DataType.INTEGER, 0, 10000, 1),
+ RangeGenerator(DataType.INTEGER, 0, 1000000, 10),
+ RangeGenerator(DataType.INTEGER, 0, 10000000, 100),
+]
-# 1K unique numbers with different distances
-range_int_1000_1 = RangeGenerator(DataType.INTEGER, 0, 1000, 1)
-range_int_1000_10 = RangeGenerator(DataType.INTEGER, 0, 10000, 10)
-range_int_1000_100 = RangeGenerator(DataType.INTEGER, 0, 100000, 100)
-range_int_1000_1000 = RangeGenerator(DataType.INTEGER, 0, 1000000, 1000)
-# 10K unique numbers with different distances
-range_int_10000_1 = RangeGenerator(DataType.INTEGER, 0, 10000, 1)
-range_int_10000_10 = RangeGenerator(DataType.INTEGER, 0, 100000, 10)
-range_int_10000_100 = RangeGenerator(DataType.INTEGER, 0, 1000000, 100)
-range_int_10000_1000 = RangeGenerator(DataType.INTEGER, 0, 10000000, 1000)
-int_ranges = [
- range_int_1000_1, range_int_1000_10, range_int_1000_100, range_int_1000_1000, range_int_10000_1,
- range_int_10000_10, range_int_10000_100, range_int_10000_1000
+int_ranges_2 = [
+ # 1K unique integers with different distances
+ RangeGenerator(DataType.INTEGER, 7000, 8000, 1),
+ RangeGenerator(DataType.INTEGER, 70000, 80000, 10),
+ RangeGenerator(DataType.INTEGER, 700000, 800000, 100),
+ # 10K unique integers with different distances
+ RangeGenerator(DataType.INTEGER, 70000, 80000, 1),
+ RangeGenerator(DataType.INTEGER, 700000, 800000, 10),
+ RangeGenerator(DataType.INTEGER, 7000000, 8000000, 100),
]
+#######################
# Integer distributions
-int_distributions = {}
-for int_range in int_ranges:
- int_distributions[
- f'uniform_int_{int_range.interval_end - int_range.interval_begin}_{int_range.step}'] = RandomDistribution.uniform(
- int_range)
- int_distributions[
- f'normal_int_{int_range.interval_end - int_range.interval_begin}_{int_range.step}'] = RandomDistribution.normal(
- int_range)
- int_distributions[
- f'chi2_int_{int_range.interval_end - int_range.interval_begin}_{int_range.step}'] = RandomDistribution.noncentral_chisquare(
- int_range)
-# Mixes of distributions with different NDV and value distances
+int_distributions = []
-unf_int_mix_1 = [
- int_distributions['uniform_int_10000_10'], int_distributions['uniform_int_100000_100'],
- int_distributions['uniform_int_10000000_1000']
-]
-int_distributions['mixed_int_uniform_1'] = RandomDistribution.mixed(children=unf_int_mix_1,
- weight=[1, 1, 1])
+for range_gen in int_ranges_1:
+ add_distribution(int_distributions, DistributionType.UNIFORM, range_gen)
+ add_distribution(int_distributions, DistributionType.NORMAL, range_gen)
+ add_distribution(int_distributions, DistributionType.CHI2, range_gen)
-unf_norm_int_mix_1 = [
- int_distributions['uniform_int_1000_1'], int_distributions['normal_int_100000_100'],
- int_distributions['normal_int_10000000_1000']
-]
-int_distributions['mixed_int_unf_norm_1'] = RandomDistribution.mixed(children=unf_norm_int_mix_1,
- weight=[1, 1, 1])
+# Distributions to be used only in other mixed distributions
+int_distributions_offset = []
+for range_gen in int_ranges_2:
+ add_distribution(int_distributions_offset, DistributionType.UNIFORM, range_gen)
+ add_distribution(int_distributions_offset, DistributionType.NORMAL, range_gen)
+ add_distribution(int_distributions_offset, DistributionType.CHI2, range_gen)
-unf_norm_chi_int_mix_1 = [
- int_distributions['uniform_int_10000_10'], int_distributions['uniform_int_1000000_100'],
- int_distributions['normal_int_10000_10'], int_distributions['normal_int_1000000_100'],
- int_distributions['chi2_int_10000_10'], int_distributions['chi2_int_10000000_1000']
-]
-int_distributions['mixed_int_unf_norm_chi_1'] = RandomDistribution.mixed(
- children=unf_norm_chi_int_mix_1, weight=[1, 1, 1, 1, 1, 1])
-
-unf_norm_chi_int_mix_2 = [
- int_distributions['uniform_int_10000_10'],
- int_distributions['normal_int_10000_10'],
- int_distributions['uniform_int_1000000_100'],
- int_distributions['normal_int_1000000_100'],
- int_distributions['chi2_int_1000000_100'],
+# Mixes of distributions with different NDV and value distances
+int_distributions.append(
+ RandomDistribution.mixed(
+ children=[int_distributions[0], int_distributions_offset[0], int_distributions[4]],
+ weight=[1, 1, 1]))
+
+int_distributions.append(
+ RandomDistribution.mixed(
+ children=[int_distributions[1], int_distributions[4], int_distributions[7]],
+ weight=[1, 1, 1]))
+
+int_distributions.append(
+ RandomDistribution.mixed(
+ children=[
+ int_distributions[1], int_distributions_offset[1], int_distributions[3],
+ int_distributions[2], int_distributions_offset[2]
+ ], weight=[1, 1, 1, 1, 1]))
+
+int_distributions.append(
+ RandomDistribution.mixed(
+ children=[
+ int_distributions[2], int_distributions[3], int_distributions[6],
+ int_distributions_offset[1], int_distributions_offset[2], int_distributions_offset[5]
+ ], weight=[1, 1, 1, 1, 1, 1]))
+
+#############################
+# Double number distributions
+
+dbl_ranges = [
+ # 1K unique doubles with different distances
+ RangeGenerator(DataType.DOUBLE, 0.0, 100.0, 0.1),
+ RangeGenerator(DataType.DOUBLE, 0.0, 10000.0, 10),
+ RangeGenerator(DataType.DOUBLE, 0.0, 1000000.0, 1000),
+ # 10K unique doubles with different distances
+ RangeGenerator(DataType.DOUBLE, 0.0, 1000.0, 0.1),
+ RangeGenerator(DataType.DOUBLE, 0.0, 100000.0, 10),
+ RangeGenerator(DataType.DOUBLE, 0.0, 10000000.0, 1000)
]
-int_distributions['mixed_int_unf_norm_chi_2'] = RandomDistribution.mixed(
- children=unf_norm_chi_int_mix_2, weight=[1, 1, 1, 1, 1])
+dbl_distributions = []
+
+for range_gen in dbl_ranges:
+ add_distribution(dbl_distributions, DistributionType.UNIFORM, range_gen)
+ add_distribution(dbl_distributions, DistributionType.NORMAL, range_gen)
+
+dbl_distributions.append(
+ RandomDistribution.mixed(
+ children=[dbl_distributions[0], dbl_distributions[3], dbl_distributions[10]],
+ weight=[1, 1, 1]))
+
+dbl_distributions.append(
+ RandomDistribution.mixed(
+ children=[
+ dbl_distributions[0],
+ dbl_distributions[4],
+ RandomDistribution.normal(RangeGenerator(DataType.DOUBLE, 500.0, 600.0, 0.1)),
+ RandomDistribution.normal(RangeGenerator(DataType.DOUBLE, 3000200.0, 5000100.0, 3030)),
+ ], weight=[1, 1, 1, 1]))
+
+#############################
+# Date distributions
+
+MINUTE = 60
+HOUR = MINUTE * 60
+DAY = HOUR * 24
+MONTH = DAY * 30
+
+range_dtt_1y = RangeGenerator(DataType.DATE, datetime(2007, 1, 1), datetime(2008, 1, 1), HOUR)
+range_dtt_1m_1 = RangeGenerator(DataType.DATE, datetime(2007, 2, 1), datetime(2008, 3, 1), HOUR)
+range_dtt_1m_2 = RangeGenerator(DataType.DATE, datetime(2007, 6, 1), datetime(2008, 7, 1), HOUR)
+range_dtt_1m_3 = RangeGenerator(DataType.DATE, datetime(2007, 10, 1), datetime(2008, 11, 1), HOUR)
+range_dtt_10y_1 = RangeGenerator(DataType.DATE, datetime(2006, 1, 1), datetime(2016, 1, 1), DAY)
+range_dtt_10y_2 = RangeGenerator(DataType.DATE, datetime(1995, 1, 1), datetime(2005, 1, 1), DAY)
+range_dtt_20y = RangeGenerator(DataType.DATE, datetime(1997, 10, 1), datetime(2017, 11, 1), MONTH)
+
+dt_distributions = []
+
+add_distribution(dt_distributions, DistributionType.UNIFORM, range_dtt_1y)
+add_distribution(dt_distributions, DistributionType.NORMAL, range_dtt_10y_1)
+
+dt_distributions.append(
+ RandomDistribution.mixed([
+ RandomDistribution.uniform(range_dtt_1y),
+ RandomDistribution.uniform(range_dtt_1m_1),
+ RandomDistribution.uniform(range_dtt_1m_2),
+ RandomDistribution.uniform(range_dtt_1m_3)
+ ], [1, 1, 1, 1]))
+
+dt_distributions.append(
+ RandomDistribution.mixed([
+ RandomDistribution.uniform(range_dtt_10y_1),
+ RandomDistribution.uniform(range_dtt_10y_2),
+ RandomDistribution.uniform(range_dtt_20y)
+ ], [1, 1, 1]))
+
+#######################
# String distributions
PRINTED_CHAR_MIN_CODE = ord('0')
@@ -173,14 +261,18 @@ d4 = RandomDistribution.uniform(range_int_20_30)
# Sets of strings where characters at different positions have different distances
string_sets = {}
-# 33 unique strings
-string_sets['set_1112_33'] = generate_str_by_distance(33, 'xxxx', d1, d1, d1, d2)
-string_sets['set_2221_33'] = generate_str_by_distance(33, 'azay', d2, d2, d3, d1)
-string_sets['set_5555_33'] = generate_str_by_distance(33, 'axbz', d4, d4, d4, d4)
+# 250 unique strings
+string_sets['string_1112_250'] = generate_str_by_distance(250, 'xxxx', d1, d1, d1, d2)
+string_sets['string_2221_250'] = generate_str_by_distance(250, 'azay', d2, d2, d3, d1)
+string_sets['string_5555_250'] = generate_str_by_distance(250, 'axbz', d4, d4, d4, d4)
# 1000 unique strings
-string_sets['set_1112_1000'] = generate_str_by_distance(1000, 'xxxx', d1, d1, d1, d2)
-string_sets['set_2221_1000'] = generate_str_by_distance(1000, 'azay', d2, d2, d3, d1)
-string_sets['set_5555_1000'] = generate_str_by_distance(1000, 'axbz', d4, d4, d4, d4)
+string_sets['string_1112_1000'] = generate_str_by_distance(1000, 'xxxx', d1, d1, d1, d2)
+string_sets['string_2221_1000'] = generate_str_by_distance(1000, 'azay', d2, d2, d3, d1)
+string_sets['string_5555_1000'] = generate_str_by_distance(1000, 'axbz', d4, d4, d4, d4)
+# 10000 unique strings
+string_sets['string_1112_10000'] = generate_str_by_distance(10000, 'xxxx', d1, d1, d1, d2)
+string_sets['string_2221_10000'] = generate_str_by_distance(10000, 'azay', d2, d2, d3, d1)
+string_sets['string_5555_10000'] = generate_str_by_distance(10000, 'axbz', d4, d4, d4, d4)
# Weights with different variance. For instance if the smallest weight is 1, and the biggest weight is 5
# then some values in a choice distribution will be picked with at most 5 times higher probability.
@@ -197,92 +289,121 @@ weights['unif_s'] = RandomDistribution.uniform(weight_range_s)
weights['unif_l'] = RandomDistribution.uniform(weight_range_l)
weights['norm_s'] = RandomDistribution.normal(weight_range_s)
weights['norm_l'] = RandomDistribution.normal(weight_range_l)
-weights['chi2_s'] = RandomDistribution.noncentral_chisquare(weight_range_s)
-weights['chi2_l'] = RandomDistribution.noncentral_chisquare(weight_range_l)
+#weights['chi2_s'] = RandomDistribution.noncentral_chisquare(weight_range_s)
+#weights['chi2_l'] = RandomDistribution.noncentral_chisquare(weight_range_l)
-def make_choice_distr(str_set: Sequence[str], weight_distr: RandomDistribution):
- return RandomDistribution.choice(str_set, weight_distr.generate(len(str_set)))
+
+def add_choice_distr(distr_set: Sequence[RandomDistribution], str_set: Sequence[str],
+ weight_distr: RandomDistribution, v_name: str, w_name: str):
+ distr = RandomDistribution.choice(str_set, weight_distr.generate(len(str_set)), v_name, w_name)
+ distr_set.append(distr)
# String data distributions to be used for string generation
-str_distributions = {}
+str_distributions = []
for set_name, cur_set in string_sets.items():
- for weight_name, weight in weights.items():
- str_distributions[f'choice_str_{set_name}_{weight_name}'] = make_choice_distr(
- cur_set, weight)
+ for weight_name, cur_weight in weights.items():
+ add_choice_distr(str_distributions, cur_set, cur_weight, set_name, weight_name)
+#######################
# Array distributions
# array lenght distributions - they are all uniform
-arr_zero_size = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 0, 1, 1))
arr_len_dist_s = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 1, 6, 1))
arr_len_dist_m = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 90, 110, 3))
arr_len_dist_l = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 900, 1100, 10))
-arr_empty_distr = ArrayRandomDistribution(arr_zero_size, int_distributions['uniform_int_1000_1'])
-arr_distributions = {}
+def add_array_distr(distr_set: Sequence[RandomDistribution], lengths_distr: RandomDistribution,
+ value_distr: RandomDistribution):
+ distr_set.append(ArrayRandomDistribution(lengths_distr, value_distr))
+
+
+arr_distributions = []
# Arrays with integers
-arr_distributions["uniform_arr_int_10000_10_s"] = ArrayRandomDistribution(
- arr_len_dist_s, int_distributions['uniform_int_10000_10'])
-arr_distributions["uniform_arr_int_10000_10_m"] = ArrayRandomDistribution(
- arr_len_dist_m, int_distributions['uniform_int_10000_10'])
-arr_distributions["uniform_arr_int_10000_10_l"] = ArrayRandomDistribution(
- arr_len_dist_l, int_distributions['uniform_int_10000_10'])
-arr_distributions["mixed_arr_int_s"] = ArrayRandomDistribution(
- arr_len_dist_s, int_distributions['mixed_int_unf_norm_chi_2'])
-arr_distributions["mixed_arr_int_m"] = ArrayRandomDistribution(
- arr_len_dist_m, int_distributions['mixed_int_unf_norm_chi_2'])
-arr_distributions["mixed_arr_int_l"] = ArrayRandomDistribution(
- arr_len_dist_l, int_distributions['mixed_int_unf_norm_chi_2'])
+add_array_distr(arr_distributions, arr_len_dist_s, int_distributions[3])
+add_array_distr(arr_distributions, arr_len_dist_m, int_distributions[3])
+add_array_distr(arr_distributions, arr_len_dist_l, int_distributions[3])
+add_array_distr(arr_distributions, arr_len_dist_s, int_distributions[-1])
+add_array_distr(arr_distributions, arr_len_dist_m, int_distributions[-1])
+add_array_distr(arr_distributions, arr_len_dist_l, int_distributions[-1])
# Arrays with strings
-arr_distributions["choice_arr_str_set_1112_33_norm_l_s"] = ArrayRandomDistribution(
- arr_len_dist_s, str_distributions['choice_str_set_1112_33_norm_l'])
-arr_distributions["choice_arr_str_set_1112_33_norm_l_m"] = ArrayRandomDistribution(
- arr_len_dist_m, str_distributions['choice_str_set_1112_33_norm_l'])
-arr_distributions["choice_arr_str_set_1112_33_norm_l_l"] = ArrayRandomDistribution(
- arr_len_dist_l, str_distributions['choice_str_set_1112_33_norm_l'])
-arr_distributions["choice_arr_str_set_5555_1000_norm_l_s"] = ArrayRandomDistribution(
- arr_len_dist_s, str_distributions['choice_str_set_5555_1000_norm_l'])
-arr_distributions["choice_arr_str_set_5555_1000_norm_l_m"] = ArrayRandomDistribution(
- arr_len_dist_m, str_distributions['choice_str_set_5555_1000_norm_l'])
-arr_distributions["choice_arr_str_set_5555_1000_norm_l_l"] = ArrayRandomDistribution(
- arr_len_dist_l, str_distributions['choice_str_set_5555_1000_norm_l'])
+add_array_distr(arr_distributions, arr_len_dist_s, str_distributions[1])
+add_array_distr(arr_distributions, arr_len_dist_m, str_distributions[1])
+add_array_distr(arr_distributions, arr_len_dist_l, str_distributions[1])
+add_array_distr(arr_distributions, arr_len_dist_s, str_distributions[-1])
+add_array_distr(arr_distributions, arr_len_dist_m, str_distributions[-1])
+add_array_distr(arr_distributions, arr_len_dist_l, str_distributions[-1])
# 30% scalars, 70% arrays
-arr_distributions["mixed_arr_int_30_70_s"] = RandomDistribution.mixed(
- [int_distributions['uniform_int_10000_10'], arr_distributions["uniform_arr_int_10000_10_s"]],
- [0.3, 0.7])
-arr_distributions["mixed_arr_int_30_70_l"] = RandomDistribution.mixed(
- [int_distributions['uniform_int_10000_10'], arr_distributions["uniform_arr_int_10000_10_l"]],
- [0.3, 0.7])
-arr_distributions["mixed_arr_str_30_70"] = RandomDistribution.mixed([
- str_distributions['choice_str_set_2221_33_norm_l'],
- arr_distributions["choice_arr_str_set_1112_33_norm_l_s"]
-], [0.3, 0.7])
+arr_distributions.append(
+ RandomDistribution.mixed([int_distributions[0], arr_distributions[0]], [0.3, 0.7]))
+arr_distributions.append(
+ RandomDistribution.mixed([int_distributions[-1], arr_distributions[-1]], [0.3, 0.7]))
# 70% scalars, 30% arrays
-arr_distributions["mixed_arr_int_70_30_s"] = RandomDistribution.mixed(
- [int_distributions['uniform_int_10000_10'], arr_distributions["uniform_arr_int_10000_10_s"]],
- [0.7, 0.3])
-arr_distributions["mixed_arr_int_70_30_l"] = RandomDistribution.mixed(
- [int_distributions['uniform_int_10000_10'], arr_distributions["uniform_arr_int_10000_10_l"]],
- [0.7, 0.3])
-arr_distributions["mixed_arr_str_70_30"] = RandomDistribution.mixed([
- str_distributions['choice_str_set_2221_33_norm_l'],
- arr_distributions["choice_arr_str_set_1112_33_norm_l_s"]
-], [0.7, 0.3])
+arr_distributions.append(
+ RandomDistribution.mixed([int_distributions[0], arr_distributions[0]], [0.7, 0.3]))
+arr_distributions.append(
+ RandomDistribution.mixed([int_distributions[-1], arr_distributions[-1]], [0.7, 0.3]))
+
+arr_zero_size = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 0, 1, 1))
+arr_empty_distr = ArrayRandomDistribution(arr_zero_size, int_distributions[0])
# 20% empty arrays
-arr_distributions["uniform_arr_int_10000_10_s_empty_20"] = RandomDistribution.mixed(
- [arr_empty_distr, arr_distributions["uniform_arr_int_10000_10_s"]], [0.2, 0.8])
+arr_distributions.append(
+ RandomDistribution.mixed([arr_empty_distr, arr_distributions[2]], [0.2, 0.8]))
# 80% empty arrays
-arr_distributions["uniform_arr_int_10000_10_s_empty_80"] = RandomDistribution.mixed(
- [arr_empty_distr, arr_distributions["uniform_arr_int_10000_10_s"]], [0.8, 0.2])
+arr_distributions.append(
+ RandomDistribution.mixed([arr_empty_distr, arr_distributions[2]], [0.8, 0.2]))
+
+###############################
+# Mixed data type distributions
+
+mix_distributions = []
+
+# Integers + strings
+int_str_mix_1 = [int_distributions[0], str_distributions[0]]
+int_str_mix_2 = [int_distributions_offset[7], str_distributions[-1]]
+
+mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_1, weight=[0.5, 0.5]))
+mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_2, weight=[0.5, 0.5]))
+
+mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_1, weight=[0.1, 0.9]))
+mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_1, weight=[0.9, 0.1]))
+mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_2, weight=[0.1, 0.9]))
+mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_2, weight=[0.9, 0.1]))
+
+# Doubles and strings
+dbl_ascii_range = RangeGenerator(DataType.DOUBLE, float(PRINTED_CHAR_MIN_CODE),
+ float(PRINTED_CHAR_MAX_CODE), 0.01)
+ascii_double_range_distr = RandomDistribution.normal(dbl_ascii_range)
+
+dbl_str_mix_1 = [ascii_double_range_distr, str_distributions[1]]
+mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_1, weight=[0.5, 0.5]))
+mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_1, weight=[0.1, 0.9]))
+mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_1, weight=[0.9, 0.1]))
+
+dbl_str_mix_2 = [dbl_distributions[5], str_distributions[0]]
+mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_2, weight=[0.5, 0.5]))
+
+dbl_str_mix_3 = [dbl_distributions[5], str_distributions[5]]
+mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_3, weight=[0.5, 0.5]))
+
+# Doubles and/or strings and dates
+
+dbl_str_dt_mix_1 = [ascii_double_range_distr, str_distributions[4], dt_distributions[0]]
+mix_distributions.append(
+ RandomDistribution.mixed(children=dbl_str_dt_mix_1, weight=[0.5, 0.5, 0.5]))
+
+str_dt_mix_1 = [str_distributions[0], dt_distributions[-1]]
+mix_distributions.append(RandomDistribution.mixed(children=str_dt_mix_1, weight=[0.5, 0.5]))
+str_dt_mix_2 = [str_distributions[-1], dt_distributions[0]]
+mix_distributions.append(RandomDistribution.mixed(children=str_dt_mix_2, weight=[0.5, 0.5]))
################################################################################
# Collection templates
@@ -291,21 +412,33 @@ arr_distributions["uniform_arr_int_10000_10_s_empty_80"] = RandomDistribution.mi
# that is committed to git, by default we generate only 100 and 1000 document collections.
# These are not sufficient for actual CE accuracy testing. Whenever one needs to estimate CE
# accuracy, they should generate larger datasets offline. To achieve this, set
-# collection_cardinalities = [100, 1000, 10000, 100000]
+# collection_cardinalities = [1000, 10000, 100000]
# Notice that such sizes result in several minutes load time on the JS test side.
-collection_cardinalities = [100, 1000]
+collection_cardinalities = [500]
field_templates = [
- config.FieldTemplate(name=f'{dist_name}', data_type=config.DataType.INTEGER, distribution=dist,
- indexed=False) for dist_name, dist in int_distributions.items()
+ config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.INTEGER, distribution=dist,
+ indexed=False) for dist in int_distributions
+]
+field_templates += [
+ config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.STRING, distribution=dist,
+ indexed=False) for dist in str_distributions
+]
+field_templates += [
+ config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.ARRAY, distribution=dist,
+ indexed=False) for dist in arr_distributions
+]
+field_templates += [
+ config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.DOUBLE, distribution=dist,
+ indexed=False) for dist in dbl_distributions
]
field_templates += [
- config.FieldTemplate(name=f'{dist_name}', data_type=config.DataType.STRING, distribution=dist,
- indexed=False) for dist_name, dist in str_distributions.items()
+ config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.DATE, distribution=dist,
+ indexed=False) for dist in dt_distributions
]
field_templates += [
- config.FieldTemplate(name=f'{dist_name}', data_type=config.DataType.ARRAY, distribution=dist,
- indexed=False) for dist_name, dist in arr_distributions.items()
+ config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.MIXDATA, distribution=dist,
+ indexed=False) for dist in mix_distributions
]
ce_data = config.CollectionTemplate(name="ce_data", fields=field_templates, compound_indexes=[],
diff --git a/buildscripts/cost_model/ce_generate_data.py b/buildscripts/cost_model/ce_generate_data.py
index f9aea10e1a7..01346ff639a 100644
--- a/buildscripts/cost_model/ce_generate_data.py
+++ b/buildscripts/cost_model/ce_generate_data.py
@@ -29,6 +29,7 @@
import asyncio
import dataclasses
+from datetime import datetime
import json
import math
import os
@@ -38,10 +39,11 @@ from pathlib import Path
import seaborn as sns
import bson
import matplotlib.pyplot as plt
-from config import CollectionTemplate, FieldTemplate, DataType
+from config import CollectionTemplate, FieldTemplate
from data_generator import CollectionInfo, DataGenerator
from database_instance import DatabaseInstance
import parameters_extractor
+from random_generator import DataType
from ce_data_settings import database_config, data_generator_config
__all__ = []
@@ -73,6 +75,8 @@ class OidEncoder(json.JSONEncoder):
# Replace the OID with a consequtive int number as needed by the query generator
OidEncoder.cur_oid += 1
return OidEncoder.cur_oid
+ if isinstance(o, datetime):
+ return str(o)
return super(OidEncoder, self).default(o)
@@ -107,6 +111,8 @@ async def generate_histograms(coll_template, coll, dump_path):
doc_count = await coll.count_documents({})
for field in coll_template.fields:
field_data = []
+ if re.match('^mixeddata_.*', field.name):
+ continue
async for doc in coll.find({field.name: {"$exists": True}}, {"_id": 0, field.name: 1}):
field_val = doc[field.name]
if isinstance(field_val, str):
diff --git a/buildscripts/cost_model/config.py b/buildscripts/cost_model/config.py
index ed4bc99731e..fa548c84a70 100644
--- a/buildscripts/cost_model/config.py
+++ b/buildscripts/cost_model/config.py
@@ -31,7 +31,7 @@ from __future__ import annotations
from dataclasses import dataclass
from enum import Enum
from typing import Callable, Sequence
-from random_generator import RandomDistribution
+from random_generator import RandomDistribution, DataType
@dataclass
@@ -100,17 +100,6 @@ class FieldTemplate:
indexed: bool
-class DataType(Enum):
- """Data types."""
-
- INTEGER = 0
- STRING = 1
- ARRAY = 2
-
- def __str__(self):
- return self.name.lower()[:3]
-
-
@dataclass
class AbtNodeCalibrationConfig:
type: str
diff --git a/buildscripts/cost_model/data_generator.py b/buildscripts/cost_model/data_generator.py
index f24c7aef3d9..cf60e460e71 100644
--- a/buildscripts/cost_model/data_generator.py
+++ b/buildscripts/cost_model/data_generator.py
@@ -35,8 +35,8 @@ import asyncio
import pymongo
from pymongo import IndexModel
from motor.motor_asyncio import AsyncIOMotorCollection
-from random_generator import RandomDistribution
-from config import DataGeneratorConfig, DataType, WriteMode
+from random_generator import RandomDistribution, DataType
+from config import DataGeneratorConfig, WriteMode
from database_instance import DatabaseInstance
__all__ = ['DataGenerator']
diff --git a/buildscripts/cost_model/random_generator.py b/buildscripts/cost_model/random_generator.py
index 1dcc2bb004c..9710b38db6f 100644
--- a/buildscripts/cost_model/random_generator.py
+++ b/buildscripts/cost_model/random_generator.py
@@ -30,23 +30,50 @@
from __future__ import annotations
from ctypes import Union
from dataclasses import dataclass
+from datetime import datetime
from enum import Enum
from itertools import chain
from typing import Generic, Sequence, TypeVar
import numpy as np
+import random
__all__ = ['RangeGenerator', 'DataType', 'RandomDistribution']
-
-class DataType(Enum):
- """Data type enum for data generators."""
-
- STRING = 0
- INTEGER = 1
- FLOAT = 2
+TVar = TypeVar('TVar', str, int, float, datetime)
-TVar = TypeVar('TVar', str, int, float)
+class DataType(Enum):
+ """MongoDB data types of collection fields. Ordered according to BSON type order."""
+
+ DOUBLE = 1
+ STRING = 2
+ OBJECT = 3
+ ARRAY = 4
+ OBJECTID = 7
+ BOOLEAN = 8
+ DATE = 9
+ NULL = 10
+ INTEGER = 16 # Both 32 and 64 bit ints
+ TIMESTAMP = 17
+ DECIMAL128 = 19
+ MIXDATA = 42
+
+ def __str__(self):
+ typenames = {
+ DataType.DOUBLE: 'dbl',
+ DataType.STRING: 'str',
+ DataType.OBJECT: 'obj',
+ DataType.ARRAY: 'arr',
+ DataType.OBJECTID: 'oid',
+ DataType.BOOLEAN: 'bool',
+ DataType.DATE: 'dt',
+ DataType.NULL: 'null',
+ DataType.INTEGER: 'int',
+ DataType.TIMESTAMP: 'ts',
+ DataType.DECIMAL128: 'dec',
+ DataType.MIXDATA: 'mixdata',
+ }
+ return typenames[self]
@dataclass
@@ -57,12 +84,33 @@ class RangeGenerator(Generic[TVar]):
interval_begin: TVar
interval_end: TVar
step: int = 1
+ ndv: int = -1
+
+ def __post_init__(self):
+ assert type(self.interval_begin) == type(
+ self.interval_end), 'Interval ends must of the same type.'
+ if type(self.interval_begin) == int or type(self.interval_begin) == float:
+ self.ndv = round((self.interval_end - self.interval_begin) / self.step)
+ elif type(self.interval_begin) == datetime:
+ begin_ts = self.interval_begin.timestamp()
+ end_ts = self.interval_end.timestamp()
+ self.ndv = round((end_ts - begin_ts) / self.step)
def generate(self) -> Sequence[TVar]:
"""Generate the range."""
gen_range_dict = {
- DataType.STRING: ansi_range, DataType.INTEGER: range, DataType.FLOAT: np.arange
+ DataType.STRING:
+ ansi_range,
+ DataType.INTEGER:
+ range,
+ # The arange function produces equi-distant values which is too regular for CE testing.
+ # It is left here as a possible way of generating doubles.
+ # DataType.DOUBLE: np.arange
+ DataType.DOUBLE:
+ double_range,
+ DataType.DATE:
+ datetime_range,
}
gen_range = gen_range_dict.get(self.data_type)
@@ -71,6 +119,26 @@ class RangeGenerator(Generic[TVar]):
return list(gen_range(self.interval_begin, self.interval_end, self.step))
+ def __str__(self):
+ # TODO: for now skip NDV from the name to make it shorter.
+ #ndv_str = "_" if self.ndv <= 0 else f'_{self.ndv}_'
+ begin_str = str(self.interval_begin.date()) if isinstance(
+ self.interval_begin, datetime) else str(self.interval_begin)
+ end_str = str(self.interval_end.date()) if isinstance(self.interval_end, datetime) else str(
+ self.interval_end)
+
+ str_rep = f'{str(self.data_type)}_{begin_str}-{end_str}-{self.step}'
+ # Remove dots and spaces from field names.
+ str_rep = str_rep.replace('.', ',')
+ str_rep = str_rep.replace(' ', '_')
+ return str_rep
+
+
+def double_range(begin: float, end: float, step: float = 1.0):
+ """Produce a sequence of double values within a range."""
+
+ return np.random.default_rng().uniform(begin, end, round((end - begin) / step))
+
def ansi_range(begin: str, end: str, step: int = 1):
"""Produces a sequence of string from begin to end."""
@@ -122,14 +190,29 @@ def ansi_range(begin: str, end: str, step: int = 1):
yield f'{prefix}{int_to_ansi(number)}'
+def datetime_range(begin: datetime, end: datetime, step: int = 60):
+ begin_ts = begin.timestamp()
+ end_ts = end.timestamp()
+ num_values = round((end_ts - begin_ts) / step)
+ assert num_values >= 1, "Datetime range must be bigger than the step."
+ for _ in range(0, num_values):
+ random_ts = np.random.randint(begin_ts, end_ts)
+ yield datetime.fromtimestamp(random_ts)
+ #random_dates = [datetime.fromtimestamp(random_ts) for random_ts in random.sample(range(int(begin_ts), int(end_ts)), num_values)]
+ #return random_dates
+
+
class DistributionType(Enum):
"""An enum of distributions supported by Random Data Generator."""
CHOICE = 0
NORMAL = 1
- NONCENTRAL_CHISQUARE = 2
+ CHI2 = 2 # NONCENTRAL_CHISQUARE
UNIFORM = 3
- MIXED = 4
+ MIXDIST = 4
+
+ def __str__(self):
+ return self.name.lower()
_rng = np.random.default_rng()
@@ -142,12 +225,42 @@ class RandomDistribution:
distribution_type: DistributionType
values: Union[Sequence[TVar], RangeGenerator]
weights: Union[Sequence[float], None]
+ values_name: str = ''
+ weights_name: str = ''
+
+ def __str__(self):
+ def print_values(vals):
+ if isinstance(vals, RangeGenerator):
+ return str(vals)
+ elif isinstance(vals[0], RandomDistribution):
+ # Must be a mixed distribution
+ res = ''
+ for distr in vals:
+ res += f'{str(distr)}_'
+ return res
+ else:
+ # All values are of the same type because of how RangeGenerator works
+ return f'{type(vals[0]).__name__}_{min(vals)}_{max(vals)}_{len(vals)}'
+
+ range_str = ''
+ if hasattr(self, 'values'):
+ range_str = print_values(self.values)
+ elif self.values_name != '':
+ range_str = f'{self.values_name}'
+ if self.weights_name != '':
+ range_str += f'_{self.weights_name}'
+
+ distr_str = f'{str(self.distribution_type)}_{range_str}'
+ if isinstance(self, ArrayRandomDistribution):
+ distr_str += f'array_{str(self.value_distr)}'
+ return distr_str
@staticmethod
- def choice(values: Sequence[TVar], weights: Union[Sequence[float], RangeGenerator]):
+ def choice(values: Sequence[TVar], weights: Union[Sequence[float], RangeGenerator],
+ v_name: str = '', w_name: str = ''):
"""Create choice distribution."""
return RandomDistribution(distribution_type=DistributionType.CHOICE, values=values,
- weights=weights)
+ weights=weights, values_name=v_name, weights_name=w_name)
@staticmethod
def normal(values: Union[Sequence[TVar], RangeGenerator]):
@@ -158,8 +271,8 @@ class RandomDistribution:
@staticmethod
def noncentral_chisquare(values: Union[Sequence[TVar], RangeGenerator]):
"""Create Non Central Chi2 distribution."""
- return RandomDistribution(distribution_type=DistributionType.NONCENTRAL_CHISQUARE,
- values=values, weights=None)
+ return RandomDistribution(distribution_type=DistributionType.CHI2, values=values,
+ weights=None)
@staticmethod
def uniform(values: Union[Sequence[TVar], RangeGenerator]):
@@ -171,7 +284,7 @@ class RandomDistribution:
def mixed(children: Sequence[RandomDistribution],
weight: Union[Sequence[float], RangeGenerator]):
"""Create mixed distribution."""
- return RandomDistribution(distribution_type=DistributionType.MIXED, values=children,
+ return RandomDistribution(distribution_type=DistributionType.MIXDIST, values=children,
weights=weight)
def generate(self, size: int) -> Sequence[TVar]:
@@ -202,9 +315,9 @@ class RandomDistribution:
generators = {
DistributionType.CHOICE: RandomDistribution._choice,
DistributionType.NORMAL: RandomDistribution._normal,
- DistributionType.NONCENTRAL_CHISQUARE: RandomDistribution._noncentral_chisquare,
+ DistributionType.CHI2: RandomDistribution._noncentral_chisquare,
DistributionType.UNIFORM: RandomDistribution._uniform,
- DistributionType.MIXED: RandomDistribution._mixed,
+ DistributionType.MIXDIST: RandomDistribution._mixed,
}
gen = generators.get(self.distribution_type)
@@ -215,7 +328,7 @@ class RandomDistribution:
def get_values(self):
"""Return a list of values used to generate a random sequence."""
- if self.distribution_type == DistributionType.MIXED:
+ if self.distribution_type == DistributionType.MIXDIST:
result = []
for child in self.values:
result.append(child.get_values())
@@ -237,7 +350,7 @@ class RandomDistribution:
# In according to the 68-95-99.7 rule 99.7% of values lie within three standard deviations of the mean.
# Therefore, if we define stddev as `len(values) / 6` 99.7% of the values will lie within our `values` array bounds.
# We define stddev as `len(values) / 6` to increase make sure that almost all values are
- # withing the boundaries and we don't have to cut the index too often.
+ # within the boundaries and we don't have to cut the index too often.
mean = len(values) / 2
stddev = len(values) / 6.5
@@ -283,29 +396,37 @@ class RandomDistribution:
@staticmethod
def _mixed(size: int, children: Sequence[RandomDistribution], probs: Sequence[float]):
if probs is None:
- raise ValueError("props must be specified for mixed distribution")
+ raise ValueError(f'probs must be specified for mixed distributions: {str(children)}')
result = []
for child_distr, prob in zip(children, probs):
if not isinstance(child_distr, RandomDistribution):
raise ValueError(
- "children must be of type RandomDistribution for mixed distribution")
+ f'children must be of type RandomDistribution for mixed distribution, child_distr: {child_distr}'
+ )
child_size = int(size * prob)
result.append(child_distr.generate(child_size))
return list(chain.from_iterable(result))
+_NO_DEFAULT = object()
+
+
@dataclass
class ArrayRandomDistribution(RandomDistribution):
"""Produces random array sequence of the specified values with the specified distribution."""
- lengths_distr: RandomDistribution
- value_distr: RandomDistribution
+ lengths_distr: RandomDistribution = _NO_DEFAULT
+ value_distr: RandomDistribution = _NO_DEFAULT
def __init__(self, lengths_distr: RandomDistribution, value_distr: RandomDistribution):
self.lengths_distr = lengths_distr
self.value_distr = value_distr
+ self.distribution_type = value_distr.distribution_type
+
+ def __str__(self):
+ return f'{super().__str__()}'
def generate(self, size: int):
"""Generate random array sequence of the given size."""
@@ -324,20 +445,24 @@ class ArrayRandomDistribution(RandomDistribution):
class DocumentRandomDistribution(RandomDistribution):
"""Produces random document sequence of the specified values with the specified distribution."""
- number_of_fields_distr: RandomDistribution
- fields_distr: RandomDistribution
- field_to_distribution: dict
+ number_of_fields_distr: RandomDistribution = _NO_DEFAULT
+ fields_distr: RandomDistribution = _NO_DEFAULT
+ field_to_distribution: dict = _NO_DEFAULT
def __init__(self, number_of_fields_distr: RandomDistribution, fields_distr: RandomDistribution,
field_to_distribution: dict):
self.number_of_fields_distr = number_of_fields_distr
self.fields_distr = fields_distr
self.field_to_distribution = field_to_distribution
+ self.distribution_type = fields_distr.distribution_type
for field in self.get_fields():
if field not in self.field_to_distribution:
raise ValueError("Must provide a RandomDistribution for each field")
+ def __str__(self):
+ return f'{super().__str__()}'
+
def generate(self, size: int):
"""Generate random document sequence of the given size."""
docs = []
@@ -373,14 +498,14 @@ if __name__ == '__main__':
def print_distr(title, distr, size=10000):
"""Print distribution."""
- print(f'\n{title}\n')
+ print(f'\n{title}: {str(distr)}\n')
rs = distr.generate(size)
has_arrays = any(isinstance(elem, list) for elem in rs)
has_dict = any(isinstance(elem, dict) for elem in rs)
if not has_arrays and not has_dict:
counter = Counter(rs)
- for value in distr.get_values():
+ for value in [*Counter(rs)]:
count = counter[value]
if isinstance(value, float):
print(f'{value:.2f}\t{count}\t{(count//10)*"*"}')
@@ -402,9 +527,24 @@ if __name__ == '__main__':
int_noncentral_chisquare = RandomDistribution.noncentral_chisquare(list(range(1, 30)))
print_distr("Noncentral Chisquare for integers", int_noncentral_chisquare)
- float_uniform = RandomDistribution.uniform(RangeGenerator(DataType.FLOAT, 0.1, 10.0, 0.37))
+ float_uniform = RandomDistribution.uniform(RangeGenerator(DataType.DOUBLE, 0.1, 10.0, 0.37))
print_distr("Uniform for floats", float_uniform)
+ float_normal = RandomDistribution.normal(RangeGenerator(DataType.DOUBLE, 0.1, 10.0, 0.37))
+ print_distr("Normal for floats", float_normal)
+
+ FOUR_DAYS_IN_SECONDS = 60 * 20 * 24 * 12
+
+ date_uniform = RandomDistribution.uniform(
+ RangeGenerator(DataType.DATE, datetime(2007, 1, 1), datetime(2008, 1, 1),
+ FOUR_DAYS_IN_SECONDS))
+ print_distr("Uniform for dates", date_uniform, size=1000)
+
+ date_normal = RandomDistribution.normal(
+ RangeGenerator(DataType.DATE, datetime(2007, 1, 1), datetime(2008, 1, 1),
+ FOUR_DAYS_IN_SECONDS))
+ print_distr("Normal for dates", date_normal, size=1000)
+
str_chisquare2 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "aa", "ba"))
str_normal2 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "ap", "bp"))
mixed = RandomDistribution.mixed(children=[float_uniform, str_chisquare2, str_normal2],