diff options
author | Timour Katchaounov <timour.katchaounov@mongodb.com> | 2023-02-15 15:01:33 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2023-02-15 16:32:12 +0000 |
commit | 5af5e4a1e8bc6930f837776cb533333d053a32f1 (patch) | |
tree | ce130cdfab062ee2c47856e1050df62cf57ab747 /buildscripts | |
parent | 9962320d74254ec0517ba4f478386f7f0bcb1f9e (diff) | |
download | mongo-5af5e4a1e8bc6930f837776cb533333d053a32f1.tar.gz |
SERVER-73031 Generate random data with mixed data types
* Added generation of random data with mixed data types
* Generation of random dates and doubles
* Some refactoring of the python generation framework wrt types
Diffstat (limited to 'buildscripts')
-rw-r--r-- | buildscripts/cost_model/calibration_settings.py | 2 | ||||
-rw-r--r-- | buildscripts/cost_model/ce_data_settings.py | 389 | ||||
-rw-r--r-- | buildscripts/cost_model/ce_generate_data.py | 8 | ||||
-rw-r--r-- | buildscripts/cost_model/config.py | 13 | ||||
-rw-r--r-- | buildscripts/cost_model/data_generator.py | 4 | ||||
-rw-r--r-- | buildscripts/cost_model/random_generator.py | 200 |
6 files changed, 442 insertions, 174 deletions
diff --git a/buildscripts/cost_model/calibration_settings.py b/buildscripts/cost_model/calibration_settings.py index 12e43b82b80..024b25220f0 100644 --- a/buildscripts/cost_model/calibration_settings.py +++ b/buildscripts/cost_model/calibration_settings.py @@ -29,7 +29,7 @@ import random import config -from random_generator import RangeGenerator, DataType, RandomDistribution, ArrayRandomDistribution +from random_generator import RangeGenerator, RandomDistribution, ArrayRandomDistribution, DataType __all__ = ['main_config', 'distributions'] diff --git a/buildscripts/cost_model/ce_data_settings.py b/buildscripts/cost_model/ce_data_settings.py index 4bd60f79c5d..f9f1e1c049b 100644 --- a/buildscripts/cost_model/ce_data_settings.py +++ b/buildscripts/cost_model/ce_data_settings.py @@ -28,10 +28,11 @@ """Configuration of data generation for CE accuracy testing.""" from pathlib import Path +from datetime import datetime import random from typing import Sequence import config -from random_generator import RangeGenerator, DataType, RandomDistribution, ArrayRandomDistribution +from random_generator import RangeGenerator, RandomDistribution, ArrayRandomDistribution, DataType, DistributionType __all__ = ['database_config', 'data_generator_config'] @@ -39,70 +40,157 @@ __all__ = ['database_config', 'data_generator_config'] # Data distributions ################################################################################ + +def add_distribution(distr_set: Sequence[RandomDistribution], distr_type: DistributionType, + rg: RangeGenerator): + distr = None + if distr_type == DistributionType.UNIFORM: + distr = RandomDistribution.uniform(rg) + elif distr_type == DistributionType.NORMAL: + distr = RandomDistribution.normal(rg) + elif distr_type == DistributionType.CHI2: + distr = RandomDistribution.noncentral_chisquare(rg) + else: + raise ValueError("Unknown distribution") + distr_set.append(distr) + + # Ranges +int_ranges_1 = [ + # 1K unique integers with different distances + RangeGenerator(DataType.INTEGER, 0, 1000, 1), + RangeGenerator(DataType.INTEGER, 0, 10000, 10), + RangeGenerator(DataType.INTEGER, 0, 100000, 100), + # 10K unique integers with different distances + RangeGenerator(DataType.INTEGER, 0, 10000, 1), + RangeGenerator(DataType.INTEGER, 0, 1000000, 10), + RangeGenerator(DataType.INTEGER, 0, 10000000, 100), +] -# 1K unique numbers with different distances -range_int_1000_1 = RangeGenerator(DataType.INTEGER, 0, 1000, 1) -range_int_1000_10 = RangeGenerator(DataType.INTEGER, 0, 10000, 10) -range_int_1000_100 = RangeGenerator(DataType.INTEGER, 0, 100000, 100) -range_int_1000_1000 = RangeGenerator(DataType.INTEGER, 0, 1000000, 1000) -# 10K unique numbers with different distances -range_int_10000_1 = RangeGenerator(DataType.INTEGER, 0, 10000, 1) -range_int_10000_10 = RangeGenerator(DataType.INTEGER, 0, 100000, 10) -range_int_10000_100 = RangeGenerator(DataType.INTEGER, 0, 1000000, 100) -range_int_10000_1000 = RangeGenerator(DataType.INTEGER, 0, 10000000, 1000) -int_ranges = [ - range_int_1000_1, range_int_1000_10, range_int_1000_100, range_int_1000_1000, range_int_10000_1, - range_int_10000_10, range_int_10000_100, range_int_10000_1000 +int_ranges_2 = [ + # 1K unique integers with different distances + RangeGenerator(DataType.INTEGER, 7000, 8000, 1), + RangeGenerator(DataType.INTEGER, 70000, 80000, 10), + RangeGenerator(DataType.INTEGER, 700000, 800000, 100), + # 10K unique integers with different distances + RangeGenerator(DataType.INTEGER, 70000, 80000, 1), + RangeGenerator(DataType.INTEGER, 700000, 800000, 10), + RangeGenerator(DataType.INTEGER, 7000000, 8000000, 100), ] +####################### # Integer distributions -int_distributions = {} -for int_range in int_ranges: - int_distributions[ - f'uniform_int_{int_range.interval_end - int_range.interval_begin}_{int_range.step}'] = RandomDistribution.uniform( - int_range) - int_distributions[ - f'normal_int_{int_range.interval_end - int_range.interval_begin}_{int_range.step}'] = RandomDistribution.normal( - int_range) - int_distributions[ - f'chi2_int_{int_range.interval_end - int_range.interval_begin}_{int_range.step}'] = RandomDistribution.noncentral_chisquare( - int_range) -# Mixes of distributions with different NDV and value distances +int_distributions = [] -unf_int_mix_1 = [ - int_distributions['uniform_int_10000_10'], int_distributions['uniform_int_100000_100'], - int_distributions['uniform_int_10000000_1000'] -] -int_distributions['mixed_int_uniform_1'] = RandomDistribution.mixed(children=unf_int_mix_1, - weight=[1, 1, 1]) +for range_gen in int_ranges_1: + add_distribution(int_distributions, DistributionType.UNIFORM, range_gen) + add_distribution(int_distributions, DistributionType.NORMAL, range_gen) + add_distribution(int_distributions, DistributionType.CHI2, range_gen) -unf_norm_int_mix_1 = [ - int_distributions['uniform_int_1000_1'], int_distributions['normal_int_100000_100'], - int_distributions['normal_int_10000000_1000'] -] -int_distributions['mixed_int_unf_norm_1'] = RandomDistribution.mixed(children=unf_norm_int_mix_1, - weight=[1, 1, 1]) +# Distributions to be used only in other mixed distributions +int_distributions_offset = [] +for range_gen in int_ranges_2: + add_distribution(int_distributions_offset, DistributionType.UNIFORM, range_gen) + add_distribution(int_distributions_offset, DistributionType.NORMAL, range_gen) + add_distribution(int_distributions_offset, DistributionType.CHI2, range_gen) -unf_norm_chi_int_mix_1 = [ - int_distributions['uniform_int_10000_10'], int_distributions['uniform_int_1000000_100'], - int_distributions['normal_int_10000_10'], int_distributions['normal_int_1000000_100'], - int_distributions['chi2_int_10000_10'], int_distributions['chi2_int_10000000_1000'] -] -int_distributions['mixed_int_unf_norm_chi_1'] = RandomDistribution.mixed( - children=unf_norm_chi_int_mix_1, weight=[1, 1, 1, 1, 1, 1]) - -unf_norm_chi_int_mix_2 = [ - int_distributions['uniform_int_10000_10'], - int_distributions['normal_int_10000_10'], - int_distributions['uniform_int_1000000_100'], - int_distributions['normal_int_1000000_100'], - int_distributions['chi2_int_1000000_100'], +# Mixes of distributions with different NDV and value distances +int_distributions.append( + RandomDistribution.mixed( + children=[int_distributions[0], int_distributions_offset[0], int_distributions[4]], + weight=[1, 1, 1])) + +int_distributions.append( + RandomDistribution.mixed( + children=[int_distributions[1], int_distributions[4], int_distributions[7]], + weight=[1, 1, 1])) + +int_distributions.append( + RandomDistribution.mixed( + children=[ + int_distributions[1], int_distributions_offset[1], int_distributions[3], + int_distributions[2], int_distributions_offset[2] + ], weight=[1, 1, 1, 1, 1])) + +int_distributions.append( + RandomDistribution.mixed( + children=[ + int_distributions[2], int_distributions[3], int_distributions[6], + int_distributions_offset[1], int_distributions_offset[2], int_distributions_offset[5] + ], weight=[1, 1, 1, 1, 1, 1])) + +############################# +# Double number distributions + +dbl_ranges = [ + # 1K unique doubles with different distances + RangeGenerator(DataType.DOUBLE, 0.0, 100.0, 0.1), + RangeGenerator(DataType.DOUBLE, 0.0, 10000.0, 10), + RangeGenerator(DataType.DOUBLE, 0.0, 1000000.0, 1000), + # 10K unique doubles with different distances + RangeGenerator(DataType.DOUBLE, 0.0, 1000.0, 0.1), + RangeGenerator(DataType.DOUBLE, 0.0, 100000.0, 10), + RangeGenerator(DataType.DOUBLE, 0.0, 10000000.0, 1000) ] -int_distributions['mixed_int_unf_norm_chi_2'] = RandomDistribution.mixed( - children=unf_norm_chi_int_mix_2, weight=[1, 1, 1, 1, 1]) +dbl_distributions = [] + +for range_gen in dbl_ranges: + add_distribution(dbl_distributions, DistributionType.UNIFORM, range_gen) + add_distribution(dbl_distributions, DistributionType.NORMAL, range_gen) + +dbl_distributions.append( + RandomDistribution.mixed( + children=[dbl_distributions[0], dbl_distributions[3], dbl_distributions[10]], + weight=[1, 1, 1])) + +dbl_distributions.append( + RandomDistribution.mixed( + children=[ + dbl_distributions[0], + dbl_distributions[4], + RandomDistribution.normal(RangeGenerator(DataType.DOUBLE, 500.0, 600.0, 0.1)), + RandomDistribution.normal(RangeGenerator(DataType.DOUBLE, 3000200.0, 5000100.0, 3030)), + ], weight=[1, 1, 1, 1])) + +############################# +# Date distributions + +MINUTE = 60 +HOUR = MINUTE * 60 +DAY = HOUR * 24 +MONTH = DAY * 30 + +range_dtt_1y = RangeGenerator(DataType.DATE, datetime(2007, 1, 1), datetime(2008, 1, 1), HOUR) +range_dtt_1m_1 = RangeGenerator(DataType.DATE, datetime(2007, 2, 1), datetime(2008, 3, 1), HOUR) +range_dtt_1m_2 = RangeGenerator(DataType.DATE, datetime(2007, 6, 1), datetime(2008, 7, 1), HOUR) +range_dtt_1m_3 = RangeGenerator(DataType.DATE, datetime(2007, 10, 1), datetime(2008, 11, 1), HOUR) +range_dtt_10y_1 = RangeGenerator(DataType.DATE, datetime(2006, 1, 1), datetime(2016, 1, 1), DAY) +range_dtt_10y_2 = RangeGenerator(DataType.DATE, datetime(1995, 1, 1), datetime(2005, 1, 1), DAY) +range_dtt_20y = RangeGenerator(DataType.DATE, datetime(1997, 10, 1), datetime(2017, 11, 1), MONTH) + +dt_distributions = [] + +add_distribution(dt_distributions, DistributionType.UNIFORM, range_dtt_1y) +add_distribution(dt_distributions, DistributionType.NORMAL, range_dtt_10y_1) + +dt_distributions.append( + RandomDistribution.mixed([ + RandomDistribution.uniform(range_dtt_1y), + RandomDistribution.uniform(range_dtt_1m_1), + RandomDistribution.uniform(range_dtt_1m_2), + RandomDistribution.uniform(range_dtt_1m_3) + ], [1, 1, 1, 1])) + +dt_distributions.append( + RandomDistribution.mixed([ + RandomDistribution.uniform(range_dtt_10y_1), + RandomDistribution.uniform(range_dtt_10y_2), + RandomDistribution.uniform(range_dtt_20y) + ], [1, 1, 1])) + +####################### # String distributions PRINTED_CHAR_MIN_CODE = ord('0') @@ -173,14 +261,18 @@ d4 = RandomDistribution.uniform(range_int_20_30) # Sets of strings where characters at different positions have different distances string_sets = {} -# 33 unique strings -string_sets['set_1112_33'] = generate_str_by_distance(33, 'xxxx', d1, d1, d1, d2) -string_sets['set_2221_33'] = generate_str_by_distance(33, 'azay', d2, d2, d3, d1) -string_sets['set_5555_33'] = generate_str_by_distance(33, 'axbz', d4, d4, d4, d4) +# 250 unique strings +string_sets['string_1112_250'] = generate_str_by_distance(250, 'xxxx', d1, d1, d1, d2) +string_sets['string_2221_250'] = generate_str_by_distance(250, 'azay', d2, d2, d3, d1) +string_sets['string_5555_250'] = generate_str_by_distance(250, 'axbz', d4, d4, d4, d4) # 1000 unique strings -string_sets['set_1112_1000'] = generate_str_by_distance(1000, 'xxxx', d1, d1, d1, d2) -string_sets['set_2221_1000'] = generate_str_by_distance(1000, 'azay', d2, d2, d3, d1) -string_sets['set_5555_1000'] = generate_str_by_distance(1000, 'axbz', d4, d4, d4, d4) +string_sets['string_1112_1000'] = generate_str_by_distance(1000, 'xxxx', d1, d1, d1, d2) +string_sets['string_2221_1000'] = generate_str_by_distance(1000, 'azay', d2, d2, d3, d1) +string_sets['string_5555_1000'] = generate_str_by_distance(1000, 'axbz', d4, d4, d4, d4) +# 10000 unique strings +string_sets['string_1112_10000'] = generate_str_by_distance(10000, 'xxxx', d1, d1, d1, d2) +string_sets['string_2221_10000'] = generate_str_by_distance(10000, 'azay', d2, d2, d3, d1) +string_sets['string_5555_10000'] = generate_str_by_distance(10000, 'axbz', d4, d4, d4, d4) # Weights with different variance. For instance if the smallest weight is 1, and the biggest weight is 5 # then some values in a choice distribution will be picked with at most 5 times higher probability. @@ -197,92 +289,121 @@ weights['unif_s'] = RandomDistribution.uniform(weight_range_s) weights['unif_l'] = RandomDistribution.uniform(weight_range_l) weights['norm_s'] = RandomDistribution.normal(weight_range_s) weights['norm_l'] = RandomDistribution.normal(weight_range_l) -weights['chi2_s'] = RandomDistribution.noncentral_chisquare(weight_range_s) -weights['chi2_l'] = RandomDistribution.noncentral_chisquare(weight_range_l) +#weights['chi2_s'] = RandomDistribution.noncentral_chisquare(weight_range_s) +#weights['chi2_l'] = RandomDistribution.noncentral_chisquare(weight_range_l) -def make_choice_distr(str_set: Sequence[str], weight_distr: RandomDistribution): - return RandomDistribution.choice(str_set, weight_distr.generate(len(str_set))) + +def add_choice_distr(distr_set: Sequence[RandomDistribution], str_set: Sequence[str], + weight_distr: RandomDistribution, v_name: str, w_name: str): + distr = RandomDistribution.choice(str_set, weight_distr.generate(len(str_set)), v_name, w_name) + distr_set.append(distr) # String data distributions to be used for string generation -str_distributions = {} +str_distributions = [] for set_name, cur_set in string_sets.items(): - for weight_name, weight in weights.items(): - str_distributions[f'choice_str_{set_name}_{weight_name}'] = make_choice_distr( - cur_set, weight) + for weight_name, cur_weight in weights.items(): + add_choice_distr(str_distributions, cur_set, cur_weight, set_name, weight_name) +####################### # Array distributions # array lenght distributions - they are all uniform -arr_zero_size = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 0, 1, 1)) arr_len_dist_s = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 1, 6, 1)) arr_len_dist_m = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 90, 110, 3)) arr_len_dist_l = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 900, 1100, 10)) -arr_empty_distr = ArrayRandomDistribution(arr_zero_size, int_distributions['uniform_int_1000_1']) -arr_distributions = {} +def add_array_distr(distr_set: Sequence[RandomDistribution], lengths_distr: RandomDistribution, + value_distr: RandomDistribution): + distr_set.append(ArrayRandomDistribution(lengths_distr, value_distr)) + + +arr_distributions = [] # Arrays with integers -arr_distributions["uniform_arr_int_10000_10_s"] = ArrayRandomDistribution( - arr_len_dist_s, int_distributions['uniform_int_10000_10']) -arr_distributions["uniform_arr_int_10000_10_m"] = ArrayRandomDistribution( - arr_len_dist_m, int_distributions['uniform_int_10000_10']) -arr_distributions["uniform_arr_int_10000_10_l"] = ArrayRandomDistribution( - arr_len_dist_l, int_distributions['uniform_int_10000_10']) -arr_distributions["mixed_arr_int_s"] = ArrayRandomDistribution( - arr_len_dist_s, int_distributions['mixed_int_unf_norm_chi_2']) -arr_distributions["mixed_arr_int_m"] = ArrayRandomDistribution( - arr_len_dist_m, int_distributions['mixed_int_unf_norm_chi_2']) -arr_distributions["mixed_arr_int_l"] = ArrayRandomDistribution( - arr_len_dist_l, int_distributions['mixed_int_unf_norm_chi_2']) +add_array_distr(arr_distributions, arr_len_dist_s, int_distributions[3]) +add_array_distr(arr_distributions, arr_len_dist_m, int_distributions[3]) +add_array_distr(arr_distributions, arr_len_dist_l, int_distributions[3]) +add_array_distr(arr_distributions, arr_len_dist_s, int_distributions[-1]) +add_array_distr(arr_distributions, arr_len_dist_m, int_distributions[-1]) +add_array_distr(arr_distributions, arr_len_dist_l, int_distributions[-1]) # Arrays with strings -arr_distributions["choice_arr_str_set_1112_33_norm_l_s"] = ArrayRandomDistribution( - arr_len_dist_s, str_distributions['choice_str_set_1112_33_norm_l']) -arr_distributions["choice_arr_str_set_1112_33_norm_l_m"] = ArrayRandomDistribution( - arr_len_dist_m, str_distributions['choice_str_set_1112_33_norm_l']) -arr_distributions["choice_arr_str_set_1112_33_norm_l_l"] = ArrayRandomDistribution( - arr_len_dist_l, str_distributions['choice_str_set_1112_33_norm_l']) -arr_distributions["choice_arr_str_set_5555_1000_norm_l_s"] = ArrayRandomDistribution( - arr_len_dist_s, str_distributions['choice_str_set_5555_1000_norm_l']) -arr_distributions["choice_arr_str_set_5555_1000_norm_l_m"] = ArrayRandomDistribution( - arr_len_dist_m, str_distributions['choice_str_set_5555_1000_norm_l']) -arr_distributions["choice_arr_str_set_5555_1000_norm_l_l"] = ArrayRandomDistribution( - arr_len_dist_l, str_distributions['choice_str_set_5555_1000_norm_l']) +add_array_distr(arr_distributions, arr_len_dist_s, str_distributions[1]) +add_array_distr(arr_distributions, arr_len_dist_m, str_distributions[1]) +add_array_distr(arr_distributions, arr_len_dist_l, str_distributions[1]) +add_array_distr(arr_distributions, arr_len_dist_s, str_distributions[-1]) +add_array_distr(arr_distributions, arr_len_dist_m, str_distributions[-1]) +add_array_distr(arr_distributions, arr_len_dist_l, str_distributions[-1]) # 30% scalars, 70% arrays -arr_distributions["mixed_arr_int_30_70_s"] = RandomDistribution.mixed( - [int_distributions['uniform_int_10000_10'], arr_distributions["uniform_arr_int_10000_10_s"]], - [0.3, 0.7]) -arr_distributions["mixed_arr_int_30_70_l"] = RandomDistribution.mixed( - [int_distributions['uniform_int_10000_10'], arr_distributions["uniform_arr_int_10000_10_l"]], - [0.3, 0.7]) -arr_distributions["mixed_arr_str_30_70"] = RandomDistribution.mixed([ - str_distributions['choice_str_set_2221_33_norm_l'], - arr_distributions["choice_arr_str_set_1112_33_norm_l_s"] -], [0.3, 0.7]) +arr_distributions.append( + RandomDistribution.mixed([int_distributions[0], arr_distributions[0]], [0.3, 0.7])) +arr_distributions.append( + RandomDistribution.mixed([int_distributions[-1], arr_distributions[-1]], [0.3, 0.7])) # 70% scalars, 30% arrays -arr_distributions["mixed_arr_int_70_30_s"] = RandomDistribution.mixed( - [int_distributions['uniform_int_10000_10'], arr_distributions["uniform_arr_int_10000_10_s"]], - [0.7, 0.3]) -arr_distributions["mixed_arr_int_70_30_l"] = RandomDistribution.mixed( - [int_distributions['uniform_int_10000_10'], arr_distributions["uniform_arr_int_10000_10_l"]], - [0.7, 0.3]) -arr_distributions["mixed_arr_str_70_30"] = RandomDistribution.mixed([ - str_distributions['choice_str_set_2221_33_norm_l'], - arr_distributions["choice_arr_str_set_1112_33_norm_l_s"] -], [0.7, 0.3]) +arr_distributions.append( + RandomDistribution.mixed([int_distributions[0], arr_distributions[0]], [0.7, 0.3])) +arr_distributions.append( + RandomDistribution.mixed([int_distributions[-1], arr_distributions[-1]], [0.7, 0.3])) + +arr_zero_size = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 0, 1, 1)) +arr_empty_distr = ArrayRandomDistribution(arr_zero_size, int_distributions[0]) # 20% empty arrays -arr_distributions["uniform_arr_int_10000_10_s_empty_20"] = RandomDistribution.mixed( - [arr_empty_distr, arr_distributions["uniform_arr_int_10000_10_s"]], [0.2, 0.8]) +arr_distributions.append( + RandomDistribution.mixed([arr_empty_distr, arr_distributions[2]], [0.2, 0.8])) # 80% empty arrays -arr_distributions["uniform_arr_int_10000_10_s_empty_80"] = RandomDistribution.mixed( - [arr_empty_distr, arr_distributions["uniform_arr_int_10000_10_s"]], [0.8, 0.2]) +arr_distributions.append( + RandomDistribution.mixed([arr_empty_distr, arr_distributions[2]], [0.8, 0.2])) + +############################### +# Mixed data type distributions + +mix_distributions = [] + +# Integers + strings +int_str_mix_1 = [int_distributions[0], str_distributions[0]] +int_str_mix_2 = [int_distributions_offset[7], str_distributions[-1]] + +mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_1, weight=[0.5, 0.5])) +mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_2, weight=[0.5, 0.5])) + +mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_1, weight=[0.1, 0.9])) +mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_1, weight=[0.9, 0.1])) +mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_2, weight=[0.1, 0.9])) +mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_2, weight=[0.9, 0.1])) + +# Doubles and strings +dbl_ascii_range = RangeGenerator(DataType.DOUBLE, float(PRINTED_CHAR_MIN_CODE), + float(PRINTED_CHAR_MAX_CODE), 0.01) +ascii_double_range_distr = RandomDistribution.normal(dbl_ascii_range) + +dbl_str_mix_1 = [ascii_double_range_distr, str_distributions[1]] +mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_1, weight=[0.5, 0.5])) +mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_1, weight=[0.1, 0.9])) +mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_1, weight=[0.9, 0.1])) + +dbl_str_mix_2 = [dbl_distributions[5], str_distributions[0]] +mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_2, weight=[0.5, 0.5])) + +dbl_str_mix_3 = [dbl_distributions[5], str_distributions[5]] +mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_3, weight=[0.5, 0.5])) + +# Doubles and/or strings and dates + +dbl_str_dt_mix_1 = [ascii_double_range_distr, str_distributions[4], dt_distributions[0]] +mix_distributions.append( + RandomDistribution.mixed(children=dbl_str_dt_mix_1, weight=[0.5, 0.5, 0.5])) + +str_dt_mix_1 = [str_distributions[0], dt_distributions[-1]] +mix_distributions.append(RandomDistribution.mixed(children=str_dt_mix_1, weight=[0.5, 0.5])) +str_dt_mix_2 = [str_distributions[-1], dt_distributions[0]] +mix_distributions.append(RandomDistribution.mixed(children=str_dt_mix_2, weight=[0.5, 0.5])) ################################################################################ # Collection templates @@ -291,21 +412,33 @@ arr_distributions["uniform_arr_int_10000_10_s_empty_80"] = RandomDistribution.mi # that is committed to git, by default we generate only 100 and 1000 document collections. # These are not sufficient for actual CE accuracy testing. Whenever one needs to estimate CE # accuracy, they should generate larger datasets offline. To achieve this, set -# collection_cardinalities = [100, 1000, 10000, 100000] +# collection_cardinalities = [1000, 10000, 100000] # Notice that such sizes result in several minutes load time on the JS test side. -collection_cardinalities = [100, 1000] +collection_cardinalities = [500] field_templates = [ - config.FieldTemplate(name=f'{dist_name}', data_type=config.DataType.INTEGER, distribution=dist, - indexed=False) for dist_name, dist in int_distributions.items() + config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.INTEGER, distribution=dist, + indexed=False) for dist in int_distributions +] +field_templates += [ + config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.STRING, distribution=dist, + indexed=False) for dist in str_distributions +] +field_templates += [ + config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.ARRAY, distribution=dist, + indexed=False) for dist in arr_distributions +] +field_templates += [ + config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.DOUBLE, distribution=dist, + indexed=False) for dist in dbl_distributions ] field_templates += [ - config.FieldTemplate(name=f'{dist_name}', data_type=config.DataType.STRING, distribution=dist, - indexed=False) for dist_name, dist in str_distributions.items() + config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.DATE, distribution=dist, + indexed=False) for dist in dt_distributions ] field_templates += [ - config.FieldTemplate(name=f'{dist_name}', data_type=config.DataType.ARRAY, distribution=dist, - indexed=False) for dist_name, dist in arr_distributions.items() + config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.MIXDATA, distribution=dist, + indexed=False) for dist in mix_distributions ] ce_data = config.CollectionTemplate(name="ce_data", fields=field_templates, compound_indexes=[], diff --git a/buildscripts/cost_model/ce_generate_data.py b/buildscripts/cost_model/ce_generate_data.py index f9aea10e1a7..01346ff639a 100644 --- a/buildscripts/cost_model/ce_generate_data.py +++ b/buildscripts/cost_model/ce_generate_data.py @@ -29,6 +29,7 @@ import asyncio import dataclasses +from datetime import datetime import json import math import os @@ -38,10 +39,11 @@ from pathlib import Path import seaborn as sns import bson import matplotlib.pyplot as plt -from config import CollectionTemplate, FieldTemplate, DataType +from config import CollectionTemplate, FieldTemplate from data_generator import CollectionInfo, DataGenerator from database_instance import DatabaseInstance import parameters_extractor +from random_generator import DataType from ce_data_settings import database_config, data_generator_config __all__ = [] @@ -73,6 +75,8 @@ class OidEncoder(json.JSONEncoder): # Replace the OID with a consequtive int number as needed by the query generator OidEncoder.cur_oid += 1 return OidEncoder.cur_oid + if isinstance(o, datetime): + return str(o) return super(OidEncoder, self).default(o) @@ -107,6 +111,8 @@ async def generate_histograms(coll_template, coll, dump_path): doc_count = await coll.count_documents({}) for field in coll_template.fields: field_data = [] + if re.match('^mixeddata_.*', field.name): + continue async for doc in coll.find({field.name: {"$exists": True}}, {"_id": 0, field.name: 1}): field_val = doc[field.name] if isinstance(field_val, str): diff --git a/buildscripts/cost_model/config.py b/buildscripts/cost_model/config.py index ed4bc99731e..fa548c84a70 100644 --- a/buildscripts/cost_model/config.py +++ b/buildscripts/cost_model/config.py @@ -31,7 +31,7 @@ from __future__ import annotations from dataclasses import dataclass from enum import Enum from typing import Callable, Sequence -from random_generator import RandomDistribution +from random_generator import RandomDistribution, DataType @dataclass @@ -100,17 +100,6 @@ class FieldTemplate: indexed: bool -class DataType(Enum): - """Data types.""" - - INTEGER = 0 - STRING = 1 - ARRAY = 2 - - def __str__(self): - return self.name.lower()[:3] - - @dataclass class AbtNodeCalibrationConfig: type: str diff --git a/buildscripts/cost_model/data_generator.py b/buildscripts/cost_model/data_generator.py index f24c7aef3d9..cf60e460e71 100644 --- a/buildscripts/cost_model/data_generator.py +++ b/buildscripts/cost_model/data_generator.py @@ -35,8 +35,8 @@ import asyncio import pymongo from pymongo import IndexModel from motor.motor_asyncio import AsyncIOMotorCollection -from random_generator import RandomDistribution -from config import DataGeneratorConfig, DataType, WriteMode +from random_generator import RandomDistribution, DataType +from config import DataGeneratorConfig, WriteMode from database_instance import DatabaseInstance __all__ = ['DataGenerator'] diff --git a/buildscripts/cost_model/random_generator.py b/buildscripts/cost_model/random_generator.py index 1dcc2bb004c..9710b38db6f 100644 --- a/buildscripts/cost_model/random_generator.py +++ b/buildscripts/cost_model/random_generator.py @@ -30,23 +30,50 @@ from __future__ import annotations from ctypes import Union from dataclasses import dataclass +from datetime import datetime from enum import Enum from itertools import chain from typing import Generic, Sequence, TypeVar import numpy as np +import random __all__ = ['RangeGenerator', 'DataType', 'RandomDistribution'] - -class DataType(Enum): - """Data type enum for data generators.""" - - STRING = 0 - INTEGER = 1 - FLOAT = 2 +TVar = TypeVar('TVar', str, int, float, datetime) -TVar = TypeVar('TVar', str, int, float) +class DataType(Enum): + """MongoDB data types of collection fields. Ordered according to BSON type order.""" + + DOUBLE = 1 + STRING = 2 + OBJECT = 3 + ARRAY = 4 + OBJECTID = 7 + BOOLEAN = 8 + DATE = 9 + NULL = 10 + INTEGER = 16 # Both 32 and 64 bit ints + TIMESTAMP = 17 + DECIMAL128 = 19 + MIXDATA = 42 + + def __str__(self): + typenames = { + DataType.DOUBLE: 'dbl', + DataType.STRING: 'str', + DataType.OBJECT: 'obj', + DataType.ARRAY: 'arr', + DataType.OBJECTID: 'oid', + DataType.BOOLEAN: 'bool', + DataType.DATE: 'dt', + DataType.NULL: 'null', + DataType.INTEGER: 'int', + DataType.TIMESTAMP: 'ts', + DataType.DECIMAL128: 'dec', + DataType.MIXDATA: 'mixdata', + } + return typenames[self] @dataclass @@ -57,12 +84,33 @@ class RangeGenerator(Generic[TVar]): interval_begin: TVar interval_end: TVar step: int = 1 + ndv: int = -1 + + def __post_init__(self): + assert type(self.interval_begin) == type( + self.interval_end), 'Interval ends must of the same type.' + if type(self.interval_begin) == int or type(self.interval_begin) == float: + self.ndv = round((self.interval_end - self.interval_begin) / self.step) + elif type(self.interval_begin) == datetime: + begin_ts = self.interval_begin.timestamp() + end_ts = self.interval_end.timestamp() + self.ndv = round((end_ts - begin_ts) / self.step) def generate(self) -> Sequence[TVar]: """Generate the range.""" gen_range_dict = { - DataType.STRING: ansi_range, DataType.INTEGER: range, DataType.FLOAT: np.arange + DataType.STRING: + ansi_range, + DataType.INTEGER: + range, + # The arange function produces equi-distant values which is too regular for CE testing. + # It is left here as a possible way of generating doubles. + # DataType.DOUBLE: np.arange + DataType.DOUBLE: + double_range, + DataType.DATE: + datetime_range, } gen_range = gen_range_dict.get(self.data_type) @@ -71,6 +119,26 @@ class RangeGenerator(Generic[TVar]): return list(gen_range(self.interval_begin, self.interval_end, self.step)) + def __str__(self): + # TODO: for now skip NDV from the name to make it shorter. + #ndv_str = "_" if self.ndv <= 0 else f'_{self.ndv}_' + begin_str = str(self.interval_begin.date()) if isinstance( + self.interval_begin, datetime) else str(self.interval_begin) + end_str = str(self.interval_end.date()) if isinstance(self.interval_end, datetime) else str( + self.interval_end) + + str_rep = f'{str(self.data_type)}_{begin_str}-{end_str}-{self.step}' + # Remove dots and spaces from field names. + str_rep = str_rep.replace('.', ',') + str_rep = str_rep.replace(' ', '_') + return str_rep + + +def double_range(begin: float, end: float, step: float = 1.0): + """Produce a sequence of double values within a range.""" + + return np.random.default_rng().uniform(begin, end, round((end - begin) / step)) + def ansi_range(begin: str, end: str, step: int = 1): """Produces a sequence of string from begin to end.""" @@ -122,14 +190,29 @@ def ansi_range(begin: str, end: str, step: int = 1): yield f'{prefix}{int_to_ansi(number)}' +def datetime_range(begin: datetime, end: datetime, step: int = 60): + begin_ts = begin.timestamp() + end_ts = end.timestamp() + num_values = round((end_ts - begin_ts) / step) + assert num_values >= 1, "Datetime range must be bigger than the step." + for _ in range(0, num_values): + random_ts = np.random.randint(begin_ts, end_ts) + yield datetime.fromtimestamp(random_ts) + #random_dates = [datetime.fromtimestamp(random_ts) for random_ts in random.sample(range(int(begin_ts), int(end_ts)), num_values)] + #return random_dates + + class DistributionType(Enum): """An enum of distributions supported by Random Data Generator.""" CHOICE = 0 NORMAL = 1 - NONCENTRAL_CHISQUARE = 2 + CHI2 = 2 # NONCENTRAL_CHISQUARE UNIFORM = 3 - MIXED = 4 + MIXDIST = 4 + + def __str__(self): + return self.name.lower() _rng = np.random.default_rng() @@ -142,12 +225,42 @@ class RandomDistribution: distribution_type: DistributionType values: Union[Sequence[TVar], RangeGenerator] weights: Union[Sequence[float], None] + values_name: str = '' + weights_name: str = '' + + def __str__(self): + def print_values(vals): + if isinstance(vals, RangeGenerator): + return str(vals) + elif isinstance(vals[0], RandomDistribution): + # Must be a mixed distribution + res = '' + for distr in vals: + res += f'{str(distr)}_' + return res + else: + # All values are of the same type because of how RangeGenerator works + return f'{type(vals[0]).__name__}_{min(vals)}_{max(vals)}_{len(vals)}' + + range_str = '' + if hasattr(self, 'values'): + range_str = print_values(self.values) + elif self.values_name != '': + range_str = f'{self.values_name}' + if self.weights_name != '': + range_str += f'_{self.weights_name}' + + distr_str = f'{str(self.distribution_type)}_{range_str}' + if isinstance(self, ArrayRandomDistribution): + distr_str += f'array_{str(self.value_distr)}' + return distr_str @staticmethod - def choice(values: Sequence[TVar], weights: Union[Sequence[float], RangeGenerator]): + def choice(values: Sequence[TVar], weights: Union[Sequence[float], RangeGenerator], + v_name: str = '', w_name: str = ''): """Create choice distribution.""" return RandomDistribution(distribution_type=DistributionType.CHOICE, values=values, - weights=weights) + weights=weights, values_name=v_name, weights_name=w_name) @staticmethod def normal(values: Union[Sequence[TVar], RangeGenerator]): @@ -158,8 +271,8 @@ class RandomDistribution: @staticmethod def noncentral_chisquare(values: Union[Sequence[TVar], RangeGenerator]): """Create Non Central Chi2 distribution.""" - return RandomDistribution(distribution_type=DistributionType.NONCENTRAL_CHISQUARE, - values=values, weights=None) + return RandomDistribution(distribution_type=DistributionType.CHI2, values=values, + weights=None) @staticmethod def uniform(values: Union[Sequence[TVar], RangeGenerator]): @@ -171,7 +284,7 @@ class RandomDistribution: def mixed(children: Sequence[RandomDistribution], weight: Union[Sequence[float], RangeGenerator]): """Create mixed distribution.""" - return RandomDistribution(distribution_type=DistributionType.MIXED, values=children, + return RandomDistribution(distribution_type=DistributionType.MIXDIST, values=children, weights=weight) def generate(self, size: int) -> Sequence[TVar]: @@ -202,9 +315,9 @@ class RandomDistribution: generators = { DistributionType.CHOICE: RandomDistribution._choice, DistributionType.NORMAL: RandomDistribution._normal, - DistributionType.NONCENTRAL_CHISQUARE: RandomDistribution._noncentral_chisquare, + DistributionType.CHI2: RandomDistribution._noncentral_chisquare, DistributionType.UNIFORM: RandomDistribution._uniform, - DistributionType.MIXED: RandomDistribution._mixed, + DistributionType.MIXDIST: RandomDistribution._mixed, } gen = generators.get(self.distribution_type) @@ -215,7 +328,7 @@ class RandomDistribution: def get_values(self): """Return a list of values used to generate a random sequence.""" - if self.distribution_type == DistributionType.MIXED: + if self.distribution_type == DistributionType.MIXDIST: result = [] for child in self.values: result.append(child.get_values()) @@ -237,7 +350,7 @@ class RandomDistribution: # In according to the 68-95-99.7 rule 99.7% of values lie within three standard deviations of the mean. # Therefore, if we define stddev as `len(values) / 6` 99.7% of the values will lie within our `values` array bounds. # We define stddev as `len(values) / 6` to increase make sure that almost all values are - # withing the boundaries and we don't have to cut the index too often. + # within the boundaries and we don't have to cut the index too often. mean = len(values) / 2 stddev = len(values) / 6.5 @@ -283,29 +396,37 @@ class RandomDistribution: @staticmethod def _mixed(size: int, children: Sequence[RandomDistribution], probs: Sequence[float]): if probs is None: - raise ValueError("props must be specified for mixed distribution") + raise ValueError(f'probs must be specified for mixed distributions: {str(children)}') result = [] for child_distr, prob in zip(children, probs): if not isinstance(child_distr, RandomDistribution): raise ValueError( - "children must be of type RandomDistribution for mixed distribution") + f'children must be of type RandomDistribution for mixed distribution, child_distr: {child_distr}' + ) child_size = int(size * prob) result.append(child_distr.generate(child_size)) return list(chain.from_iterable(result)) +_NO_DEFAULT = object() + + @dataclass class ArrayRandomDistribution(RandomDistribution): """Produces random array sequence of the specified values with the specified distribution.""" - lengths_distr: RandomDistribution - value_distr: RandomDistribution + lengths_distr: RandomDistribution = _NO_DEFAULT + value_distr: RandomDistribution = _NO_DEFAULT def __init__(self, lengths_distr: RandomDistribution, value_distr: RandomDistribution): self.lengths_distr = lengths_distr self.value_distr = value_distr + self.distribution_type = value_distr.distribution_type + + def __str__(self): + return f'{super().__str__()}' def generate(self, size: int): """Generate random array sequence of the given size.""" @@ -324,20 +445,24 @@ class ArrayRandomDistribution(RandomDistribution): class DocumentRandomDistribution(RandomDistribution): """Produces random document sequence of the specified values with the specified distribution.""" - number_of_fields_distr: RandomDistribution - fields_distr: RandomDistribution - field_to_distribution: dict + number_of_fields_distr: RandomDistribution = _NO_DEFAULT + fields_distr: RandomDistribution = _NO_DEFAULT + field_to_distribution: dict = _NO_DEFAULT def __init__(self, number_of_fields_distr: RandomDistribution, fields_distr: RandomDistribution, field_to_distribution: dict): self.number_of_fields_distr = number_of_fields_distr self.fields_distr = fields_distr self.field_to_distribution = field_to_distribution + self.distribution_type = fields_distr.distribution_type for field in self.get_fields(): if field not in self.field_to_distribution: raise ValueError("Must provide a RandomDistribution for each field") + def __str__(self): + return f'{super().__str__()}' + def generate(self, size: int): """Generate random document sequence of the given size.""" docs = [] @@ -373,14 +498,14 @@ if __name__ == '__main__': def print_distr(title, distr, size=10000): """Print distribution.""" - print(f'\n{title}\n') + print(f'\n{title}: {str(distr)}\n') rs = distr.generate(size) has_arrays = any(isinstance(elem, list) for elem in rs) has_dict = any(isinstance(elem, dict) for elem in rs) if not has_arrays and not has_dict: counter = Counter(rs) - for value in distr.get_values(): + for value in [*Counter(rs)]: count = counter[value] if isinstance(value, float): print(f'{value:.2f}\t{count}\t{(count//10)*"*"}') @@ -402,9 +527,24 @@ if __name__ == '__main__': int_noncentral_chisquare = RandomDistribution.noncentral_chisquare(list(range(1, 30))) print_distr("Noncentral Chisquare for integers", int_noncentral_chisquare) - float_uniform = RandomDistribution.uniform(RangeGenerator(DataType.FLOAT, 0.1, 10.0, 0.37)) + float_uniform = RandomDistribution.uniform(RangeGenerator(DataType.DOUBLE, 0.1, 10.0, 0.37)) print_distr("Uniform for floats", float_uniform) + float_normal = RandomDistribution.normal(RangeGenerator(DataType.DOUBLE, 0.1, 10.0, 0.37)) + print_distr("Normal for floats", float_normal) + + FOUR_DAYS_IN_SECONDS = 60 * 20 * 24 * 12 + + date_uniform = RandomDistribution.uniform( + RangeGenerator(DataType.DATE, datetime(2007, 1, 1), datetime(2008, 1, 1), + FOUR_DAYS_IN_SECONDS)) + print_distr("Uniform for dates", date_uniform, size=1000) + + date_normal = RandomDistribution.normal( + RangeGenerator(DataType.DATE, datetime(2007, 1, 1), datetime(2008, 1, 1), + FOUR_DAYS_IN_SECONDS)) + print_distr("Normal for dates", date_normal, size=1000) + str_chisquare2 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "aa", "ba")) str_normal2 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "ap", "bp")) mixed = RandomDistribution.mixed(children=[float_uniform, str_chisquare2, str_normal2], |