summaryrefslogtreecommitdiff
path: root/buildscripts/cost_model/abt_calibrator.py
blob: 08d24032446ae11db24c59f79eaa1eb1e689e00c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# Copyright (C) 2022-present MongoDB, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the Server Side Public License, version 1,
# as published by MongoDB, Inc.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# Server Side Public License for more details.
#
# You should have received a copy of the Server Side Public License
# along with this program. If not, see
# <http://www.mongodb.com/licensing/server-side-public-license>.
#
# As a special exception, the copyright holders give permission to link the
# code of portions of this program with the OpenSSL library under certain
# conditions as described in each individual source file and distribute
# linked combinations including the program with the OpenSSL library. You
# must comply with the Server Side Public License in all respects for
# all of the code used other than as permitted herein. If you modify file(s)
# with this exception, you may extend this exception to your version of the
# file(s), but you are not obligated to do so. If you do not wish to do so,
# delete this exception statement from your version. If you delete this
# exception statement from all source files in the program, then also delete
# it in the license file.
#
"""Calibrate ABT nodes."""

from __future__ import annotations
import pandas as pd
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from config import AbtCalibratorConfig, AbtNodeCalibrationConfig
from database_instance import DatabaseInstance
from cost_estimator import estimate
import experiment as exp

__all__ = ['calibrate']


async def calibrate(config: AbtCalibratorConfig, database: DatabaseInstance):
    """Main entry-point for ABT calibration."""

    if not config.enabled:
        return {}

    df = await exp.load_calibration_data(database, config.input_collection_name)
    noout_df = exp.remove_outliers(df, 0.0, 0.90)
    abt_df = exp.extract_abt_nodes(noout_df)
    result = {}
    for node_config in config.nodes:
        result[node_config.type] = calibrate_node(abt_df, config, node_config)
    return result


def calibrate_node(abt_df: pd.DataFrame, config: AbtCalibratorConfig,
                   node_config: AbtNodeCalibrationConfig):
    abt_node_df = abt_df[abt_df.abt_type == node_config.type]
    if node_config.filter_function is not None:
        abt_node_df = node_config.filter_function(abt_node_df)

    # pylint: disable=invalid-name
    if node_config.variables_override is None:
        variables = ['n_processed']
    else:
        variables = node_config.variables_override
    y = abt_node_df['execution_time']
    X = abt_node_df[variables]

    X = sm.add_constant(X)

    def fit(X, y):
        nnls = LinearRegression(positive=True, fit_intercept=False)
        model = nnls.fit(X, y)
        return (model.coef_, model.predict)

    return estimate(fit, X.to_numpy(), y.to_numpy(), config.test_size, config.trace)