/**
 * Tests accuracy of pre-generated sample histograms against histograms built on the entire
 * collection.
 * @tags: [
 *   requires_cqf,
 * ]
 */

/**
 * Returns a 2-element array containing the number of documents returned by the 'predicate' and
 * the cardinality estimate when run against 'coll'
 */
function getMatchCE(coll, predicate) {
    jsTestLog(`Query: ${coll.getName()} ${tojson(predicate)}`);
    const explain = coll.explain("executionStats").aggregate([{$match: predicate}]);
    const n = round2(explain.executionStats.nReturned);
    const ce = round2(getRootCE(explain));
    const explainSummarized = tojson(summarizeExplainForCE(explain));
    print(explainSummarized);
    return [n, ce];
}

function testMatchPredicate(baseColl, sampleColl, predicate, collSize, totSampleErr, totBaseErr) {
    // Determine number of documents returned and predicate CE for queries against base and
    // sample collections. The results should be the same in both cases since we reset the
    // sample collection to have the same documents as the base collection after building a
    // histogram on it.
    const [baseN, baseCE] = getMatchCE(baseColl, predicate);
    const [sampleN, sampleCE] = getMatchCE(sampleColl, predicate);
    assert.eq(baseN, sampleN);
    const nReturned = baseN;

    // Compute errors for each strategy compared to actual query cardinality.
    const baseErr = computeStrategyErrors({baseCE, nReturned}, "baseCE", collSize);
    const sampleErr = computeStrategyErrors({sampleCE, nReturned}, "sampleCE", collSize);

    totSampleErr.absError += sampleErr.absError;
    totSampleErr.relError += sampleErr.relError;
    totSampleErr.selError += sampleErr.selError;
    totBaseErr.absError += baseErr.absError;
    totBaseErr.relError += baseErr.relError;
    totBaseErr.selError += baseErr.selError;

    jsTestLog(
        `CE: ${tojson(predicate)}, base = ${baseCE}, sample = ${sampleCE}, actual = ${nReturned}`);
    print(`Base error: ${tojson(baseErr)}\n`);
    print(`Sample error: ${tojson(sampleErr)}`);
}

(function() {
load("jstests/libs/load_ce_test_data.js");  // For 'loadJSONDataset'.
load("jstests/libs/ce_stats_utils.js");     // For 'getRootCE', 'createHistogram', runHistogramsTest
load("jstests/query_golden/libs/compute_errors.js");  // For 'computeStrategyErrors'.

Random.setRandomSeed(6345);

const collData = 'ce_accuracy_test';
const dataDir = 'jstests/query_golden/libs/data/';
const sampleRate = 0.2;

load(`${dataDir}${collData}.schema`);  // For 'dbMetadata'.
load(`${dataDir}${collData}.data`);    // For 'chunkNames'.

/**
 * Main testing function. Initializes histograms and sample collection, and then executes a series
 * of queries against the 'base' collection, whose histograms include all values, and the 'sampled'
 * collection, whose histograms include only 10% of values.
 */
runHistogramsTest(function testSampleHistogram() {
    const sampleDB = db.getSiblingDB("ce_sampled_histogram");
    const baseDB = db.getSiblingDB("ce_base_histogram");

    const collMetadata = dbMetadata[0];
    const collName = collMetadata.collectionName;
    assert.eq(collName, "ce_data_500");

    const sampleColl = sampleDB[collName];
    const baseColl = baseDB[collName];

    const fields = [
        "uniform_int_0-1000-1",
        "normal_int_0-1000-1",
        "chi2_int_0-1000-1",
        "mixdist_uniform_int_0-1000-1_uniform_int_7000-8000-1_normal_int_0-10000-10_",
        "mixdist_normal_int_0-1000-1_normal_int_0-10000-10_normal_int_0-100000-100_",
    ];

    // Initialize base collection.
    loadJSONDataset(baseDB, chunkNames, dataDir, dbMetadata);
    const collSize = baseColl.count();

    // Select approximately 'sampleRate'*collSize documents from the base collection to insert
    // into the sample collection.
    let sample = [];
    for (let i = 0; i < collSize; i++) {
        if (Random.rand() < sampleRate) {
            sample.push(i);
        }
    }
    baseColl.aggregate({$match: {_id: {$in: sample}}},
                       {$out: {db: sampleDB.getName(), coll: collName}});

    let projection = {_id: 0};
    let sortFields = {};
    // Build histograms on the base and sample collections.
    for (const field of fields) {
        projection = Object.assign(projection, {[field]: 1});
        sortFields = Object.assign(sortFields, {[field]: 1});
        createHistogram(baseColl, field);
        createHistogram(sampleColl, field);
    }

    // Replace the sample coll with the full collection. In this way, we have a histogram on only a
    // sample of documents in the base collection. Note that this does not test $analyze sampling
    // logic, because that yields different results on every test run.
    baseColl.aggregate({$out: {db: sampleDB.getName(), coll: collName}});

    // Run some queries to demonstrate that the sample CE scales to approach the base CE.
    const totSampleErr = {absError: 0, relError: 0, selError: 0};
    const totBaseErr = {absError: 0, relError: 0, selError: 0};

    forceCE("histogram");
    let count = 0;
    // Sort the values to ensure a stable test result.
    const values =
        baseColl.find({_id: {$in: [3, 123, 405]}}, projection).sort(sortFields).toArray();
    for (const field of fields) {
        for (let i = 1; i < values.length; i++) {
            const prev = values[i - 1][field];
            const cur = values[i][field];

            const min = prev < cur ? prev : cur;
            const max = prev > cur ? prev : cur;

            // Test a variety of queries.
            testMatchPredicate(baseColl,
                               sampleColl,
                               {[field]: {$gte: min, $lte: max}},
                               collSize,
                               totSampleErr,
                               totBaseErr);
            testMatchPredicate(
                baseColl, sampleColl, {[field]: {$lt: min}}, collSize, totSampleErr, totBaseErr);
            testMatchPredicate(
                baseColl, sampleColl, {[field]: {$eq: min}}, collSize, totSampleErr, totBaseErr);
            count += 3;
        }
    }

    const avgBaseErr = {
        absError: round2(totBaseErr.absError / count),
        relError: round2(totBaseErr.relError / count),
        selError: round2(totBaseErr.selError / count)
    };
    const avgSampleErr = {
        absError: round2(totSampleErr.absError / count),
        relError: round2(totSampleErr.relError / count),
        selError: round2(totSampleErr.selError / count)
    };

    jsTestLog(`Average errors (${count} queries):`);
    print(`Average base error: ${tojson(avgBaseErr)}\n`);
    print(`Average sample error: ${tojson(avgSampleErr)}`);
});
})();