/** * Copyright (C) 2018-present MongoDB, Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the Server Side Public License, version 1, * as published by MongoDB, Inc. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * Server Side Public License for more details. * * You should have received a copy of the Server Side Public License * along with this program. If not, see * . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the Server Side Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #include "mongo/db/fts/fts_spec.h" #include "mongo/db/bson/dotted_path_support.h" #include "mongo/util/str.h" namespace mongo { namespace fts { // // This file contains functionality specific to indexing documents from TEXT_INDEX_VERSION_1 // text indexes. // namespace dps = ::mongo::dotted_path_support; namespace { void _addFTSStuff(BSONObjBuilder* b) { b->append("_fts", INDEX_NAME); b->append("_ftsx", 1); } } // namespace const FTSLanguage& FTSSpec::_getLanguageToUseV1(const BSONObj& userDoc) const { BSONElement e = userDoc[_languageOverrideField]; if (e.type() == String) { StringData x = e.valueStringData(); if (e.size() > 0) { // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail. return FTSLanguage::make(x, TEXT_INDEX_VERSION_1); } } return *_defaultLanguage; } void FTSSpec::_scoreStringV1(const Tools& tools, StringData raw, TermFrequencyMap* docScores, double weight) const { ScoreHelperMap terms; unsigned numTokens = 0; Tokenizer i(&tools.language, raw); while (i.more()) { Token t = i.next(); if (t.type != Token::TEXT) continue; std::string term = str::toLower(t.data); if (tools.stopwords->isStopWord(term)) continue; term = tools.stemmer->stem(term).toString(); ScoreHelperStruct& data = terms[term]; if (data.exp) data.exp *= 2; else data.exp = 1; data.count += 1; data.freq += (1 / data.exp); numTokens++; } for (ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i) { const std::string& term = i->first; const ScoreHelperStruct& data = i->second; // in order to adjust weights as a function of term count as it // relates to total field length. ie. is this the only word or // a frequently occuring term? or does it only show up once in // a long block of text? double coeff = (0.5 * data.count / numTokens) + 0.5; // if term is identical to the raw form of the // field (untokenized) give it a small boost. double adjustment = 1; if (raw.size() == term.length() && raw.equalCaseInsensitive(term)) adjustment += 0.1; double& score = (*docScores)[term]; score += (weight * data.freq * coeff * adjustment); verify(score <= MAX_WEIGHT); } } bool FTSSpec::_weightV1(StringData field, double* out) const { Weights::const_iterator i = _weights.find(field.toString()); if (i == _weights.end()) return false; *out = i->second; return true; } /* * Recurses over all fields of an obj (document in collection) * and fills term,score map term_freqs * @param tokenizer, tokenizer to tokenize a string into terms * @param obj, object being parsed * term_freqs, map to be filled up */ void FTSSpec::_scoreRecurseV1(const Tools& tools, const BSONObj& obj, TermFrequencyMap* term_freqs) const { BSONObjIterator j(obj); while (j.more()) { BSONElement x = j.next(); if (languageOverrideField() == x.fieldName()) continue; if (x.type() == String) { double w = 1; _weightV1(x.fieldName(), &w); _scoreStringV1(tools, x.valueStringData(), term_freqs, w); } else if (x.isABSONObj()) { _scoreRecurseV1(tools, x.Obj(), term_freqs); } } } void FTSSpec::_scoreDocumentV1(const BSONObj& obj, TermFrequencyMap* term_freqs) const { const FTSLanguage& language = _getLanguageToUseV1(obj); Stemmer stemmer(&language); Tools tools(language, &stemmer, StopWords::getStopWords(&language)); if (wildcard()) { // if * is specified for weight, we can recurse over all fields. _scoreRecurseV1(tools, obj, term_freqs); return; } // otherwise, we need to remember the different weights for each field // and act accordingly (in other words, call _score) for (Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++) { const char* leftOverName = i->first.c_str(); // name of field BSONElement e = dps::extractElementAtPath(obj, leftOverName); // weight associated to name of field double weight = i->second; if (e.eoo()) { // do nothing } else if (e.type() == Array) { BSONObjIterator j(e.Obj()); while (j.more()) { BSONElement x = j.next(); if (leftOverName[0] && x.isABSONObj()) x = dps::extractElementAtPath(x.Obj(), leftOverName); if (x.type() == String) _scoreStringV1(tools, x.valueStringData(), term_freqs, weight); } } else if (e.type() == String) { _scoreStringV1(tools, e.valueStringData(), term_freqs, weight); } } } StatusWith FTSSpec::_fixSpecV1(const BSONObj& spec) { std::map m; BSONObj keyPattern; { BSONObjBuilder b; bool addedFtsStuff = false; BSONObjIterator i(spec["key"].Obj()); while (i.more()) { BSONElement e = i.next(); if ((e.fieldNameStringData() == "_fts") || (e.fieldNameStringData() == "_ftsx")) { addedFtsStuff = true; b.append(e); } else if (e.type() == String && (e.valueStringData() == "fts" || e.valueStringData() == "text")) { if (!addedFtsStuff) { _addFTSStuff(&b); addedFtsStuff = true; } m[e.fieldName()] = 1; } else { b.append(e); } } if (!addedFtsStuff) _addFTSStuff(&b); keyPattern = b.obj(); } if (spec["weights"].isABSONObj()) { BSONObjIterator i(spec["weights"].Obj()); while (i.more()) { BSONElement e = i.next(); m[e.fieldName()] = e.numberInt(); } } else if (spec["weights"].str() == WILDCARD) { m[WILDCARD] = 1; } BSONObj weights; { BSONObjBuilder b; for (const auto& kv : m) { if (kv.second <= 0 || kv.second >= MAX_WORD_WEIGHT) { return {ErrorCodes::CannotCreateIndex, str::stream() << "text index weight must be in the exclusive interval (0," << MAX_WORD_WEIGHT << ") but found: " << kv.second}; } b.append(kv.first, kv.second); } weights = b.obj(); } std::string default_language(spec.getStringField("default_language")); if (default_language.empty()) default_language = "english"; std::string language_override(spec.getStringField("language_override")); if (language_override.empty()) language_override = "language"; int version = -1; int textIndexVersion = 1; BSONObjBuilder b; BSONObjIterator i(spec); while (i.more()) { BSONElement e = i.next(); StringData fieldName = e.fieldNameStringData(); if (fieldName == "key") { b.append("key", keyPattern); } else if (fieldName == "weights") { b.append("weights", weights); weights = BSONObj(); } else if (fieldName == "default_language") { b.append("default_language", default_language); default_language = ""; } else if (fieldName == "language_override") { b.append("language_override", language_override); language_override = ""; } else if (fieldName == "v") { version = e.numberInt(); } else if (fieldName == "textIndexVersion") { textIndexVersion = e.numberInt(); if (textIndexVersion != 1) { return {ErrorCodes::CannotCreateIndex, str::stream() << "bad textIndexVersion: " << textIndexVersion}; } } else { b.append(e); } } if (!weights.isEmpty()) b.append("weights", weights); if (!default_language.empty()) b.append("default_language", default_language); if (!language_override.empty()) b.append("language_override", language_override); if (version >= 0) b.append("v", version); b.append("textIndexVersion", textIndexVersion); return b.obj(); } } // namespace fts } // namespace mongo