// fts_spec.cpp /** * Copyright (C) 2012 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the GNU Affero General Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #include "mongo/platform/basic.h" #include "mongo/db/fts/fts_spec.h" #include "mongo/db/bson/dotted_path_support.h" #include "mongo/db/field_ref.h" #include "mongo/db/fts/fts_element_iterator.h" #include "mongo/db/fts/fts_tokenizer.h" #include "mongo/db/fts/fts_util.h" #include "mongo/db/matcher/expression_parser.h" #include "mongo/util/mongoutils/str.h" #include "mongo/util/stringutils.h" namespace mongo { namespace fts { using std::map; using std::string; using namespace mongoutils; namespace dps = ::mongo::dotted_path_support; const double DEFAULT_WEIGHT = 1; const double MAX_WEIGHT = 1000000000; const double MAX_WORD_WEIGHT = MAX_WEIGHT / 10000; namespace { // Default language. Used for new indexes. const std::string moduleDefaultLanguage("english"); /** Validate the given language override string. */ bool validateOverride(const string& override) { // The override field can't be empty, can't be prefixed with a dollar sign, and // can't contain a dot. return !override.empty()&& override[0] != '$' && override.find('.') == std::string::npos; } } FTSSpec::FTSSpec(const BSONObj& indexInfo) { // indexInfo is a text index spec. Text index specs pass through fixSpec() before // being saved to the system.indexes collection. fixSpec() enforces a schema, such that // required fields must exist and be of the correct type (e.g. weights, // textIndexVersion). massert(16739, "found invalid spec for text index", indexInfo["weights"].isABSONObj()); BSONElement textIndexVersionElt = indexInfo["textIndexVersion"]; massert(17367, "found invalid spec for text index, expected number for textIndexVersion", textIndexVersionElt.isNumber()); // We currently support TEXT_INDEX_VERSION_1 (deprecated), TEXT_INDEX_VERSION_2, and // TEXT_INDEX_VERSION_3. // Reject all other values. switch (textIndexVersionElt.numberInt()) { case TEXT_INDEX_VERSION_3: _textIndexVersion = TEXT_INDEX_VERSION_3; break; case TEXT_INDEX_VERSION_2: _textIndexVersion = TEXT_INDEX_VERSION_2; break; case TEXT_INDEX_VERSION_1: _textIndexVersion = TEXT_INDEX_VERSION_1; break; default: msgasserted(17364, str::stream() << "attempt to use unsupported textIndexVersion " << textIndexVersionElt.numberInt() << "; versions supported: " << TEXT_INDEX_VERSION_3 << ", " << TEXT_INDEX_VERSION_2 << ", " << TEXT_INDEX_VERSION_1); } // Initialize _defaultLanguage. Note that the FTSLanguage constructor requires // textIndexVersion, since language parsing is version-specific. auto indexLanguage = indexInfo["default_language"].String(); auto swl = FTSLanguage::make(indexLanguage, _textIndexVersion); // This can fail if the user originally created the text index under an instance of // MongoDB that supports different languages then the current instance // TODO: consder propagating the index ns to here to improve the error message uassert(28682, str::stream() << "Unrecognized language " << indexLanguage << " found for text index. Verify mongod was started with the" " correct options.", swl.getStatus().isOK()); _defaultLanguage = swl.getValue(); _languageOverrideField = indexInfo["language_override"].valuestrsafe(); _wildcard = false; // in this block we fill in the _weights map { BSONObjIterator i(indexInfo["weights"].Obj()); while (i.more()) { BSONElement e = i.next(); verify(e.isNumber()); if (WILDCARD == e.fieldName()) { _wildcard = true; } else { double num = e.number(); _weights[e.fieldName()] = num; verify(num > 0 && num < MAX_WORD_WEIGHT); } } verify(_wildcard || _weights.size()); } // extra information { BSONObj keyPattern = indexInfo["key"].Obj(); verify(keyPattern.nFields() >= 2); BSONObjIterator i(keyPattern); bool passedFTS = false; while (i.more()) { BSONElement e = i.next(); if (str::equals(e.fieldName(), "_fts") || str::equals(e.fieldName(), "_ftsx")) { passedFTS = true; continue; } if (passedFTS) _extraAfter.push_back(e.fieldName()); else _extraBefore.push_back(e.fieldName()); } } } const FTSLanguage* FTSSpec::_getLanguageToUseV2(const BSONObj& userDoc, const FTSLanguage* currentLanguage) const { BSONElement e = userDoc[_languageOverrideField]; if (e.eoo()) { return currentLanguage; } uassert(17261, "found language override field in document with non-string type", e.type() == mongo::String); StatusWithFTSLanguage swl = FTSLanguage::make(e.String(), getTextIndexVersion()); uassert(17262, "language override unsupported: " + e.String(), swl.getStatus().isOK()); return swl.getValue(); } void FTSSpec::scoreDocument(const BSONObj& obj, TermFrequencyMap* term_freqs) const { if (_textIndexVersion == TEXT_INDEX_VERSION_1) { return _scoreDocumentV1(obj, term_freqs); } FTSElementIterator it(*this, obj); while (it.more()) { FTSIteratorValue val = it.next(); std::unique_ptr tokenizer(val._language->createTokenizer()); _scoreStringV2(tokenizer.get(), val._text, term_freqs, val._weight); } } void FTSSpec::_scoreStringV2(FTSTokenizer* tokenizer, StringData raw, TermFrequencyMap* docScores, double weight) const { ScoreHelperMap terms; unsigned numTokens = 0; tokenizer->reset(raw.rawData(), FTSTokenizer::kFilterStopWords); while (tokenizer->moveNext()) { StringData term = tokenizer->get(); ScoreHelperStruct& data = terms[term]; if (data.exp) { data.exp *= 2; } else { data.exp = 1; } data.count += 1; data.freq += (1 / data.exp); numTokens++; } for (ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i) { const string& term = i->first; const ScoreHelperStruct& data = i->second; // in order to adjust weights as a function of term count as it // relates to total field length. ie. is this the only word or // a frequently occuring term? or does it only show up once in // a long block of text? double coeff = (0.5 * data.count / numTokens) + 0.5; // if term is identical to the raw form of the // field (untokenized) give it a small boost. double adjustment = 1; if (raw.size() == term.length() && raw.equalCaseInsensitive(term)) adjustment += 0.1; double& score = (*docScores)[term]; score += (weight * data.freq * coeff * adjustment); verify(score <= MAX_WEIGHT); } } Status FTSSpec::getIndexPrefix(const BSONObj& query, BSONObj* out) const { if (numExtraBefore() == 0) { *out = BSONObj(); return Status::OK(); } BSONObjBuilder b; for (unsigned i = 0; i < numExtraBefore(); i++) { BSONElement e = dps::extractElementAtPath(query, extraBefore(i)); if (e.eoo()) return Status(ErrorCodes::BadValue, str::stream() << "need have an equality filter on: " << extraBefore(i)); if (e.isABSONObj() && MatchExpressionParser::parsePathAcceptingKeyword(e.Obj().firstElement())) return Status(ErrorCodes::BadValue, str::stream() << "need have an equality filter on: " << extraBefore(i)); b.append(e); } *out = b.obj(); return Status::OK(); } namespace { void _addFTSStuff(BSONObjBuilder* b) { b->append("_fts", INDEX_NAME); b->append("_ftsx", 1); } Status verifyFieldNameNotReserved(StringData s) { if (s == "_fts" || s == "_ftsx") { return {ErrorCodes::CannotCreateIndex, "text index with reserved fields _fts/_ftsx not allowed"}; } return Status::OK(); } } StatusWith FTSSpec::fixSpec(const BSONObj& spec) { if (spec["textIndexVersion"].numberInt() == TEXT_INDEX_VERSION_1) { return _fixSpecV1(spec); } map m; BSONObj keyPattern; { BSONObjBuilder b; // Populate m and keyPattern. { bool addedFtsStuff = false; BSONObjIterator i(spec["key"].Obj()); while (i.more()) { BSONElement e = i.next(); if (str::equals(e.fieldName(), "_fts")) { if (INDEX_NAME != e.valuestrsafe()) { return {ErrorCodes::CannotCreateIndex, "expecting _fts:\"text\""}; } addedFtsStuff = true; b.append(e); } else if (str::equals(e.fieldName(), "_ftsx")) { if (e.numberInt() != 1) { return {ErrorCodes::CannotCreateIndex, "expecting _ftsx:1"}; } b.append(e); } else if (e.type() == String && INDEX_NAME == e.valuestr()) { if (!addedFtsStuff) { _addFTSStuff(&b); addedFtsStuff = true; } m[e.fieldName()] = 1; } else { if (e.numberInt() != 1 && e.numberInt() != -1) { return {ErrorCodes::CannotCreateIndex, "expected value 1 or -1 for non-text key in compound index"}; } b.append(e); } } verify(addedFtsStuff); } keyPattern = b.obj(); // Verify that index key is in the correct format: extraBefore fields, then text // fields, then extraAfter fields. { BSONObjIterator i(spec["key"].Obj()); verify(i.more()); BSONElement e = i.next(); // extraBefore fields while (String != e.type()) { Status notReservedStatus = verifyFieldNameNotReserved(e.fieldNameStringData()); if (!notReservedStatus.isOK()) { return notReservedStatus; } if (!i.more()) { return {ErrorCodes::CannotCreateIndex, "expected additional fields in text index key pattern"}; } e = i.next(); } // text fields bool alreadyFixed = str::equals(e.fieldName(), "_fts"); if (alreadyFixed) { if (!i.more()) { return {ErrorCodes::CannotCreateIndex, "expected _ftsx after _fts"}; } e = i.next(); if (!str::equals(e.fieldName(), "_ftsx")) { return {ErrorCodes::CannotCreateIndex, "expected _ftsx after _fts"}; } e = i.next(); } else { do { Status notReservedStatus = verifyFieldNameNotReserved(e.fieldNameStringData()); if (!notReservedStatus.isOK()) { return notReservedStatus; } e = i.next(); } while (!e.eoo() && e.type() == String); } // extraAfterFields while (!e.eoo()) { if (e.type() == BSONType::String) { return {ErrorCodes::CannotCreateIndex, "'text' fields in index must all be adjacent"}; } Status notReservedStatus = verifyFieldNameNotReserved(e.fieldNameStringData()); if (!notReservedStatus.isOK()) { return notReservedStatus; } e = i.next(); } } } if (spec["weights"].type() == Object) { BSONObjIterator i(spec["weights"].Obj()); while (i.more()) { BSONElement e = i.next(); if (!e.isNumber()) { return {ErrorCodes::CannotCreateIndex, "weight for text index needs numeric type"}; } m[e.fieldName()] = e.numberInt(); } } else if (spec["weights"].str() == WILDCARD) { m[WILDCARD] = 1; } else if (!spec["weights"].eoo()) { return {ErrorCodes::CannotCreateIndex, "text index option 'weights' must be an object"}; } if (m.empty()) { return {ErrorCodes::CannotCreateIndex, "text index option 'weights' must specify fields or the wildcard"}; } BSONObj weights; { BSONObjBuilder b; for (map::iterator i = m.begin(); i != m.end(); ++i) { if (i->second <= 0 || i->second >= MAX_WORD_WEIGHT) { return {ErrorCodes::CannotCreateIndex, str::stream() << "text index weight must be in the exclusive interval (0," << MAX_WORD_WEIGHT << ") but found: " << i->second}; } // Verify weight refers to a valid field. if (i->first != "$**") { FieldRef keyField(i->first); if (keyField.numParts() == 0) { return {ErrorCodes::CannotCreateIndex, "weight cannot be on an empty field"}; } for (size_t partNum = 0; partNum < keyField.numParts(); partNum++) { StringData part = keyField.getPart(partNum); if (part.empty()) { return {ErrorCodes::CannotCreateIndex, "weight cannot have empty path component"}; } if (part.startsWith("$")) { return {ErrorCodes::CannotCreateIndex, "weight cannot have path component with $ prefix"}; } } } b.append(i->first, i->second); } weights = b.obj(); } BSONElement default_language_elt = spec["default_language"]; string default_language(default_language_elt.str()); if (default_language_elt.eoo()) { default_language = moduleDefaultLanguage; } else if (default_language_elt.type() != BSONType::String) { return {ErrorCodes::CannotCreateIndex, "default_language needs a string type"}; } if (!FTSLanguage::make(default_language, TEXT_INDEX_VERSION_3).getStatus().isOK()) { return {ErrorCodes::CannotCreateIndex, "default_language is not valid"}; } BSONElement language_override_elt = spec["language_override"]; string language_override(language_override_elt.str()); if (language_override_elt.eoo()) { language_override = "language"; } else if (language_override_elt.type() != BSONType::String) { return {ErrorCodes::CannotCreateIndex, "language_override must be a string"}; } else if (!validateOverride(language_override)) { return {ErrorCodes::CannotCreateIndex, "language_override is not valid"}; } int version = -1; int textIndexVersion = TEXT_INDEX_VERSION_3; // default text index version BSONObjBuilder b; BSONObjIterator i(spec); while (i.more()) { BSONElement e = i.next(); if (str::equals(e.fieldName(), "key")) { b.append("key", keyPattern); } else if (str::equals(e.fieldName(), "weights")) { b.append("weights", weights); weights = BSONObj(); } else if (str::equals(e.fieldName(), "default_language")) { b.append("default_language", default_language); default_language = ""; } else if (str::equals(e.fieldName(), "language_override")) { b.append("language_override", language_override); language_override = ""; } else if (str::equals(e.fieldName(), "v")) { version = e.numberInt(); } else if (str::equals(e.fieldName(), "textIndexVersion")) { if (!e.isNumber()) { return {ErrorCodes::CannotCreateIndex, "text index option 'textIndexVersion' must be a number"}; } textIndexVersion = e.numberInt(); if (textIndexVersion != TEXT_INDEX_VERSION_2 && textIndexVersion != TEXT_INDEX_VERSION_3) { return {ErrorCodes::CannotCreateIndex, str::stream() << "bad textIndexVersion: " << textIndexVersion}; } } else { b.append(e); } } if (!weights.isEmpty()) { b.append("weights", weights); } if (!default_language.empty()) { b.append("default_language", default_language); } if (!language_override.empty()) { b.append("language_override", language_override); } if (version >= 0) { b.append("v", version); } b.append("textIndexVersion", textIndexVersion); return b.obj(); } } }