// fts_spec.cpp
/**
* Copyright (C) 2012 10gen Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#include "mongo/platform/basic.h"
#include "mongo/db/fts/fts_spec.h"
#include "mongo/db/bson/dotted_path_support.h"
#include "mongo/db/field_ref.h"
#include "mongo/db/fts/fts_element_iterator.h"
#include "mongo/db/fts/fts_tokenizer.h"
#include "mongo/db/fts/fts_util.h"
#include "mongo/db/matcher/expression_parser.h"
#include "mongo/util/mongoutils/str.h"
#include "mongo/util/stringutils.h"
namespace mongo {
namespace fts {
using std::map;
using std::string;
using namespace mongoutils;
namespace dps = ::mongo::dotted_path_support;
const double DEFAULT_WEIGHT = 1;
const double MAX_WEIGHT = 1000000000;
const double MAX_WORD_WEIGHT = MAX_WEIGHT / 10000;
namespace {
// Default language. Used for new indexes.
const std::string moduleDefaultLanguage("english");
/** Validate the given language override string. */
bool validateOverride(const string& override) {
// The override field can't be empty, can't be prefixed with a dollar sign, and
// can't contain a dot.
return !override.empty()&& override[0] != '$' && override.find('.') == std::string::npos;
}
}
FTSSpec::FTSSpec(const BSONObj& indexInfo) {
// indexInfo is a text index spec. Text index specs pass through fixSpec() before
// being saved to the system.indexes collection. fixSpec() enforces a schema, such that
// required fields must exist and be of the correct type (e.g. weights,
// textIndexVersion).
massert(16739, "found invalid spec for text index", indexInfo["weights"].isABSONObj());
BSONElement textIndexVersionElt = indexInfo["textIndexVersion"];
massert(17367,
"found invalid spec for text index, expected number for textIndexVersion",
textIndexVersionElt.isNumber());
// We currently support TEXT_INDEX_VERSION_1 (deprecated), TEXT_INDEX_VERSION_2, and
// TEXT_INDEX_VERSION_3.
// Reject all other values.
switch (textIndexVersionElt.numberInt()) {
case TEXT_INDEX_VERSION_3:
_textIndexVersion = TEXT_INDEX_VERSION_3;
break;
case TEXT_INDEX_VERSION_2:
_textIndexVersion = TEXT_INDEX_VERSION_2;
break;
case TEXT_INDEX_VERSION_1:
_textIndexVersion = TEXT_INDEX_VERSION_1;
break;
default:
msgasserted(17364,
str::stream() << "attempt to use unsupported textIndexVersion "
<< textIndexVersionElt.numberInt()
<< "; versions supported: "
<< TEXT_INDEX_VERSION_3
<< ", "
<< TEXT_INDEX_VERSION_2
<< ", "
<< TEXT_INDEX_VERSION_1);
}
// Initialize _defaultLanguage. Note that the FTSLanguage constructor requires
// textIndexVersion, since language parsing is version-specific.
auto indexLanguage = indexInfo["default_language"].String();
auto swl = FTSLanguage::make(indexLanguage, _textIndexVersion);
// This can fail if the user originally created the text index under an instance of
// MongoDB that supports different languages then the current instance
// TODO: consder propagating the index ns to here to improve the error message
uassert(28682,
str::stream() << "Unrecognized language " << indexLanguage
<< " found for text index. Verify mongod was started with the"
" correct options.",
swl.getStatus().isOK());
_defaultLanguage = swl.getValue();
_languageOverrideField = indexInfo["language_override"].valuestrsafe();
_wildcard = false;
// in this block we fill in the _weights map
{
BSONObjIterator i(indexInfo["weights"].Obj());
while (i.more()) {
BSONElement e = i.next();
verify(e.isNumber());
if (WILDCARD == e.fieldName()) {
_wildcard = true;
} else {
double num = e.number();
_weights[e.fieldName()] = num;
verify(num > 0 && num < MAX_WORD_WEIGHT);
}
}
verify(_wildcard || _weights.size());
}
// extra information
{
BSONObj keyPattern = indexInfo["key"].Obj();
verify(keyPattern.nFields() >= 2);
BSONObjIterator i(keyPattern);
bool passedFTS = false;
while (i.more()) {
BSONElement e = i.next();
if (str::equals(e.fieldName(), "_fts") || str::equals(e.fieldName(), "_ftsx")) {
passedFTS = true;
continue;
}
if (passedFTS)
_extraAfter.push_back(e.fieldName());
else
_extraBefore.push_back(e.fieldName());
}
}
}
const FTSLanguage* FTSSpec::_getLanguageToUseV2(const BSONObj& userDoc,
const FTSLanguage* currentLanguage) const {
BSONElement e = userDoc[_languageOverrideField];
if (e.eoo()) {
return currentLanguage;
}
uassert(17261,
"found language override field in document with non-string type",
e.type() == mongo::String);
StatusWithFTSLanguage swl = FTSLanguage::make(e.String(), getTextIndexVersion());
uassert(17262, "language override unsupported: " + e.String(), swl.getStatus().isOK());
return swl.getValue();
}
void FTSSpec::scoreDocument(const BSONObj& obj, TermFrequencyMap* term_freqs) const {
if (_textIndexVersion == TEXT_INDEX_VERSION_1) {
return _scoreDocumentV1(obj, term_freqs);
}
FTSElementIterator it(*this, obj);
while (it.more()) {
FTSIteratorValue val = it.next();
std::unique_ptr tokenizer(val._language->createTokenizer());
_scoreStringV2(tokenizer.get(), val._text, term_freqs, val._weight);
}
}
void FTSSpec::_scoreStringV2(FTSTokenizer* tokenizer,
StringData raw,
TermFrequencyMap* docScores,
double weight) const {
ScoreHelperMap terms;
unsigned numTokens = 0;
tokenizer->reset(raw.rawData(), FTSTokenizer::kFilterStopWords);
while (tokenizer->moveNext()) {
StringData term = tokenizer->get();
ScoreHelperStruct& data = terms[term];
if (data.exp) {
data.exp *= 2;
} else {
data.exp = 1;
}
data.count += 1;
data.freq += (1 / data.exp);
numTokens++;
}
for (ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i) {
const string& term = i->first;
const ScoreHelperStruct& data = i->second;
// in order to adjust weights as a function of term count as it
// relates to total field length. ie. is this the only word or
// a frequently occuring term? or does it only show up once in
// a long block of text?
double coeff = (0.5 * data.count / numTokens) + 0.5;
// if term is identical to the raw form of the
// field (untokenized) give it a small boost.
double adjustment = 1;
if (raw.size() == term.length() && raw.equalCaseInsensitive(term))
adjustment += 0.1;
double& score = (*docScores)[term];
score += (weight * data.freq * coeff * adjustment);
verify(score <= MAX_WEIGHT);
}
}
Status FTSSpec::getIndexPrefix(const BSONObj& query, BSONObj* out) const {
if (numExtraBefore() == 0) {
*out = BSONObj();
return Status::OK();
}
BSONObjBuilder b;
for (unsigned i = 0; i < numExtraBefore(); i++) {
BSONElement e = dps::extractElementAtPath(query, extraBefore(i));
if (e.eoo())
return Status(ErrorCodes::BadValue,
str::stream() << "need have an equality filter on: " << extraBefore(i));
if (e.isABSONObj() &&
MatchExpressionParser::parsePathAcceptingKeyword(e.Obj().firstElement()))
return Status(ErrorCodes::BadValue,
str::stream() << "need have an equality filter on: " << extraBefore(i));
b.append(e);
}
*out = b.obj();
return Status::OK();
}
namespace {
void _addFTSStuff(BSONObjBuilder* b) {
b->append("_fts", INDEX_NAME);
b->append("_ftsx", 1);
}
Status verifyFieldNameNotReserved(StringData s) {
if (s == "_fts" || s == "_ftsx") {
return {ErrorCodes::CannotCreateIndex,
"text index with reserved fields _fts/_ftsx not allowed"};
}
return Status::OK();
}
}
StatusWith FTSSpec::fixSpec(const BSONObj& spec) {
if (spec["textIndexVersion"].numberInt() == TEXT_INDEX_VERSION_1) {
return _fixSpecV1(spec);
}
map m;
BSONObj keyPattern;
{
BSONObjBuilder b;
// Populate m and keyPattern.
{
bool addedFtsStuff = false;
BSONObjIterator i(spec["key"].Obj());
while (i.more()) {
BSONElement e = i.next();
if (str::equals(e.fieldName(), "_fts")) {
if (INDEX_NAME != e.valuestrsafe()) {
return {ErrorCodes::CannotCreateIndex, "expecting _fts:\"text\""};
}
addedFtsStuff = true;
b.append(e);
} else if (str::equals(e.fieldName(), "_ftsx")) {
if (e.numberInt() != 1) {
return {ErrorCodes::CannotCreateIndex, "expecting _ftsx:1"};
}
b.append(e);
} else if (e.type() == String && INDEX_NAME == e.valuestr()) {
if (!addedFtsStuff) {
_addFTSStuff(&b);
addedFtsStuff = true;
}
m[e.fieldName()] = 1;
} else {
if (e.numberInt() != 1 && e.numberInt() != -1) {
return {ErrorCodes::CannotCreateIndex,
"expected value 1 or -1 for non-text key in compound index"};
}
b.append(e);
}
}
verify(addedFtsStuff);
}
keyPattern = b.obj();
// Verify that index key is in the correct format: extraBefore fields, then text
// fields, then extraAfter fields.
{
BSONObjIterator i(spec["key"].Obj());
verify(i.more());
BSONElement e = i.next();
// extraBefore fields
while (String != e.type()) {
Status notReservedStatus = verifyFieldNameNotReserved(e.fieldNameStringData());
if (!notReservedStatus.isOK()) {
return notReservedStatus;
}
if (!i.more()) {
return {ErrorCodes::CannotCreateIndex,
"expected additional fields in text index key pattern"};
}
e = i.next();
}
// text fields
bool alreadyFixed = str::equals(e.fieldName(), "_fts");
if (alreadyFixed) {
if (!i.more()) {
return {ErrorCodes::CannotCreateIndex, "expected _ftsx after _fts"};
}
e = i.next();
if (!str::equals(e.fieldName(), "_ftsx")) {
return {ErrorCodes::CannotCreateIndex, "expected _ftsx after _fts"};
}
e = i.next();
} else {
do {
Status notReservedStatus = verifyFieldNameNotReserved(e.fieldNameStringData());
if (!notReservedStatus.isOK()) {
return notReservedStatus;
}
e = i.next();
} while (!e.eoo() && e.type() == String);
}
// extraAfterFields
while (!e.eoo()) {
if (e.type() == BSONType::String) {
return {ErrorCodes::CannotCreateIndex,
"'text' fields in index must all be adjacent"};
}
Status notReservedStatus = verifyFieldNameNotReserved(e.fieldNameStringData());
if (!notReservedStatus.isOK()) {
return notReservedStatus;
}
e = i.next();
}
}
}
if (spec["weights"].type() == Object) {
BSONObjIterator i(spec["weights"].Obj());
while (i.more()) {
BSONElement e = i.next();
if (!e.isNumber()) {
return {ErrorCodes::CannotCreateIndex, "weight for text index needs numeric type"};
}
m[e.fieldName()] = e.numberInt();
}
} else if (spec["weights"].str() == WILDCARD) {
m[WILDCARD] = 1;
} else if (!spec["weights"].eoo()) {
return {ErrorCodes::CannotCreateIndex, "text index option 'weights' must be an object"};
}
if (m.empty()) {
return {ErrorCodes::CannotCreateIndex,
"text index option 'weights' must specify fields or the wildcard"};
}
BSONObj weights;
{
BSONObjBuilder b;
for (map::iterator i = m.begin(); i != m.end(); ++i) {
if (i->second <= 0 || i->second >= MAX_WORD_WEIGHT) {
return {ErrorCodes::CannotCreateIndex,
str::stream() << "text index weight must be in the exclusive interval (0,"
<< MAX_WORD_WEIGHT
<< ") but found: "
<< i->second};
}
// Verify weight refers to a valid field.
if (i->first != "$**") {
FieldRef keyField(i->first);
if (keyField.numParts() == 0) {
return {ErrorCodes::CannotCreateIndex, "weight cannot be on an empty field"};
}
for (size_t partNum = 0; partNum < keyField.numParts(); partNum++) {
StringData part = keyField.getPart(partNum);
if (part.empty()) {
return {ErrorCodes::CannotCreateIndex,
"weight cannot have empty path component"};
}
if (part.startsWith("$")) {
return {ErrorCodes::CannotCreateIndex,
"weight cannot have path component with $ prefix"};
}
}
}
b.append(i->first, i->second);
}
weights = b.obj();
}
BSONElement default_language_elt = spec["default_language"];
string default_language(default_language_elt.str());
if (default_language_elt.eoo()) {
default_language = moduleDefaultLanguage;
} else if (default_language_elt.type() != BSONType::String) {
return {ErrorCodes::CannotCreateIndex, "default_language needs a string type"};
}
if (!FTSLanguage::make(default_language, TEXT_INDEX_VERSION_3).getStatus().isOK()) {
return {ErrorCodes::CannotCreateIndex, "default_language is not valid"};
}
BSONElement language_override_elt = spec["language_override"];
string language_override(language_override_elt.str());
if (language_override_elt.eoo()) {
language_override = "language";
} else if (language_override_elt.type() != BSONType::String) {
return {ErrorCodes::CannotCreateIndex, "language_override must be a string"};
} else if (!validateOverride(language_override)) {
return {ErrorCodes::CannotCreateIndex, "language_override is not valid"};
}
int version = -1;
int textIndexVersion = TEXT_INDEX_VERSION_3; // default text index version
BSONObjBuilder b;
BSONObjIterator i(spec);
while (i.more()) {
BSONElement e = i.next();
if (str::equals(e.fieldName(), "key")) {
b.append("key", keyPattern);
} else if (str::equals(e.fieldName(), "weights")) {
b.append("weights", weights);
weights = BSONObj();
} else if (str::equals(e.fieldName(), "default_language")) {
b.append("default_language", default_language);
default_language = "";
} else if (str::equals(e.fieldName(), "language_override")) {
b.append("language_override", language_override);
language_override = "";
} else if (str::equals(e.fieldName(), "v")) {
version = e.numberInt();
} else if (str::equals(e.fieldName(), "textIndexVersion")) {
if (!e.isNumber()) {
return {ErrorCodes::CannotCreateIndex,
"text index option 'textIndexVersion' must be a number"};
}
textIndexVersion = e.numberInt();
if (textIndexVersion != TEXT_INDEX_VERSION_2 &&
textIndexVersion != TEXT_INDEX_VERSION_3) {
return {ErrorCodes::CannotCreateIndex,
str::stream() << "bad textIndexVersion: " << textIndexVersion};
}
} else {
b.append(e);
}
}
if (!weights.isEmpty()) {
b.append("weights", weights);
}
if (!default_language.empty()) {
b.append("default_language", default_language);
}
if (!language_override.empty()) {
b.append("language_override", language_override);
}
if (version >= 0) {
b.append("v", version);
}
b.append("textIndexVersion", textIndexVersion);
return b.obj();
}
}
}