// fts_spec.cpp
/**
* Copyright (C) 2012 10gen Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#include "mongo/platform/basic.h"
#include "mongo/db/fts/fts_spec.h"
#include "mongo/db/field_ref.h"
#include "mongo/db/fts/fts_element_iterator.h"
#include "mongo/db/fts/fts_util.h"
#include "mongo/util/mongoutils/str.h"
#include "mongo/util/stringutils.h"
namespace mongo {
namespace fts {
using std::map;
using std::string;
using namespace mongoutils;
const double DEFAULT_WEIGHT = 1;
const double MAX_WEIGHT = 1000000000;
const double MAX_WORD_WEIGHT = MAX_WEIGHT / 10000;
namespace {
// Default language. Used for new indexes.
const std::string moduleDefaultLanguage( "english" );
/** Validate the given language override string. */
bool validateOverride( const string& override ) {
// The override field can't be empty, can't be prefixed with a dollar sign, and
// can't contain a dot.
return !override.empty() &&
override[0] != '$' &&
override.find('.') == std::string::npos;
}
}
FTSSpec::FTSSpec( const BSONObj& indexInfo ) {
// indexInfo is a text index spec. Text index specs pass through fixSpec() before
// being saved to the system.indexes collection. fixSpec() enforces a schema, such that
// required fields must exist and be of the correct type (e.g. weights,
// textIndexVersion).
massert( 16739, "found invalid spec for text index",
indexInfo["weights"].isABSONObj() );
BSONElement textIndexVersionElt = indexInfo["textIndexVersion"];
massert( 17367,
"found invalid spec for text index, expected number for textIndexVersion",
textIndexVersionElt.isNumber() );
// We currently support TEXT_INDEX_VERSION_1 (deprecated) and TEXT_INDEX_VERSION_2.
// Reject all other values.
massert( 17364,
str::stream() << "attempt to use unsupported textIndexVersion " <<
textIndexVersionElt.numberInt() << "; versions supported: " <<
TEXT_INDEX_VERSION_2 << ", " << TEXT_INDEX_VERSION_1,
textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 ||
textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_1 );
_textIndexVersion = ( textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 ) ?
TEXT_INDEX_VERSION_2 : TEXT_INDEX_VERSION_1;
// Initialize _defaultLanguage. Note that the FTSLanguage constructor requires
// textIndexVersion, since language parsing is version-specific.
StatusWithFTSLanguage swl =
FTSLanguage::make( indexInfo["default_language"].String(), _textIndexVersion );
verify( swl.getStatus().isOK() ); // should not fail, since validated by fixSpec().
_defaultLanguage = swl.getValue();
_languageOverrideField = indexInfo["language_override"].valuestrsafe();
_wildcard = false;
// in this block we fill in the _weights map
{
BSONObjIterator i( indexInfo["weights"].Obj() );
while ( i.more() ) {
BSONElement e = i.next();
verify( e.isNumber() );
if ( WILDCARD == e.fieldName() ) {
_wildcard = true;
}
else {
double num = e.number();
_weights[ e.fieldName() ] = num;
verify( num > 0 && num < MAX_WORD_WEIGHT );
}
}
verify( _wildcard || _weights.size() );
}
// extra information
{
BSONObj keyPattern = indexInfo["key"].Obj();
verify( keyPattern.nFields() >= 2 );
BSONObjIterator i( keyPattern );
bool passedFTS = false;
while ( i.more() ) {
BSONElement e = i.next();
if ( str::equals( e.fieldName(), "_fts" ) ||
str::equals( e.fieldName(), "_ftsx" ) ) {
passedFTS = true;
continue;
}
if ( passedFTS )
_extraAfter.push_back( e.fieldName() );
else
_extraBefore.push_back( e.fieldName() );
}
}
}
const FTSLanguage* FTSSpec::_getLanguageToUseV2( const BSONObj& userDoc,
const FTSLanguage* currentLanguage ) const {
BSONElement e = userDoc[_languageOverrideField];
if ( e.eoo() ) {
return currentLanguage;
}
uassert( 17261,
"found language override field in document with non-string type",
e.type() == mongo::String );
StatusWithFTSLanguage swl = FTSLanguage::make( e.String(), TEXT_INDEX_VERSION_2 );
uassert( 17262,
"language override unsupported: " + e.String(),
swl.getStatus().isOK() );
return swl.getValue();
}
void FTSSpec::scoreDocument( const BSONObj& obj, TermFrequencyMap* term_freqs ) const {
if ( _textIndexVersion == TEXT_INDEX_VERSION_1 ) {
return _scoreDocumentV1( obj, term_freqs );
}
FTSElementIterator it( *this, obj );
while ( it.more() ) {
FTSIteratorValue val = it.next();
Stemmer stemmer( *val._language );
Tools tools( *val._language, &stemmer, StopWords::getStopWords( *val._language ) );
_scoreStringV2( tools, val._text, term_freqs, val._weight );
}
}
void FTSSpec::_scoreStringV2( const Tools& tools,
const StringData& raw,
TermFrequencyMap* docScores,
double weight ) const {
ScoreHelperMap terms;
unsigned numTokens = 0;
Tokenizer i( tools.language, raw );
while ( i.more() ) {
Token t = i.next();
if ( t.type != Token::TEXT )
continue;
string term = t.data.toString();
makeLower( &term );
if ( tools.stopwords->isStopWord( term ) ) {
continue;
}
term = tools.stemmer->stem( term );
ScoreHelperStruct& data = terms[term];
if ( data.exp ) {
data.exp *= 2;
}
else {
data.exp = 1;
}
data.count += 1;
data.freq += ( 1 / data.exp );
numTokens++;
}
for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) {
const string& term = i->first;
const ScoreHelperStruct& data = i->second;
// in order to adjust weights as a function of term count as it
// relates to total field length. ie. is this the only word or
// a frequently occuring term? or does it only show up once in
// a long block of text?
double coeff = ( 0.5 * data.count / numTokens ) + 0.5;
// if term is identical to the raw form of the
// field (untokenized) give it a small boost.
double adjustment = 1;
if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) )
adjustment += 0.1;
double& score = (*docScores)[term];
score += ( weight * data.freq * coeff * adjustment );
verify( score <= MAX_WEIGHT );
}
}
Status FTSSpec::getIndexPrefix( const BSONObj& query, BSONObj* out ) const {
if ( numExtraBefore() == 0 ) {
*out = BSONObj();
return Status::OK();
}
BSONObjBuilder b;
for ( unsigned i = 0; i < numExtraBefore(); i++ ) {
BSONElement e = query.getFieldDotted(extraBefore(i));
if ( e.eoo() )
return Status( ErrorCodes::BadValue,
str::stream()
<< "need have an equality filter on: "
<< extraBefore(i) );
if ( e.isABSONObj() && e.Obj().firstElement().getGtLtOp( -1 ) != -1 )
return Status( ErrorCodes::BadValue,
str::stream()
<< "need have an equality filter on: "
<< extraBefore(i) );
b.append( e );
}
*out = b.obj();
return Status::OK();
}
namespace {
void _addFTSStuff( BSONObjBuilder* b ) {
b->append( "_fts", INDEX_NAME );
b->append( "_ftsx", 1 );
}
void verifyFieldNameNotReserved( StringData s ) {
uassert( 17289,
"text index with reserved fields _fts/_ftsx not allowed",
s != "_fts" && s != "_ftsx" );
}
}
BSONObj FTSSpec::fixSpec( const BSONObj& spec ) {
if ( spec["textIndexVersion"].numberInt() == TEXT_INDEX_VERSION_1 ) {
return _fixSpecV1( spec );
}
map m;
BSONObj keyPattern;
{
BSONObjBuilder b;
// Populate m and keyPattern.
{
bool addedFtsStuff = false;
BSONObjIterator i( spec["key"].Obj() );
while ( i.more() ) {
BSONElement e = i.next();
if ( str::equals( e.fieldName(), "_fts" ) ) {
uassert( 17271,
"expecting _fts:\"text\"",
INDEX_NAME == e.valuestrsafe() );
addedFtsStuff = true;
b.append( e );
}
else if ( str::equals( e.fieldName(), "_ftsx" ) ) {
uassert( 17272, "expecting _ftsx:1", e.numberInt() == 1 );
b.append( e );
}
else if ( e.type() == String && INDEX_NAME == e.valuestr() ) {
if ( !addedFtsStuff ) {
_addFTSStuff( &b );
addedFtsStuff = true;
}
m[e.fieldName()] = 1;
}
else {
uassert( 17273,
"expected value 1 or -1 for non-text key in compound index",
e.numberInt() == 1 || e.numberInt() == -1 );
b.append( e );
}
}
verify( addedFtsStuff );
}
keyPattern = b.obj();
// Verify that index key is in the correct format: extraBefore fields, then text
// fields, then extraAfter fields.
{
BSONObjIterator i( spec["key"].Obj() );
verify( i.more() );
BSONElement e = i.next();
// extraBefore fields
while ( String != e.type() ) {
verifyFieldNameNotReserved( e.fieldNameStringData() );
verify( i.more() );
e = i.next();
}
// text fields
bool alreadyFixed = str::equals( e.fieldName(), "_fts" );
if ( alreadyFixed ) {
uassert( 17288, "expected _ftsx after _fts", i.more() );
e = i.next();
uassert( 17274,
"expected _ftsx after _fts",
str::equals( e.fieldName(), "_ftsx" ) );
e = i.next();
}
else {
do {
verifyFieldNameNotReserved( e.fieldNameStringData() );
e = i.next();
} while ( !e.eoo() && e.type() == String );
}
// extraAfterFields
while ( !e.eoo() ) {
uassert( 17389,
"'text' fields in index must all be adjacent",
e.type() != String );
verifyFieldNameNotReserved( e.fieldNameStringData() );
e = i.next();
}
}
}
if ( spec["weights"].type() == Object ) {
BSONObjIterator i( spec["weights"].Obj() );
while ( i.more() ) {
BSONElement e = i.next();
uassert( 17283,
"weight for text index needs numeric type",
e.isNumber() );
m[e.fieldName()] = e.numberInt();
}
}
else if ( spec["weights"].str() == WILDCARD ) {
m[WILDCARD] = 1;
}
else if ( !spec["weights"].eoo() ) {
uasserted( 17284, "text index option 'weights' must be an object" );
}
BSONObj weights;
{
BSONObjBuilder b;
for ( map::iterator i = m.begin(); i != m.end(); ++i ) {
uassert( 16674, "score for word too high",
i->second > 0 && i->second < MAX_WORD_WEIGHT );
// Verify weight refers to a valid field.
if ( i->first != "$**" ) {
FieldRef keyField( i->first );
uassert( 17294,
"weight cannot be on an empty field",
keyField.numParts() != 0 );
for ( size_t partNum = 0; partNum < keyField.numParts(); partNum++ ) {
StringData part = keyField.getPart(partNum);
uassert( 17291,
"weight cannot have empty path component",
!part.empty() );
uassert( 17292,
"weight cannot have path component with $ prefix",
!part.startsWith( "$" ) );
}
}
b.append( i->first, i->second );
}
weights = b.obj();
}
BSONElement default_language_elt = spec["default_language"];
string default_language( default_language_elt.str() );
if ( default_language_elt.eoo() ) {
default_language = moduleDefaultLanguage;
}
else {
uassert( 17263,
"default_language needs a string type",
default_language_elt.type() == String );
}
uassert( 17264,
"default_language is not valid",
FTSLanguage::make( default_language,
TEXT_INDEX_VERSION_2 ).getStatus().isOK() );
BSONElement language_override_elt = spec["language_override"];
string language_override( language_override_elt.str() );
if ( language_override_elt.eoo() ) {
language_override = "language";
}
else {
uassert( 17136,
"language_override is not valid",
language_override_elt.type() == String
&& validateOverride( language_override ) );
}
int version = -1;
int textIndexVersion = TEXT_INDEX_VERSION_2;
BSONObjBuilder b;
BSONObjIterator i( spec );
while ( i.more() ) {
BSONElement e = i.next();
if ( str::equals( e.fieldName(), "key" ) ) {
b.append( "key", keyPattern );
}
else if ( str::equals( e.fieldName(), "weights" ) ) {
b.append( "weights", weights );
weights = BSONObj();
}
else if ( str::equals( e.fieldName(), "default_language" ) ) {
b.append( "default_language", default_language);
default_language = "";
}
else if ( str::equals( e.fieldName(), "language_override" ) ) {
b.append( "language_override", language_override);
language_override = "";
}
else if ( str::equals( e.fieldName(), "v" ) ) {
version = e.numberInt();
}
else if ( str::equals( e.fieldName(), "textIndexVersion" ) ) {
uassert( 17293,
"text index option 'textIndexVersion' must be a number",
e.isNumber() );
textIndexVersion = e.numberInt();
uassert( 16730,
str::stream() << "bad textIndexVersion: " << textIndexVersion,
textIndexVersion == TEXT_INDEX_VERSION_2 );
}
else {
b.append( e );
}
}
if ( !weights.isEmpty() ) {
b.append( "weights", weights );
}
if ( !default_language.empty() ) {
b.append( "default_language", default_language);
}
if ( !language_override.empty() ) {
b.append( "language_override", language_override);
}
if ( version >= 0 ) {
b.append( "v", version );
}
b.append( "textIndexVersion", textIndexVersion );
return b.obj();
}
}
}