// fts_spec.cpp /** * Copyright (C) 2012 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the GNU Affero General Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #include "mongo/pch.h" #include "mongo/db/fts/fts_spec.h" #include "mongo/db/fts/fts_util.h" #include "mongo/util/mongoutils/str.h" #include "mongo/util/stringutils.h" namespace mongo { namespace fts { using namespace mongoutils; const double DEFAULT_WEIGHT = 1; const double MAX_WEIGHT = 1000000000; const double MAX_WORD_WEIGHT = MAX_WEIGHT / 10000; namespace { // Default language. Used for new indexes. const std::string moduleDefaultLanguage( "english" ); } FTSSpec::FTSSpec( const BSONObj& indexInfo ) { massert( 16739, "found invalid spec for text index", indexInfo["weights"].isABSONObj() ); Status status = _defaultLanguage.init( indexInfo["default_language"].String() ); verify( status.isOK() ); _languageOverrideField = indexInfo["language_override"].valuestrsafe(); if ( _languageOverrideField.size() == 0 ) _languageOverrideField = "language"; _wildcard = false; // in this block we fill in the _weights map { BSONObjIterator i( indexInfo["weights"].Obj() ); while ( i.more() ) { BSONElement e = i.next(); verify( e.isNumber() ); if ( WILDCARD == e.fieldName() ) { _wildcard = true; } else { double num = e.number(); _weights[ e.fieldName() ] = num; verify( num > 0 && num < MAX_WORD_WEIGHT ); } } verify( _wildcard || _weights.size() ); } // extra information { BSONObj keyPattern = indexInfo["key"].Obj(); verify( keyPattern.nFields() >= 2 ); BSONObjIterator i( keyPattern ); bool passedFTS = false; while ( i.more() ) { BSONElement e = i.next(); if ( str::equals( e.fieldName(), "_fts" ) || str::equals( e.fieldName(), "_ftsx" ) ) { passedFTS = true; continue; } if ( passedFTS ) _extraAfter.push_back( e.fieldName() ); else _extraBefore.push_back( e.fieldName() ); } } } const FTSLanguage FTSSpec::getLanguageToUse( const BSONObj& userDoc, const FTSLanguage currentLanguage ) const { BSONElement e = userDoc[_languageOverrideField]; if ( e.eoo() ) { return currentLanguage; } uassert( 17261, "found language override field in document with non-string type", e.type() == mongo::String ); StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( e.String() ); uassert( 17262, "language override unsupported: " + e.String(), swl.getStatus().isOK() ); return swl.getValue(); } namespace { /** * Check for exact match or path prefix match. */ inline bool _matchPrefix( const string& dottedName, const string& weight ) { if ( weight == dottedName ) { return true; } return str::startsWith( weight, dottedName + '.' ); } } void FTSSpec::scoreDocument( const BSONObj& obj, const FTSLanguage parentLanguage, const string& parentPath, bool isArray, TermFrequencyMap* term_freqs ) const { const FTSLanguage language = getLanguageToUse( obj, parentLanguage ); Stemmer stemmer( language ); Tools tools( language, &stemmer, StopWords::getStopWords( language ) ); // Perform a depth-first traversal of obj, skipping fields not touched by this spec. BSONObjIterator j( obj ); while ( j.more() ) { BSONElement elem = j.next(); string fieldName = elem.fieldName(); // Skip "language" specifier fields if wildcard. if ( wildcard() && languageOverrideField() == fieldName ) { continue; } // Compose the dotted name of the current field: // 1. parent path empty (top level): use the current field name // 2. parent path non-empty and obj is an array: use the parent path // 3. parent path non-empty and obj is a sub-doc: append field name to parent path string dottedName = ( parentPath.empty() ? fieldName : isArray ? parentPath : parentPath + '.' + fieldName ); // Find lower bound of dottedName in _weights. lower_bound leaves us at the first // weight that could possibly match or be a prefix of dottedName. And if this // element fails to match, then no subsequent weight can match, since the weights // are lexicographically ordered. Weights::const_iterator i = _weights.lower_bound( dottedName ); // possibleWeightMatch is set if the weight map contains either a match or some item // lexicographically larger than fieldName. This boolean acts as a guard on // dereferences of iterator 'i'. bool possibleWeightMatch = ( i != _weights.end() ); // Optimize away two cases, when not wildcard: // 1. lower_bound seeks to end(): no prefix match possible // 2. lower_bound seeks to a name which is not a prefix if ( !wildcard() ) { if ( !possibleWeightMatch ) { continue; } else if ( !_matchPrefix( dottedName, i->first ) ) { continue; } } // Is the current field an exact match on a weight? bool exactMatch = ( possibleWeightMatch && i->first == dottedName ); double weight = ( possibleWeightMatch ? i->second : DEFAULT_WEIGHT ); switch ( elem.type() ) { case String: // Only index strings on exact match or wildcard. if ( exactMatch || wildcard() ) { _scoreString( tools, elem.valuestr(), term_freqs, weight ); } break; case Object: // Only descend into a sub-document on proper prefix or wildcard. Note that // !exactMatch is a sufficient test for proper prefix match, because of // matchPrefix() continue block above. if ( !exactMatch || wildcard() ) { scoreDocument( elem.Obj(), language, dottedName, false, term_freqs ); } break; case Array: // Only descend into arrays from non-array parents or on wildcard. if ( !isArray || wildcard() ) { scoreDocument( elem.Obj(), language, dottedName, true, term_freqs ); } break; default: // Skip over all other BSON types. break; } } } namespace { struct ScoreHelperStruct { ScoreHelperStruct() : freq(0), count(0), exp(0){ } double freq; double count; double exp; }; typedef unordered_map ScoreHelperMap; } void FTSSpec::_scoreString( const Tools& tools, const StringData& raw, TermFrequencyMap* docScores, double weight ) const { ScoreHelperMap terms; unsigned numTokens = 0; Tokenizer i( tools.language, raw ); while ( i.more() ) { Token t = i.next(); if ( t.type != Token::TEXT ) continue; string term = t.data.toString(); makeLower( &term ); if ( tools.stopwords->isStopWord( term ) ) continue; term = tools.stemmer->stem( term ); ScoreHelperStruct& data = terms[term]; if ( data.exp ) data.exp *= 2; else data.exp = 1; data.count += 1; data.freq += ( 1 / data.exp ); numTokens++; } for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) { const string& term = i->first; const ScoreHelperStruct& data = i->second; // in order to adjust weights as a function of term count as it // relates to total field length. ie. is this the only word or // a frequently occuring term? or does it only show up once in // a long block of text? double coeff = ( 0.5 * data.count / numTokens ) + 0.5; // if term is identical to the raw form of the // field (untokenized) give it a small boost. double adjustment = 1; if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) ) adjustment += 0.1; double& score = (*docScores)[term]; score += ( weight * data.freq * coeff * adjustment ); verify( score <= MAX_WEIGHT ); } } Status FTSSpec::getIndexPrefix( const BSONObj& query, BSONObj* out ) const { if ( numExtraBefore() == 0 ) { *out = BSONObj(); return Status::OK(); } BSONObjBuilder b; for ( unsigned i = 0; i < numExtraBefore(); i++ ) { BSONElement e = query.getFieldDotted(extraBefore(i)); if ( e.eoo() ) return Status( ErrorCodes::BadValue, str::stream() << "need have an equality filter on: " << extraBefore(i) ); if ( e.isABSONObj() && e.Obj().firstElement().getGtLtOp( -1 ) != -1 ) return Status( ErrorCodes::BadValue, str::stream() << "need have an equality filter on: " << extraBefore(i) ); b.append( e ); } *out = b.obj(); return Status::OK(); } void _addFTSStuff( BSONObjBuilder* b ) { b->append( "_fts", INDEX_NAME ); b->append( "_ftsx", 1 ); } BSONObj FTSSpec::fixSpec( const BSONObj& spec ) { map m; BSONObj keyPattern; { BSONObjBuilder b; bool addedFtsStuff = false; BSONObjIterator i( spec["key"].Obj() ); while ( i.more() ) { BSONElement e = i.next(); if ( str::equals( e.fieldName(), "_fts" ) || str::equals( e.fieldName(), "_ftsx" ) ) { addedFtsStuff = true; b.append( e ); } else if ( e.type() == String && ( str::equals( "fts", e.valuestr() ) || str::equals( "text", e.valuestr() ) ) ) { if ( !addedFtsStuff ) { _addFTSStuff( &b ); addedFtsStuff = true; } m[e.fieldName()] = 1; } else { b.append( e ); } } if ( !addedFtsStuff ) _addFTSStuff( &b ); keyPattern = b.obj(); } if ( spec["weights"].isABSONObj() ) { BSONObjIterator i( spec["weights"].Obj() ); while ( i.more() ) { BSONElement e = i.next(); m[e.fieldName()] = e.numberInt(); } } else if ( spec["weights"].str() == WILDCARD ) { m[WILDCARD] = 1; } BSONObj weights; { BSONObjBuilder b; for ( map::iterator i = m.begin(); i != m.end(); ++i ) { uassert( 16674, "score for word too high", i->second > 0 && i->second < MAX_WORD_WEIGHT ); b.append( i->first, i->second ); } weights = b.obj(); } BSONElement default_language_elt = spec["default_language"]; string default_language( default_language_elt.str() ); if ( default_language_elt.eoo() ) { default_language = moduleDefaultLanguage; } else { uassert( 17263, "default_language needs a string type", default_language_elt.type() == String ); } uassert( 17264, "default_language is not valid", FTSLanguage::makeFTSLanguage( default_language ).getStatus().isOK() ); string language_override(spec.getStringField("language_override")); if ( language_override.empty() ) language_override = "language"; int version = -1; int textIndexVersion = 2; BSONObjBuilder b; BSONObjIterator i( spec ); while ( i.more() ) { BSONElement e = i.next(); if ( str::equals( e.fieldName(), "key" ) ) { b.append( "key", keyPattern ); } else if ( str::equals( e.fieldName(), "weights" ) ) { b.append( "weights", weights ); weights = BSONObj(); } else if ( str::equals( e.fieldName(), "default_language" ) ) { b.append( "default_language", default_language); default_language = ""; } else if ( str::equals( e.fieldName(), "language_override" ) ) { b.append( "language_override", language_override); language_override = ""; } else if ( str::equals( e.fieldName(), "v" ) ) { version = e.numberInt(); } else if ( str::equals( e.fieldName(), "textIndexVersion" ) ) { textIndexVersion = e.numberInt(); uassert( 16730, str::stream() << "bad textIndexVersion: " << textIndexVersion, textIndexVersion == 2 ); } else { b.append( e ); } } if ( !weights.isEmpty() ) b.append( "weights", weights ); if ( !default_language.empty() ) b.append( "default_language", default_language); if ( !language_override.empty() ) b.append( "language_override", language_override); if ( version >= 0 ) b.append( "v", version ); b.append( "textIndexVersion", textIndexVersion ); return b.obj(); } } }