/** * Copyright (C) 2014 MongoDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the GNU Affero General Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #include "mongo/db/fts/fts_spec.h" #include "mongo/util/mongoutils/str.h" namespace mongo { namespace fts { // // This file contains functionality specific to indexing documents from TEXT_INDEX_VERSION_1 // text indexes. // using namespace mongoutils; namespace { void _addFTSStuff( BSONObjBuilder* b ) { b->append( "_fts", INDEX_NAME ); b->append( "_ftsx", 1 ); } } const FTSLanguage& FTSSpec::_getLanguageToUseV1( const BSONObj& userDoc ) const { BSONElement e = userDoc[_languageOverrideField]; if ( e.type() == String ) { const char * x = e.valuestrsafe(); if ( strlen( x ) > 0 ) { StatusWithFTSLanguage swl = FTSLanguage::make( x, TEXT_INDEX_VERSION_1 ); dassert( swl.isOK() ); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail. return *swl.getValue(); } } return *_defaultLanguage; } void FTSSpec::_scoreStringV1( const Tools& tools, const StringData& raw, TermFrequencyMap* docScores, double weight ) const { ScoreHelperMap terms; unsigned numTokens = 0; Tokenizer i( tools.language, raw ); while ( i.more() ) { Token t = i.next(); if ( t.type != Token::TEXT ) continue; string term = t.data.toString(); makeLower( &term ); if ( tools.stopwords->isStopWord( term ) ) continue; term = tools.stemmer->stem( term ); ScoreHelperStruct& data = terms[term]; if ( data.exp ) data.exp *= 2; else data.exp = 1; data.count += 1; data.freq += ( 1 / data.exp ); numTokens++; } for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) { const string& term = i->first; const ScoreHelperStruct& data = i->second; // in order to adjust weights as a function of term count as it // relates to total field length. ie. is this the only word or // a frequently occuring term? or does it only show up once in // a long block of text? double coeff = ( 0.5 * data.count / numTokens ) + 0.5; // if term is identical to the raw form of the // field (untokenized) give it a small boost. double adjustment = 1; if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) ) adjustment += 0.1; double& score = (*docScores)[term]; score += ( weight * data.freq * coeff * adjustment ); verify( score <= MAX_WEIGHT ); } } bool FTSSpec::_weightV1( const StringData& field, double* out ) const { Weights::const_iterator i = _weights.find( field.toString() ); if ( i == _weights.end() ) return false; *out = i->second; return true; } /* * Recurses over all fields of an obj (document in collection) * and fills term,score map term_freqs * @param tokenizer, tokenizer to tokenize a string into terms * @param obj, object being parsed * term_freqs, map to be filled up */ void FTSSpec::_scoreRecurseV1( const Tools& tools, const BSONObj& obj, TermFrequencyMap* term_freqs ) const { BSONObjIterator j( obj ); while ( j.more() ) { BSONElement x = j.next(); if ( languageOverrideField() == x.fieldName() ) continue; if (x.type() == String) { double w = 1; _weightV1( x.fieldName(), &w ); _scoreStringV1(tools, x.valuestr(), term_freqs, w); } else if ( x.isABSONObj() ) { _scoreRecurseV1( tools, x.Obj(), term_freqs); } } } void FTSSpec::_scoreDocumentV1( const BSONObj& obj, TermFrequencyMap* term_freqs ) const { const FTSLanguage& language = _getLanguageToUseV1( obj ); Stemmer stemmer(language); Tools tools(language, &stemmer, StopWords::getStopWords( language )); if ( wildcard() ) { // if * is specified for weight, we can recurse over all fields. _scoreRecurseV1(tools, obj, term_freqs); return; } // otherwise, we need to remember the different weights for each field // and act accordingly (in other words, call _score) for ( Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++ ) { const char * leftOverName = i->first.c_str(); // name of field BSONElement e = obj.getFieldDottedOrArray(leftOverName); // weight associated to name of field double weight = i->second; if ( e.eoo() ) { // do nothing } else if ( e.type() == Array ) { BSONObjIterator j( e.Obj() ); while ( j.more() ) { BSONElement x = j.next(); if ( leftOverName[0] && x.isABSONObj() ) x = x.Obj().getFieldDotted( leftOverName ); if ( x.type() == String ) _scoreStringV1( tools, x.valuestr(), term_freqs, weight ); } } else if ( e.type() == String ) { _scoreStringV1( tools, e.valuestr(), term_freqs, weight ); } } } BSONObj FTSSpec::_fixSpecV1( const BSONObj& spec ) { map m; BSONObj keyPattern; { BSONObjBuilder b; bool addedFtsStuff = false; BSONObjIterator i( spec["key"].Obj() ); while ( i.more() ) { BSONElement e = i.next(); if ( str::equals( e.fieldName(), "_fts" ) || str::equals( e.fieldName(), "_ftsx" ) ) { addedFtsStuff = true; b.append( e ); } else if ( e.type() == String && ( str::equals( "fts", e.valuestr() ) || str::equals( "text", e.valuestr() ) ) ) { if ( !addedFtsStuff ) { _addFTSStuff( &b ); addedFtsStuff = true; } m[e.fieldName()] = 1; } else { b.append( e ); } } if ( !addedFtsStuff ) _addFTSStuff( &b ); keyPattern = b.obj(); } if ( spec["weights"].isABSONObj() ) { BSONObjIterator i( spec["weights"].Obj() ); while ( i.more() ) { BSONElement e = i.next(); m[e.fieldName()] = e.numberInt(); } } else if ( spec["weights"].str() == WILDCARD ) { m[WILDCARD] = 1; } BSONObj weights; { BSONObjBuilder b; for ( map::iterator i = m.begin(); i != m.end(); ++i ) { uassert( 17365, "score for word too high", i->second > 0 && i->second < MAX_WORD_WEIGHT ); b.append( i->first, i->second ); } weights = b.obj(); } string default_language(spec.getStringField("default_language")); if ( default_language.empty() ) default_language = "english"; string language_override(spec.getStringField("language_override")); if ( language_override.empty() ) language_override = "language"; int version = -1; int textIndexVersion = 1; BSONObjBuilder b; BSONObjIterator i( spec ); while ( i.more() ) { BSONElement e = i.next(); if ( str::equals( e.fieldName(), "key" ) ) { b.append( "key", keyPattern ); } else if ( str::equals( e.fieldName(), "weights" ) ) { b.append( "weights", weights ); weights = BSONObj(); } else if ( str::equals( e.fieldName(), "default_language" ) ) { b.append( "default_language", default_language); default_language = ""; } else if ( str::equals( e.fieldName(), "language_override" ) ) { b.append( "language_override", language_override); language_override = ""; } else if ( str::equals( e.fieldName(), "v" ) ) { version = e.numberInt(); } else if ( str::equals( e.fieldName(), "textIndexVersion" ) ) { textIndexVersion = e.numberInt(); uassert( 17366, str::stream() << "bad textIndexVersion: " << textIndexVersion, textIndexVersion == 1 ); } else { b.append( e ); } } if ( !weights.isEmpty() ) b.append( "weights", weights ); if ( !default_language.empty() ) b.append( "default_language", default_language); if ( !language_override.empty() ) b.append( "language_override", language_override); if ( version >= 0 ) b.append( "v", version ); b.append( "textIndexVersion", textIndexVersion ); return b.obj(); } } }