diff options
author | Jason Rassi <rassi@10gen.com> | 2013-10-11 21:54:58 -0700 |
---|---|---|
committer | Jason Rassi <rassi@10gen.com> | 2013-10-11 21:55:01 -0700 |
commit | bf0f29709b19565245be370aa3f8c46f0332de91 (patch) | |
tree | 632455643b23dbb7d24c1964facddf8e2b420b85 /src/mongo | |
parent | e9542d111dcd02f93113ad448ca15a1c9b95f1e7 (diff) | |
download | mongo-bf0f29709b19565245be370aa3f8c46f0332de91.tar.gz |
SERVER-9390 Text search support for multi-language documents
FTSIndexFormat::getKeys() now desends into subdocuments to find
language field to apply to the given subdocument.
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/fts/fts_index_format.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_spec.cpp | 162 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_spec.h | 41 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_spec_test.cpp | 243 |
4 files changed, 353 insertions, 95 deletions
diff --git a/src/mongo/db/fts/fts_index_format.cpp b/src/mongo/db/fts/fts_index_format.cpp index ce6f3164c48..62a2fbe699e 100644 --- a/src/mongo/db/fts/fts_index_format.cpp +++ b/src/mongo/db/fts/fts_index_format.cpp @@ -79,7 +79,7 @@ namespace mongo { TermFrequencyMap term_freqs; - spec.scoreDocument( obj, &term_freqs ); + spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &term_freqs ); // create index keys from raw scores // only 1 per string diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp index a66af81090d..3120a9c7ed2 100644 --- a/src/mongo/db/fts/fts_spec.cpp +++ b/src/mongo/db/fts/fts_spec.cpp @@ -33,6 +33,7 @@ #include "mongo/db/fts/fts_spec.h" #include "mongo/db/fts/fts_util.h" #include "mongo/util/mongoutils/str.h" +#include "mongo/util/stringutils.h" namespace mongo { @@ -40,7 +41,8 @@ namespace mongo { using namespace mongoutils; - const double MAX_WEIGHT = 1000000000.0; + const double DEFAULT_WEIGHT = 1; + const double MAX_WEIGHT = 1000000000; const double MAX_WORD_WEIGHT = MAX_WEIGHT / 10000; FTSSpec::FTSSpec( const BSONObj& indexInfo ) { @@ -101,101 +103,113 @@ namespace mongo { } } - bool FTSSpec::weight( const StringData& field, double* out ) const { - Weights::const_iterator i = _weights.find( field.toString() ); - if ( i == _weights.end() ) - return false; - *out = i->second; - return true; - } - - string FTSSpec::getLanguageToUse( const BSONObj& userDoc ) const { + string FTSSpec::getLanguageToUse( const BSONObj& userDoc, + const string& currentLanguage ) const { BSONElement e = userDoc[_languageOverrideField]; if ( e.type() == String ) { const char * x = e.valuestrsafe(); if ( strlen( x ) > 0 ) return x; } - return _defaultLanguage; + return currentLanguage; } - /* - * Calculates the score for all terms in a document of a collection - * @param obj, the document in the collection being parsed - * @param term_freqs, map<string,double> to fill up - */ - void FTSSpec::scoreDocument( const BSONObj& obj, TermFrequencyMap* term_freqs ) const { - - string language = getLanguageToUse( obj ); - - Stemmer stemmer(language); - Tools tools(language); - tools.stemmer = &stemmer; - tools.stopwords = StopWords::getStopWords( language ); - - if ( wildcard() ) { - // if * is specified for weight, we can recurse over all fields. - _scoreRecurse(tools, obj, term_freqs); - return; - } - // otherwise, we need to remember the different weights for each field - // and act accordingly (in other words, call _score) - for ( Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++ ) { - const char * leftOverName = i->first.c_str(); - // name of field - BSONElement e = obj.getFieldDottedOrArray(leftOverName); - // weight associated to name of field - double weight = i->second; - - if ( e.eoo() ) { - // do nothing - } - else if ( e.type() == Array ) { - BSONObjIterator j( e.Obj() ); - while ( j.more() ) { - BSONElement x = j.next(); - if ( leftOverName[0] && x.isABSONObj() ) - x = x.Obj().getFieldDotted( leftOverName ); - if ( x.type() == String ) - _scoreString( tools, x.valuestr(), term_freqs, weight ); - } - } - else if ( e.type() == String ) { - _scoreString( tools, e.valuestr(), term_freqs, weight ); + namespace { + /** + * Check for exact match or path prefix match. + */ + inline bool _matchPrefix( const string& dottedName, const string& weight ) { + if ( weight == dottedName ) { + return true; } - + return str::startsWith( weight, dottedName + '.' ); } } + void FTSSpec::scoreDocument( const BSONObj& obj, + const string& parentLanguage, + const string& parentPath, + bool isArray, + TermFrequencyMap* term_freqs ) const { + string language = getLanguageToUse( obj, parentLanguage ); + Stemmer stemmer( language ); + Tools tools( language, &stemmer, StopWords::getStopWords( language ) ); - /* - * Recurses over all fields of an obj (document in collection) - * and fills term,score map term_freqs - * @param tokenizer, tokenizer to tokenize a string into terms - * @param obj, object being parsed - * term_freqs, map <term,score> to be filled up - */ - void FTSSpec::_scoreRecurse(const Tools& tools, - const BSONObj& obj, - TermFrequencyMap* term_freqs ) const { + // Perform a depth-first traversal of obj, skipping fields not touched by this spec. BSONObjIterator j( obj ); while ( j.more() ) { - BSONElement x = j.next(); - if ( languageOverrideField() == x.fieldName() ) - continue; + BSONElement elem = j.next(); + string fieldName = elem.fieldName(); - if (x.type() == String) { - double w = 1; - weight( x.fieldName(), &w ); - _scoreString(tools, x.valuestr(), term_freqs, w); + // Skip "language" specifier fields if wildcard. + if ( wildcard() && languageOverrideField() == fieldName ) { + continue; } - else if ( x.isABSONObj() ) { - _scoreRecurse( tools, x.Obj(), term_freqs); + + // Compose the dotted name of the current field: + // 1. parent path empty (top level): use the current field name + // 2. parent path non-empty and obj is an array: use the parent path + // 3. parent path non-empty and obj is a sub-doc: append field name to parent path + string dottedName = ( parentPath.empty() ? fieldName + : isArray ? parentPath + : parentPath + '.' + fieldName ); + + // Find lower bound of dottedName in _weights. lower_bound leaves us at the first + // weight that could possibly match or be a prefix of dottedName. And if this + // element fails to match, then no subsequent weight can match, since the weights + // are lexicographically ordered. + Weights::const_iterator i = _weights.lower_bound( dottedName ); + + // possibleWeightMatch is set if the weight map contains either a match or some item + // lexicographically larger than fieldName. This boolean acts as a guard on + // dereferences of iterator 'i'. + bool possibleWeightMatch = ( i != _weights.end() ); + + // Optimize away two cases, when not wildcard: + // 1. lower_bound seeks to end(): no prefix match possible + // 2. lower_bound seeks to a name which is not a prefix + if ( !wildcard() ) { + if ( !possibleWeightMatch ) { + continue; + } + else if ( !_matchPrefix( dottedName, i->first ) ) { + continue; + } } + // Is the current field an exact match on a weight? + bool exactMatch = ( possibleWeightMatch && i->first == dottedName ); + + double weight = ( possibleWeightMatch ? i->second : DEFAULT_WEIGHT ); + + switch ( elem.type() ) { + case String: + // Only index strings on exact match or wildcard. + if ( exactMatch || wildcard() ) { + _scoreString( tools, elem.valuestr(), term_freqs, weight ); + } + break; + case Object: + // Only descend into a sub-document on proper prefix or wildcard. Note that + // !exactMatch is a sufficient test for proper prefix match, because of + // matchPrefix() continue block above. + if ( !exactMatch || wildcard() ) { + scoreDocument( elem.Obj(), language, dottedName, false, term_freqs ); + } + break; + case Array: + // Only descend into arrays from non-array parents or on wildcard. + if ( !isArray || wildcard() ) { + scoreDocument( elem.Obj(), language, dottedName, true, term_freqs ); + } + break; + default: + // Skip over all other BSON types. + break; + } } } diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h index e867dc04246..d13281eac9f 100644 --- a/src/mongo/db/fts/fts_spec.h +++ b/src/mongo/db/fts/fts_spec.h @@ -54,8 +54,13 @@ namespace mongo { class FTSSpec { struct Tools { - Tools( string language ) - : language( language ){} + Tools( string _language, + const Stemmer* _stemmer, + const StopWords* _stopwords ) + : language( _language ) + , stemmer( _stemmer ) + , stopwords( _stopwords ) {} + const std::string& language; const Stemmer* stemmer; const StopWords* stopwords; @@ -74,9 +79,26 @@ namespace mongo { size_t numExtraAfter() const { return _extraAfter.size(); } const std::string& extraAfter( unsigned i ) const { return _extraAfter[i]; } - string getLanguageToUse( const BSONObj& userDoc ) const; + /** + * Find a "language" field, if any, in a given BSON doc. If the language is not on the + * list of valid languages, return current. + */ + string getLanguageToUse( const BSONObj& userDoc, + const std::string& currentLanguage ) const; - void scoreDocument( const BSONObj& obj, TermFrequencyMap* scores ) const; + /** + * Calculates term/score pairs for a BSONObj as applied to this spec. + * - "obj": the BSONObj to traverse; can be a subdocument or array + * - "parentLanguage": nearest enclosing document "language" spec for obj + * - "parentPath": obj's dotted path in containing document + * - "isArray": true if obj is an array + * - "term_freqs": out-parameter to store results + */ + void scoreDocument( const BSONObj& obj, + const string& parentLanguage, + const string& parentPath, + bool isArray, + TermFrequencyMap* term_freqs ) const; /** * given a query, pulls out the pieces (in order) that go in the index first @@ -85,19 +107,8 @@ namespace mongo { const Weights& weights() const { return _weights; } - /** - * @param out - untouched if field isn't present - * @return if field is here - */ - bool weight( const StringData& field, double* out ) const; - - static BSONObj fixSpec( const BSONObj& spec ); private: - void _scoreRecurse(const Tools& tools, - const BSONObj& obj, - TermFrequencyMap* term_freqs ) const; - void _scoreString( const Tools& tools, const StringData& raw, TermFrequencyMap* term_freqs, diff --git a/src/mongo/db/fts/fts_spec_test.cpp b/src/mongo/db/fts/fts_spec_test.cpp index 12a814a29a3..b8bb292c777 100644 --- a/src/mongo/db/fts/fts_spec_test.cpp +++ b/src/mongo/db/fts/fts_spec_test.cpp @@ -31,6 +31,7 @@ #include "mongo/pch.h" #include "mongo/db/fts/fts_spec.h" +#include "mongo/db/json.h" #include "mongo/unittest/unittest.h" namespace mongo { @@ -54,7 +55,8 @@ namespace mongo { FTSSpec spec( FTSSpec::fixSpec( user ) ); TermFrequencyMap m; - spec.scoreDocument( BSON( "title" << "cat sat run" ), &m ); + spec.scoreDocument( BSON( "title" << "cat sat run" ), + "english", "", false, &m ); ASSERT_EQUALS( 3U, m.size() ); ASSERT_EQUALS( m["cat"], m["sat"] ); ASSERT_EQUALS( m["cat"], m["run"] ); @@ -69,9 +71,8 @@ namespace mongo { FTSSpec spec( FTSSpec::fixSpec( user ) ); TermFrequencyMap m; - spec.scoreDocument( BSON( "title" << "cat sat run" - << "text" << "cat book" ), - &m ); + spec.scoreDocument( BSON( "title" << "cat sat run" << "text" << "cat book" ), + "english", "", false, &m ); ASSERT_EQUALS( 4U, m.size() ); ASSERT_EQUALS( m["sat"], m["run"] ); @@ -92,7 +93,8 @@ namespace mongo { FTSSpec spec( FTSSpec::fixSpec( user ) ); TermFrequencyMap m; - spec.scoreDocument( BSON( "title" << "cat sat sat run run run" ), &m ); + spec.scoreDocument( BSON( "title" << "cat sat sat run run run" ), + "english", "", false, &m ); ASSERT_EQUALS( 3U, m.size() ); ASSERT( m["cat"] > 0 ); ASSERT( m["sat"] > m["cat"] ); @@ -151,5 +153,236 @@ namespace mongo { ASSERT( !spec.getIndexPrefix( BSONObj(), &prefix ).isOK() ); } + // Test for correct behavior when encountering nested arrays (both directly nested and + // indirectly nested). + + TEST( FTSSpec, NestedArraysPos1 ) { + BSONObj user = BSON( "key" << BSON( "a.b" << "fts" ) ); + FTSSpec spec( FTSSpec::fixSpec( user ) ); + + // The following document matches {"a.b": {$type: 2}}, so "term" should be indexed. + BSONObj obj = fromjson("{a: [{b: ['term']}]}"); // indirectly nested arrays + TermFrequencyMap m; + spec.scoreDocument( obj, "english", "", false, &m ); + ASSERT_EQUALS( 1U, m.size() ); + } + + TEST( FTSSpec, NestedArraysPos2 ) { + BSONObj user = BSON( "key" << BSON( "$**" << "fts" ) ); + FTSSpec spec( FTSSpec::fixSpec( user ) ); + + // The wildcard spec implies a full recursive traversal, so "term" should be indexed. + BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays + TermFrequencyMap m; + spec.scoreDocument( obj, "english", "", false, &m ); + ASSERT_EQUALS( 1U, m.size() ); + } + + TEST( FTSSpec, NestedArraysNeg1 ) { + BSONObj user = BSON( "key" << BSON( "a.b" << "fts" ) ); + FTSSpec spec( FTSSpec::fixSpec( user ) ); + + // The following document does not match {"a.b": {$type: 2}}, so "term" should not be + // indexed. + BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays + TermFrequencyMap m; + spec.scoreDocument( obj, "english", "", false, &m ); + ASSERT_EQUALS( 0U, m.size() ); + } + + // Multi-language test_1: test independent stemming per sub-document + TEST( FTSSpec, NestedLanguages_PerArrayItemStemming ) { + BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "fts" ) ); + FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ a :" + " { b :" + " [ { c : \"walked\", language : \"english\" }," + " { c : \"camminato\", language : \"italian\" }," + " { c : \"ging\", language : \"german\" } ]" + " }" + " }" ); + + spec.scoreDocument( obj, "english", "", false, &tfm ); + + set<string> hits; + hits.insert("walk"); + hits.insert("cammin"); + hits.insert("ging"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS( 1U, hits.count( term ) ); + } + + } + + // Multi-language test_2: test nested stemming per sub-document + TEST( FTSSpec, NestedLanguages_PerSubdocStemming ) { + BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "fts" ) ); + FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ language : \"english\"," + " a :" + " { language : \"danish\"," + " b :" + " [ { c : \"foredrag\" }," + " { c : \"foredragsholder\" }," + " { c : \"lector\" } ]" + " }" + "}" ); + + spec.scoreDocument( obj, "english", "", false, &tfm ); + + set<string> hits; + hits.insert("foredrag"); + hits.insert("foredragshold"); + hits.insert("lector"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS( 1U, hits.count( term ) ); + } + + } + + // Multi-language test_3: test nested arrays + TEST( FTSSpec, NestedLanguages_NestedArrays ) { + BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "fts" ) ); + FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ language : \"english\"," + " a : [" + " { language : \"danish\"," + " b :" + " [ { c : [\"foredrag\"] }," + " { c : [\"foredragsholder\"] }," + " { c : [\"lector\"] } ]" + " } ]" + "}" ); + + spec.scoreDocument( obj, "english", "", false, &tfm ); + + set<string> hits; + hits.insert("foredrag"); + hits.insert("foredragshold"); + hits.insert("lector"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS( 1U, hits.count( term ) ); + } + + } + + // Multi-language test_4: test pruning + TEST( FTSSpec, NestedLanguages_PathPruning ) { + BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "fts" ) ); + FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ language : \"english\"," + " a : " + " { language : \"danish\"," + " bc : \"foo\"," + " b : { d: \"bar\" }," + " b :" + " [ { c : \"foredrag\" }," + " { c : \"foredragsholder\" }," + " { c : \"lector\" } ]" + " }" + "}" ); + + spec.scoreDocument( obj, "english", "", false, &tfm ); + + set<string> hits; + hits.insert("foredrag"); + hits.insert("foredragshold"); + hits.insert("lector"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS( 1U, hits.count( term ) ); + } + + } + + // Multi-language test_5: test wildcard spec + TEST( FTSSpec, NestedLanguages_Wildcard ) { + BSONObj indexSpec = BSON( "key" << BSON( "$**" << "fts" ) ); + FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ language : \"english\"," + " b : \"walking\"," + " c : { e: \"walked\" }," + " d : " + " { language : \"danish\"," + " e :" + " [ { f : \"foredrag\" }," + " { f : \"foredragsholder\" }," + " { f : \"lector\" } ]" + " }" + "}" ); + + spec.scoreDocument( obj, "english", "", false, &tfm ); + + set<string> hits; + hits.insert("foredrag"); + hits.insert("foredragshold"); + hits.insert("lector"); + hits.insert("walk"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS( 1U, hits.count( term ) ); + } + + } + + // Multi-language test_6: test wildcard spec with override + TEST( FTSSpec, NestedLanguages_WildcardOverride ) { + BSONObj indexSpec = BSON( "key" << BSON( "$**" << "fts" ) << + "weights" << BSON( "d.e.f" << 20 ) ); + FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ language : \"english\"," + " b : \"walking\"," + " c : { e: \"walked\" }," + " d : " + " { language : \"danish\"," + " e :" + " [ { f : \"foredrag\" }," + " { f : \"foredragsholder\" }," + " { f : \"lector\" } ]" + " }" + "}" ); + + spec.scoreDocument( obj, "english", "", false, &tfm ); + + set<string> hits; + hits.insert("foredrag"); + hits.insert("foredragshold"); + hits.insert("lector"); + hits.insert("walk"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS( 1U, hits.count( term ) ); + } + + } + + } } |