summaryrefslogtreecommitdiff
path: root/src/mongo/db/fts
diff options
context:
space:
mode:
authorJason Rassi <rassi@10gen.com>2013-10-11 21:54:58 -0700
committerJason Rassi <rassi@10gen.com>2013-10-11 21:55:01 -0700
commitbf0f29709b19565245be370aa3f8c46f0332de91 (patch)
tree632455643b23dbb7d24c1964facddf8e2b420b85 /src/mongo/db/fts
parente9542d111dcd02f93113ad448ca15a1c9b95f1e7 (diff)
downloadmongo-bf0f29709b19565245be370aa3f8c46f0332de91.tar.gz
SERVER-9390 Text search support for multi-language documents
FTSIndexFormat::getKeys() now desends into subdocuments to find language field to apply to the given subdocument.
Diffstat (limited to 'src/mongo/db/fts')
-rw-r--r--src/mongo/db/fts/fts_index_format.cpp2
-rw-r--r--src/mongo/db/fts/fts_spec.cpp162
-rw-r--r--src/mongo/db/fts/fts_spec.h41
-rw-r--r--src/mongo/db/fts/fts_spec_test.cpp243
4 files changed, 353 insertions, 95 deletions
diff --git a/src/mongo/db/fts/fts_index_format.cpp b/src/mongo/db/fts/fts_index_format.cpp
index ce6f3164c48..62a2fbe699e 100644
--- a/src/mongo/db/fts/fts_index_format.cpp
+++ b/src/mongo/db/fts/fts_index_format.cpp
@@ -79,7 +79,7 @@ namespace mongo {
TermFrequencyMap term_freqs;
- spec.scoreDocument( obj, &term_freqs );
+ spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &term_freqs );
// create index keys from raw scores
// only 1 per string
diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp
index a66af81090d..3120a9c7ed2 100644
--- a/src/mongo/db/fts/fts_spec.cpp
+++ b/src/mongo/db/fts/fts_spec.cpp
@@ -33,6 +33,7 @@
#include "mongo/db/fts/fts_spec.h"
#include "mongo/db/fts/fts_util.h"
#include "mongo/util/mongoutils/str.h"
+#include "mongo/util/stringutils.h"
namespace mongo {
@@ -40,7 +41,8 @@ namespace mongo {
using namespace mongoutils;
- const double MAX_WEIGHT = 1000000000.0;
+ const double DEFAULT_WEIGHT = 1;
+ const double MAX_WEIGHT = 1000000000;
const double MAX_WORD_WEIGHT = MAX_WEIGHT / 10000;
FTSSpec::FTSSpec( const BSONObj& indexInfo ) {
@@ -101,101 +103,113 @@ namespace mongo {
}
}
- bool FTSSpec::weight( const StringData& field, double* out ) const {
- Weights::const_iterator i = _weights.find( field.toString() );
- if ( i == _weights.end() )
- return false;
- *out = i->second;
- return true;
- }
-
- string FTSSpec::getLanguageToUse( const BSONObj& userDoc ) const {
+ string FTSSpec::getLanguageToUse( const BSONObj& userDoc,
+ const string& currentLanguage ) const {
BSONElement e = userDoc[_languageOverrideField];
if ( e.type() == String ) {
const char * x = e.valuestrsafe();
if ( strlen( x ) > 0 )
return x;
}
- return _defaultLanguage;
+ return currentLanguage;
}
- /*
- * Calculates the score for all terms in a document of a collection
- * @param obj, the document in the collection being parsed
- * @param term_freqs, map<string,double> to fill up
- */
- void FTSSpec::scoreDocument( const BSONObj& obj, TermFrequencyMap* term_freqs ) const {
-
- string language = getLanguageToUse( obj );
-
- Stemmer stemmer(language);
- Tools tools(language);
- tools.stemmer = &stemmer;
- tools.stopwords = StopWords::getStopWords( language );
-
- if ( wildcard() ) {
- // if * is specified for weight, we can recurse over all fields.
- _scoreRecurse(tools, obj, term_freqs);
- return;
- }
- // otherwise, we need to remember the different weights for each field
- // and act accordingly (in other words, call _score)
- for ( Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++ ) {
- const char * leftOverName = i->first.c_str();
- // name of field
- BSONElement e = obj.getFieldDottedOrArray(leftOverName);
- // weight associated to name of field
- double weight = i->second;
-
- if ( e.eoo() ) {
- // do nothing
- }
- else if ( e.type() == Array ) {
- BSONObjIterator j( e.Obj() );
- while ( j.more() ) {
- BSONElement x = j.next();
- if ( leftOverName[0] && x.isABSONObj() )
- x = x.Obj().getFieldDotted( leftOverName );
- if ( x.type() == String )
- _scoreString( tools, x.valuestr(), term_freqs, weight );
- }
- }
- else if ( e.type() == String ) {
- _scoreString( tools, e.valuestr(), term_freqs, weight );
+ namespace {
+ /**
+ * Check for exact match or path prefix match.
+ */
+ inline bool _matchPrefix( const string& dottedName, const string& weight ) {
+ if ( weight == dottedName ) {
+ return true;
}
-
+ return str::startsWith( weight, dottedName + '.' );
}
}
+ void FTSSpec::scoreDocument( const BSONObj& obj,
+ const string& parentLanguage,
+ const string& parentPath,
+ bool isArray,
+ TermFrequencyMap* term_freqs ) const {
+ string language = getLanguageToUse( obj, parentLanguage );
+ Stemmer stemmer( language );
+ Tools tools( language, &stemmer, StopWords::getStopWords( language ) );
- /*
- * Recurses over all fields of an obj (document in collection)
- * and fills term,score map term_freqs
- * @param tokenizer, tokenizer to tokenize a string into terms
- * @param obj, object being parsed
- * term_freqs, map <term,score> to be filled up
- */
- void FTSSpec::_scoreRecurse(const Tools& tools,
- const BSONObj& obj,
- TermFrequencyMap* term_freqs ) const {
+ // Perform a depth-first traversal of obj, skipping fields not touched by this spec.
BSONObjIterator j( obj );
while ( j.more() ) {
- BSONElement x = j.next();
- if ( languageOverrideField() == x.fieldName() )
- continue;
+ BSONElement elem = j.next();
+ string fieldName = elem.fieldName();
- if (x.type() == String) {
- double w = 1;
- weight( x.fieldName(), &w );
- _scoreString(tools, x.valuestr(), term_freqs, w);
+ // Skip "language" specifier fields if wildcard.
+ if ( wildcard() && languageOverrideField() == fieldName ) {
+ continue;
}
- else if ( x.isABSONObj() ) {
- _scoreRecurse( tools, x.Obj(), term_freqs);
+
+ // Compose the dotted name of the current field:
+ // 1. parent path empty (top level): use the current field name
+ // 2. parent path non-empty and obj is an array: use the parent path
+ // 3. parent path non-empty and obj is a sub-doc: append field name to parent path
+ string dottedName = ( parentPath.empty() ? fieldName
+ : isArray ? parentPath
+ : parentPath + '.' + fieldName );
+
+ // Find lower bound of dottedName in _weights. lower_bound leaves us at the first
+ // weight that could possibly match or be a prefix of dottedName. And if this
+ // element fails to match, then no subsequent weight can match, since the weights
+ // are lexicographically ordered.
+ Weights::const_iterator i = _weights.lower_bound( dottedName );
+
+ // possibleWeightMatch is set if the weight map contains either a match or some item
+ // lexicographically larger than fieldName. This boolean acts as a guard on
+ // dereferences of iterator 'i'.
+ bool possibleWeightMatch = ( i != _weights.end() );
+
+ // Optimize away two cases, when not wildcard:
+ // 1. lower_bound seeks to end(): no prefix match possible
+ // 2. lower_bound seeks to a name which is not a prefix
+ if ( !wildcard() ) {
+ if ( !possibleWeightMatch ) {
+ continue;
+ }
+ else if ( !_matchPrefix( dottedName, i->first ) ) {
+ continue;
+ }
}
+ // Is the current field an exact match on a weight?
+ bool exactMatch = ( possibleWeightMatch && i->first == dottedName );
+
+ double weight = ( possibleWeightMatch ? i->second : DEFAULT_WEIGHT );
+
+ switch ( elem.type() ) {
+ case String:
+ // Only index strings on exact match or wildcard.
+ if ( exactMatch || wildcard() ) {
+ _scoreString( tools, elem.valuestr(), term_freqs, weight );
+ }
+ break;
+ case Object:
+ // Only descend into a sub-document on proper prefix or wildcard. Note that
+ // !exactMatch is a sufficient test for proper prefix match, because of
+ // matchPrefix() continue block above.
+ if ( !exactMatch || wildcard() ) {
+ scoreDocument( elem.Obj(), language, dottedName, false, term_freqs );
+ }
+ break;
+ case Array:
+ // Only descend into arrays from non-array parents or on wildcard.
+ if ( !isArray || wildcard() ) {
+ scoreDocument( elem.Obj(), language, dottedName, true, term_freqs );
+ }
+ break;
+ default:
+ // Skip over all other BSON types.
+ break;
+ }
}
}
diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h
index e867dc04246..d13281eac9f 100644
--- a/src/mongo/db/fts/fts_spec.h
+++ b/src/mongo/db/fts/fts_spec.h
@@ -54,8 +54,13 @@ namespace mongo {
class FTSSpec {
struct Tools {
- Tools( string language )
- : language( language ){}
+ Tools( string _language,
+ const Stemmer* _stemmer,
+ const StopWords* _stopwords )
+ : language( _language )
+ , stemmer( _stemmer )
+ , stopwords( _stopwords ) {}
+
const std::string& language;
const Stemmer* stemmer;
const StopWords* stopwords;
@@ -74,9 +79,26 @@ namespace mongo {
size_t numExtraAfter() const { return _extraAfter.size(); }
const std::string& extraAfter( unsigned i ) const { return _extraAfter[i]; }
- string getLanguageToUse( const BSONObj& userDoc ) const;
+ /**
+ * Find a "language" field, if any, in a given BSON doc. If the language is not on the
+ * list of valid languages, return current.
+ */
+ string getLanguageToUse( const BSONObj& userDoc,
+ const std::string& currentLanguage ) const;
- void scoreDocument( const BSONObj& obj, TermFrequencyMap* scores ) const;
+ /**
+ * Calculates term/score pairs for a BSONObj as applied to this spec.
+ * - "obj": the BSONObj to traverse; can be a subdocument or array
+ * - "parentLanguage": nearest enclosing document "language" spec for obj
+ * - "parentPath": obj's dotted path in containing document
+ * - "isArray": true if obj is an array
+ * - "term_freqs": out-parameter to store results
+ */
+ void scoreDocument( const BSONObj& obj,
+ const string& parentLanguage,
+ const string& parentPath,
+ bool isArray,
+ TermFrequencyMap* term_freqs ) const;
/**
* given a query, pulls out the pieces (in order) that go in the index first
@@ -85,19 +107,8 @@ namespace mongo {
const Weights& weights() const { return _weights; }
- /**
- * @param out - untouched if field isn't present
- * @return if field is here
- */
- bool weight( const StringData& field, double* out ) const;
-
-
static BSONObj fixSpec( const BSONObj& spec );
private:
- void _scoreRecurse(const Tools& tools,
- const BSONObj& obj,
- TermFrequencyMap* term_freqs ) const;
-
void _scoreString( const Tools& tools,
const StringData& raw,
TermFrequencyMap* term_freqs,
diff --git a/src/mongo/db/fts/fts_spec_test.cpp b/src/mongo/db/fts/fts_spec_test.cpp
index 12a814a29a3..b8bb292c777 100644
--- a/src/mongo/db/fts/fts_spec_test.cpp
+++ b/src/mongo/db/fts/fts_spec_test.cpp
@@ -31,6 +31,7 @@
#include "mongo/pch.h"
#include "mongo/db/fts/fts_spec.h"
+#include "mongo/db/json.h"
#include "mongo/unittest/unittest.h"
namespace mongo {
@@ -54,7 +55,8 @@ namespace mongo {
FTSSpec spec( FTSSpec::fixSpec( user ) );
TermFrequencyMap m;
- spec.scoreDocument( BSON( "title" << "cat sat run" ), &m );
+ spec.scoreDocument( BSON( "title" << "cat sat run" ),
+ "english", "", false, &m );
ASSERT_EQUALS( 3U, m.size() );
ASSERT_EQUALS( m["cat"], m["sat"] );
ASSERT_EQUALS( m["cat"], m["run"] );
@@ -69,9 +71,8 @@ namespace mongo {
FTSSpec spec( FTSSpec::fixSpec( user ) );
TermFrequencyMap m;
- spec.scoreDocument( BSON( "title" << "cat sat run"
- << "text" << "cat book" ),
- &m );
+ spec.scoreDocument( BSON( "title" << "cat sat run" << "text" << "cat book" ),
+ "english", "", false, &m );
ASSERT_EQUALS( 4U, m.size() );
ASSERT_EQUALS( m["sat"], m["run"] );
@@ -92,7 +93,8 @@ namespace mongo {
FTSSpec spec( FTSSpec::fixSpec( user ) );
TermFrequencyMap m;
- spec.scoreDocument( BSON( "title" << "cat sat sat run run run" ), &m );
+ spec.scoreDocument( BSON( "title" << "cat sat sat run run run" ),
+ "english", "", false, &m );
ASSERT_EQUALS( 3U, m.size() );
ASSERT( m["cat"] > 0 );
ASSERT( m["sat"] > m["cat"] );
@@ -151,5 +153,236 @@ namespace mongo {
ASSERT( !spec.getIndexPrefix( BSONObj(), &prefix ).isOK() );
}
+ // Test for correct behavior when encountering nested arrays (both directly nested and
+ // indirectly nested).
+
+ TEST( FTSSpec, NestedArraysPos1 ) {
+ BSONObj user = BSON( "key" << BSON( "a.b" << "fts" ) );
+ FTSSpec spec( FTSSpec::fixSpec( user ) );
+
+ // The following document matches {"a.b": {$type: 2}}, so "term" should be indexed.
+ BSONObj obj = fromjson("{a: [{b: ['term']}]}"); // indirectly nested arrays
+ TermFrequencyMap m;
+ spec.scoreDocument( obj, "english", "", false, &m );
+ ASSERT_EQUALS( 1U, m.size() );
+ }
+
+ TEST( FTSSpec, NestedArraysPos2 ) {
+ BSONObj user = BSON( "key" << BSON( "$**" << "fts" ) );
+ FTSSpec spec( FTSSpec::fixSpec( user ) );
+
+ // The wildcard spec implies a full recursive traversal, so "term" should be indexed.
+ BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays
+ TermFrequencyMap m;
+ spec.scoreDocument( obj, "english", "", false, &m );
+ ASSERT_EQUALS( 1U, m.size() );
+ }
+
+ TEST( FTSSpec, NestedArraysNeg1 ) {
+ BSONObj user = BSON( "key" << BSON( "a.b" << "fts" ) );
+ FTSSpec spec( FTSSpec::fixSpec( user ) );
+
+ // The following document does not match {"a.b": {$type: 2}}, so "term" should not be
+ // indexed.
+ BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays
+ TermFrequencyMap m;
+ spec.scoreDocument( obj, "english", "", false, &m );
+ ASSERT_EQUALS( 0U, m.size() );
+ }
+
+ // Multi-language test_1: test independent stemming per sub-document
+ TEST( FTSSpec, NestedLanguages_PerArrayItemStemming ) {
+ BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "fts" ) );
+ FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
+ TermFrequencyMap tfm;
+
+ BSONObj obj = fromjson(
+ "{ a :"
+ " { b :"
+ " [ { c : \"walked\", language : \"english\" },"
+ " { c : \"camminato\", language : \"italian\" },"
+ " { c : \"ging\", language : \"german\" } ]"
+ " }"
+ " }" );
+
+ spec.scoreDocument( obj, "english", "", false, &tfm );
+
+ set<string> hits;
+ hits.insert("walk");
+ hits.insert("cammin");
+ hits.insert("ging");
+
+ for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) {
+ string term = i->first;
+ ASSERT_EQUALS( 1U, hits.count( term ) );
+ }
+
+ }
+
+ // Multi-language test_2: test nested stemming per sub-document
+ TEST( FTSSpec, NestedLanguages_PerSubdocStemming ) {
+ BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "fts" ) );
+ FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
+ TermFrequencyMap tfm;
+
+ BSONObj obj = fromjson(
+ "{ language : \"english\","
+ " a :"
+ " { language : \"danish\","
+ " b :"
+ " [ { c : \"foredrag\" },"
+ " { c : \"foredragsholder\" },"
+ " { c : \"lector\" } ]"
+ " }"
+ "}" );
+
+ spec.scoreDocument( obj, "english", "", false, &tfm );
+
+ set<string> hits;
+ hits.insert("foredrag");
+ hits.insert("foredragshold");
+ hits.insert("lector");
+
+ for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) {
+ string term = i->first;
+ ASSERT_EQUALS( 1U, hits.count( term ) );
+ }
+
+ }
+
+ // Multi-language test_3: test nested arrays
+ TEST( FTSSpec, NestedLanguages_NestedArrays ) {
+ BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "fts" ) );
+ FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
+ TermFrequencyMap tfm;
+
+ BSONObj obj = fromjson(
+ "{ language : \"english\","
+ " a : ["
+ " { language : \"danish\","
+ " b :"
+ " [ { c : [\"foredrag\"] },"
+ " { c : [\"foredragsholder\"] },"
+ " { c : [\"lector\"] } ]"
+ " } ]"
+ "}" );
+
+ spec.scoreDocument( obj, "english", "", false, &tfm );
+
+ set<string> hits;
+ hits.insert("foredrag");
+ hits.insert("foredragshold");
+ hits.insert("lector");
+
+ for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) {
+ string term = i->first;
+ ASSERT_EQUALS( 1U, hits.count( term ) );
+ }
+
+ }
+
+ // Multi-language test_4: test pruning
+ TEST( FTSSpec, NestedLanguages_PathPruning ) {
+ BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "fts" ) );
+ FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
+ TermFrequencyMap tfm;
+
+ BSONObj obj = fromjson(
+ "{ language : \"english\","
+ " a : "
+ " { language : \"danish\","
+ " bc : \"foo\","
+ " b : { d: \"bar\" },"
+ " b :"
+ " [ { c : \"foredrag\" },"
+ " { c : \"foredragsholder\" },"
+ " { c : \"lector\" } ]"
+ " }"
+ "}" );
+
+ spec.scoreDocument( obj, "english", "", false, &tfm );
+
+ set<string> hits;
+ hits.insert("foredrag");
+ hits.insert("foredragshold");
+ hits.insert("lector");
+
+ for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) {
+ string term = i->first;
+ ASSERT_EQUALS( 1U, hits.count( term ) );
+ }
+
+ }
+
+ // Multi-language test_5: test wildcard spec
+ TEST( FTSSpec, NestedLanguages_Wildcard ) {
+ BSONObj indexSpec = BSON( "key" << BSON( "$**" << "fts" ) );
+ FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
+ TermFrequencyMap tfm;
+
+ BSONObj obj = fromjson(
+ "{ language : \"english\","
+ " b : \"walking\","
+ " c : { e: \"walked\" },"
+ " d : "
+ " { language : \"danish\","
+ " e :"
+ " [ { f : \"foredrag\" },"
+ " { f : \"foredragsholder\" },"
+ " { f : \"lector\" } ]"
+ " }"
+ "}" );
+
+ spec.scoreDocument( obj, "english", "", false, &tfm );
+
+ set<string> hits;
+ hits.insert("foredrag");
+ hits.insert("foredragshold");
+ hits.insert("lector");
+ hits.insert("walk");
+
+ for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) {
+ string term = i->first;
+ ASSERT_EQUALS( 1U, hits.count( term ) );
+ }
+
+ }
+
+ // Multi-language test_6: test wildcard spec with override
+ TEST( FTSSpec, NestedLanguages_WildcardOverride ) {
+ BSONObj indexSpec = BSON( "key" << BSON( "$**" << "fts" ) <<
+ "weights" << BSON( "d.e.f" << 20 ) );
+ FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
+ TermFrequencyMap tfm;
+
+ BSONObj obj = fromjson(
+ "{ language : \"english\","
+ " b : \"walking\","
+ " c : { e: \"walked\" },"
+ " d : "
+ " { language : \"danish\","
+ " e :"
+ " [ { f : \"foredrag\" },"
+ " { f : \"foredragsholder\" },"
+ " { f : \"lector\" } ]"
+ " }"
+ "}" );
+
+ spec.scoreDocument( obj, "english", "", false, &tfm );
+
+ set<string> hits;
+ hits.insert("foredrag");
+ hits.insert("foredragshold");
+ hits.insert("lector");
+ hits.insert("walk");
+
+ for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) {
+ string term = i->first;
+ ASSERT_EQUALS( 1U, hits.count( term ) );
+ }
+
+ }
+
+
}
}