diff options
author | Mark Benvenuto <mark.benvenuto@mongodb.com> | 2015-04-01 14:33:08 -0400 |
---|---|---|
committer | Mark Benvenuto <mark.benvenuto@mongodb.com> | 2015-04-01 14:58:14 -0400 |
commit | 937b2bdc5b85095734a9cc08fccc9a8586e871cd (patch) | |
tree | 00ec134c5b43041456435960a6e95ef4ff0b9fa8 /src/mongo/db | |
parent | 72598f750d732c08c98f5f578bf1335acd78e10e (diff) | |
download | mongo-937b2bdc5b85095734a9cc08fccc9a8586e871cd.tar.gz |
SERVER-17520: Add support for FTS Tokenizer stop word filtering
Diffstat (limited to 'src/mongo/db')
-rw-r--r-- | src/mongo/db/fts/fts_basic_tokenizer.cpp | 9 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_basic_tokenizer.h | 4 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_matcher.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_matcher_test.cpp | 20 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_query.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_spec.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_tokenizer.h | 24 |
7 files changed, 58 insertions, 13 deletions
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp index 296f473f144..b0b1448ab60 100644 --- a/src/mongo/db/fts/fts_basic_tokenizer.cpp +++ b/src/mongo/db/fts/fts_basic_tokenizer.cpp @@ -48,8 +48,8 @@ namespace fts { : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) { } - void BasicFTSTokenizer::reset(const char* document, bool generateCaseSensitiveTokens) { - _generateCaseSensitiveTokens = generateCaseSensitiveTokens; + void BasicFTSTokenizer::reset(const char* document, Options options) { + _options = options; _tokenizer = stdx::make_unique<Tokenizer>(_language, document); } @@ -69,11 +69,12 @@ namespace fts { // Stop words are case-sensitive so we need them to be lower cased to check // against the stop word list - if (_stopWords->isStopWord(word)) { + if ((_options & FTSTokenizer::FilterStopWords) && + _stopWords->isStopWord(word)) { continue; } - if (_generateCaseSensitiveTokens) { + if (_options & FTSTokenizer::GenerateCaseSensitiveTokens) { word = token.data.toString(); } diff --git a/src/mongo/db/fts/fts_basic_tokenizer.h b/src/mongo/db/fts/fts_basic_tokenizer.h index fd59a4583fc..7e89289488f 100644 --- a/src/mongo/db/fts/fts_basic_tokenizer.h +++ b/src/mongo/db/fts/fts_basic_tokenizer.h @@ -58,7 +58,7 @@ namespace fts { public: BasicFTSTokenizer(const FTSLanguage* language); - void reset(const char* document, bool generateCaseSensitiveTokens) override; + void reset(const char* document, Options options) override; bool moveNext() override; @@ -70,7 +70,7 @@ namespace fts { const StopWords* const _stopWords; std::unique_ptr<Tokenizer> _tokenizer; - bool _generateCaseSensitiveTokens; + Options _options; std::string _stem; }; diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp index 634bcf345cd..c2aa234cd51 100644 --- a/src/mongo/db/fts/fts_matcher.cpp +++ b/src/mongo/db/fts/fts_matcher.cpp @@ -99,7 +99,8 @@ namespace mongo { const string& raw ) const { std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer()); - tokenizer->reset(raw.c_str(), _query.getCaseSensitive()); + tokenizer->reset(raw.c_str(), _query.getCaseSensitive() ? + FTSTokenizer::GenerateCaseSensitiveTokens : FTSTokenizer::None); while (tokenizer->moveNext()) { string word = tokenizer->get().toString(); @@ -131,7 +132,8 @@ namespace mongo { const string& raw ) const { std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer()); - tokenizer->reset(raw.c_str(), _query.getCaseSensitive()); + tokenizer->reset(raw.c_str(), _query.getCaseSensitive() ? + FTSTokenizer::GenerateCaseSensitiveTokens : FTSTokenizer::None); while (tokenizer->moveNext()) { string word = tokenizer->get().toString(); diff --git a/src/mongo/db/fts/fts_matcher_test.cpp b/src/mongo/db/fts/fts_matcher_test.cpp index 9f8becb83cd..0ea0fbe9e7e 100644 --- a/src/mongo/db/fts/fts_matcher_test.cpp +++ b/src/mongo/db/fts/fts_matcher_test.cpp @@ -94,6 +94,26 @@ namespace mongo { ASSERT( m.hasNegativeTerm( BSON( "x" << "gladly" ) ) ); } + // Test the matcher does not filter out stop words from positive terms + TEST( FTSMatcher, MatcherDoesNotFilterStopWordsNeg ) { + FTSQuery q; + ASSERT_OK( q.parse( "-the", "none", false, TEXT_INDEX_VERSION_2 ) ); + FTSMatcher m( q, + FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) ); + + ASSERT( m.hasNegativeTerm( BSON( "x" << "the" ) ) ); + } + + // Test the matcher does not filter out stop words from negative terms + TEST( FTSMatcher, MatcherDoesNotFilterStopWordsPos ) { + FTSQuery q; + ASSERT_OK( q.parse( "the", "none", false, TEXT_INDEX_VERSION_2 ) ); + FTSMatcher m( q, + FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) ); + + ASSERT( m.hasPositiveTerm( BSON( "x" << "the" ) ) ); + } + // Returns whether a document indexed with text data 'doc' contains any positive terms from // case-sensitive text query 'search'. static bool docHasPositiveTermWithCase( const std::string& doc, diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp index e05aa5693cc..b3384d4b40c 100644 --- a/src/mongo/db/fts/fts_query.cpp +++ b/src/mongo/db/fts/fts_query.cpp @@ -125,7 +125,7 @@ namespace mongo { void FTSQuery::_addTerm( FTSTokenizer* tokenizer, const string& token, bool negated ) { - tokenizer->reset(token.c_str(), false); + tokenizer->reset(token.c_str(), FTSTokenizer::FilterStopWords); auto& activeTerms = negated ? _negatedTerms : _positiveTerms; @@ -152,7 +152,9 @@ namespace mongo { return; } - tokenizer->reset(token.c_str(), true); + tokenizer->reset(token.c_str(), static_cast<FTSTokenizer::Options>( + FTSTokenizer::FilterStopWords + | FTSTokenizer::GenerateCaseSensitiveTokens)); // If we want case-sensitivity, get the case-sensitive token while (tokenizer->moveNext()) { diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp index 9e68835e83b..332d6d198e0 100644 --- a/src/mongo/db/fts/fts_spec.cpp +++ b/src/mongo/db/fts/fts_spec.cpp @@ -182,7 +182,7 @@ namespace mongo { unsigned numTokens = 0; - tokenizer->reset(raw.rawData(), false ); + tokenizer->reset(raw.rawData(), FTSTokenizer::FilterStopWords ); while (tokenizer->moveNext()) { string term = tokenizer->get().toString(); diff --git a/src/mongo/db/fts/fts_tokenizer.h b/src/mongo/db/fts/fts_tokenizer.h index 65833aff0cb..6db2014b5fb 100644 --- a/src/mongo/db/fts/fts_tokenizer.h +++ b/src/mongo/db/fts/fts_tokenizer.h @@ -42,17 +42,37 @@ namespace fts { * FTSTokenizer * A iterator of "documents" where a document contains space delimited words. * For each word returns a stem or lemma version of a word optimized for full text indexing. - * Optionally supports returning case sensitive search terms. + * Supports various options to control how tokens are generated. */ class FTSTokenizer { public: virtual ~FTSTokenizer() = default; /** + * Options for generating tokens + */ + enum Options { + /** + * Default means lower cased, and stop words are not filtered. + */ + None = 0, + + /** + * Do not lower case terms. + */ + GenerateCaseSensitiveTokens = 1 << 0, + + /** + * Filter out stop words from return tokens. + */ + FilterStopWords = 1 << 1, + }; + + /** * Process a new document, and discards any previous results. * May be called multiple times on an instance of an iterator. */ - virtual void reset(const char* document, bool generateCaseSensitiveTokens) = 0; + virtual void reset(const char* document, Options options) = 0; /** * Moves to the next token in the iterator. |