summaryrefslogtreecommitdiff
path: root/src/mongo/db
diff options
context:
space:
mode:
authorMark Benvenuto <mark.benvenuto@mongodb.com>2015-04-01 14:33:08 -0400
committerMark Benvenuto <mark.benvenuto@mongodb.com>2015-04-01 14:58:14 -0400
commit937b2bdc5b85095734a9cc08fccc9a8586e871cd (patch)
tree00ec134c5b43041456435960a6e95ef4ff0b9fa8 /src/mongo/db
parent72598f750d732c08c98f5f578bf1335acd78e10e (diff)
downloadmongo-937b2bdc5b85095734a9cc08fccc9a8586e871cd.tar.gz
SERVER-17520: Add support for FTS Tokenizer stop word filtering
Diffstat (limited to 'src/mongo/db')
-rw-r--r--src/mongo/db/fts/fts_basic_tokenizer.cpp9
-rw-r--r--src/mongo/db/fts/fts_basic_tokenizer.h4
-rw-r--r--src/mongo/db/fts/fts_matcher.cpp6
-rw-r--r--src/mongo/db/fts/fts_matcher_test.cpp20
-rw-r--r--src/mongo/db/fts/fts_query.cpp6
-rw-r--r--src/mongo/db/fts/fts_spec.cpp2
-rw-r--r--src/mongo/db/fts/fts_tokenizer.h24
7 files changed, 58 insertions, 13 deletions
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp
index 296f473f144..b0b1448ab60 100644
--- a/src/mongo/db/fts/fts_basic_tokenizer.cpp
+++ b/src/mongo/db/fts/fts_basic_tokenizer.cpp
@@ -48,8 +48,8 @@ namespace fts {
: _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) {
}
- void BasicFTSTokenizer::reset(const char* document, bool generateCaseSensitiveTokens) {
- _generateCaseSensitiveTokens = generateCaseSensitiveTokens;
+ void BasicFTSTokenizer::reset(const char* document, Options options) {
+ _options = options;
_tokenizer = stdx::make_unique<Tokenizer>(_language, document);
}
@@ -69,11 +69,12 @@ namespace fts {
// Stop words are case-sensitive so we need them to be lower cased to check
// against the stop word list
- if (_stopWords->isStopWord(word)) {
+ if ((_options & FTSTokenizer::FilterStopWords) &&
+ _stopWords->isStopWord(word)) {
continue;
}
- if (_generateCaseSensitiveTokens) {
+ if (_options & FTSTokenizer::GenerateCaseSensitiveTokens) {
word = token.data.toString();
}
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.h b/src/mongo/db/fts/fts_basic_tokenizer.h
index fd59a4583fc..7e89289488f 100644
--- a/src/mongo/db/fts/fts_basic_tokenizer.h
+++ b/src/mongo/db/fts/fts_basic_tokenizer.h
@@ -58,7 +58,7 @@ namespace fts {
public:
BasicFTSTokenizer(const FTSLanguage* language);
- void reset(const char* document, bool generateCaseSensitiveTokens) override;
+ void reset(const char* document, Options options) override;
bool moveNext() override;
@@ -70,7 +70,7 @@ namespace fts {
const StopWords* const _stopWords;
std::unique_ptr<Tokenizer> _tokenizer;
- bool _generateCaseSensitiveTokens;
+ Options _options;
std::string _stem;
};
diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp
index 634bcf345cd..c2aa234cd51 100644
--- a/src/mongo/db/fts/fts_matcher.cpp
+++ b/src/mongo/db/fts/fts_matcher.cpp
@@ -99,7 +99,8 @@ namespace mongo {
const string& raw ) const {
std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
- tokenizer->reset(raw.c_str(), _query.getCaseSensitive());
+ tokenizer->reset(raw.c_str(), _query.getCaseSensitive() ?
+ FTSTokenizer::GenerateCaseSensitiveTokens : FTSTokenizer::None);
while (tokenizer->moveNext()) {
string word = tokenizer->get().toString();
@@ -131,7 +132,8 @@ namespace mongo {
const string& raw ) const {
std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
- tokenizer->reset(raw.c_str(), _query.getCaseSensitive());
+ tokenizer->reset(raw.c_str(), _query.getCaseSensitive() ?
+ FTSTokenizer::GenerateCaseSensitiveTokens : FTSTokenizer::None);
while (tokenizer->moveNext()) {
string word = tokenizer->get().toString();
diff --git a/src/mongo/db/fts/fts_matcher_test.cpp b/src/mongo/db/fts/fts_matcher_test.cpp
index 9f8becb83cd..0ea0fbe9e7e 100644
--- a/src/mongo/db/fts/fts_matcher_test.cpp
+++ b/src/mongo/db/fts/fts_matcher_test.cpp
@@ -94,6 +94,26 @@ namespace mongo {
ASSERT( m.hasNegativeTerm( BSON( "x" << "gladly" ) ) );
}
+ // Test the matcher does not filter out stop words from positive terms
+ TEST( FTSMatcher, MatcherDoesNotFilterStopWordsNeg ) {
+ FTSQuery q;
+ ASSERT_OK( q.parse( "-the", "none", false, TEXT_INDEX_VERSION_2 ) );
+ FTSMatcher m( q,
+ FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) );
+
+ ASSERT( m.hasNegativeTerm( BSON( "x" << "the" ) ) );
+ }
+
+ // Test the matcher does not filter out stop words from negative terms
+ TEST( FTSMatcher, MatcherDoesNotFilterStopWordsPos ) {
+ FTSQuery q;
+ ASSERT_OK( q.parse( "the", "none", false, TEXT_INDEX_VERSION_2 ) );
+ FTSMatcher m( q,
+ FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) );
+
+ ASSERT( m.hasPositiveTerm( BSON( "x" << "the" ) ) );
+ }
+
// Returns whether a document indexed with text data 'doc' contains any positive terms from
// case-sensitive text query 'search'.
static bool docHasPositiveTermWithCase( const std::string& doc,
diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp
index e05aa5693cc..b3384d4b40c 100644
--- a/src/mongo/db/fts/fts_query.cpp
+++ b/src/mongo/db/fts/fts_query.cpp
@@ -125,7 +125,7 @@ namespace mongo {
void FTSQuery::_addTerm( FTSTokenizer* tokenizer,
const string& token,
bool negated ) {
- tokenizer->reset(token.c_str(), false);
+ tokenizer->reset(token.c_str(), FTSTokenizer::FilterStopWords);
auto& activeTerms = negated ? _negatedTerms : _positiveTerms;
@@ -152,7 +152,9 @@ namespace mongo {
return;
}
- tokenizer->reset(token.c_str(), true);
+ tokenizer->reset(token.c_str(), static_cast<FTSTokenizer::Options>(
+ FTSTokenizer::FilterStopWords
+ | FTSTokenizer::GenerateCaseSensitiveTokens));
// If we want case-sensitivity, get the case-sensitive token
while (tokenizer->moveNext()) {
diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp
index 9e68835e83b..332d6d198e0 100644
--- a/src/mongo/db/fts/fts_spec.cpp
+++ b/src/mongo/db/fts/fts_spec.cpp
@@ -182,7 +182,7 @@ namespace mongo {
unsigned numTokens = 0;
- tokenizer->reset(raw.rawData(), false );
+ tokenizer->reset(raw.rawData(), FTSTokenizer::FilterStopWords );
while (tokenizer->moveNext()) {
string term = tokenizer->get().toString();
diff --git a/src/mongo/db/fts/fts_tokenizer.h b/src/mongo/db/fts/fts_tokenizer.h
index 65833aff0cb..6db2014b5fb 100644
--- a/src/mongo/db/fts/fts_tokenizer.h
+++ b/src/mongo/db/fts/fts_tokenizer.h
@@ -42,17 +42,37 @@ namespace fts {
* FTSTokenizer
* A iterator of "documents" where a document contains space delimited words.
* For each word returns a stem or lemma version of a word optimized for full text indexing.
- * Optionally supports returning case sensitive search terms.
+ * Supports various options to control how tokens are generated.
*/
class FTSTokenizer {
public:
virtual ~FTSTokenizer() = default;
/**
+ * Options for generating tokens
+ */
+ enum Options {
+ /**
+ * Default means lower cased, and stop words are not filtered.
+ */
+ None = 0,
+
+ /**
+ * Do not lower case terms.
+ */
+ GenerateCaseSensitiveTokens = 1 << 0,
+
+ /**
+ * Filter out stop words from return tokens.
+ */
+ FilterStopWords = 1 << 1,
+ };
+
+ /**
* Process a new document, and discards any previous results.
* May be called multiple times on an instance of an iterator.
*/
- virtual void reset(const char* document, bool generateCaseSensitiveTokens) = 0;
+ virtual void reset(const char* document, Options options) = 0;
/**
* Moves to the next token in the iterator.