summaryrefslogtreecommitdiff
path: root/src/mongo/db/fts
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/db/fts')
-rw-r--r--src/mongo/db/fts/fts_basic_tokenizer.cpp84
-rw-r--r--src/mongo/db/fts/fts_basic_tokenizer.h67
-rw-r--r--src/mongo/db/fts/fts_basic_tokenizer_test.cpp78
-rw-r--r--src/mongo/db/fts/fts_element_iterator.cpp269
-rw-r--r--src/mongo/db/fts/fts_element_iterator.h246
-rw-r--r--src/mongo/db/fts/fts_element_iterator_test.cpp536
-rw-r--r--src/mongo/db/fts/fts_enabled.cpp68
-rw-r--r--src/mongo/db/fts/fts_index_format.cpp318
-rw-r--r--src/mongo/db/fts/fts_index_format.h62
-rw-r--r--src/mongo/db/fts/fts_index_format_test.cpp323
-rw-r--r--src/mongo/db/fts/fts_language.cpp395
-rw-r--r--src/mongo/db/fts/fts_language.h203
-rw-r--r--src/mongo/db/fts/fts_language_test.cpp197
-rw-r--r--src/mongo/db/fts/fts_matcher.cpp212
-rw-r--r--src/mongo/db/fts/fts_matcher.h138
-rw-r--r--src/mongo/db/fts/fts_matcher_test.cpp383
-rw-r--r--src/mongo/db/fts/fts_query.cpp345
-rw-r--r--src/mongo/db/fts/fts_query.h109
-rw-r--r--src/mongo/db/fts/fts_query_parser.cpp112
-rw-r--r--src/mongo/db/fts/fts_query_parser.h89
-rw-r--r--src/mongo/db/fts/fts_query_test.cpp456
-rw-r--r--src/mongo/db/fts/fts_spec.cpp753
-rw-r--r--src/mongo/db/fts/fts_spec.h264
-rw-r--r--src/mongo/db/fts/fts_spec_legacy.cpp470
-rw-r--r--src/mongo/db/fts/fts_spec_test.cpp1087
-rw-r--r--src/mongo/db/fts/fts_tokenizer.h84
-rw-r--r--src/mongo/db/fts/fts_util.cpp10
-rw-r--r--src/mongo/db/fts/fts_util.h18
-rw-r--r--src/mongo/db/fts/stemmer.cpp51
-rw-r--r--src/mongo/db/fts/stemmer.h39
-rw-r--r--src/mongo/db/fts/stemmer_test.cpp25
-rw-r--r--src/mongo/db/fts/stop_words.cpp55
-rw-r--r--src/mongo/db/fts/stop_words.h34
-rw-r--r--src/mongo/db/fts/stop_words_test.cpp15
-rw-r--r--src/mongo/db/fts/tokenizer.cpp194
-rw-r--r--src/mongo/db/fts/tokenizer.h54
-rw-r--r--src/mongo/db/fts/tokenizer_test.cpp129
37 files changed, 3928 insertions, 4044 deletions
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp
index 2d5cc493123..9fc41923d40 100644
--- a/src/mongo/db/fts/fts_basic_tokenizer.cpp
+++ b/src/mongo/db/fts/fts_basic_tokenizer.cpp
@@ -42,56 +42,54 @@
namespace mongo {
namespace fts {
- using std::string;
-
- BasicFTSTokenizer::BasicFTSTokenizer(const FTSLanguage* language)
- : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) {
- }
-
- void BasicFTSTokenizer::reset(StringData document, Options options) {
- _options = options;
- _document = document.toString();
- _tokenizer = stdx::make_unique<Tokenizer>(_language, _document);
- }
-
- bool BasicFTSTokenizer::moveNext() {
- while (true) {
- bool hasMore = _tokenizer->more();
- if (!hasMore) {
- _stem = "";
- return false;
- }
-
- Token token = _tokenizer->next();
+using std::string;
+
+BasicFTSTokenizer::BasicFTSTokenizer(const FTSLanguage* language)
+ : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) {}
+
+void BasicFTSTokenizer::reset(StringData document, Options options) {
+ _options = options;
+ _document = document.toString();
+ _tokenizer = stdx::make_unique<Tokenizer>(_language, _document);
+}
+
+bool BasicFTSTokenizer::moveNext() {
+ while (true) {
+ bool hasMore = _tokenizer->more();
+ if (!hasMore) {
+ _stem = "";
+ return false;
+ }
- // Do not return delimiters
- if (token.type != Token::TEXT) {
- continue;
- }
+ Token token = _tokenizer->next();
- string word = token.data.toString();
+ // Do not return delimiters
+ if (token.type != Token::TEXT) {
+ continue;
+ }
- word = tolowerString(token.data);
+ string word = token.data.toString();
- // Stop words are case-sensitive so we need them to be lower cased to check
- // against the stop word list
- if ((_options & FTSTokenizer::FilterStopWords) &&
- _stopWords->isStopWord(word)) {
- continue;
- }
+ word = tolowerString(token.data);
- if (_options & FTSTokenizer::GenerateCaseSensitiveTokens) {
- word = token.data.toString();
- }
+ // Stop words are case-sensitive so we need them to be lower cased to check
+ // against the stop word list
+ if ((_options & FTSTokenizer::FilterStopWords) && _stopWords->isStopWord(word)) {
+ continue;
+ }
- _stem = _stemmer.stem(word);
- return true;
+ if (_options & FTSTokenizer::GenerateCaseSensitiveTokens) {
+ word = token.data.toString();
}
- }
- StringData BasicFTSTokenizer::get() const {
- return _stem;
+ _stem = _stemmer.stem(word);
+ return true;
}
+}
+
+StringData BasicFTSTokenizer::get() const {
+ return _stem;
+}
-} // namespace fts
-} // namespace mongo
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.h b/src/mongo/db/fts/fts_basic_tokenizer.h
index 45b3ad8e074..221de72bb8c 100644
--- a/src/mongo/db/fts/fts_basic_tokenizer.h
+++ b/src/mongo/db/fts/fts_basic_tokenizer.h
@@ -37,44 +37,45 @@
namespace mongo {
namespace fts {
- class FTSLanguage;
- class StopWords;
+class FTSLanguage;
+class StopWords;
- /**
- * BasicFTSTokenizer
- * A iterator of "documents" where a document contains ASCII space (U+0020) delimited words.
- * Uses
- * - Tokenizer for tokenizing words via ASCII space (ie, U+0020 space).
- * - tolower from the C standard libary to lower letters, ie, it only supports lower casing
- * - ASCII letters (U+0000 - U+007F)
- * - Stemmer (ie, Snowball Stemmer) to stem words.
- * - Embeded stop word lists for each language in StopWord class
- *
- * For each word returns a stem version of a word optimized for full text indexing.
- * Optionally supports returning case sensitive search terms.
- */
- class BasicFTSTokenizer : public FTSTokenizer {
- MONGO_DISALLOW_COPYING(BasicFTSTokenizer);
- public:
- BasicFTSTokenizer(const FTSLanguage* language);
+/**
+ * BasicFTSTokenizer
+ * A iterator of "documents" where a document contains ASCII space (U+0020) delimited words.
+ * Uses
+ * - Tokenizer for tokenizing words via ASCII space (ie, U+0020 space).
+ * - tolower from the C standard libary to lower letters, ie, it only supports lower casing
+ * - ASCII letters (U+0000 - U+007F)
+ * - Stemmer (ie, Snowball Stemmer) to stem words.
+ * - Embeded stop word lists for each language in StopWord class
+ *
+ * For each word returns a stem version of a word optimized for full text indexing.
+ * Optionally supports returning case sensitive search terms.
+ */
+class BasicFTSTokenizer : public FTSTokenizer {
+ MONGO_DISALLOW_COPYING(BasicFTSTokenizer);
+
+public:
+ BasicFTSTokenizer(const FTSLanguage* language);
- void reset(StringData document, Options options) final;
+ void reset(StringData document, Options options) final;
- bool moveNext() final;
+ bool moveNext() final;
- StringData get() const final;
+ StringData get() const final;
- private:
- const FTSLanguage* const _language;
- const Stemmer _stemmer;
- const StopWords* const _stopWords;
+private:
+ const FTSLanguage* const _language;
+ const Stemmer _stemmer;
+ const StopWords* const _stopWords;
- std::string _document;
- std::unique_ptr<Tokenizer> _tokenizer;
- Options _options;
+ std::string _document;
+ std::unique_ptr<Tokenizer> _tokenizer;
+ Options _options;
- std::string _stem;
- };
+ std::string _stem;
+};
-} // namespace fts
-} // namespace mongo
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_basic_tokenizer_test.cpp b/src/mongo/db/fts/fts_basic_tokenizer_test.cpp
index 384be225f28..5feab67face 100644
--- a/src/mongo/db/fts/fts_basic_tokenizer_test.cpp
+++ b/src/mongo/db/fts/fts_basic_tokenizer_test.cpp
@@ -33,53 +33,51 @@
namespace mongo {
namespace fts {
- std::vector<std::string> tokenizeString(const char* str, const char* language) {
- StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_2);
- ASSERT_OK(swl);
+std::vector<std::string> tokenizeString(const char* str, const char* language) {
+ StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_2);
+ ASSERT_OK(swl);
- std::unique_ptr<FTSTokenizer> tokenizer(swl.getValue()->createTokenizer());
+ std::unique_ptr<FTSTokenizer> tokenizer(swl.getValue()->createTokenizer());
- tokenizer->reset(str, FTSTokenizer::None);
+ tokenizer->reset(str, FTSTokenizer::None);
- std::vector<std::string> terms;
+ std::vector<std::string> terms;
- while (tokenizer->moveNext()) {
- terms.push_back(tokenizer->get().toString());
- }
-
- return terms;
+ while (tokenizer->moveNext()) {
+ terms.push_back(tokenizer->get().toString());
}
- // Ensure punctuation is filtered out of the indexed document
- // and the 's is not separated
- TEST(FtsBasicTokenizer, English) {
- std::vector<std::string> terms = tokenizeString("Do you see Mark's dog running?",
- "english");
+ return terms;
+}
- ASSERT_EQUALS(6U, terms.size());
- ASSERT_EQUALS("do", terms[0]);
- ASSERT_EQUALS("you", terms[1]);
- ASSERT_EQUALS("see", terms[2]);
- ASSERT_EQUALS("mark", terms[3]);
- ASSERT_EQUALS("dog", terms[4]);
- ASSERT_EQUALS("run", terms[5]);
- }
+// Ensure punctuation is filtered out of the indexed document
+// and the 's is not separated
+TEST(FtsBasicTokenizer, English) {
+ std::vector<std::string> terms = tokenizeString("Do you see Mark's dog running?", "english");
- // Ensure punctuation is filtered out of the indexed document
- // and the 's is separated
- TEST(FtsBasicTokenizer, French) {
- std::vector<std::string> terms = tokenizeString("Do you see Mark's dog running?",
- "french");
+ ASSERT_EQUALS(6U, terms.size());
+ ASSERT_EQUALS("do", terms[0]);
+ ASSERT_EQUALS("you", terms[1]);
+ ASSERT_EQUALS("see", terms[2]);
+ ASSERT_EQUALS("mark", terms[3]);
+ ASSERT_EQUALS("dog", terms[4]);
+ ASSERT_EQUALS("run", terms[5]);
+}
- ASSERT_EQUALS(7U, terms.size());
- ASSERT_EQUALS("do", terms[0]);
- ASSERT_EQUALS("you", terms[1]);
- ASSERT_EQUALS("se", terms[2]);
- ASSERT_EQUALS("mark", terms[3]);
- ASSERT_EQUALS("s", terms[4]);
- ASSERT_EQUALS("dog", terms[5]);
- ASSERT_EQUALS("running", terms[6]);
- }
+// Ensure punctuation is filtered out of the indexed document
+// and the 's is separated
+TEST(FtsBasicTokenizer, French) {
+ std::vector<std::string> terms = tokenizeString("Do you see Mark's dog running?", "french");
+
+ ASSERT_EQUALS(7U, terms.size());
+ ASSERT_EQUALS("do", terms[0]);
+ ASSERT_EQUALS("you", terms[1]);
+ ASSERT_EQUALS("se", terms[2]);
+ ASSERT_EQUALS("mark", terms[3]);
+ ASSERT_EQUALS("s", terms[4]);
+ ASSERT_EQUALS("dog", terms[5]);
+ ASSERT_EQUALS("running", terms[6]);
+}
-} // namespace fts
-} // namespace mongo
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_element_iterator.cpp b/src/mongo/db/fts/fts_element_iterator.cpp
index f57e1097c14..4df642dc66a 100644
--- a/src/mongo/db/fts/fts_element_iterator.cpp
+++ b/src/mongo/db/fts/fts_element_iterator.cpp
@@ -37,152 +37,149 @@
namespace mongo {
- namespace fts {
+namespace fts {
+
+using std::string;
+
+extern const double DEFAULT_WEIGHT;
+extern const double MAX_WEIGHT;
+
+std::ostream& operator<<(std::ostream& os, FTSElementIterator::FTSIteratorFrame& frame) {
+ BSONObjIterator it = frame._it;
+ return os << "FTSIteratorFrame["
+ " element=" << (*it).toString() << ", _language=" << frame._language->str()
+ << ", _parentPath=" << frame._parentPath << ", _isArray=" << frame._isArray << "]";
+}
+
+FTSElementIterator::FTSElementIterator(const FTSSpec& spec, const BSONObj& obj)
+ : _frame(obj, spec, &spec.defaultLanguage(), "", false),
+ _spec(spec),
+ _currentValue(advance()) {}
+
+namespace {
+/** Check for exact match or path prefix match. */
+inline bool _matchPrefix(const string& dottedName, const string& weight) {
+ if (weight == dottedName) {
+ return true;
+ }
+ return mongoutils::str::startsWith(weight, dottedName + '.');
+}
+}
+
+bool FTSElementIterator::more() {
+ //_currentValue = advance();
+ return _currentValue.valid();
+}
+
+FTSIteratorValue FTSElementIterator::next() {
+ FTSIteratorValue result = _currentValue;
+ _currentValue = advance();
+ return result;
+}
- using std::string;
-
- extern const double DEFAULT_WEIGHT;
- extern const double MAX_WEIGHT;
-
- std::ostream& operator<<( std::ostream& os, FTSElementIterator::FTSIteratorFrame& frame ) {
- BSONObjIterator it = frame._it;
- return os << "FTSIteratorFrame["
- " element=" << (*it).toString() <<
- ", _language=" << frame._language->str() <<
- ", _parentPath=" << frame._parentPath <<
- ", _isArray=" << frame._isArray << "]";
- }
-
- FTSElementIterator::FTSElementIterator( const FTSSpec& spec, const BSONObj& obj )
- : _frame( obj, spec, &spec.defaultLanguage(), "", false ),
- _spec( spec ),
- _currentValue( advance() )
- { }
-
- namespace {
- /** Check for exact match or path prefix match. */
- inline bool _matchPrefix( const string& dottedName, const string& weight ) {
- if ( weight == dottedName ) {
- return true;
- }
- return mongoutils::str::startsWith( weight, dottedName + '.' );
- }
- }
-
- bool FTSElementIterator::more() {
- //_currentValue = advance();
- return _currentValue.valid();
+/**
+ * Helper method:
+ * if (current object iterator not exhausted) return true;
+ * while (frame stack not empty) {
+ * resume object iterator popped from stack;
+ * if (resumed iterator not exhausted) return true;
+ * }
+ * return false;
+ */
+bool FTSElementIterator::moreFrames() {
+ if (_frame._it.more())
+ return true;
+ while (!_frameStack.empty()) {
+ _frame = _frameStack.top();
+ _frameStack.pop();
+ if (_frame._it.more()) {
+ return true;
}
-
- FTSIteratorValue FTSElementIterator::next() {
- FTSIteratorValue result = _currentValue;
- _currentValue = advance();
- return result;
+ }
+ return false;
+}
+
+FTSIteratorValue FTSElementIterator::advance() {
+ while (moreFrames()) {
+ BSONElement elem = _frame._it.next();
+ string fieldName = elem.fieldName();
+
+ // Skip "language" specifier fields if wildcard.
+ if (_spec.wildcard() && _spec.languageOverrideField() == fieldName) {
+ continue;
}
- /**
- * Helper method:
- * if (current object iterator not exhausted) return true;
- * while (frame stack not empty) {
- * resume object iterator popped from stack;
- * if (resumed iterator not exhausted) return true;
- * }
- * return false;
- */
- bool FTSElementIterator::moreFrames() {
- if (_frame._it.more()) return true;
- while (!_frameStack.empty()) {
- _frame = _frameStack.top();
- _frameStack.pop();
- if (_frame._it.more()) {
- return true;
- }
+ // Compose the dotted name of the current field:
+ // 1. parent path empty (top level): use the current field name
+ // 2. parent path non-empty and obj is an array: use the parent path
+ // 3. parent path non-empty and obj is a sub-doc: append field name to parent path
+ string dottedName = (_frame._parentPath.empty() ? fieldName : _frame._isArray
+ ? _frame._parentPath
+ : _frame._parentPath + '.' + fieldName);
+
+ // Find lower bound of dottedName in _weights. lower_bound leaves us at the first
+ // weight that could possibly match or be a prefix of dottedName. And if this
+ // element fails to match, then no subsequent weight can match, since the weights
+ // are lexicographically ordered.
+ Weights::const_iterator i =
+ _spec.weights().lower_bound(elem.type() == Object ? dottedName + '.' : dottedName);
+
+ // possibleWeightMatch is set if the weight map contains either a match or some item
+ // lexicographically larger than fieldName. This boolean acts as a guard on
+ // dereferences of iterator 'i'.
+ bool possibleWeightMatch = (i != _spec.weights().end());
+
+ // Optimize away two cases, when not wildcard:
+ // 1. lower_bound seeks to end(): no prefix match possible
+ // 2. lower_bound seeks to a name which is not a prefix
+ if (!_spec.wildcard()) {
+ if (!possibleWeightMatch) {
+ continue;
+ } else if (!_matchPrefix(dottedName, i->first)) {
+ continue;
}
- return false;
}
- FTSIteratorValue FTSElementIterator::advance() {
- while ( moreFrames() ) {
-
- BSONElement elem = _frame._it.next();
- string fieldName = elem.fieldName();
+ // Is the current field an exact match on a weight?
+ bool exactMatch = (possibleWeightMatch && i->first == dottedName);
+ double weight = (possibleWeightMatch ? i->second : DEFAULT_WEIGHT);
- // Skip "language" specifier fields if wildcard.
- if ( _spec.wildcard() && _spec.languageOverrideField() == fieldName ) {
- continue;
+ switch (elem.type()) {
+ case String:
+ // Only index strings on exact match or wildcard.
+ if (exactMatch || _spec.wildcard()) {
+ return FTSIteratorValue(elem.valuestr(), _frame._language, weight);
}
-
- // Compose the dotted name of the current field:
- // 1. parent path empty (top level): use the current field name
- // 2. parent path non-empty and obj is an array: use the parent path
- // 3. parent path non-empty and obj is a sub-doc: append field name to parent path
- string dottedName = ( _frame._parentPath.empty() ? fieldName
- : _frame._isArray ? _frame._parentPath
- : _frame._parentPath + '.' + fieldName );
-
- // Find lower bound of dottedName in _weights. lower_bound leaves us at the first
- // weight that could possibly match or be a prefix of dottedName. And if this
- // element fails to match, then no subsequent weight can match, since the weights
- // are lexicographically ordered.
- Weights::const_iterator i = _spec.weights().lower_bound( elem.type() == Object
- ? dottedName + '.'
- : dottedName );
-
- // possibleWeightMatch is set if the weight map contains either a match or some item
- // lexicographically larger than fieldName. This boolean acts as a guard on
- // dereferences of iterator 'i'.
- bool possibleWeightMatch = ( i != _spec.weights().end() );
-
- // Optimize away two cases, when not wildcard:
- // 1. lower_bound seeks to end(): no prefix match possible
- // 2. lower_bound seeks to a name which is not a prefix
- if ( !_spec.wildcard() ) {
- if ( !possibleWeightMatch ) {
- continue;
- }
- else if ( !_matchPrefix( dottedName, i->first ) ) {
- continue;
- }
+ break;
+
+ case Object:
+ // Only descend into a sub-document on proper prefix or wildcard. Note that
+ // !exactMatch is a sufficient test for proper prefix match, because of
+ // if ( !matchPrefix( dottedName, i->first ) ) continue;
+ // block above.
+ if (!exactMatch || _spec.wildcard()) {
+ _frameStack.push(_frame);
+ _frame =
+ FTSIteratorFrame(elem.Obj(), _spec, _frame._language, dottedName, false);
}
-
- // Is the current field an exact match on a weight?
- bool exactMatch = ( possibleWeightMatch && i->first == dottedName );
- double weight = ( possibleWeightMatch ? i->second : DEFAULT_WEIGHT );
-
- switch ( elem.type() ) {
- case String:
- // Only index strings on exact match or wildcard.
- if ( exactMatch || _spec.wildcard() ) {
- return FTSIteratorValue( elem.valuestr(), _frame._language, weight );
- }
- break;
-
- case Object:
- // Only descend into a sub-document on proper prefix or wildcard. Note that
- // !exactMatch is a sufficient test for proper prefix match, because of
- // if ( !matchPrefix( dottedName, i->first ) ) continue;
- // block above.
- if ( !exactMatch || _spec.wildcard() ) {
- _frameStack.push( _frame );
- _frame = FTSIteratorFrame( elem.Obj(), _spec, _frame._language, dottedName, false );
- }
- break;
-
- case Array:
- // Only descend into arrays from non-array parents or on wildcard.
- if ( !_frame._isArray || _spec.wildcard() ) {
- _frameStack.push( _frame );
- _frame = FTSIteratorFrame( elem.Obj(), _spec, _frame._language, dottedName, true );
- }
- break;
-
- default:
- // Skip over all other BSON types.
- break;
+ break;
+
+ case Array:
+ // Only descend into arrays from non-array parents or on wildcard.
+ if (!_frame._isArray || _spec.wildcard()) {
+ _frameStack.push(_frame);
+ _frame =
+ FTSIteratorFrame(elem.Obj(), _spec, _frame._language, dottedName, true);
}
- }
- return FTSIteratorValue(); // valid()==false
+ break;
+
+ default:
+ // Skip over all other BSON types.
+ break;
}
+ }
+ return FTSIteratorValue(); // valid()==false
+}
- } // namespace fts
-} // namespace mongo
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_element_iterator.h b/src/mongo/db/fts/fts_element_iterator.h
index 17f72fff7f9..21e1e446627 100644
--- a/src/mongo/db/fts/fts_element_iterator.h
+++ b/src/mongo/db/fts/fts_element_iterator.h
@@ -40,133 +40,121 @@
namespace mongo {
- namespace fts {
-
- /**
- * Encapsulates data fields returned by FTSElementIterator
- */
- struct FTSIteratorValue {
-
- FTSIteratorValue( const char* text,
- const FTSLanguage* language,
- double weight )
- : _text(text),
- _language(language),
- _weight(weight),
- _valid(true)
- {}
-
- FTSIteratorValue()
- : _text(NULL),
- _language(),
- _weight(0.0),
- _valid(false)
- {}
-
- bool valid() const { return _valid; }
-
- const char* _text;
- const FTSLanguage* _language;
- double _weight;
- bool _valid;
- };
-
- /**
- * Iterator pattern for walking through text-indexed fields of a
- * BSON document.
- *
- * Example usage:
- * FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
- * FTSElementIterator it( spec, obj );
- * while ( it.more() ) {
- * FTSIteratorValue val = it.next();
- * std::cout << val._text << '[' << val._language.str()
- * << ',' << val._weight << ']' << std::endl;
- * }
- *
- */
- class FTSElementIterator {
- public:
- /**
- * Iterator constructor
- *
- * Note: Caller must ensure that the constructed FTSElementIterator
- * does >not< outlive either spec or obj.
- *
- * @arg spec text index specifier
- * @arg obj document that the iterator will traverse
- */
- FTSElementIterator( const FTSSpec& spec, const BSONObj& obj);
-
- /**
- * Iterator interface: returns false iff there are no further text-indexable fields.
- */
- bool more();
-
- /**
- * Iterator interface: advances to the next text-indexable field.
- */
- FTSIteratorValue next();
-
- /**
- * Iterator frame needed for iterative implementation of
- * recursive sub-documents.
- */
- struct FTSIteratorFrame {
- FTSIteratorFrame( const BSONObj& obj,
- const FTSSpec& spec,
- const FTSLanguage* parentLanguage,
- const std::string& parentPath,
- bool isArray )
- : _it( obj ),
- _language( spec._getLanguageToUseV2( obj, parentLanguage ) ),
- _parentPath( parentPath ),
- _isArray( isArray )
- {}
-
- friend std::ostream& operator<<(std::ostream&, FTSIteratorFrame&);
-
- BSONObjIterator _it;
- const FTSLanguage* _language;
- std::string _parentPath;
- bool _isArray;
- };
-
- private:
- /**
- * Helper method:
- * returns false iff all FTSIteratorFrames on _frameStack are exhausted.
- */
- bool moreFrames();
-
- /**
- * Helper method:
- * advances to the next text-indexable field, possibly pushing frames as
- * needed for recursive sub-documents.
- */
- FTSIteratorValue advance();
-
- /**
- * Stack used by iterative implementation of recursive sub-document traversal.
- */
- std::stack<FTSIteratorFrame> _frameStack;
-
- /**
- * Current frame, not yet pushed to stack.
- */
- FTSIteratorFrame _frame;
-
- /**
- * Constructor input parameter: text index specification.
- */
- const FTSSpec& _spec;
-
- /**
- * Current iterator return value, computed by 'more()', returned by 'next()'.
- */
- FTSIteratorValue _currentValue;
- };
-
- } // namespace fts
-} // namespace mongo
+namespace fts {
+/**
+ * Encapsulates data fields returned by FTSElementIterator
+ */
+struct FTSIteratorValue {
+ FTSIteratorValue(const char* text, const FTSLanguage* language, double weight)
+ : _text(text), _language(language), _weight(weight), _valid(true) {}
+
+ FTSIteratorValue() : _text(NULL), _language(), _weight(0.0), _valid(false) {}
+
+ bool valid() const {
+ return _valid;
+ }
+
+ const char* _text;
+ const FTSLanguage* _language;
+ double _weight;
+ bool _valid;
+};
+
+/**
+ * Iterator pattern for walking through text-indexed fields of a
+ * BSON document.
+ *
+ * Example usage:
+ * FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
+ * FTSElementIterator it( spec, obj );
+ * while ( it.more() ) {
+ * FTSIteratorValue val = it.next();
+ * std::cout << val._text << '[' << val._language.str()
+ * << ',' << val._weight << ']' << std::endl;
+ * }
+ *
+ */
+class FTSElementIterator {
+public:
+ /**
+ * Iterator constructor
+ *
+ * Note: Caller must ensure that the constructed FTSElementIterator
+ * does >not< outlive either spec or obj.
+ *
+ * @arg spec text index specifier
+ * @arg obj document that the iterator will traverse
+ */
+ FTSElementIterator(const FTSSpec& spec, const BSONObj& obj);
+
+ /**
+ * Iterator interface: returns false iff there are no further text-indexable fields.
+ */
+ bool more();
+
+ /**
+ * Iterator interface: advances to the next text-indexable field.
+ */
+ FTSIteratorValue next();
+
+ /**
+ * Iterator frame needed for iterative implementation of
+ * recursive sub-documents.
+ */
+ struct FTSIteratorFrame {
+ FTSIteratorFrame(const BSONObj& obj,
+ const FTSSpec& spec,
+ const FTSLanguage* parentLanguage,
+ const std::string& parentPath,
+ bool isArray)
+ : _it(obj),
+ _language(spec._getLanguageToUseV2(obj, parentLanguage)),
+ _parentPath(parentPath),
+ _isArray(isArray) {}
+
+ friend std::ostream& operator<<(std::ostream&, FTSIteratorFrame&);
+
+ BSONObjIterator _it;
+ const FTSLanguage* _language;
+ std::string _parentPath;
+ bool _isArray;
+ };
+
+private:
+ /**
+ * Helper method:
+ * returns false iff all FTSIteratorFrames on _frameStack are exhausted.
+ */
+ bool moreFrames();
+
+ /**
+ * Helper method:
+ * advances to the next text-indexable field, possibly pushing frames as
+ * needed for recursive sub-documents.
+ */
+ FTSIteratorValue advance();
+
+ /**
+ * Stack used by iterative implementation of recursive sub-document traversal.
+ */
+ std::stack<FTSIteratorFrame> _frameStack;
+
+ /**
+ * Current frame, not yet pushed to stack.
+ */
+ FTSIteratorFrame _frame;
+
+ /**
+ * Constructor input parameter: text index specification.
+ */
+ const FTSSpec& _spec;
+
+ /**
+ * Current iterator return value, computed by 'more()', returned by 'next()'.
+ */
+ FTSIteratorValue _currentValue;
+};
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_element_iterator_test.cpp b/src/mongo/db/fts/fts_element_iterator_test.cpp
index 6d5694c5990..2a16c14b5a5 100644
--- a/src/mongo/db/fts/fts_element_iterator_test.cpp
+++ b/src/mongo/db/fts/fts_element_iterator_test.cpp
@@ -34,279 +34,267 @@
#include "mongo/unittest/unittest.h"
namespace mongo {
- namespace fts {
-
- using std::string;
-
- TEST( FTSElementIterator, Test1 ) {
-
- BSONObj obj = fromjson(
- "{ b : \"walking\","
- " c : { e: \"walked\" },"
- " d : \"walker\""
- " }" );
-
- BSONObj indexSpec = fromjson(
- "{ key : { a : \"text\" }, weights : { b : 10, d : 5 } }" );
-
- FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
- Weights::const_iterator itt = spec.weights().begin();
- ASSERT( itt != spec.weights().end() );
- ASSERT_EQUALS( "a", itt->first );
- ASSERT_EQUALS( 1, itt->second );
- ++itt;
- ASSERT( itt != spec.weights().end() );
- ASSERT_EQUALS( "b", itt->first );
- ASSERT_EQUALS( 10, itt->second );
- ++itt;
- ASSERT( itt != spec.weights().end() );
- ASSERT_EQUALS( "d", itt->first );
- ASSERT_EQUALS( 5, itt->second );
- ++itt;
-
- FTSElementIterator it( spec, obj );
-
- ASSERT( it.more() );
- FTSIteratorValue val = it.next();
- ASSERT_EQUALS( "walking", string(val._text) );
- ASSERT_EQUALS( "english", val._language->str() );
- ASSERT_EQUALS( 10, val._weight );
-
- ASSERT( it.more() );
- val = it.next();
- ASSERT_EQUALS( "walker", string(val._text) );
- ASSERT_EQUALS( "english", val._language->str() );
- ASSERT_EQUALS( 5, val._weight );
- }
-
- // Multi-language : test
- TEST( FTSElementIterator, Test2 ) {
-
- BSONObj obj = fromjson(
- "{ a :"
- " { b :"
- " [ { c : \"walked\", language : \"english\" },"
- " { c : \"camminato\", language : \"italian\" },"
- " { c : \"ging\", language : \"german\" } ]"
- " },"
- " d : \"Feliz Año Nuevo!\","
- " language : \"spanish\""
- " }" );
-
- BSONObj indexSpec = fromjson(
- "{ key : { \"a.b.c\" : \"text\", d : \"text\" } }" );
-
- FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
-
- FTSElementIterator it( spec, obj );
-
- ASSERT( it.more() );
- FTSIteratorValue val = it.next();
- ASSERT_EQUALS( "walked", string(val._text) );
- ASSERT_EQUALS( "english", val._language->str() );
- ASSERT_EQUALS( 1, val._weight );
-
- ASSERT( it.more() );
- val = it.next();
- ASSERT_EQUALS( "camminato", string(val._text) );
- ASSERT_EQUALS( "italian", val._language->str() );
- ASSERT_EQUALS( 1, val._weight );
-
- ASSERT( it.more() );
- val = it.next();
- ASSERT_EQUALS( "ging", string(val._text) );
- ASSERT_EQUALS( "german", val._language->str() );
- ASSERT_EQUALS( 1, val._weight );
-
- ASSERT( it.more() );
- val = it.next();
- ASSERT_EQUALS( "Feliz Año Nuevo!", string(val._text) );
- ASSERT_EQUALS( "spanish", val._language->str() );
- ASSERT_EQUALS( 1, val._weight );
- }
-
- // Multi-language : test nested stemming per sub-document
- TEST( FTSElementIterator, Test3 ) {
-
- BSONObj obj = fromjson(
- "{ language : \"english\","
- " a :"
- " { language : \"danish\","
- " b :"
- " [ { c : \"foredrag\" },"
- " { c : \"foredragsholder\" },"
- " { c : \"lector\" } ]"
- " }"
- "}" );
-
- BSONObj indexSpec = fromjson(
- "{ key : { a : \"text\", \"a.b.c\" : \"text\" }, weights : { \"a.b.c\" : 5 } }" );
-
- FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
- Weights::const_iterator itt = spec.weights().begin();
- ASSERT( itt != spec.weights().end() );
- ASSERT_EQUALS( "a", itt->first );
- ASSERT_EQUALS( 1, itt->second );
- ++itt;
- ASSERT( itt != spec.weights().end() );
- ASSERT_EQUALS( "a.b.c", itt->first );
- ASSERT_EQUALS( 5, itt->second );
-
- FTSElementIterator it( spec, obj );
-
- ASSERT( it.more() );
- FTSIteratorValue val = it.next();
- ASSERT_EQUALS( "foredrag", string(val._text) );
- ASSERT_EQUALS( "danish", val._language->str() );
- ASSERT_EQUALS( 5, val._weight );
-
- ASSERT( it.more() );
- val = it.next();
- ASSERT_EQUALS( "foredragsholder", string(val._text) );
- ASSERT_EQUALS( "danish", val._language->str() );
- ASSERT_EQUALS( 5, val._weight );
-
- ASSERT( it.more() );
- val = it.next();
- ASSERT_EQUALS( "lector", string(val._text) );
- ASSERT_EQUALS( "danish", val._language->str() );
- ASSERT_EQUALS( 5, val._weight );
-
- }
-
- // Multi-language : test nested arrays
- TEST( FTSElementIterator, Test4 ) {
-
- BSONObj obj = fromjson(
- "{ language : \"english\","
- " a : ["
- " { language : \"danish\","
- " b :"
- " [ { c : [\"foredrag\"] },"
- " { c : [\"foredragsholder\"] },"
- " { c : [\"lector\"] } ]"
- " } ]"
- "}" );
-
- BSONObj indexSpec = fromjson(
- "{ key : { \"a.b.c\" : \"text\" }, weights : { \"a.b.c\" : 5 } }" );
-
- FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
- FTSElementIterator it( spec, obj );
-
- ASSERT( it.more() );
- FTSIteratorValue val = it.next();
- ASSERT_EQUALS( "foredrag", string(val._text) );
- ASSERT_EQUALS( "danish", val._language->str() );
- ASSERT_EQUALS( 5, val._weight );
-
- ASSERT( it.more() );
- val = it.next();
- ASSERT_EQUALS( "foredragsholder", string(val._text) );
- ASSERT_EQUALS( "danish", val._language->str() );
- ASSERT_EQUALS( 5, val._weight );
-
- ASSERT( it.more() );
- val = it.next();
- ASSERT_EQUALS( "lector", string(val._text) );
- ASSERT_EQUALS( "danish", val._language->str() );
- ASSERT_EQUALS( 5, val._weight );
-
- }
-
- // Multi-language : test wildcard spec
- TEST( FTSElementIterator, Test5 ) {
-
- BSONObj obj = fromjson(
- "{ language : \"english\","
- " b : \"these boots were made for walking\","
- " c : { e: \"I walked half way to the market before seeing the sunrise\" },"
- " d : "
- " { language : \"danish\","
- " e :"
- " [ { f : \"foredrag\", g : 12 },"
- " { f : \"foredragsholder\", g : 13 },"
- " { f : \"lector\", g : 14 } ]"
- " }"
- "}" );
-
- BSONObj indexSpec = fromjson(
- "{ key : { a : \"text\" }, weights : { b : 20, c : 10, \"d.e.f\" : 5 } }" );
-
- FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
- FTSElementIterator it( spec, obj );
-
- ASSERT( it.more() );
- FTSIteratorValue val = it.next();
- ASSERT_EQUALS( "these boots were made for walking", string(val._text) );
- ASSERT_EQUALS( "english", val._language->str() );
- ASSERT_EQUALS( 20, val._weight );
-
- ASSERT( it.more() );
- val = it.next();
- ASSERT_EQUALS( "foredrag", string(val._text) );
- ASSERT_EQUALS( "danish", val._language->str() );
- ASSERT_EQUALS( 5, val._weight );
-
- ASSERT( it.more() );
- val = it.next();
- ASSERT_EQUALS( "foredragsholder", string(val._text) );
- ASSERT_EQUALS( "danish", val._language->str() );
- ASSERT_EQUALS( 5, val._weight );
-
- ASSERT( it.more() );
- val = it.next();
- ASSERT_EQUALS( "lector", string(val._text) );
- ASSERT_EQUALS( "danish", val._language->str() );
- ASSERT_EQUALS( 5, val._weight );
- }
-
- // Multi-language : test wildcard spec
- TEST( FTSElementIterator, Test6 ) {
-
- BSONObj obj = fromjson(
- "{ language : \"english\","
- " b : \"these boots were made for walking\","
- " c : { e: \"I walked half way to the market before seeing the sunrise\" },"
- " d : "
- " { language : \"danish\","
- " e :"
- " [ { f : \"foredrag\", g : 12 },"
- " { f : \"foredragsholder\", g : 13 },"
- " { f : \"lector\", g : 14 } ]"
- " }"
- "}" );
-
- BSONObj indexSpec = fromjson(
- "{ key : { a : \"text\" }, weights : { b : 20, c : 10, \"d.e.f\" : 5 } }" );
-
- FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
- FTSElementIterator it( spec, obj );
-
- ASSERT( it.more() );
- FTSIteratorValue val = it.next();
- ASSERT_EQUALS( "these boots were made for walking", string(val._text) );
- ASSERT_EQUALS( "english", val._language->str() );
- ASSERT_EQUALS( 20, val._weight );
-
- ASSERT( it.more() );
- val = it.next();
- ASSERT_EQUALS( "foredrag", string(val._text) );
- ASSERT_EQUALS( "danish", val._language->str() );
- ASSERT_EQUALS( 5, val._weight );
-
- ASSERT( it.more() );
- val = it.next();
- ASSERT_EQUALS( "foredragsholder", string(val._text) );
- ASSERT_EQUALS( "danish", val._language->str() );
- ASSERT_EQUALS( 5, val._weight );
-
- ASSERT( it.more() );
- val = it.next();
- ASSERT_EQUALS( "lector", string(val._text) );
- ASSERT_EQUALS( "danish", val._language->str() );
- ASSERT_EQUALS( 5, val._weight );
- }
- }
+namespace fts {
+
+using std::string;
+
+TEST(FTSElementIterator, Test1) {
+ BSONObj obj = fromjson(
+ "{ b : \"walking\","
+ " c : { e: \"walked\" },"
+ " d : \"walker\""
+ " }");
+
+ BSONObj indexSpec = fromjson("{ key : { a : \"text\" }, weights : { b : 10, d : 5 } }");
+
+ FTSSpec spec(FTSSpec::fixSpec(indexSpec));
+ Weights::const_iterator itt = spec.weights().begin();
+ ASSERT(itt != spec.weights().end());
+ ASSERT_EQUALS("a", itt->first);
+ ASSERT_EQUALS(1, itt->second);
+ ++itt;
+ ASSERT(itt != spec.weights().end());
+ ASSERT_EQUALS("b", itt->first);
+ ASSERT_EQUALS(10, itt->second);
+ ++itt;
+ ASSERT(itt != spec.weights().end());
+ ASSERT_EQUALS("d", itt->first);
+ ASSERT_EQUALS(5, itt->second);
+ ++itt;
+
+ FTSElementIterator it(spec, obj);
+
+ ASSERT(it.more());
+ FTSIteratorValue val = it.next();
+ ASSERT_EQUALS("walking", string(val._text));
+ ASSERT_EQUALS("english", val._language->str());
+ ASSERT_EQUALS(10, val._weight);
+
+ ASSERT(it.more());
+ val = it.next();
+ ASSERT_EQUALS("walker", string(val._text));
+ ASSERT_EQUALS("english", val._language->str());
+ ASSERT_EQUALS(5, val._weight);
}
+// Multi-language : test
+TEST(FTSElementIterator, Test2) {
+ BSONObj obj = fromjson(
+ "{ a :"
+ " { b :"
+ " [ { c : \"walked\", language : \"english\" },"
+ " { c : \"camminato\", language : \"italian\" },"
+ " { c : \"ging\", language : \"german\" } ]"
+ " },"
+ " d : \"Feliz Año Nuevo!\","
+ " language : \"spanish\""
+ " }");
+
+ BSONObj indexSpec = fromjson("{ key : { \"a.b.c\" : \"text\", d : \"text\" } }");
+
+ FTSSpec spec(FTSSpec::fixSpec(indexSpec));
+
+ FTSElementIterator it(spec, obj);
+
+ ASSERT(it.more());
+ FTSIteratorValue val = it.next();
+ ASSERT_EQUALS("walked", string(val._text));
+ ASSERT_EQUALS("english", val._language->str());
+ ASSERT_EQUALS(1, val._weight);
+
+ ASSERT(it.more());
+ val = it.next();
+ ASSERT_EQUALS("camminato", string(val._text));
+ ASSERT_EQUALS("italian", val._language->str());
+ ASSERT_EQUALS(1, val._weight);
+
+ ASSERT(it.more());
+ val = it.next();
+ ASSERT_EQUALS("ging", string(val._text));
+ ASSERT_EQUALS("german", val._language->str());
+ ASSERT_EQUALS(1, val._weight);
+
+ ASSERT(it.more());
+ val = it.next();
+ ASSERT_EQUALS("Feliz Año Nuevo!", string(val._text));
+ ASSERT_EQUALS("spanish", val._language->str());
+ ASSERT_EQUALS(1, val._weight);
+}
+
+// Multi-language : test nested stemming per sub-document
+TEST(FTSElementIterator, Test3) {
+ BSONObj obj = fromjson(
+ "{ language : \"english\","
+ " a :"
+ " { language : \"danish\","
+ " b :"
+ " [ { c : \"foredrag\" },"
+ " { c : \"foredragsholder\" },"
+ " { c : \"lector\" } ]"
+ " }"
+ "}");
+
+ BSONObj indexSpec =
+ fromjson("{ key : { a : \"text\", \"a.b.c\" : \"text\" }, weights : { \"a.b.c\" : 5 } }");
+
+ FTSSpec spec(FTSSpec::fixSpec(indexSpec));
+ Weights::const_iterator itt = spec.weights().begin();
+ ASSERT(itt != spec.weights().end());
+ ASSERT_EQUALS("a", itt->first);
+ ASSERT_EQUALS(1, itt->second);
+ ++itt;
+ ASSERT(itt != spec.weights().end());
+ ASSERT_EQUALS("a.b.c", itt->first);
+ ASSERT_EQUALS(5, itt->second);
+
+ FTSElementIterator it(spec, obj);
+
+ ASSERT(it.more());
+ FTSIteratorValue val = it.next();
+ ASSERT_EQUALS("foredrag", string(val._text));
+ ASSERT_EQUALS("danish", val._language->str());
+ ASSERT_EQUALS(5, val._weight);
+
+ ASSERT(it.more());
+ val = it.next();
+ ASSERT_EQUALS("foredragsholder", string(val._text));
+ ASSERT_EQUALS("danish", val._language->str());
+ ASSERT_EQUALS(5, val._weight);
+
+ ASSERT(it.more());
+ val = it.next();
+ ASSERT_EQUALS("lector", string(val._text));
+ ASSERT_EQUALS("danish", val._language->str());
+ ASSERT_EQUALS(5, val._weight);
+}
+
+// Multi-language : test nested arrays
+TEST(FTSElementIterator, Test4) {
+ BSONObj obj = fromjson(
+ "{ language : \"english\","
+ " a : ["
+ " { language : \"danish\","
+ " b :"
+ " [ { c : [\"foredrag\"] },"
+ " { c : [\"foredragsholder\"] },"
+ " { c : [\"lector\"] } ]"
+ " } ]"
+ "}");
+
+ BSONObj indexSpec = fromjson("{ key : { \"a.b.c\" : \"text\" }, weights : { \"a.b.c\" : 5 } }");
+
+ FTSSpec spec(FTSSpec::fixSpec(indexSpec));
+ FTSElementIterator it(spec, obj);
+
+ ASSERT(it.more());
+ FTSIteratorValue val = it.next();
+ ASSERT_EQUALS("foredrag", string(val._text));
+ ASSERT_EQUALS("danish", val._language->str());
+ ASSERT_EQUALS(5, val._weight);
+
+ ASSERT(it.more());
+ val = it.next();
+ ASSERT_EQUALS("foredragsholder", string(val._text));
+ ASSERT_EQUALS("danish", val._language->str());
+ ASSERT_EQUALS(5, val._weight);
+
+ ASSERT(it.more());
+ val = it.next();
+ ASSERT_EQUALS("lector", string(val._text));
+ ASSERT_EQUALS("danish", val._language->str());
+ ASSERT_EQUALS(5, val._weight);
+}
+
+// Multi-language : test wildcard spec
+TEST(FTSElementIterator, Test5) {
+ BSONObj obj = fromjson(
+ "{ language : \"english\","
+ " b : \"these boots were made for walking\","
+ " c : { e: \"I walked half way to the market before seeing the sunrise\" },"
+ " d : "
+ " { language : \"danish\","
+ " e :"
+ " [ { f : \"foredrag\", g : 12 },"
+ " { f : \"foredragsholder\", g : 13 },"
+ " { f : \"lector\", g : 14 } ]"
+ " }"
+ "}");
+
+ BSONObj indexSpec =
+ fromjson("{ key : { a : \"text\" }, weights : { b : 20, c : 10, \"d.e.f\" : 5 } }");
+
+ FTSSpec spec(FTSSpec::fixSpec(indexSpec));
+ FTSElementIterator it(spec, obj);
+
+ ASSERT(it.more());
+ FTSIteratorValue val = it.next();
+ ASSERT_EQUALS("these boots were made for walking", string(val._text));
+ ASSERT_EQUALS("english", val._language->str());
+ ASSERT_EQUALS(20, val._weight);
+
+ ASSERT(it.more());
+ val = it.next();
+ ASSERT_EQUALS("foredrag", string(val._text));
+ ASSERT_EQUALS("danish", val._language->str());
+ ASSERT_EQUALS(5, val._weight);
+
+ ASSERT(it.more());
+ val = it.next();
+ ASSERT_EQUALS("foredragsholder", string(val._text));
+ ASSERT_EQUALS("danish", val._language->str());
+ ASSERT_EQUALS(5, val._weight);
+
+ ASSERT(it.more());
+ val = it.next();
+ ASSERT_EQUALS("lector", string(val._text));
+ ASSERT_EQUALS("danish", val._language->str());
+ ASSERT_EQUALS(5, val._weight);
+}
+
+// Multi-language : test wildcard spec
+TEST(FTSElementIterator, Test6) {
+ BSONObj obj = fromjson(
+ "{ language : \"english\","
+ " b : \"these boots were made for walking\","
+ " c : { e: \"I walked half way to the market before seeing the sunrise\" },"
+ " d : "
+ " { language : \"danish\","
+ " e :"
+ " [ { f : \"foredrag\", g : 12 },"
+ " { f : \"foredragsholder\", g : 13 },"
+ " { f : \"lector\", g : 14 } ]"
+ " }"
+ "}");
+
+ BSONObj indexSpec =
+ fromjson("{ key : { a : \"text\" }, weights : { b : 20, c : 10, \"d.e.f\" : 5 } }");
+
+ FTSSpec spec(FTSSpec::fixSpec(indexSpec));
+ FTSElementIterator it(spec, obj);
+
+ ASSERT(it.more());
+ FTSIteratorValue val = it.next();
+ ASSERT_EQUALS("these boots were made for walking", string(val._text));
+ ASSERT_EQUALS("english", val._language->str());
+ ASSERT_EQUALS(20, val._weight);
+
+ ASSERT(it.more());
+ val = it.next();
+ ASSERT_EQUALS("foredrag", string(val._text));
+ ASSERT_EQUALS("danish", val._language->str());
+ ASSERT_EQUALS(5, val._weight);
+
+ ASSERT(it.more());
+ val = it.next();
+ ASSERT_EQUALS("foredragsholder", string(val._text));
+ ASSERT_EQUALS("danish", val._language->str());
+ ASSERT_EQUALS(5, val._weight);
+
+ ASSERT(it.more());
+ val = it.next();
+ ASSERT_EQUALS("lector", string(val._text));
+ ASSERT_EQUALS("danish", val._language->str());
+ ASSERT_EQUALS(5, val._weight);
+}
+}
+}
diff --git a/src/mongo/db/fts/fts_enabled.cpp b/src/mongo/db/fts/fts_enabled.cpp
index b8e071bd62a..fb261194db1 100644
--- a/src/mongo/db/fts/fts_enabled.cpp
+++ b/src/mongo/db/fts/fts_enabled.cpp
@@ -35,46 +35,42 @@
#include "mongo/util/log.h"
namespace mongo {
- namespace fts {
- namespace {
+namespace fts {
+namespace {
- bool dummyEnabledFlag = true; // Unused, needed for server parameter.
+bool dummyEnabledFlag = true; // Unused, needed for server parameter.
- /**
- * Declaration for the "textSearchEnabled" server parameter, which is now deprecated.
- * Note that:
- * - setting to true performs a no-op and logs a deprecation message.
- * - setting to false will fail.
- */
- class ExportedTextSearchEnabledParameter : public ExportedServerParameter<bool> {
- public:
- ExportedTextSearchEnabledParameter() :
- ExportedServerParameter<bool>( ServerParameterSet::getGlobal(),
- "textSearchEnabled",
- &dummyEnabledFlag,
- true,
- true ) {}
-
- virtual Status validate( const bool& potentialNewValue ) {
- if ( !potentialNewValue ) {
- return Status( ErrorCodes::BadValue,
- "textSearchEnabled cannot be set to false");
- }
-
- log() << "Attempted to set textSearchEnabled server parameter.";
- log() << "Text search is enabled by default and cannot be disabled.";
- log() << "The following are now deprecated and will be removed in a future "
- << "release:";
- log() << "- the \"textSearchEnabled\" server parameter (setting it has no "
- << "effect)";
- log() << "- the \"text\" command (has been replaced by the $text query "
- "operator)";
+/**
+ * Declaration for the "textSearchEnabled" server parameter, which is now deprecated.
+ * Note that:
+ * - setting to true performs a no-op and logs a deprecation message.
+ * - setting to false will fail.
+ */
+class ExportedTextSearchEnabledParameter : public ExportedServerParameter<bool> {
+public:
+ ExportedTextSearchEnabledParameter()
+ : ExportedServerParameter<bool>(
+ ServerParameterSet::getGlobal(), "textSearchEnabled", &dummyEnabledFlag, true, true) {
+ }
- return Status::OK();
- }
+ virtual Status validate(const bool& potentialNewValue) {
+ if (!potentialNewValue) {
+ return Status(ErrorCodes::BadValue, "textSearchEnabled cannot be set to false");
+ }
- } exportedTextSearchEnabledParam;
+ log() << "Attempted to set textSearchEnabled server parameter.";
+ log() << "Text search is enabled by default and cannot be disabled.";
+ log() << "The following are now deprecated and will be removed in a future "
+ << "release:";
+ log() << "- the \"textSearchEnabled\" server parameter (setting it has no "
+ << "effect)";
+ log() << "- the \"text\" command (has been replaced by the $text query "
+ "operator)";
- }
+ return Status::OK();
}
+
+} exportedTextSearchEnabledParam;
+}
+}
}
diff --git a/src/mongo/db/fts/fts_index_format.cpp b/src/mongo/db/fts/fts_index_format.cpp
index fc0e703b84d..f7110d80858 100644
--- a/src/mongo/db/fts/fts_index_format.cpp
+++ b/src/mongo/db/fts/fts_index_format.cpp
@@ -40,178 +40,168 @@
namespace mongo {
- namespace fts {
-
- using std::string;
- using std::vector;
-
- namespace {
- BSONObj nullObj;
- BSONElement nullElt;
-
- // New in textIndexVersion 2.
- // If the term is longer than 32 characters, it may
- // result in the generated key being too large
- // for the index. In that case, we generate a 64-character key
- // from the concatenation of the first 32 characters
- // and the hex string of the murmur3 hash value of the entire
- // term value.
- const size_t termKeyPrefixLength = 32U;
- // 128-bit hash value expressed in hex = 32 characters
- const size_t termKeySuffixLength = 32U;
- const size_t termKeyLength = termKeyPrefixLength + termKeySuffixLength;
-
- /**
- * Returns size of buffer required to store term in index key.
- * In version 1, terms are stored verbatim in key.
- * In version 2, terms longer than 32 characters are hashed and combined
- * with a prefix.
- */
- int guessTermSize( const std::string& term, TextIndexVersion textIndexVersion ) {
- if ( TEXT_INDEX_VERSION_1 == textIndexVersion ) {
- return term.size();
- }
- else {
- invariant( TEXT_INDEX_VERSION_2 == textIndexVersion );
- if ( term.size() <= termKeyPrefixLength ) {
- return term.size();
- }
- return termKeyLength;
- }
- }
- }
+namespace fts {
+
+using std::string;
+using std::vector;
+
+namespace {
+BSONObj nullObj;
+BSONElement nullElt;
+
+// New in textIndexVersion 2.
+// If the term is longer than 32 characters, it may
+// result in the generated key being too large
+// for the index. In that case, we generate a 64-character key
+// from the concatenation of the first 32 characters
+// and the hex string of the murmur3 hash value of the entire
+// term value.
+const size_t termKeyPrefixLength = 32U;
+// 128-bit hash value expressed in hex = 32 characters
+const size_t termKeySuffixLength = 32U;
+const size_t termKeyLength = termKeyPrefixLength + termKeySuffixLength;
- MONGO_INITIALIZER( FTSIndexFormat )( InitializerContext* context ) {
- BSONObjBuilder b;
- b.appendNull( "" );
- nullObj = b.obj();
- nullElt = nullObj.firstElement();
- return Status::OK();
+/**
+ * Returns size of buffer required to store term in index key.
+ * In version 1, terms are stored verbatim in key.
+ * In version 2, terms longer than 32 characters are hashed and combined
+ * with a prefix.
+ */
+int guessTermSize(const std::string& term, TextIndexVersion textIndexVersion) {
+ if (TEXT_INDEX_VERSION_1 == textIndexVersion) {
+ return term.size();
+ } else {
+ invariant(TEXT_INDEX_VERSION_2 == textIndexVersion);
+ if (term.size() <= termKeyPrefixLength) {
+ return term.size();
}
+ return termKeyLength;
+ }
+}
+}
- void FTSIndexFormat::getKeys( const FTSSpec& spec,
- const BSONObj& obj,
- BSONObjSet* keys ) {
-
- int extraSize = 0;
- vector<BSONElement> extrasBefore;
- vector<BSONElement> extrasAfter;
-
- // compute the non FTS key elements
- for ( unsigned i = 0; i < spec.numExtraBefore(); i++ ) {
- BSONElement e = obj.getFieldDotted(spec.extraBefore(i));
- if ( e.eoo() )
- e = nullElt;
- uassert( 16675, "cannot have a multi-key as a prefix to a text index",
- e.type() != Array );
- extrasBefore.push_back(e);
- extraSize += e.size();
- }
- for ( unsigned i = 0; i < spec.numExtraAfter(); i++ ) {
- BSONElement e = obj.getFieldDotted(spec.extraAfter(i));
- if ( e.eoo() )
- e = nullElt;
- extrasAfter.push_back(e);
- extraSize += e.size();
- }
-
-
- TermFrequencyMap term_freqs;
- spec.scoreDocument( obj, &term_freqs );
-
- // create index keys from raw scores
- // only 1 per string
-
- uassert( 16732,
- mongoutils::str::stream() << "too many unique keys for a single document to"
- << " have a text index, max is " << term_freqs.size() << obj["_id"],
- term_freqs.size() <= 400000 );
-
- long long keyBSONSize = 0;
- const int MaxKeyBSONSizeMB = 4;
-
- for ( TermFrequencyMap::const_iterator i = term_freqs.begin(); i != term_freqs.end(); ++i ) {
-
- const string& term = i->first;
- double weight = i->second;
-
- // guess the total size of the btree entry based on the size of the weight, term tuple
- int guess =
- 5 /* bson overhead */ +
- 10 /* weight */ +
- 8 /* term overhead */ +
- /* term size (could be truncated/hashed) */
- guessTermSize( term, spec.getTextIndexVersion() ) +
- extraSize;
-
- BSONObjBuilder b(guess); // builds a BSON object with guess length.
- for ( unsigned k = 0; k < extrasBefore.size(); k++ ) {
- b.appendAs( extrasBefore[k], "" );
- }
- _appendIndexKey( b, weight, term, spec.getTextIndexVersion() );
- for ( unsigned k = 0; k < extrasAfter.size(); k++ ) {
- b.appendAs( extrasAfter[k], "" );
- }
- BSONObj res = b.obj();
-
- verify( guess >= res.objsize() );
-
- keys->insert( res );
- keyBSONSize += res.objsize();
-
- uassert( 16733,
- mongoutils::str::stream()
- << "trying to index text where term list is too big, max is "
- << MaxKeyBSONSizeMB << "mb " << obj["_id"],
- keyBSONSize <= ( MaxKeyBSONSizeMB * 1024 * 1024 ) );
-
- }
- }
+MONGO_INITIALIZER(FTSIndexFormat)(InitializerContext* context) {
+ BSONObjBuilder b;
+ b.appendNull("");
+ nullObj = b.obj();
+ nullElt = nullObj.firstElement();
+ return Status::OK();
+}
+
+void FTSIndexFormat::getKeys(const FTSSpec& spec, const BSONObj& obj, BSONObjSet* keys) {
+ int extraSize = 0;
+ vector<BSONElement> extrasBefore;
+ vector<BSONElement> extrasAfter;
+
+ // compute the non FTS key elements
+ for (unsigned i = 0; i < spec.numExtraBefore(); i++) {
+ BSONElement e = obj.getFieldDotted(spec.extraBefore(i));
+ if (e.eoo())
+ e = nullElt;
+ uassert(16675, "cannot have a multi-key as a prefix to a text index", e.type() != Array);
+ extrasBefore.push_back(e);
+ extraSize += e.size();
+ }
+ for (unsigned i = 0; i < spec.numExtraAfter(); i++) {
+ BSONElement e = obj.getFieldDotted(spec.extraAfter(i));
+ if (e.eoo())
+ e = nullElt;
+ extrasAfter.push_back(e);
+ extraSize += e.size();
+ }
+
+
+ TermFrequencyMap term_freqs;
+ spec.scoreDocument(obj, &term_freqs);
+
+ // create index keys from raw scores
+ // only 1 per string
+
+ uassert(16732,
+ mongoutils::str::stream() << "too many unique keys for a single document to"
+ << " have a text index, max is " << term_freqs.size()
+ << obj["_id"],
+ term_freqs.size() <= 400000);
+
+ long long keyBSONSize = 0;
+ const int MaxKeyBSONSizeMB = 4;
- BSONObj FTSIndexFormat::getIndexKey( double weight,
- const string& term,
- const BSONObj& indexPrefix,
- TextIndexVersion textIndexVersion ) {
- BSONObjBuilder b;
+ for (TermFrequencyMap::const_iterator i = term_freqs.begin(); i != term_freqs.end(); ++i) {
+ const string& term = i->first;
+ double weight = i->second;
- BSONObjIterator i( indexPrefix );
- while ( i.more() ) {
- b.appendAs( i.next(), "" );
- }
+ // guess the total size of the btree entry based on the size of the weight, term tuple
+ int guess = 5 /* bson overhead */ + 10 /* weight */ + 8 /* term overhead */ +
+ /* term size (could be truncated/hashed) */
+ guessTermSize(term, spec.getTextIndexVersion()) + extraSize;
- _appendIndexKey( b, weight, term, textIndexVersion );
- return b.obj();
+ BSONObjBuilder b(guess); // builds a BSON object with guess length.
+ for (unsigned k = 0; k < extrasBefore.size(); k++) {
+ b.appendAs(extrasBefore[k], "");
}
+ _appendIndexKey(b, weight, term, spec.getTextIndexVersion());
+ for (unsigned k = 0; k < extrasAfter.size(); k++) {
+ b.appendAs(extrasAfter[k], "");
+ }
+ BSONObj res = b.obj();
+
+ verify(guess >= res.objsize());
+
+ keys->insert(res);
+ keyBSONSize += res.objsize();
+
+ uassert(16733,
+ mongoutils::str::stream()
+ << "trying to index text where term list is too big, max is "
+ << MaxKeyBSONSizeMB << "mb " << obj["_id"],
+ keyBSONSize <= (MaxKeyBSONSizeMB * 1024 * 1024));
+ }
+}
+
+BSONObj FTSIndexFormat::getIndexKey(double weight,
+ const string& term,
+ const BSONObj& indexPrefix,
+ TextIndexVersion textIndexVersion) {
+ BSONObjBuilder b;
- void FTSIndexFormat::_appendIndexKey( BSONObjBuilder& b, double weight, const string& term,
- TextIndexVersion textIndexVersion ) {
- verify( weight >= 0 && weight <= MAX_WEIGHT ); // FTSmaxweight = defined in fts_header
- // Terms are added to index key verbatim.
- if ( TEXT_INDEX_VERSION_1 == textIndexVersion ) {
- b.append( "", term );
- b.append( "", weight );
- }
- // See comments at the top of file for termKeyPrefixLength.
- // Apply hash for text index version 2 to long terms (longer than 32 characters).
- else {
- invariant( TEXT_INDEX_VERSION_2 == textIndexVersion );
- if ( term.size() <= termKeyPrefixLength ) {
- b.append( "", term );
- }
- else {
- union {
- uint64_t hash[2];
- char data[16];
- } t;
- uint32_t seed = 0;
- MurmurHash3_x64_128( term.data(), term.size(), seed, t.hash );
- string keySuffix = mongo::toHexLower( t.data, sizeof( t.data ) );
- invariant( termKeySuffixLength == keySuffix.size() );
- b.append( "", term.substr( 0, termKeyPrefixLength ) +
- keySuffix );
- }
- b.append( "", weight );
- }
+ BSONObjIterator i(indexPrefix);
+ while (i.more()) {
+ b.appendAs(i.next(), "");
+ }
+
+ _appendIndexKey(b, weight, term, textIndexVersion);
+ return b.obj();
+}
+
+void FTSIndexFormat::_appendIndexKey(BSONObjBuilder& b,
+ double weight,
+ const string& term,
+ TextIndexVersion textIndexVersion) {
+ verify(weight >= 0 && weight <= MAX_WEIGHT); // FTSmaxweight = defined in fts_header
+ // Terms are added to index key verbatim.
+ if (TEXT_INDEX_VERSION_1 == textIndexVersion) {
+ b.append("", term);
+ b.append("", weight);
+ }
+ // See comments at the top of file for termKeyPrefixLength.
+ // Apply hash for text index version 2 to long terms (longer than 32 characters).
+ else {
+ invariant(TEXT_INDEX_VERSION_2 == textIndexVersion);
+ if (term.size() <= termKeyPrefixLength) {
+ b.append("", term);
+ } else {
+ union {
+ uint64_t hash[2];
+ char data[16];
+ } t;
+ uint32_t seed = 0;
+ MurmurHash3_x64_128(term.data(), term.size(), seed, t.hash);
+ string keySuffix = mongo::toHexLower(t.data, sizeof(t.data));
+ invariant(termKeySuffixLength == keySuffix.size());
+ b.append("", term.substr(0, termKeyPrefixLength) + keySuffix);
}
+ b.append("", weight);
}
}
+}
+}
diff --git a/src/mongo/db/fts/fts_index_format.h b/src/mongo/db/fts/fts_index_format.h
index 579afb2d673..82be9ad03f5 100644
--- a/src/mongo/db/fts/fts_index_format.h
+++ b/src/mongo/db/fts/fts_index_format.h
@@ -37,40 +37,38 @@
namespace mongo {
- namespace fts {
+namespace fts {
- class FTSSpec;
+class FTSSpec;
- class FTSIndexFormat {
- public:
+class FTSIndexFormat {
+public:
+ static void getKeys(const FTSSpec& spec, const BSONObj& document, BSONObjSet* keys);
- static void getKeys( const FTSSpec& spec,
- const BSONObj& document,
- BSONObjSet* keys );
+ /**
+ * Helper method to get return entry from the FTSIndex as a BSONObj
+ * @param weight, the weight of the term in the entry
+ * @param term, the std::string term in the entry
+ * @param indexPrefix, the fields that go in the index first
+ * @param textIndexVersion, index version. affects key format.
+ */
+ static BSONObj getIndexKey(double weight,
+ const std::string& term,
+ const BSONObj& indexPrefix,
+ TextIndexVersion textIndexVersion);
- /**
- * Helper method to get return entry from the FTSIndex as a BSONObj
- * @param weight, the weight of the term in the entry
- * @param term, the std::string term in the entry
- * @param indexPrefix, the fields that go in the index first
- * @param textIndexVersion, index version. affects key format.
- */
- static BSONObj getIndexKey( double weight,
- const std::string& term,
- const BSONObj& indexPrefix,
- TextIndexVersion textIndexVersion );
-
- private:
- /**
- * Helper method to get return entry from the FTSIndex as a BSONObj
- * @param b, reference to the BSONOBjBuilder
- * @param weight, the weight of the term in the entry
- * @param term, the std::string term in the entry
- * @param textIndexVersion, index version. affects key format.
- */
- static void _appendIndexKey( BSONObjBuilder& b, double weight, const std::string& term,
- TextIndexVersion textIndexVersion );
- };
-
- }
+private:
+ /**
+ * Helper method to get return entry from the FTSIndex as a BSONObj
+ * @param b, reference to the BSONOBjBuilder
+ * @param weight, the weight of the term in the entry
+ * @param term, the std::string term in the entry
+ * @param textIndexVersion, index version. affects key format.
+ */
+ static void _appendIndexKey(BSONObjBuilder& b,
+ double weight,
+ const std::string& term,
+ TextIndexVersion textIndexVersion);
+};
+}
}
diff --git a/src/mongo/db/fts/fts_index_format_test.cpp b/src/mongo/db/fts/fts_index_format_test.cpp
index a15d014e98c..f7c8a5fa432 100644
--- a/src/mongo/db/fts/fts_index_format_test.cpp
+++ b/src/mongo/db/fts/fts_index_format_test.cpp
@@ -42,165 +42,184 @@
namespace mongo {
- namespace fts {
-
- using std::string;
-
- TEST( FTSIndexFormat, Simple1 ) {
- FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" ) ) ) );
- BSONObjSet keys;
- FTSIndexFormat::getKeys( spec, BSON( "data" << "cat sat" ), &keys );
-
- ASSERT_EQUALS( 2U, keys.size() );
- for ( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) {
- BSONObj key = *i;
- ASSERT_EQUALS( 2, key.nFields() );
- ASSERT_EQUALS( String, key.firstElement().type() );
- }
- }
-
- TEST( FTSIndexFormat, ExtraBack1 ) {
- FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" <<
- "x" << 1 ) ) ) );
- BSONObjSet keys;
- FTSIndexFormat::getKeys( spec, BSON( "data" << "cat" << "x" << 5 ), &keys );
-
- ASSERT_EQUALS( 1U, keys.size() );
- BSONObj key = *(keys.begin());
- ASSERT_EQUALS( 3, key.nFields() );
- BSONObjIterator i( key );
- ASSERT_EQUALS( StringData("cat"), i.next().valuestr() );
- ASSERT( i.next().numberDouble() > 0 );
- ASSERT_EQUALS( 5, i.next().numberInt() );
- }
+namespace fts {
+
+using std::string;
+
+TEST(FTSIndexFormat, Simple1) {
+ FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("data"
+ << "text"))));
+ BSONObjSet keys;
+ FTSIndexFormat::getKeys(spec,
+ BSON("data"
+ << "cat sat"),
+ &keys);
+
+ ASSERT_EQUALS(2U, keys.size());
+ for (BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i) {
+ BSONObj key = *i;
+ ASSERT_EQUALS(2, key.nFields());
+ ASSERT_EQUALS(String, key.firstElement().type());
+ }
+}
- /*
- TEST( FTSIndexFormat, ExtraBackArray1 ) {
- FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" <<
- "x.y" << 1 ) ) ) );
- BSONObjSet keys;
- FTSIndexFormat::getKeys( spec,
- BSON( "data" << "cat" <<
- "x" << BSON_ARRAY( BSON( "y" << 1 ) <<
- BSON( "y" << 2 ) ) ),
- &keys );
-
- ASSERT_EQUALS( 1U, keys.size() );
- BSONObj key = *(keys.begin());
- log() << "e: " << key << endl;
- ASSERT_EQUALS( 3, key.nFields() );
- BSONObjIterator i( key );
- ASSERT_EQUALS( StringData("cat"), i.next().valuestr() );
- ASSERT( i.next().numberDouble() > 0 );
- ASSERT_EQUALS( 5, i.next().numberInt() );
- }
- */
-
- TEST( FTSIndexFormat, ExtraFront1 ) {
- FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << 1 <<
- "data" << "text" ) ) ) );
- BSONObjSet keys;
- FTSIndexFormat::getKeys( spec, BSON( "data" << "cat" << "x" << 5 ), &keys );
-
- ASSERT_EQUALS( 1U, keys.size() );
- BSONObj key = *(keys.begin());
- ASSERT_EQUALS( 3, key.nFields() );
- BSONObjIterator i( key );
- ASSERT_EQUALS( 5, i.next().numberInt() );
- ASSERT_EQUALS( StringData("cat"), i.next().valuestr() );
- ASSERT( i.next().numberDouble() > 0 );
- }
+TEST(FTSIndexFormat, ExtraBack1) {
+ FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("data"
+ << "text"
+ << "x" << 1))));
+ BSONObjSet keys;
+ FTSIndexFormat::getKeys(spec,
+ BSON("data"
+ << "cat"
+ << "x" << 5),
+ &keys);
+
+ ASSERT_EQUALS(1U, keys.size());
+ BSONObj key = *(keys.begin());
+ ASSERT_EQUALS(3, key.nFields());
+ BSONObjIterator i(key);
+ ASSERT_EQUALS(StringData("cat"), i.next().valuestr());
+ ASSERT(i.next().numberDouble() > 0);
+ ASSERT_EQUALS(5, i.next().numberInt());
+}
- TEST( FTSIndexFormat, StopWords1 ) {
- FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" ) ) ) );
+/*
+TEST( FTSIndexFormat, ExtraBackArray1 ) {
+ FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" <<
+ "x.y" << 1 ) ) ) );
+ BSONObjSet keys;
+ FTSIndexFormat::getKeys( spec,
+ BSON( "data" << "cat" <<
+ "x" << BSON_ARRAY( BSON( "y" << 1 ) <<
+ BSON( "y" << 2 ) ) ),
+ &keys );
+
+ ASSERT_EQUALS( 1U, keys.size() );
+ BSONObj key = *(keys.begin());
+ log() << "e: " << key << endl;
+ ASSERT_EQUALS( 3, key.nFields() );
+ BSONObjIterator i( key );
+ ASSERT_EQUALS( StringData("cat"), i.next().valuestr() );
+ ASSERT( i.next().numberDouble() > 0 );
+ ASSERT_EQUALS( 5, i.next().numberInt() );
+}
+*/
- BSONObjSet keys1;
- FTSIndexFormat::getKeys( spec, BSON( "data" << "computer" ), &keys1 );
- ASSERT_EQUALS( 1U, keys1.size() );
+TEST(FTSIndexFormat, ExtraFront1) {
+ FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("x" << 1 << "data"
+ << "text"))));
+ BSONObjSet keys;
+ FTSIndexFormat::getKeys(spec,
+ BSON("data"
+ << "cat"
+ << "x" << 5),
+ &keys);
+
+ ASSERT_EQUALS(1U, keys.size());
+ BSONObj key = *(keys.begin());
+ ASSERT_EQUALS(3, key.nFields());
+ BSONObjIterator i(key);
+ ASSERT_EQUALS(5, i.next().numberInt());
+ ASSERT_EQUALS(StringData("cat"), i.next().valuestr());
+ ASSERT(i.next().numberDouble() > 0);
+}
- BSONObjSet keys2;
- FTSIndexFormat::getKeys( spec, BSON( "data" << "any computer" ), &keys2 );
- ASSERT_EQUALS( 1U, keys2.size() );
- }
+TEST(FTSIndexFormat, StopWords1) {
+ FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("data"
+ << "text"))));
+
+ BSONObjSet keys1;
+ FTSIndexFormat::getKeys(spec,
+ BSON("data"
+ << "computer"),
+ &keys1);
+ ASSERT_EQUALS(1U, keys1.size());
+
+ BSONObjSet keys2;
+ FTSIndexFormat::getKeys(spec,
+ BSON("data"
+ << "any computer"),
+ &keys2);
+ ASSERT_EQUALS(1U, keys2.size());
+}
- /**
- * Helper function to compare keys returned in getKeys() result
- * with expected values.
- */
- void assertEqualsIndexKeys( std::set<std::string>& expectedKeys, const BSONObjSet& keys ) {
- ASSERT_EQUALS( expectedKeys.size(), keys.size() );
- for ( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) {
- BSONObj key = *i;
- ASSERT_EQUALS( 2, key.nFields() );
- ASSERT_EQUALS( String, key.firstElement().type() );
- string s = key.firstElement().String();
- std::set<string>::const_iterator j = expectedKeys.find(s);
- if (j == expectedKeys.end()) {
- mongoutils::str::stream ss;
- ss << "unexpected key " << s << " in FTSIndexFormat::getKeys result. "
- << "expected keys:";
- for (std::set<string>::const_iterator k = expectedKeys.begin();
- k != expectedKeys.end(); ++k) {
- ss << "\n " << *k;
- }
- FAIL(ss);
- }
+/**
+ * Helper function to compare keys returned in getKeys() result
+ * with expected values.
+ */
+void assertEqualsIndexKeys(std::set<std::string>& expectedKeys, const BSONObjSet& keys) {
+ ASSERT_EQUALS(expectedKeys.size(), keys.size());
+ for (BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i) {
+ BSONObj key = *i;
+ ASSERT_EQUALS(2, key.nFields());
+ ASSERT_EQUALS(String, key.firstElement().type());
+ string s = key.firstElement().String();
+ std::set<string>::const_iterator j = expectedKeys.find(s);
+ if (j == expectedKeys.end()) {
+ mongoutils::str::stream ss;
+ ss << "unexpected key " << s << " in FTSIndexFormat::getKeys result. "
+ << "expected keys:";
+ for (std::set<string>::const_iterator k = expectedKeys.begin(); k != expectedKeys.end();
+ ++k) {
+ ss << "\n " << *k;
}
+ FAIL(ss);
}
+ }
+}
- /**
- * Tests keys for long terms using text index version 1.
- * Terms that are too long are not truncated in version 1.
- */
- TEST( FTSIndexFormat, LongWordsTextIndexVersion1 ) {
- FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" ) <<
- "textIndexVersion" << 1 ) ) );
- BSONObjSet keys;
- string longPrefix( 1024U, 'a' );
- // "aaa...aaacat"
- string longWordCat = longPrefix + "cat";
- // "aaa...aaasat"
- string longWordSat = longPrefix + "sat";
- string text = mongoutils::str::stream() << longWordCat << " " << longWordSat;
- FTSIndexFormat::getKeys( spec, BSON( "data" << text ), &keys );
-
- // Hard-coded expected computed keys for future-proofing.
- std::set<string> expectedKeys;
- // cat
- expectedKeys.insert( longWordCat );
- // sat
- expectedKeys.insert( longWordSat );
-
- assertEqualsIndexKeys( expectedKeys, keys);
- }
-
- /**
- * Tests keys for long terms using text index version 2.
- * In version 2, long terms (longer than 32 characters)
- * are hashed with murmur3 and appended to the first 32
- * characters of the term to form the index key.
- */
- TEST( FTSIndexFormat, LongWordTextIndexVersion2 ) {
- FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" ) <<
- "textIndexVersion" << 2 ) ) );
- BSONObjSet keys;
- string longPrefix( 1024U, 'a' );
- // "aaa...aaacat"
- string longWordCat = longPrefix + "cat";
- // "aaa...aaasat"
- string longWordSat = longPrefix + "sat";
- string text = mongoutils::str::stream() << longWordCat << " " << longWordSat;
- FTSIndexFormat::getKeys( spec, BSON( "data" << text ), &keys );
-
- // Hard-coded expected computed keys for future-proofing.
- std::set<string> expectedKeys;
- // cat
- expectedKeys.insert( "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab8e78455d827ebb87cbe87f392bf45f6" );
- // sat
- expectedKeys.insert( "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaf2d6f58bb3b81b97e611ae7ccac6dea7" );
-
- assertEqualsIndexKeys( expectedKeys, keys);
- }
+/**
+ * Tests keys for long terms using text index version 1.
+ * Terms that are too long are not truncated in version 1.
+ */
+TEST(FTSIndexFormat, LongWordsTextIndexVersion1) {
+ FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("data"
+ << "text") << "textIndexVersion" << 1)));
+ BSONObjSet keys;
+ string longPrefix(1024U, 'a');
+ // "aaa...aaacat"
+ string longWordCat = longPrefix + "cat";
+ // "aaa...aaasat"
+ string longWordSat = longPrefix + "sat";
+ string text = mongoutils::str::stream() << longWordCat << " " << longWordSat;
+ FTSIndexFormat::getKeys(spec, BSON("data" << text), &keys);
+
+ // Hard-coded expected computed keys for future-proofing.
+ std::set<string> expectedKeys;
+ // cat
+ expectedKeys.insert(longWordCat);
+ // sat
+ expectedKeys.insert(longWordSat);
+
+ assertEqualsIndexKeys(expectedKeys, keys);
+}
- }
+/**
+ * Tests keys for long terms using text index version 2.
+ * In version 2, long terms (longer than 32 characters)
+ * are hashed with murmur3 and appended to the first 32
+ * characters of the term to form the index key.
+ */
+TEST(FTSIndexFormat, LongWordTextIndexVersion2) {
+ FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("data"
+ << "text") << "textIndexVersion" << 2)));
+ BSONObjSet keys;
+ string longPrefix(1024U, 'a');
+ // "aaa...aaacat"
+ string longWordCat = longPrefix + "cat";
+ // "aaa...aaasat"
+ string longWordSat = longPrefix + "sat";
+ string text = mongoutils::str::stream() << longWordCat << " " << longWordSat;
+ FTSIndexFormat::getKeys(spec, BSON("data" << text), &keys);
+
+ // Hard-coded expected computed keys for future-proofing.
+ std::set<string> expectedKeys;
+ // cat
+ expectedKeys.insert("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab8e78455d827ebb87cbe87f392bf45f6");
+ // sat
+ expectedKeys.insert("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaf2d6f58bb3b81b97e611ae7ccac6dea7");
+
+ assertEqualsIndexKeys(expectedKeys, keys);
+}
+}
}
diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp
index 4b1966d6458..7a0c64ab1cf 100644
--- a/src/mongo/db/fts/fts_language.cpp
+++ b/src/mongo/db/fts/fts_language.cpp
@@ -31,7 +31,7 @@
#include "mongo/db/fts/fts_language.h"
#include <string>
-
+
#include "mongo/base/init.h"
#include "mongo/db/fts/fts_basic_tokenizer.h"
#include "mongo/stdx/memory.h"
@@ -42,225 +42,220 @@
namespace mongo {
- namespace fts {
+namespace fts {
- namespace {
+namespace {
- /**
- * Case-insensitive StringData comparator.
- */
- struct LanguageStringCompare {
- /** Returns true if lhs < rhs. */
- bool operator()( std::string lhs, std::string rhs ) const {
- size_t minSize = std::min( lhs.size(), rhs.size() );
+/**
+ * Case-insensitive StringData comparator.
+ */
+struct LanguageStringCompare {
+ /** Returns true if lhs < rhs. */
+ bool operator()(std::string lhs, std::string rhs) const {
+ size_t minSize = std::min(lhs.size(), rhs.size());
- for ( size_t x = 0; x < minSize; x++ ) {
- char a = tolower( lhs[x] );
- char b = tolower( rhs[x] );
- if ( a < b ) {
- return true;
- }
- if ( a > b ) {
- return false;
- }
- }
+ for (size_t x = 0; x < minSize; x++) {
+ char a = tolower(lhs[x]);
+ char b = tolower(rhs[x]);
+ if (a < b) {
+ return true;
+ }
+ if (a > b) {
+ return false;
+ }
+ }
- return lhs.size() < rhs.size();
- }
- };
+ return lhs.size() < rhs.size();
+ }
+};
- // Lookup table from user language string (case-insensitive) to FTSLanguage. Populated
- // by initializers in group FTSAllLanguagesRegistered and initializer
- // FTSRegisterLanguageAliases. For use with TEXT_INDEX_VERSION_2 text indexes only.
- typedef std::map<std::string, const FTSLanguage*, LanguageStringCompare> LanguageMapV2;
- LanguageMapV2 languageMapV2;
+// Lookup table from user language string (case-insensitive) to FTSLanguage. Populated
+// by initializers in group FTSAllLanguagesRegistered and initializer
+// FTSRegisterLanguageAliases. For use with TEXT_INDEX_VERSION_2 text indexes only.
+typedef std::map<std::string, const FTSLanguage*, LanguageStringCompare> LanguageMapV2;
+LanguageMapV2 languageMapV2;
- // Like languageMapV2, but for use with TEXT_INDEX_VERSION_1 text indexes.
- // Case-sensitive by lookup key.
- typedef std::map<StringData, const FTSLanguage*> LanguageMapV1;
- LanguageMapV1 languageMapV1;
- }
+// Like languageMapV2, but for use with TEXT_INDEX_VERSION_1 text indexes.
+// Case-sensitive by lookup key.
+typedef std::map<StringData, const FTSLanguage*> LanguageMapV1;
+LanguageMapV1 languageMapV1;
+}
- std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const {
- return stdx::make_unique<BasicFTSTokenizer>(this);
- }
+std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const {
+ return stdx::make_unique<BasicFTSTokenizer>(this);
+}
- MONGO_INITIALIZER_GROUP( FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES,
- MONGO_NO_DEPENDENTS );
+MONGO_INITIALIZER_GROUP(FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, MONGO_NO_DEPENDENTS);
- //
- // Register supported languages' canonical names for TEXT_INDEX_VERSION_2.
- //
+//
+// Register supported languages' canonical names for TEXT_INDEX_VERSION_2.
+//
- MONGO_FTS_LANGUAGE_DECLARE( languageNoneV2, "none", TEXT_INDEX_VERSION_2 );
- MONGO_FTS_LANGUAGE_DECLARE( languageDanishV2, "danish", TEXT_INDEX_VERSION_2 );
- MONGO_FTS_LANGUAGE_DECLARE( languageDutchV2, "dutch", TEXT_INDEX_VERSION_2 );
- MONGO_FTS_LANGUAGE_DECLARE( languageEnglishV2, "english", TEXT_INDEX_VERSION_2 );
- MONGO_FTS_LANGUAGE_DECLARE( languageFinnishV2, "finnish", TEXT_INDEX_VERSION_2 );
- MONGO_FTS_LANGUAGE_DECLARE( languageFrenchV2, "french", TEXT_INDEX_VERSION_2 );
- MONGO_FTS_LANGUAGE_DECLARE( languageGermanV2, "german", TEXT_INDEX_VERSION_2 );
- MONGO_FTS_LANGUAGE_DECLARE( languageHungarianV2, "hungarian", TEXT_INDEX_VERSION_2 );
- MONGO_FTS_LANGUAGE_DECLARE( languageItalianV2, "italian", TEXT_INDEX_VERSION_2 );
- MONGO_FTS_LANGUAGE_DECLARE( languageNorwegianV2, "norwegian", TEXT_INDEX_VERSION_2 );
- MONGO_FTS_LANGUAGE_DECLARE( languagePortugueseV2, "portuguese", TEXT_INDEX_VERSION_2 );
- MONGO_FTS_LANGUAGE_DECLARE( languageRomanianV2, "romanian", TEXT_INDEX_VERSION_2 );
- MONGO_FTS_LANGUAGE_DECLARE( languageRussianV2, "russian", TEXT_INDEX_VERSION_2 );
- MONGO_FTS_LANGUAGE_DECLARE( languageSpanishV2, "spanish", TEXT_INDEX_VERSION_2 );
- MONGO_FTS_LANGUAGE_DECLARE( languageSwedishV2, "swedish", TEXT_INDEX_VERSION_2 );
- MONGO_FTS_LANGUAGE_DECLARE( languageTurkishV2, "turkish", TEXT_INDEX_VERSION_2 );
+MONGO_FTS_LANGUAGE_DECLARE(languageNoneV2, "none", TEXT_INDEX_VERSION_2);
+MONGO_FTS_LANGUAGE_DECLARE(languageDanishV2, "danish", TEXT_INDEX_VERSION_2);
+MONGO_FTS_LANGUAGE_DECLARE(languageDutchV2, "dutch", TEXT_INDEX_VERSION_2);
+MONGO_FTS_LANGUAGE_DECLARE(languageEnglishV2, "english", TEXT_INDEX_VERSION_2);
+MONGO_FTS_LANGUAGE_DECLARE(languageFinnishV2, "finnish", TEXT_INDEX_VERSION_2);
+MONGO_FTS_LANGUAGE_DECLARE(languageFrenchV2, "french", TEXT_INDEX_VERSION_2);
+MONGO_FTS_LANGUAGE_DECLARE(languageGermanV2, "german", TEXT_INDEX_VERSION_2);
+MONGO_FTS_LANGUAGE_DECLARE(languageHungarianV2, "hungarian", TEXT_INDEX_VERSION_2);
+MONGO_FTS_LANGUAGE_DECLARE(languageItalianV2, "italian", TEXT_INDEX_VERSION_2);
+MONGO_FTS_LANGUAGE_DECLARE(languageNorwegianV2, "norwegian", TEXT_INDEX_VERSION_2);
+MONGO_FTS_LANGUAGE_DECLARE(languagePortugueseV2, "portuguese", TEXT_INDEX_VERSION_2);
+MONGO_FTS_LANGUAGE_DECLARE(languageRomanianV2, "romanian", TEXT_INDEX_VERSION_2);
+MONGO_FTS_LANGUAGE_DECLARE(languageRussianV2, "russian", TEXT_INDEX_VERSION_2);
+MONGO_FTS_LANGUAGE_DECLARE(languageSpanishV2, "spanish", TEXT_INDEX_VERSION_2);
+MONGO_FTS_LANGUAGE_DECLARE(languageSwedishV2, "swedish", TEXT_INDEX_VERSION_2);
+MONGO_FTS_LANGUAGE_DECLARE(languageTurkishV2, "turkish", TEXT_INDEX_VERSION_2);
- //
- // Register all Snowball language modules for TEXT_INDEX_VERSION_1. Note that only the full
- // names are recognized by the StopWords class (as such, the language string "dan" in
- // TEXT_INDEX_VERSION_1 will generate the Danish stemmer and the empty stopword list).
- //
+//
+// Register all Snowball language modules for TEXT_INDEX_VERSION_1. Note that only the full
+// names are recognized by the StopWords class (as such, the language string "dan" in
+// TEXT_INDEX_VERSION_1 will generate the Danish stemmer and the empty stopword list).
+//
- MONGO_FTS_LANGUAGE_DECLARE( languageNoneV1, "none", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageDaV1, "da", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageDanV1, "dan", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageDanishV1, "danish", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageDeV1, "de", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageDeuV1, "deu", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageDutV1, "dut", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageDutchV1, "dutch", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageEnV1, "en", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageEngV1, "eng", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageEnglishV1, "english", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageEsV1, "es", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageEslV1, "esl", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageFiV1, "fi", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageFinV1, "fin", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageFinnishV1, "finnish", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageFrV1, "fr", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageFraV1, "fra", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageFreV1, "fre", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageFrenchV1, "french", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageGerV1, "ger", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageGermanV1, "german", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageHuV1, "hu", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageHunV1, "hun", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageHungarianV1, "hungarian", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageItV1, "it", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageItaV1, "ita", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageItalianV1, "italian", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageNlV1, "nl", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageNldV1, "nld", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageNoV1, "no", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageNorV1, "nor", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageNorwegianV1, "norwegian", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languagePorV1, "por", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languagePorterV1, "porter", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languagePortugueseV1, "portuguese", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languagePtV1, "pt", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageRoV1, "ro", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageRomanianV1, "romanian", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageRonV1, "ron", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageRuV1, "ru", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageRumV1, "rum", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageRusV1, "rus", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageRussianV1, "russian", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageSpaV1, "spa", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageSpanishV1, "spanish", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageSvV1, "sv", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageSweV1, "swe", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageSwedishV1, "swedish", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageTrV1, "tr", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageTurV1, "tur", TEXT_INDEX_VERSION_1 );
- MONGO_FTS_LANGUAGE_DECLARE( languageTurkishV1, "turkish", TEXT_INDEX_VERSION_1 );
+MONGO_FTS_LANGUAGE_DECLARE(languageNoneV1, "none", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageDaV1, "da", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageDanV1, "dan", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageDanishV1, "danish", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageDeV1, "de", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageDeuV1, "deu", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageDutV1, "dut", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageDutchV1, "dutch", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageEnV1, "en", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageEngV1, "eng", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageEnglishV1, "english", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageEsV1, "es", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageEslV1, "esl", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageFiV1, "fi", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageFinV1, "fin", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageFinnishV1, "finnish", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageFrV1, "fr", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageFraV1, "fra", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageFreV1, "fre", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageFrenchV1, "french", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageGerV1, "ger", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageGermanV1, "german", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageHuV1, "hu", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageHunV1, "hun", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageHungarianV1, "hungarian", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageItV1, "it", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageItaV1, "ita", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageItalianV1, "italian", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageNlV1, "nl", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageNldV1, "nld", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageNoV1, "no", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageNorV1, "nor", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageNorwegianV1, "norwegian", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languagePorV1, "por", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languagePorterV1, "porter", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languagePortugueseV1, "portuguese", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languagePtV1, "pt", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageRoV1, "ro", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageRomanianV1, "romanian", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageRonV1, "ron", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageRuV1, "ru", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageRumV1, "rum", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageRusV1, "rus", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageRussianV1, "russian", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageSpaV1, "spa", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageSpanishV1, "spanish", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageSvV1, "sv", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageSweV1, "swe", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageSwedishV1, "swedish", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageTrV1, "tr", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageTurV1, "tur", TEXT_INDEX_VERSION_1);
+MONGO_FTS_LANGUAGE_DECLARE(languageTurkishV1, "turkish", TEXT_INDEX_VERSION_1);
- MONGO_INITIALIZER_WITH_PREREQUISITES( FTSRegisterLanguageAliases,
- ( "FTSAllLanguagesRegistered" ) )
- ( InitializerContext* context ) {
- // Register language aliases for TEXT_INDEX_VERSION_2.
- FTSLanguage::registerLanguageAlias( &languageDanishV2, "da", TEXT_INDEX_VERSION_2 );
- FTSLanguage::registerLanguageAlias( &languageDutchV2, "nl", TEXT_INDEX_VERSION_2 );
- FTSLanguage::registerLanguageAlias( &languageEnglishV2, "en", TEXT_INDEX_VERSION_2 );
- FTSLanguage::registerLanguageAlias( &languageFinnishV2, "fi", TEXT_INDEX_VERSION_2 );
- FTSLanguage::registerLanguageAlias( &languageFrenchV2, "fr", TEXT_INDEX_VERSION_2 );
- FTSLanguage::registerLanguageAlias( &languageGermanV2, "de", TEXT_INDEX_VERSION_2 );
- FTSLanguage::registerLanguageAlias( &languageHungarianV2, "hu", TEXT_INDEX_VERSION_2 );
- FTSLanguage::registerLanguageAlias( &languageItalianV2, "it", TEXT_INDEX_VERSION_2 );
- FTSLanguage::registerLanguageAlias( &languageNorwegianV2, "nb", TEXT_INDEX_VERSION_2 );
- FTSLanguage::registerLanguageAlias( &languagePortugueseV2, "pt", TEXT_INDEX_VERSION_2 );
- FTSLanguage::registerLanguageAlias( &languageRomanianV2, "ro", TEXT_INDEX_VERSION_2 );
- FTSLanguage::registerLanguageAlias( &languageRussianV2, "ru", TEXT_INDEX_VERSION_2 );
- FTSLanguage::registerLanguageAlias( &languageSpanishV2, "es", TEXT_INDEX_VERSION_2 );
- FTSLanguage::registerLanguageAlias( &languageSwedishV2, "sv", TEXT_INDEX_VERSION_2 );
- FTSLanguage::registerLanguageAlias( &languageTurkishV2, "tr", TEXT_INDEX_VERSION_2 );
- return Status::OK();
- }
-
- // static
- void FTSLanguage::registerLanguage( StringData languageName,
- TextIndexVersion textIndexVersion,
- FTSLanguage* language ) {
- verify( !languageName.empty() );
- language->_canonicalName = languageName.toString();
- switch ( textIndexVersion ) {
- case TEXT_INDEX_VERSION_2:
- languageMapV2[ languageName.toString() ] = language;
- return;
- case TEXT_INDEX_VERSION_1:
- verify( languageMapV1.find( languageName ) == languageMapV1.end() );
- languageMapV1[ languageName ] = language;
- return;
- }
- verify( false );
- }
+MONGO_INITIALIZER_WITH_PREREQUISITES(FTSRegisterLanguageAliases, ("FTSAllLanguagesRegistered"))
+(InitializerContext* context) {
+ // Register language aliases for TEXT_INDEX_VERSION_2.
+ FTSLanguage::registerLanguageAlias(&languageDanishV2, "da", TEXT_INDEX_VERSION_2);
+ FTSLanguage::registerLanguageAlias(&languageDutchV2, "nl", TEXT_INDEX_VERSION_2);
+ FTSLanguage::registerLanguageAlias(&languageEnglishV2, "en", TEXT_INDEX_VERSION_2);
+ FTSLanguage::registerLanguageAlias(&languageFinnishV2, "fi", TEXT_INDEX_VERSION_2);
+ FTSLanguage::registerLanguageAlias(&languageFrenchV2, "fr", TEXT_INDEX_VERSION_2);
+ FTSLanguage::registerLanguageAlias(&languageGermanV2, "de", TEXT_INDEX_VERSION_2);
+ FTSLanguage::registerLanguageAlias(&languageHungarianV2, "hu", TEXT_INDEX_VERSION_2);
+ FTSLanguage::registerLanguageAlias(&languageItalianV2, "it", TEXT_INDEX_VERSION_2);
+ FTSLanguage::registerLanguageAlias(&languageNorwegianV2, "nb", TEXT_INDEX_VERSION_2);
+ FTSLanguage::registerLanguageAlias(&languagePortugueseV2, "pt", TEXT_INDEX_VERSION_2);
+ FTSLanguage::registerLanguageAlias(&languageRomanianV2, "ro", TEXT_INDEX_VERSION_2);
+ FTSLanguage::registerLanguageAlias(&languageRussianV2, "ru", TEXT_INDEX_VERSION_2);
+ FTSLanguage::registerLanguageAlias(&languageSpanishV2, "es", TEXT_INDEX_VERSION_2);
+ FTSLanguage::registerLanguageAlias(&languageSwedishV2, "sv", TEXT_INDEX_VERSION_2);
+ FTSLanguage::registerLanguageAlias(&languageTurkishV2, "tr", TEXT_INDEX_VERSION_2);
+ return Status::OK();
+}
- // static
- void FTSLanguage::registerLanguageAlias( const FTSLanguage* language,
- StringData alias,
- TextIndexVersion textIndexVersion ) {
- switch ( textIndexVersion ) {
- case TEXT_INDEX_VERSION_2:
- languageMapV2[ alias.toString() ] = language;
- return;
- case TEXT_INDEX_VERSION_1:
- verify( languageMapV1.find( alias ) == languageMapV1.end() );
- languageMapV1[ alias ] = language;
- return;
- }
- verify( false );
- }
+// static
+void FTSLanguage::registerLanguage(StringData languageName,
+ TextIndexVersion textIndexVersion,
+ FTSLanguage* language) {
+ verify(!languageName.empty());
+ language->_canonicalName = languageName.toString();
+ switch (textIndexVersion) {
+ case TEXT_INDEX_VERSION_2:
+ languageMapV2[languageName.toString()] = language;
+ return;
+ case TEXT_INDEX_VERSION_1:
+ verify(languageMapV1.find(languageName) == languageMapV1.end());
+ languageMapV1[languageName] = language;
+ return;
+ }
+ verify(false);
+}
- FTSLanguage::FTSLanguage() : _canonicalName() {
- }
+// static
+void FTSLanguage::registerLanguageAlias(const FTSLanguage* language,
+ StringData alias,
+ TextIndexVersion textIndexVersion) {
+ switch (textIndexVersion) {
+ case TEXT_INDEX_VERSION_2:
+ languageMapV2[alias.toString()] = language;
+ return;
+ case TEXT_INDEX_VERSION_1:
+ verify(languageMapV1.find(alias) == languageMapV1.end());
+ languageMapV1[alias] = language;
+ return;
+ }
+ verify(false);
+}
- const std::string& FTSLanguage::str() const {
- verify( !_canonicalName.empty() );
- return _canonicalName;
- }
+FTSLanguage::FTSLanguage() : _canonicalName() {}
- // static
- StatusWithFTSLanguage FTSLanguage::make( StringData langName,
- TextIndexVersion textIndexVersion ) {
- switch ( textIndexVersion ) {
- case TEXT_INDEX_VERSION_2: {
- LanguageMapV2::const_iterator it = languageMapV2.find( langName.toString() );
- if ( it == languageMapV2.end() ) {
- // TEXT_INDEX_VERSION_2 rejects unrecognized language strings.
- Status status = Status( ErrorCodes::BadValue,
- mongoutils::str::stream() <<
- "unsupported language: \"" << langName <<
- "\"" );
- return StatusWithFTSLanguage( status );
- }
+const std::string& FTSLanguage::str() const {
+ verify(!_canonicalName.empty());
+ return _canonicalName;
+}
- return StatusWithFTSLanguage( it->second );
- }
- case TEXT_INDEX_VERSION_1: {
- LanguageMapV1::const_iterator it = languageMapV1.find( langName );
- if ( it == languageMapV1.end() ) {
- // TEXT_INDEX_VERSION_1 treats unrecognized language strings as "none".
- return StatusWithFTSLanguage( &languageNoneV1 );
- }
- return StatusWithFTSLanguage( it->second );
- }
+// static
+StatusWithFTSLanguage FTSLanguage::make(StringData langName, TextIndexVersion textIndexVersion) {
+ switch (textIndexVersion) {
+ case TEXT_INDEX_VERSION_2: {
+ LanguageMapV2::const_iterator it = languageMapV2.find(langName.toString());
+ if (it == languageMapV2.end()) {
+ // TEXT_INDEX_VERSION_2 rejects unrecognized language strings.
+ Status status = Status(ErrorCodes::BadValue,
+ mongoutils::str::stream() << "unsupported language: \""
+ << langName << "\"");
+ return StatusWithFTSLanguage(status);
}
- verify( false );
- return StatusWithFTSLanguage( Status::OK() );
+ return StatusWithFTSLanguage(it->second);
+ }
+ case TEXT_INDEX_VERSION_1: {
+ LanguageMapV1::const_iterator it = languageMapV1.find(langName);
+ if (it == languageMapV1.end()) {
+ // TEXT_INDEX_VERSION_1 treats unrecognized language strings as "none".
+ return StatusWithFTSLanguage(&languageNoneV1);
+ }
+ return StatusWithFTSLanguage(it->second);
}
}
+
+ verify(false);
+ return StatusWithFTSLanguage(Status::OK());
+}
+}
}
diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h
index ce45e0b812a..facdb8c9ce0 100644
--- a/src/mongo/db/fts/fts_language.h
+++ b/src/mongo/db/fts/fts_language.h
@@ -37,108 +37,107 @@
namespace mongo {
- namespace fts {
-
- class FTSTokenizer;
-
- #define MONGO_FTS_LANGUAGE_DECLARE( language, name, version ) \
- BasicFTSLanguage language; \
- MONGO_INITIALIZER_GENERAL( language, MONGO_NO_PREREQUISITES, \
- ( "FTSAllLanguagesRegistered" ) ) \
- ( ::mongo::InitializerContext* context ) { \
- FTSLanguage::registerLanguage( name, version, &language ); \
- return Status::OK(); \
- }
-
- /**
- * A FTSLanguage represents a language for a text-indexed document or a text search.
- * FTSLanguage objects are not copyable.
- *
- * Recommended usage:
- *
- * StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_2 );
- * if ( !swl.getStatus().isOK() ) {
- * // Error.
- * }
- * else {
- * const FTSLanguage* language = swl.getValue();
- * // Use language.
- * }
- */
- class FTSLanguage {
- // Use make() instead of copying.
- MONGO_DISALLOW_COPYING( FTSLanguage );
- public:
- /** Create an uninitialized language. */
- FTSLanguage();
-
- virtual ~FTSLanguage() {}
-
- /**
- * Returns the language as a std::string in canonical form (lowercased English name). It is
- * an error to call str() on an uninitialized language.
- */
- const std::string& str() const;
-
- /**
- * Returns a new FTSTokenizer instance for this language.
- * Lifetime is scoped to FTSLanguage (which are currently all process lifetime)
- */
- virtual std::unique_ptr<FTSTokenizer> createTokenizer() const = 0;
-
- /**
- * Register std::string 'languageName' as a new language with text index version
- * 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'.
- * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language
- * string.
- */
- static void registerLanguage( StringData languageName,
- TextIndexVersion textIndexVersion,
- FTSLanguage *languageOut );
-
- /**
- * Register 'alias' as an alias for 'language' with text index version
- * 'textIndexVersion'. Subsequent calls to FTSLanguage::make() will recognize the
- * newly-registered alias.
- */
- static void registerLanguageAlias( const FTSLanguage* language,
- StringData alias,
- TextIndexVersion textIndexVersion );
-
- /**
- * Return the FTSLanguage associated with the given language string. Returns an error
- * Status if an invalid language std::string is passed.
- *
- * For textIndexVersion=TEXT_INDEX_VERSION_2, language strings are
- * case-insensitive, and need to be in one of the two following forms:
- * - English name, like "spanish".
- * - Two-letter code, like "es".
- *
- * For textIndexVersion=TEXT_INDEX_VERSION_1, no validation or normalization of
- * language strings is performed. This is necessary to preserve indexing behavior for
- * documents with language strings like "en": for compatibility, text data in these
- * documents needs to be processed with the English stemmer and the empty stopword list
- * (since "en" is recognized by Snowball but not the stopword processing logic).
- */
- static StatusWith<const FTSLanguage*> make( StringData langName,
- TextIndexVersion textIndexVersion );
-
- private:
- // std::string representation of language in canonical form.
- std::string _canonicalName;
- };
-
- typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage;
-
-
- class BasicFTSLanguage : public FTSLanguage {
- public:
- std::unique_ptr<FTSTokenizer> createTokenizer() const override;
- };
-
- extern BasicFTSLanguage languagePorterV1;
- extern BasicFTSLanguage languageEnglishV2;
- extern BasicFTSLanguage languageFrenchV2;
+namespace fts {
+class FTSTokenizer;
+
+#define MONGO_FTS_LANGUAGE_DECLARE(language, name, version) \
+ BasicFTSLanguage language; \
+ MONGO_INITIALIZER_GENERAL(language, MONGO_NO_PREREQUISITES, ("FTSAllLanguagesRegistered")) \
+ (::mongo::InitializerContext * context) { \
+ FTSLanguage::registerLanguage(name, version, &language); \
+ return Status::OK(); \
}
+
+/**
+ * A FTSLanguage represents a language for a text-indexed document or a text search.
+ * FTSLanguage objects are not copyable.
+ *
+ * Recommended usage:
+ *
+ * StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_2 );
+ * if ( !swl.getStatus().isOK() ) {
+ * // Error.
+ * }
+ * else {
+ * const FTSLanguage* language = swl.getValue();
+ * // Use language.
+ * }
+ */
+class FTSLanguage {
+ // Use make() instead of copying.
+ MONGO_DISALLOW_COPYING(FTSLanguage);
+
+public:
+ /** Create an uninitialized language. */
+ FTSLanguage();
+
+ virtual ~FTSLanguage() {}
+
+ /**
+ * Returns the language as a std::string in canonical form (lowercased English name). It is
+ * an error to call str() on an uninitialized language.
+ */
+ const std::string& str() const;
+
+ /**
+ * Returns a new FTSTokenizer instance for this language.
+ * Lifetime is scoped to FTSLanguage (which are currently all process lifetime)
+ */
+ virtual std::unique_ptr<FTSTokenizer> createTokenizer() const = 0;
+
+ /**
+ * Register std::string 'languageName' as a new language with text index version
+ * 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'.
+ * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language
+ * string.
+ */
+ static void registerLanguage(StringData languageName,
+ TextIndexVersion textIndexVersion,
+ FTSLanguage* languageOut);
+
+ /**
+ * Register 'alias' as an alias for 'language' with text index version
+ * 'textIndexVersion'. Subsequent calls to FTSLanguage::make() will recognize the
+ * newly-registered alias.
+ */
+ static void registerLanguageAlias(const FTSLanguage* language,
+ StringData alias,
+ TextIndexVersion textIndexVersion);
+
+ /**
+ * Return the FTSLanguage associated with the given language string. Returns an error
+ * Status if an invalid language std::string is passed.
+ *
+ * For textIndexVersion=TEXT_INDEX_VERSION_2, language strings are
+ * case-insensitive, and need to be in one of the two following forms:
+ * - English name, like "spanish".
+ * - Two-letter code, like "es".
+ *
+ * For textIndexVersion=TEXT_INDEX_VERSION_1, no validation or normalization of
+ * language strings is performed. This is necessary to preserve indexing behavior for
+ * documents with language strings like "en": for compatibility, text data in these
+ * documents needs to be processed with the English stemmer and the empty stopword list
+ * (since "en" is recognized by Snowball but not the stopword processing logic).
+ */
+ static StatusWith<const FTSLanguage*> make(StringData langName,
+ TextIndexVersion textIndexVersion);
+
+private:
+ // std::string representation of language in canonical form.
+ std::string _canonicalName;
+};
+
+typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage;
+
+
+class BasicFTSLanguage : public FTSLanguage {
+public:
+ std::unique_ptr<FTSTokenizer> createTokenizer() const override;
+};
+
+extern BasicFTSLanguage languagePorterV1;
+extern BasicFTSLanguage languageEnglishV2;
+extern BasicFTSLanguage languageFrenchV2;
+}
}
diff --git a/src/mongo/db/fts/fts_language_test.cpp b/src/mongo/db/fts/fts_language_test.cpp
index 0fb46ef2df7..c24f02ff7fd 100644
--- a/src/mongo/db/fts/fts_language_test.cpp
+++ b/src/mongo/db/fts/fts_language_test.cpp
@@ -35,103 +35,102 @@
namespace mongo {
- namespace fts {
-
- // Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2.
-
- TEST( FTSLanguageV2, ExactLanguage ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( "spanish", TEXT_INDEX_VERSION_2 );
- ASSERT( swl.getStatus().isOK() );
- ASSERT_EQUALS( swl.getValue()->str(), "spanish" );
- }
-
- TEST( FTSLanguageV2, ExactCode ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( "es", TEXT_INDEX_VERSION_2 );
- ASSERT( swl.getStatus().isOK() );
- ASSERT_EQUALS( swl.getValue()->str(), "spanish" );
- }
-
- TEST( FTSLanguageV2, UpperCaseLanguage ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( "SPANISH", TEXT_INDEX_VERSION_2 );
- ASSERT( swl.getStatus().isOK() );
- ASSERT_EQUALS( swl.getValue()->str(), "spanish" );
- }
-
- TEST( FTSLanguageV2, UpperCaseCode ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( "ES", TEXT_INDEX_VERSION_2 );
- ASSERT( swl.getStatus().isOK() );
- ASSERT_EQUALS( swl.getValue()->str(), "spanish" );
- }
-
- TEST( FTSLanguageV2, NoneLanguage ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( "none", TEXT_INDEX_VERSION_2 );
- ASSERT( swl.getStatus().isOK() );
- ASSERT_EQUALS( swl.getValue()->str(), "none" );
- }
-
- // Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2.
-
- TEST( FTSLanguageV2, Unknown ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( "spanglish", TEXT_INDEX_VERSION_2 );
- ASSERT( !swl.getStatus().isOK() );
- }
-
- TEST( FTSLanguageV2, Empty ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( "", TEXT_INDEX_VERSION_2 );
- ASSERT( !swl.getStatus().isOK() );
- }
-
- // Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1.
-
- TEST( FTSLanguageV1, ExactLanguage ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( "spanish", TEXT_INDEX_VERSION_1 );
- ASSERT( swl.getStatus().isOK() );
- ASSERT_EQUALS( swl.getValue()->str(), "spanish" );
- }
-
- TEST( FTSLanguageV1, DeprecatedLanguage ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( "porter", TEXT_INDEX_VERSION_1 );
- ASSERT( swl.getStatus().isOK() );
- ASSERT_EQUALS( swl.getValue()->str(), "porter" );
- }
-
- TEST( FTSLanguageV1, StemmerOnlyLanguage1 ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_1 );
- ASSERT( swl.getStatus().isOK() );
- ASSERT_EQUALS( swl.getValue()->str(), "en" );
- }
-
- TEST( FTSLanguageV1, StemmerOnlyLanguage2 ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( "eng", TEXT_INDEX_VERSION_1 );
- ASSERT( swl.getStatus().isOK() );
- ASSERT_EQUALS( swl.getValue()->str(), "eng" );
- }
-
- TEST( FTSLanguageV1, NoneLanguage ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( "none", TEXT_INDEX_VERSION_1 );
- ASSERT( swl.getStatus().isOK() );
- ASSERT_EQUALS( swl.getValue()->str(), "none" );
- }
-
- // Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1.
-
- TEST( FTSLanguageV1, CaseSensitive ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( "SPANISH", TEXT_INDEX_VERSION_1 );
- ASSERT( swl.getStatus().isOK() );
- ASSERT_EQUALS( swl.getValue()->str(), "none" );
- }
-
- TEST( FTSLanguageV1, Unknown ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( "asdf", TEXT_INDEX_VERSION_1 );
- ASSERT( swl.getStatus().isOK() );
- ASSERT_EQUALS( swl.getValue()->str(), "none" );
- }
-
- TEST( FTSLanguageV1, Empty ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( "", TEXT_INDEX_VERSION_1 );
- ASSERT( swl.getStatus().isOK() );
- ASSERT_EQUALS( swl.getValue()->str(), "none" );
- }
-
- }
+namespace fts {
+
+// Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2.
+
+TEST(FTSLanguageV2, ExactLanguage) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("spanish", TEXT_INDEX_VERSION_2);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "spanish");
+}
+
+TEST(FTSLanguageV2, ExactCode) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("es", TEXT_INDEX_VERSION_2);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "spanish");
+}
+
+TEST(FTSLanguageV2, UpperCaseLanguage) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_2);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "spanish");
+}
+
+TEST(FTSLanguageV2, UpperCaseCode) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("ES", TEXT_INDEX_VERSION_2);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "spanish");
+}
+
+TEST(FTSLanguageV2, NoneLanguage) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("none", TEXT_INDEX_VERSION_2);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "none");
+}
+
+// Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2.
+
+TEST(FTSLanguageV2, Unknown) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("spanglish", TEXT_INDEX_VERSION_2);
+ ASSERT(!swl.getStatus().isOK());
+}
+
+TEST(FTSLanguageV2, Empty) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("", TEXT_INDEX_VERSION_2);
+ ASSERT(!swl.getStatus().isOK());
+}
+
+// Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1.
+
+TEST(FTSLanguageV1, ExactLanguage) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("spanish", TEXT_INDEX_VERSION_1);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "spanish");
+}
+
+TEST(FTSLanguageV1, DeprecatedLanguage) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("porter", TEXT_INDEX_VERSION_1);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "porter");
+}
+
+TEST(FTSLanguageV1, StemmerOnlyLanguage1) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("en", TEXT_INDEX_VERSION_1);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "en");
+}
+
+TEST(FTSLanguageV1, StemmerOnlyLanguage2) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("eng", TEXT_INDEX_VERSION_1);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "eng");
+}
+
+TEST(FTSLanguageV1, NoneLanguage) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("none", TEXT_INDEX_VERSION_1);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "none");
+}
+
+// Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1.
+
+TEST(FTSLanguageV1, CaseSensitive) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_1);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "none");
+}
+
+TEST(FTSLanguageV1, Unknown) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("asdf", TEXT_INDEX_VERSION_1);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "none");
+}
+
+TEST(FTSLanguageV1, Empty) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("", TEXT_INDEX_VERSION_1);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "none");
+}
+}
}
diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp
index c2aa234cd51..544ef93cf36 100644
--- a/src/mongo/db/fts/fts_matcher.cpp
+++ b/src/mongo/db/fts/fts_matcher.cpp
@@ -37,144 +37,138 @@
namespace mongo {
- namespace fts {
-
- using std::string;
-
- /**
- * Does the string 'phrase' occur in the string 'haystack'? Match is case-insensitive if
- * 'caseSensitive' is false; otherwise, an exact substring match is performed.
- */
- static bool phraseMatches( const string& phrase,
- const string& haystack,
- bool caseSensitive ) {
- if ( caseSensitive ) {
- return haystack.find( phrase ) != string::npos;
- }
- return strcasestr( haystack.c_str(), phrase.c_str() ) != NULL;
- }
-
- FTSMatcher::FTSMatcher( const FTSQuery& query, const FTSSpec& spec )
- : _query( query ),
- _spec( spec ) {
- }
+namespace fts {
- bool FTSMatcher::matches( const BSONObj& obj ) const {
- if ( canSkipPositiveTermCheck() ) {
- // We can assume that 'obj' has at least one positive term, and dassert as a sanity
- // check.
- dassert( hasPositiveTerm( obj ) );
- }
- else {
- if ( !hasPositiveTerm( obj ) ) {
- return false;
- }
- }
-
- if ( hasNegativeTerm( obj ) ) {
- return false;
- }
-
- if ( !positivePhrasesMatch( obj ) ) {
- return false;
- }
-
- return negativePhrasesMatch( obj );
- }
+using std::string;
- bool FTSMatcher::hasPositiveTerm( const BSONObj& obj ) const {
- FTSElementIterator it( _spec, obj );
+/**
+ * Does the string 'phrase' occur in the string 'haystack'? Match is case-insensitive if
+ * 'caseSensitive' is false; otherwise, an exact substring match is performed.
+ */
+static bool phraseMatches(const string& phrase, const string& haystack, bool caseSensitive) {
+ if (caseSensitive) {
+ return haystack.find(phrase) != string::npos;
+ }
+ return strcasestr(haystack.c_str(), phrase.c_str()) != NULL;
+}
- while ( it.more() ) {
- FTSIteratorValue val = it.next();
- if ( _hasPositiveTerm_string( val._language, val._text ) ) {
- return true;
- }
- }
+FTSMatcher::FTSMatcher(const FTSQuery& query, const FTSSpec& spec) : _query(query), _spec(spec) {}
+bool FTSMatcher::matches(const BSONObj& obj) const {
+ if (canSkipPositiveTermCheck()) {
+ // We can assume that 'obj' has at least one positive term, and dassert as a sanity
+ // check.
+ dassert(hasPositiveTerm(obj));
+ } else {
+ if (!hasPositiveTerm(obj)) {
return false;
}
+ }
- bool FTSMatcher::_hasPositiveTerm_string( const FTSLanguage* language,
- const string& raw ) const {
- std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
+ if (hasNegativeTerm(obj)) {
+ return false;
+ }
- tokenizer->reset(raw.c_str(), _query.getCaseSensitive() ?
- FTSTokenizer::GenerateCaseSensitiveTokens : FTSTokenizer::None);
+ if (!positivePhrasesMatch(obj)) {
+ return false;
+ }
- while (tokenizer->moveNext()) {
- string word = tokenizer->get().toString();
- if (_query.getPositiveTerms().count(word) > 0) {
- return true;
- }
- }
- return false;
+ return negativePhrasesMatch(obj);
+}
+
+bool FTSMatcher::hasPositiveTerm(const BSONObj& obj) const {
+ FTSElementIterator it(_spec, obj);
+
+ while (it.more()) {
+ FTSIteratorValue val = it.next();
+ if (_hasPositiveTerm_string(val._language, val._text)) {
+ return true;
}
+ }
- bool FTSMatcher::hasNegativeTerm( const BSONObj& obj ) const {
- if ( _query.getNegatedTerms().size() == 0 ) {
- return false;
- }
+ return false;
+}
- FTSElementIterator it( _spec, obj );
+bool FTSMatcher::_hasPositiveTerm_string(const FTSLanguage* language, const string& raw) const {
+ std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
- while ( it.more() ) {
- FTSIteratorValue val = it.next();
- if ( _hasNegativeTerm_string( val._language, val._text ) ) {
- return true;
- }
- }
+ tokenizer->reset(raw.c_str(),
+ _query.getCaseSensitive() ? FTSTokenizer::GenerateCaseSensitiveTokens
+ : FTSTokenizer::None);
- return false;
+ while (tokenizer->moveNext()) {
+ string word = tokenizer->get().toString();
+ if (_query.getPositiveTerms().count(word) > 0) {
+ return true;
}
+ }
+ return false;
+}
- bool FTSMatcher::_hasNegativeTerm_string( const FTSLanguage* language,
- const string& raw ) const {
- std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
+bool FTSMatcher::hasNegativeTerm(const BSONObj& obj) const {
+ if (_query.getNegatedTerms().size() == 0) {
+ return false;
+ }
- tokenizer->reset(raw.c_str(), _query.getCaseSensitive() ?
- FTSTokenizer::GenerateCaseSensitiveTokens : FTSTokenizer::None);
+ FTSElementIterator it(_spec, obj);
- while (tokenizer->moveNext()) {
- string word = tokenizer->get().toString();
- if ( _query.getNegatedTerms().count( word ) > 0 ) {
- return true;
- }
- }
- return false;
+ while (it.more()) {
+ FTSIteratorValue val = it.next();
+ if (_hasNegativeTerm_string(val._language, val._text)) {
+ return true;
}
+ }
- bool FTSMatcher::positivePhrasesMatch( const BSONObj& obj ) const {
- for ( size_t i = 0; i < _query.getPositivePhr().size(); i++ ) {
- if ( !_phraseMatch( _query.getPositivePhr()[i], obj ) ) {
- return false;
- }
- }
+ return false;
+}
- return true;
- }
+bool FTSMatcher::_hasNegativeTerm_string(const FTSLanguage* language, const string& raw) const {
+ std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
- bool FTSMatcher::negativePhrasesMatch( const BSONObj& obj ) const {
- for ( size_t i = 0; i < _query.getNegatedPhr().size(); i++ ) {
- if ( _phraseMatch( _query.getNegatedPhr()[i], obj ) ) {
- return false;
- }
- }
+ tokenizer->reset(raw.c_str(),
+ _query.getCaseSensitive() ? FTSTokenizer::GenerateCaseSensitiveTokens
+ : FTSTokenizer::None);
+ while (tokenizer->moveNext()) {
+ string word = tokenizer->get().toString();
+ if (_query.getNegatedTerms().count(word) > 0) {
return true;
}
+ }
+ return false;
+}
- bool FTSMatcher::_phraseMatch( const string& phrase, const BSONObj& obj ) const {
- FTSElementIterator it( _spec, obj );
+bool FTSMatcher::positivePhrasesMatch(const BSONObj& obj) const {
+ for (size_t i = 0; i < _query.getPositivePhr().size(); i++) {
+ if (!_phraseMatch(_query.getPositivePhr()[i], obj)) {
+ return false;
+ }
+ }
- while ( it.more() ) {
- FTSIteratorValue val = it.next();
- if ( phraseMatches( phrase, val._text, _query.getCaseSensitive() ) ) {
- return true;
- }
- }
+ return true;
+}
+bool FTSMatcher::negativePhrasesMatch(const BSONObj& obj) const {
+ for (size_t i = 0; i < _query.getNegatedPhr().size(); i++) {
+ if (_phraseMatch(_query.getNegatedPhr()[i], obj)) {
return false;
}
}
+
+ return true;
+}
+
+bool FTSMatcher::_phraseMatch(const string& phrase, const BSONObj& obj) const {
+ FTSElementIterator it(_spec, obj);
+
+ while (it.more()) {
+ FTSIteratorValue val = it.next();
+ if (phraseMatches(phrase, val._text, _query.getCaseSensitive())) {
+ return true;
+ }
+ }
+
+ return false;
+}
+}
}
diff --git a/src/mongo/db/fts/fts_matcher.h b/src/mongo/db/fts/fts_matcher.h
index 058dcc7bcb6..00fe8291c4d 100644
--- a/src/mongo/db/fts/fts_matcher.h
+++ b/src/mongo/db/fts/fts_matcher.h
@@ -36,74 +36,74 @@
namespace mongo {
- namespace fts {
-
- class FTSMatcher {
- MONGO_DISALLOW_COPYING( FTSMatcher );
- public:
- FTSMatcher( const FTSQuery& query, const FTSSpec& spec );
-
- /**
- * Returns whether 'obj' matches the query. An object is considered to match the query
- * if all four of the following conditions hold:
- * 1) The object contains at least one positive term.
- * 2) The object contains zero negative terms.
- * 3) The object contains all positive phrases.
- * 4) The object contains zero negative phrases.
- */
- bool matches( const BSONObj& obj ) const;
-
- /**
- * Returns whether 'obj' contains at least one positive term.
- */
- bool hasPositiveTerm( const BSONObj& obj ) const;
-
- /**
- * Returns whether 'obj' contains at least one negative term.
- */
- bool hasNegativeTerm( const BSONObj& obj ) const;
-
- /**
- * Returns whether 'obj' contains all positive phrases.
- */
- bool positivePhrasesMatch( const BSONObj& obj ) const;
-
- /**
- * Returns whether 'obj' contains zero negative phrases.
- */
- bool negativePhrasesMatch( const BSONObj& obj ) const;
-
- private:
- /**
- * For matching, can we skip the positive term check? This is done as optimization when
- * we have a-priori knowledge that all documents being matched pass the positive term
- * check.
- */
- bool canSkipPositiveTermCheck() const { return !_query.getCaseSensitive(); }
-
- /**
- * Returns whether the string 'raw' contains any positive terms from the query.
- * 'language' specifies the language for 'raw'.
- */
- bool _hasPositiveTerm_string( const FTSLanguage* language,
- const std::string& raw ) const;
-
- /**
- * Returns whether the string 'raw' contains any negative terms from the query.
- * 'language' specifies the language for 'raw'.
- */
- bool _hasNegativeTerm_string( const FTSLanguage* language,
- const std::string& raw ) const;
-
- /**
- * Returns whether 'obj' contains the exact string 'phrase' in any indexed fields.
- */
- bool _phraseMatch( const std::string& phrase, const BSONObj& obj ) const;
-
- // TODO These should be unowned pointers instead of owned copies.
- const FTSQuery _query;
- const FTSSpec _spec;
- };
-
+namespace fts {
+
+class FTSMatcher {
+ MONGO_DISALLOW_COPYING(FTSMatcher);
+
+public:
+ FTSMatcher(const FTSQuery& query, const FTSSpec& spec);
+
+ /**
+ * Returns whether 'obj' matches the query. An object is considered to match the query
+ * if all four of the following conditions hold:
+ * 1) The object contains at least one positive term.
+ * 2) The object contains zero negative terms.
+ * 3) The object contains all positive phrases.
+ * 4) The object contains zero negative phrases.
+ */
+ bool matches(const BSONObj& obj) const;
+
+ /**
+ * Returns whether 'obj' contains at least one positive term.
+ */
+ bool hasPositiveTerm(const BSONObj& obj) const;
+
+ /**
+ * Returns whether 'obj' contains at least one negative term.
+ */
+ bool hasNegativeTerm(const BSONObj& obj) const;
+
+ /**
+ * Returns whether 'obj' contains all positive phrases.
+ */
+ bool positivePhrasesMatch(const BSONObj& obj) const;
+
+ /**
+ * Returns whether 'obj' contains zero negative phrases.
+ */
+ bool negativePhrasesMatch(const BSONObj& obj) const;
+
+private:
+ /**
+ * For matching, can we skip the positive term check? This is done as optimization when
+ * we have a-priori knowledge that all documents being matched pass the positive term
+ * check.
+ */
+ bool canSkipPositiveTermCheck() const {
+ return !_query.getCaseSensitive();
}
+
+ /**
+ * Returns whether the string 'raw' contains any positive terms from the query.
+ * 'language' specifies the language for 'raw'.
+ */
+ bool _hasPositiveTerm_string(const FTSLanguage* language, const std::string& raw) const;
+
+ /**
+ * Returns whether the string 'raw' contains any negative terms from the query.
+ * 'language' specifies the language for 'raw'.
+ */
+ bool _hasNegativeTerm_string(const FTSLanguage* language, const std::string& raw) const;
+
+ /**
+ * Returns whether 'obj' contains the exact string 'phrase' in any indexed fields.
+ */
+ bool _phraseMatch(const std::string& phrase, const BSONObj& obj) const;
+
+ // TODO These should be unowned pointers instead of owned copies.
+ const FTSQuery _query;
+ const FTSSpec _spec;
+};
+}
}
diff --git a/src/mongo/db/fts/fts_matcher_test.cpp b/src/mongo/db/fts/fts_matcher_test.cpp
index 0ea0fbe9e7e..13eb74609dc 100644
--- a/src/mongo/db/fts/fts_matcher_test.cpp
+++ b/src/mongo/db/fts/fts_matcher_test.cpp
@@ -34,187 +34,204 @@
#include "mongo/unittest/unittest.h"
namespace mongo {
- namespace fts {
-
- TEST( FTSMatcher, NegWild1 ) {
- FTSQuery q;
- ASSERT_OK( q.parse( "foo -bar", "english", false, TEXT_INDEX_VERSION_2 ) );
- FTSMatcher m( q,
- FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "text" ) ) ) ) );
-
- ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "bar" ) ) ) );
- ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "bar" ) ) ) );
- }
-
- // Regression test for SERVER-11994.
- TEST( FTSMatcher, NegWild2 ) {
- FTSQuery q;
- ASSERT_OK( q.parse( "pizza -restaurant", "english", false, TEXT_INDEX_VERSION_2 ) );
- FTSMatcher m( q,
- FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "text" ) ) ) ) );
-
- ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "pizza restaurant" ) ) ) );
- ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "PIZZA RESTAURANT" ) ) ) );
- }
-
- TEST( FTSMatcher, Phrase1 ) {
- FTSQuery q;
- ASSERT_OK( q.parse( "foo \"table top\"", "english", false, TEXT_INDEX_VERSION_2 ) );
- FTSMatcher m( q,
- FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "text" ) ) ) ) );
-
- ASSERT( m.positivePhrasesMatch( BSON( "x" << "table top" ) ) );
- ASSERT( m.positivePhrasesMatch( BSON( "x" << " asd table top asd" ) ) );
- ASSERT( !m.positivePhrasesMatch( BSON( "x" << "tablz top" ) ) );
- ASSERT( !m.positivePhrasesMatch( BSON( "x" << " asd tablz top asd" ) ) );
-
- ASSERT( m.positivePhrasesMatch( BSON( "x" << "table top" ) ) );
- ASSERT( !m.positivePhrasesMatch( BSON( "x" << "table a top" ) ) );
-
- }
-
- TEST( FTSMatcher, Phrase2 ) {
- FTSQuery q;
- ASSERT_OK( q.parse( "foo \"table top\"", "english", false, TEXT_INDEX_VERSION_2 ) );
- FTSMatcher m( q,
- FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) );
- ASSERT( m.positivePhrasesMatch( BSON( "x" << BSON_ARRAY( "table top" ) ) ) );
- }
-
- // Test that the matcher parses the document with the document language, not the search
- // language.
- TEST( FTSMatcher, ParsesUsingDocLanguage ) {
- FTSQuery q;
- ASSERT_OK( q.parse( "-glad", "none", false, TEXT_INDEX_VERSION_2 ) );
- FTSMatcher m( q,
- FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) );
-
- // Even though the search language is "none", the document {x: "gladly"} should be
- // parsed using the English stemmer, and as such should match the negated term "glad".
- ASSERT( m.hasNegativeTerm( BSON( "x" << "gladly" ) ) );
- }
-
- // Test the matcher does not filter out stop words from positive terms
- TEST( FTSMatcher, MatcherDoesNotFilterStopWordsNeg ) {
- FTSQuery q;
- ASSERT_OK( q.parse( "-the", "none", false, TEXT_INDEX_VERSION_2 ) );
- FTSMatcher m( q,
- FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) );
-
- ASSERT( m.hasNegativeTerm( BSON( "x" << "the" ) ) );
- }
-
- // Test the matcher does not filter out stop words from negative terms
- TEST( FTSMatcher, MatcherDoesNotFilterStopWordsPos ) {
- FTSQuery q;
- ASSERT_OK( q.parse( "the", "none", false, TEXT_INDEX_VERSION_2 ) );
- FTSMatcher m( q,
- FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) );
-
- ASSERT( m.hasPositiveTerm( BSON( "x" << "the" ) ) );
- }
-
- // Returns whether a document indexed with text data 'doc' contains any positive terms from
- // case-sensitive text query 'search'.
- static bool docHasPositiveTermWithCase( const std::string& doc,
- const std::string& search ) {
- FTSQuery q;
- ASSERT_OK( q.parse( search, "english", true, TEXT_INDEX_VERSION_2 ) );
- FTSMatcher m( q,
- FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) );
-
- return m.hasPositiveTerm( BSON( "x" << doc ) );
- }
-
- TEST( FTSMatcher, HasPositiveTermCaseSensitive ) {
- ASSERT_TRUE( docHasPositiveTermWithCase( "hello world", "hello" ) );
- ASSERT_TRUE( docHasPositiveTermWithCase( "Hello World", "Hello" ) );
- ASSERT_TRUE( docHasPositiveTermWithCase( "Hello World", "World Hello" ) );
- ASSERT_TRUE( docHasPositiveTermWithCase( "Hello World", "World GoodBye" ) );
- ASSERT_TRUE( docHasPositiveTermWithCase( "John Runs", "Runs" ) );
- ASSERT_TRUE( docHasPositiveTermWithCase( "John Runs", "Running" ) );
- ASSERT_TRUE( docHasPositiveTermWithCase( "John Runs", "Run" ) );
-
- ASSERT_FALSE( docHasPositiveTermWithCase( "John Runs", "run" ) );
- ASSERT_FALSE( docHasPositiveTermWithCase( "Hello World", "HELLO" ) );
- ASSERT_FALSE( docHasPositiveTermWithCase( "hello world", "Hello" ) );
- ASSERT_FALSE( docHasPositiveTermWithCase( "Hello World", "hello" ) );
- }
-
- // Returns whether a document indexed with text data 'doc' contains any negative terms from
- // case-sensitive text query 'search'.
- static bool docHasNegativeTermWithCase( const std::string& doc,
- const std::string& search ) {
- FTSQuery q;
- ASSERT_OK( q.parse( search, "english", true, TEXT_INDEX_VERSION_2 ) );
- FTSMatcher m( q,
- FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) );
-
- return m.hasNegativeTerm( BSON( "x" << doc ) );
- }
-
- TEST( FTSMatcher, HasNegativeTermCaseSensitive ) {
- ASSERT_TRUE( docHasNegativeTermWithCase( "hello world", "hello -world" ) );
- ASSERT_TRUE( docHasNegativeTermWithCase( "Hello World", "Hello -World" ) );
- ASSERT_TRUE( docHasNegativeTermWithCase( "Hello World", "-World -Hello" ) );
- ASSERT_TRUE( docHasNegativeTermWithCase( "Hello World", "-Goodbye -World" ) );
- ASSERT_TRUE( docHasNegativeTermWithCase( "John Runs", "-Runs" ) );
- ASSERT_TRUE( docHasNegativeTermWithCase( "John Runs", "-Running" ) );
- ASSERT_TRUE( docHasNegativeTermWithCase( "John Runs", "-Run" ) );
-
- ASSERT_FALSE( docHasNegativeTermWithCase( "John Runs", "-run" ) );
- ASSERT_FALSE( docHasNegativeTermWithCase( "Hello World", "Hello -WORLD" ) );
- ASSERT_FALSE( docHasNegativeTermWithCase( "hello world", "hello -World" ) );
- ASSERT_FALSE( docHasNegativeTermWithCase( "Hello World", "Hello -world" ) );
- }
-
- // Returns whether a document indexed with text data 'doc' contains all positive phrases
- // from case-sensitive text query 'search'.
- static bool docPositivePhrasesMatchWithCase( const std::string& doc,
- const std::string& search ) {
- FTSQuery q;
- ASSERT_OK( q.parse( search, "english", true, TEXT_INDEX_VERSION_2 ) );
- FTSMatcher m( q,
- FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) );
-
- return m.positivePhrasesMatch( BSON( "x" << doc ) );
- }
-
- TEST( FTSMatcher, PositivePhrasesMatchWithCase ) {
- ASSERT_TRUE( docPositivePhrasesMatchWithCase( "John Runs", "\"John Runs\"" ) );
- ASSERT_TRUE( docPositivePhrasesMatchWithCase( "John Runs", "\"John Run\"" ) );
- ASSERT_TRUE( docPositivePhrasesMatchWithCase( "John Runs", "\"John\" \"Run\"" ) );
- ASSERT_TRUE( docPositivePhrasesMatchWithCase( "John Runs", "\"n R\"" ) );
-
- ASSERT_FALSE( docPositivePhrasesMatchWithCase( "John Runs", "\"john runs\"" ) );
- ASSERT_FALSE( docPositivePhrasesMatchWithCase( "john runs", "\"John Runs\"" ) );
- ASSERT_FALSE( docPositivePhrasesMatchWithCase( "John Runs", "\"John\" \"Running\"" ) );
- }
-
- // Returns whether a document indexed with text data 'doc' contains zero negative phrases
- // from case-sensitive text query 'search'.
- static bool docNegativePhrasesMatchWithCase( const std::string& doc,
- const std::string& search ) {
- FTSQuery q;
- ASSERT_OK( q.parse( search, "english", true, TEXT_INDEX_VERSION_2 ) );
- FTSMatcher m( q,
- FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) );
-
- return m.negativePhrasesMatch( BSON( "x" << doc ) );
- }
-
- TEST( FTSMatcher, NegativePhrasesMatchWithCase ) {
- ASSERT_TRUE( docNegativePhrasesMatchWithCase( "John Runs", "-\"john runs\"" ) );
- ASSERT_TRUE( docNegativePhrasesMatchWithCase( "john runs", "-\"John Runs\"" ) );
- ASSERT_TRUE( docNegativePhrasesMatchWithCase( "john runs", "-\"John\" -\"Runs\"" ) );
-
- ASSERT_FALSE( docNegativePhrasesMatchWithCase( "John Runs", "-\"John Runs\"" ) );
- ASSERT_FALSE( docNegativePhrasesMatchWithCase( "John Runs", "-\"John Run\"" ) );
- ASSERT_FALSE( docNegativePhrasesMatchWithCase( "John Runs", "-\"John\" -\"Run\"" ) );
- ASSERT_FALSE( docNegativePhrasesMatchWithCase( "John Runs", "-\"n R\"" ) );
- ASSERT_FALSE( docNegativePhrasesMatchWithCase( "John Runs",
- "-\"John\" -\"Running\"" ) );
- }
-
- }
+namespace fts {
+
+TEST(FTSMatcher, NegWild1) {
+ FTSQuery q;
+ ASSERT_OK(q.parse("foo -bar", "english", false, TEXT_INDEX_VERSION_2));
+ FTSMatcher m(q,
+ FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("$**"
+ << "text")))));
+
+ ASSERT(m.hasNegativeTerm(BSON("x" << BSON("y"
+ << "bar"))));
+ ASSERT(m.hasNegativeTerm(BSON("x" << BSON("y"
+ << "bar"))));
+}
+
+// Regression test for SERVER-11994.
+TEST(FTSMatcher, NegWild2) {
+ FTSQuery q;
+ ASSERT_OK(q.parse("pizza -restaurant", "english", false, TEXT_INDEX_VERSION_2));
+ FTSMatcher m(q,
+ FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("$**"
+ << "text")))));
+
+ ASSERT(m.hasNegativeTerm(BSON("x" << BSON("y"
+ << "pizza restaurant"))));
+ ASSERT(m.hasNegativeTerm(BSON("x" << BSON("y"
+ << "PIZZA RESTAURANT"))));
+}
+
+TEST(FTSMatcher, Phrase1) {
+ FTSQuery q;
+ ASSERT_OK(q.parse("foo \"table top\"", "english", false, TEXT_INDEX_VERSION_2));
+ FTSMatcher m(q,
+ FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("$**"
+ << "text")))));
+
+ ASSERT(m.positivePhrasesMatch(BSON("x"
+ << "table top")));
+ ASSERT(m.positivePhrasesMatch(BSON("x"
+ << " asd table top asd")));
+ ASSERT(!m.positivePhrasesMatch(BSON("x"
+ << "tablz top")));
+ ASSERT(!m.positivePhrasesMatch(BSON("x"
+ << " asd tablz top asd")));
+
+ ASSERT(m.positivePhrasesMatch(BSON("x"
+ << "table top")));
+ ASSERT(!m.positivePhrasesMatch(BSON("x"
+ << "table a top")));
+}
+
+TEST(FTSMatcher, Phrase2) {
+ FTSQuery q;
+ ASSERT_OK(q.parse("foo \"table top\"", "english", false, TEXT_INDEX_VERSION_2));
+ FTSMatcher m(q,
+ FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x"
+ << "text")))));
+ ASSERT(m.positivePhrasesMatch(BSON("x" << BSON_ARRAY("table top"))));
+}
+
+// Test that the matcher parses the document with the document language, not the search
+// language.
+TEST(FTSMatcher, ParsesUsingDocLanguage) {
+ FTSQuery q;
+ ASSERT_OK(q.parse("-glad", "none", false, TEXT_INDEX_VERSION_2));
+ FTSMatcher m(q,
+ FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x"
+ << "text")))));
+
+ // Even though the search language is "none", the document {x: "gladly"} should be
+ // parsed using the English stemmer, and as such should match the negated term "glad".
+ ASSERT(m.hasNegativeTerm(BSON("x"
+ << "gladly")));
+}
+
+// Test the matcher does not filter out stop words from positive terms
+TEST(FTSMatcher, MatcherDoesNotFilterStopWordsNeg) {
+ FTSQuery q;
+ ASSERT_OK(q.parse("-the", "none", false, TEXT_INDEX_VERSION_2));
+ FTSMatcher m(q,
+ FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x"
+ << "text")))));
+
+ ASSERT(m.hasNegativeTerm(BSON("x"
+ << "the")));
+}
+
+// Test the matcher does not filter out stop words from negative terms
+TEST(FTSMatcher, MatcherDoesNotFilterStopWordsPos) {
+ FTSQuery q;
+ ASSERT_OK(q.parse("the", "none", false, TEXT_INDEX_VERSION_2));
+ FTSMatcher m(q,
+ FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x"
+ << "text")))));
+
+ ASSERT(m.hasPositiveTerm(BSON("x"
+ << "the")));
+}
+
+// Returns whether a document indexed with text data 'doc' contains any positive terms from
+// case-sensitive text query 'search'.
+static bool docHasPositiveTermWithCase(const std::string& doc, const std::string& search) {
+ FTSQuery q;
+ ASSERT_OK(q.parse(search, "english", true, TEXT_INDEX_VERSION_2));
+ FTSMatcher m(q,
+ FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x"
+ << "text")))));
+
+ return m.hasPositiveTerm(BSON("x" << doc));
+}
+
+TEST(FTSMatcher, HasPositiveTermCaseSensitive) {
+ ASSERT_TRUE(docHasPositiveTermWithCase("hello world", "hello"));
+ ASSERT_TRUE(docHasPositiveTermWithCase("Hello World", "Hello"));
+ ASSERT_TRUE(docHasPositiveTermWithCase("Hello World", "World Hello"));
+ ASSERT_TRUE(docHasPositiveTermWithCase("Hello World", "World GoodBye"));
+ ASSERT_TRUE(docHasPositiveTermWithCase("John Runs", "Runs"));
+ ASSERT_TRUE(docHasPositiveTermWithCase("John Runs", "Running"));
+ ASSERT_TRUE(docHasPositiveTermWithCase("John Runs", "Run"));
+
+ ASSERT_FALSE(docHasPositiveTermWithCase("John Runs", "run"));
+ ASSERT_FALSE(docHasPositiveTermWithCase("Hello World", "HELLO"));
+ ASSERT_FALSE(docHasPositiveTermWithCase("hello world", "Hello"));
+ ASSERT_FALSE(docHasPositiveTermWithCase("Hello World", "hello"));
+}
+
+// Returns whether a document indexed with text data 'doc' contains any negative terms from
+// case-sensitive text query 'search'.
+static bool docHasNegativeTermWithCase(const std::string& doc, const std::string& search) {
+ FTSQuery q;
+ ASSERT_OK(q.parse(search, "english", true, TEXT_INDEX_VERSION_2));
+ FTSMatcher m(q,
+ FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x"
+ << "text")))));
+
+ return m.hasNegativeTerm(BSON("x" << doc));
+}
+
+TEST(FTSMatcher, HasNegativeTermCaseSensitive) {
+ ASSERT_TRUE(docHasNegativeTermWithCase("hello world", "hello -world"));
+ ASSERT_TRUE(docHasNegativeTermWithCase("Hello World", "Hello -World"));
+ ASSERT_TRUE(docHasNegativeTermWithCase("Hello World", "-World -Hello"));
+ ASSERT_TRUE(docHasNegativeTermWithCase("Hello World", "-Goodbye -World"));
+ ASSERT_TRUE(docHasNegativeTermWithCase("John Runs", "-Runs"));
+ ASSERT_TRUE(docHasNegativeTermWithCase("John Runs", "-Running"));
+ ASSERT_TRUE(docHasNegativeTermWithCase("John Runs", "-Run"));
+
+ ASSERT_FALSE(docHasNegativeTermWithCase("John Runs", "-run"));
+ ASSERT_FALSE(docHasNegativeTermWithCase("Hello World", "Hello -WORLD"));
+ ASSERT_FALSE(docHasNegativeTermWithCase("hello world", "hello -World"));
+ ASSERT_FALSE(docHasNegativeTermWithCase("Hello World", "Hello -world"));
+}
+
+// Returns whether a document indexed with text data 'doc' contains all positive phrases
+// from case-sensitive text query 'search'.
+static bool docPositivePhrasesMatchWithCase(const std::string& doc, const std::string& search) {
+ FTSQuery q;
+ ASSERT_OK(q.parse(search, "english", true, TEXT_INDEX_VERSION_2));
+ FTSMatcher m(q,
+ FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x"
+ << "text")))));
+
+ return m.positivePhrasesMatch(BSON("x" << doc));
+}
+
+TEST(FTSMatcher, PositivePhrasesMatchWithCase) {
+ ASSERT_TRUE(docPositivePhrasesMatchWithCase("John Runs", "\"John Runs\""));
+ ASSERT_TRUE(docPositivePhrasesMatchWithCase("John Runs", "\"John Run\""));
+ ASSERT_TRUE(docPositivePhrasesMatchWithCase("John Runs", "\"John\" \"Run\""));
+ ASSERT_TRUE(docPositivePhrasesMatchWithCase("John Runs", "\"n R\""));
+
+ ASSERT_FALSE(docPositivePhrasesMatchWithCase("John Runs", "\"john runs\""));
+ ASSERT_FALSE(docPositivePhrasesMatchWithCase("john runs", "\"John Runs\""));
+ ASSERT_FALSE(docPositivePhrasesMatchWithCase("John Runs", "\"John\" \"Running\""));
+}
+
+// Returns whether a document indexed with text data 'doc' contains zero negative phrases
+// from case-sensitive text query 'search'.
+static bool docNegativePhrasesMatchWithCase(const std::string& doc, const std::string& search) {
+ FTSQuery q;
+ ASSERT_OK(q.parse(search, "english", true, TEXT_INDEX_VERSION_2));
+ FTSMatcher m(q,
+ FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x"
+ << "text")))));
+
+ return m.negativePhrasesMatch(BSON("x" << doc));
+}
+
+TEST(FTSMatcher, NegativePhrasesMatchWithCase) {
+ ASSERT_TRUE(docNegativePhrasesMatchWithCase("John Runs", "-\"john runs\""));
+ ASSERT_TRUE(docNegativePhrasesMatchWithCase("john runs", "-\"John Runs\""));
+ ASSERT_TRUE(docNegativePhrasesMatchWithCase("john runs", "-\"John\" -\"Runs\""));
+
+ ASSERT_FALSE(docNegativePhrasesMatchWithCase("John Runs", "-\"John Runs\""));
+ ASSERT_FALSE(docNegativePhrasesMatchWithCase("John Runs", "-\"John Run\""));
+ ASSERT_FALSE(docNegativePhrasesMatchWithCase("John Runs", "-\"John\" -\"Run\""));
+ ASSERT_FALSE(docNegativePhrasesMatchWithCase("John Runs", "-\"n R\""));
+ ASSERT_FALSE(docNegativePhrasesMatchWithCase("John Runs", "-\"John\" -\"Running\""));
+}
+}
}
diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp
index bbaac9b2f1e..8dec8e29204 100644
--- a/src/mongo/db/fts/fts_query.cpp
+++ b/src/mongo/db/fts/fts_query.cpp
@@ -40,219 +40,208 @@
namespace mongo {
- namespace fts {
+namespace fts {
- using namespace mongoutils;
+using namespace mongoutils;
- using std::set;
- using std::string;
- using std::stringstream;
- using std::vector;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::vector;
- const bool FTSQuery::caseSensitiveDefault = false;
+const bool FTSQuery::caseSensitiveDefault = false;
- Status FTSQuery::parse(const string& query, StringData language, bool caseSensitive,
- TextIndexVersion textIndexVersion) {
- StatusWithFTSLanguage swl = FTSLanguage::make( language, textIndexVersion );
- if ( !swl.getStatus().isOK() ) {
- return swl.getStatus();
- }
- _language = swl.getValue();
- _caseSensitive = caseSensitive;
-
- // Build a space delimited list of words to have the FtsTokenizer tokenize
- string positiveTermSentence;
- string negativeTermSentence;
-
- bool inNegation = false;
- bool inPhrase = false;
-
- unsigned quoteOffset = 0;
-
- FTSQueryParser i(query);
- while ( i.more() ) {
- QueryToken t = i.next();
-
- if ( t.type == QueryToken::TEXT ) {
- string s = t.data.toString();
-
- if ( inPhrase && inNegation ) {
- // don't add term
- }
- else {
- if (inNegation) {
- negativeTermSentence.append(s);
- negativeTermSentence.push_back(' ');
- }
- else {
- positiveTermSentence.append(s);
- positiveTermSentence.push_back(' ');
- }
- }
-
- if ( inNegation && !inPhrase )
- inNegation = false;
+Status FTSQuery::parse(const string& query,
+ StringData language,
+ bool caseSensitive,
+ TextIndexVersion textIndexVersion) {
+ StatusWithFTSLanguage swl = FTSLanguage::make(language, textIndexVersion);
+ if (!swl.getStatus().isOK()) {
+ return swl.getStatus();
+ }
+ _language = swl.getValue();
+ _caseSensitive = caseSensitive;
+
+ // Build a space delimited list of words to have the FtsTokenizer tokenize
+ string positiveTermSentence;
+ string negativeTermSentence;
+
+ bool inNegation = false;
+ bool inPhrase = false;
+
+ unsigned quoteOffset = 0;
+
+ FTSQueryParser i(query);
+ while (i.more()) {
+ QueryToken t = i.next();
+
+ if (t.type == QueryToken::TEXT) {
+ string s = t.data.toString();
+
+ if (inPhrase && inNegation) {
+ // don't add term
+ } else {
+ if (inNegation) {
+ negativeTermSentence.append(s);
+ negativeTermSentence.push_back(' ');
+ } else {
+ positiveTermSentence.append(s);
+ positiveTermSentence.push_back(' ');
}
- else if ( t.type == QueryToken::DELIMITER ) {
- char c = t.data[0];
- if ( c == '-' ) {
- if ( !inPhrase && t.previousWhiteSpace ) {
- // phrases can be negated, and terms not in phrases can be negated.
- // terms in phrases can not be negated.
- inNegation = true;
- }
- }
- else if ( c == '"' ) {
- if ( inPhrase ) {
- // end of a phrase
- unsigned phraseStart = quoteOffset + 1;
- unsigned phraseLength = t.offset - phraseStart;
- StringData phrase = StringData( query ).substr( phraseStart,
- phraseLength );
- if ( inNegation )
- _negatedPhrases.push_back( normalizeString( phrase ) );
- else
- _positivePhrases.push_back( normalizeString( phrase ) );
- inNegation = false;
- inPhrase = false;
- }
- else {
- // start of a phrase
- inPhrase = true;
- quoteOffset = t.offset;
- }
- }
+ }
+
+ if (inNegation && !inPhrase)
+ inNegation = false;
+ } else if (t.type == QueryToken::DELIMITER) {
+ char c = t.data[0];
+ if (c == '-') {
+ if (!inPhrase && t.previousWhiteSpace) {
+ // phrases can be negated, and terms not in phrases can be negated.
+ // terms in phrases can not be negated.
+ inNegation = true;
}
- else {
- invariant( false );
+ } else if (c == '"') {
+ if (inPhrase) {
+ // end of a phrase
+ unsigned phraseStart = quoteOffset + 1;
+ unsigned phraseLength = t.offset - phraseStart;
+ StringData phrase = StringData(query).substr(phraseStart, phraseLength);
+ if (inNegation)
+ _negatedPhrases.push_back(normalizeString(phrase));
+ else
+ _positivePhrases.push_back(normalizeString(phrase));
+ inNegation = false;
+ inPhrase = false;
+ } else {
+ // start of a phrase
+ inPhrase = true;
+ quoteOffset = t.offset;
}
}
-
- std::unique_ptr<FTSTokenizer> tokenizer(_language->createTokenizer());
-
- _addTerms(tokenizer.get(), positiveTermSentence, false);
- _addTerms(tokenizer.get(), negativeTermSentence, true);
-
- return Status::OK();
+ } else {
+ invariant(false);
}
+ }
- void FTSQuery::_addTerms( FTSTokenizer* tokenizer,
- const string& sentence,
- bool negated ) {
-
- tokenizer->reset(sentence.c_str(), FTSTokenizer::FilterStopWords);
+ std::unique_ptr<FTSTokenizer> tokenizer(_language->createTokenizer());
- auto& activeTerms = negated ? _negatedTerms : _positiveTerms;
+ _addTerms(tokenizer.get(), positiveTermSentence, false);
+ _addTerms(tokenizer.get(), negativeTermSentence, true);
- // First, get all the terms for indexing, ie, lower cased words
- // If we are case-insensitive, we can also used this for positive, and negative terms
- // Some terms may be expanded into multiple words in some non-English languages
- while (tokenizer->moveNext()) {
+ return Status::OK();
+}
- string word = tokenizer->get().toString();
+void FTSQuery::_addTerms(FTSTokenizer* tokenizer, const string& sentence, bool negated) {
+ tokenizer->reset(sentence.c_str(), FTSTokenizer::FilterStopWords);
- if (!negated) {
- _termsForBounds.insert(word);
- }
+ auto& activeTerms = negated ? _negatedTerms : _positiveTerms;
- // Compute the string corresponding to 'token' that will be used for the matcher.
- // For case-insensitive queries, this is the same string as 'boundsTerm' computed
- // above.
- if (!_caseSensitive) {
- activeTerms.insert(word);
- }
- }
+ // First, get all the terms for indexing, ie, lower cased words
+ // If we are case-insensitive, we can also used this for positive, and negative terms
+ // Some terms may be expanded into multiple words in some non-English languages
+ while (tokenizer->moveNext()) {
+ string word = tokenizer->get().toString();
- if (!_caseSensitive) {
- return;
- }
+ if (!negated) {
+ _termsForBounds.insert(word);
+ }
- tokenizer->reset(sentence.c_str(), static_cast<FTSTokenizer::Options>(
- FTSTokenizer::FilterStopWords
- | FTSTokenizer::GenerateCaseSensitiveTokens));
+ // Compute the string corresponding to 'token' that will be used for the matcher.
+ // For case-insensitive queries, this is the same string as 'boundsTerm' computed
+ // above.
+ if (!_caseSensitive) {
+ activeTerms.insert(word);
+ }
+ }
- // If we want case-sensitivity, get the case-sensitive token
- while (tokenizer->moveNext()) {
+ if (!_caseSensitive) {
+ return;
+ }
- string word = tokenizer->get().toString();
+ tokenizer->reset(sentence.c_str(),
+ static_cast<FTSTokenizer::Options>(FTSTokenizer::FilterStopWords |
+ FTSTokenizer::GenerateCaseSensitiveTokens));
- activeTerms.insert(word);
- }
- }
+ // If we want case-sensitivity, get the case-sensitive token
+ while (tokenizer->moveNext()) {
+ string word = tokenizer->get().toString();
- string FTSQuery::normalizeString(StringData str) const {
- if (_caseSensitive) {
- return str.toString();
- }
- return tolowerString(str);
- }
+ activeTerms.insert(word);
+ }
+}
- namespace {
- void _debugHelp( stringstream& ss, const set<string>& s, const string& sep ) {
- bool first = true;
- for ( set<string>::const_iterator i = s.begin(); i != s.end(); ++i ) {
- if ( first )
- first = false;
- else
- ss << sep;
- ss << *i;
- }
- }
+string FTSQuery::normalizeString(StringData str) const {
+ if (_caseSensitive) {
+ return str.toString();
+ }
+ return tolowerString(str);
+}
- void _debugHelp( stringstream& ss, const vector<string>& v, const string& sep ) {
- set<string> s( v.begin(), v.end() );
- _debugHelp( ss, s, sep );
- }
+namespace {
+void _debugHelp(stringstream& ss, const set<string>& s, const string& sep) {
+ bool first = true;
+ for (set<string>::const_iterator i = s.begin(); i != s.end(); ++i) {
+ if (first)
+ first = false;
+ else
+ ss << sep;
+ ss << *i;
+ }
+}
- }
+void _debugHelp(stringstream& ss, const vector<string>& v, const string& sep) {
+ set<string> s(v.begin(), v.end());
+ _debugHelp(ss, s, sep);
+}
+}
- string FTSQuery::toString() const {
- stringstream ss;
- ss << "FTSQuery\n";
+string FTSQuery::toString() const {
+ stringstream ss;
+ ss << "FTSQuery\n";
- ss << " terms: ";
- _debugHelp( ss, getPositiveTerms(), ", " );
- ss << "\n";
+ ss << " terms: ";
+ _debugHelp(ss, getPositiveTerms(), ", ");
+ ss << "\n";
- ss << " negated terms: ";
- _debugHelp( ss, getNegatedTerms(), ", " );
- ss << "\n";
+ ss << " negated terms: ";
+ _debugHelp(ss, getNegatedTerms(), ", ");
+ ss << "\n";
- ss << " phrases: ";
- _debugHelp( ss, getPositivePhr(), ", " );
- ss << "\n";
+ ss << " phrases: ";
+ _debugHelp(ss, getPositivePhr(), ", ");
+ ss << "\n";
- ss << " negated phrases: ";
- _debugHelp( ss, getNegatedPhr(), ", " );
- ss << "\n";
+ ss << " negated phrases: ";
+ _debugHelp(ss, getNegatedPhr(), ", ");
+ ss << "\n";
- return ss.str();
- }
+ return ss.str();
+}
- string FTSQuery::debugString() const {
- stringstream ss;
+string FTSQuery::debugString() const {
+ stringstream ss;
- _debugHelp( ss, getPositiveTerms(), "|" );
- ss << "||";
+ _debugHelp(ss, getPositiveTerms(), "|");
+ ss << "||";
- _debugHelp( ss, getNegatedTerms(), "|" );
- ss << "||";
+ _debugHelp(ss, getNegatedTerms(), "|");
+ ss << "||";
- _debugHelp( ss, getPositivePhr(), "|" );
- ss << "||";
+ _debugHelp(ss, getPositivePhr(), "|");
+ ss << "||";
- _debugHelp( ss, getNegatedPhr(), "|" );
+ _debugHelp(ss, getNegatedPhr(), "|");
- return ss.str();
- }
+ return ss.str();
+}
- BSONObj FTSQuery::toBSON() const {
- BSONObjBuilder bob;
- bob.append( "terms", getPositiveTerms() );
- bob.append( "negatedTerms", getNegatedTerms() );
- bob.append( "phrases", getPositivePhr() );
- bob.append( "negatedPhrases", getNegatedPhr() );
- return bob.obj();
- }
- }
+BSONObj FTSQuery::toBSON() const {
+ BSONObjBuilder bob;
+ bob.append("terms", getPositiveTerms());
+ bob.append("negatedTerms", getNegatedTerms());
+ bob.append("phrases", getPositivePhr());
+ bob.append("negatedPhrases", getNegatedPhr());
+ return bob.obj();
+}
+}
}
diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h
index 88ca4ce64d0..10e0cd2faaf 100644
--- a/src/mongo/db/fts/fts_query.h
+++ b/src/mongo/db/fts/fts_query.h
@@ -40,68 +40,77 @@
namespace mongo {
- namespace fts {
-
- class FTSQuery {
-
- public:
- // Initializes an FTSQuery. Note that the parsing of "language" depends on the text
- // index version, since a query which doesn't specify a language and is against a
- // version 1 text index with a version 1 default language string needs to be parsed as
- // version 1 (see fts_language.cpp for a list of language strings specific to version
- // 1).
- Status parse(const std::string& query, StringData language, bool caseSensitive,
- TextIndexVersion textIndexVersion);
-
- const std::set<std::string>& getPositiveTerms() const { return _positiveTerms; }
- const std::set<std::string>& getNegatedTerms() const { return _negatedTerms; }
- const std::vector<std::string>& getPositivePhr() const { return _positivePhrases; }
- const std::vector<std::string>& getNegatedPhr() const { return _negatedPhrases; }
-
- const std::set<std::string>& getTermsForBounds() const {
- return _termsForBounds;
- }
+namespace fts {
+
+class FTSQuery {
+public:
+ // Initializes an FTSQuery. Note that the parsing of "language" depends on the text
+ // index version, since a query which doesn't specify a language and is against a
+ // version 1 text index with a version 1 default language string needs to be parsed as
+ // version 1 (see fts_language.cpp for a list of language strings specific to version
+ // 1).
+ Status parse(const std::string& query,
+ StringData language,
+ bool caseSensitive,
+ TextIndexVersion textIndexVersion);
+
+ const std::set<std::string>& getPositiveTerms() const {
+ return _positiveTerms;
+ }
+ const std::set<std::string>& getNegatedTerms() const {
+ return _negatedTerms;
+ }
+ const std::vector<std::string>& getPositivePhr() const {
+ return _positivePhrases;
+ }
+ const std::vector<std::string>& getNegatedPhr() const {
+ return _negatedPhrases;
+ }
- const FTSLanguage& getLanguage() const { return *_language; }
- bool getCaseSensitive() const { return _caseSensitive; }
+ const std::set<std::string>& getTermsForBounds() const {
+ return _termsForBounds;
+ }
- std::string toString() const;
+ const FTSLanguage& getLanguage() const {
+ return *_language;
+ }
+ bool getCaseSensitive() const {
+ return _caseSensitive;
+ }
- std::string debugString() const;
+ std::string toString() const;
- BSONObj toBSON() const;
+ std::string debugString() const;
- /**
- * Lowercases "str" if _caseSensitive is set, else returns a copy of "str" unchanged.
- */
- std::string normalizeString( StringData str ) const;
+ BSONObj toBSON() const;
- static const bool caseSensitiveDefault;
+ /**
+ * Lowercases "str" if _caseSensitive is set, else returns a copy of "str" unchanged.
+ */
+ std::string normalizeString(StringData str) const;
- private:
- void _addTerms( FTSTokenizer* tokenizer,
- const std::string& tokens,
- bool negated );
+ static const bool caseSensitiveDefault;
- const FTSLanguage* _language;
- bool _caseSensitive;
+private:
+ void _addTerms(FTSTokenizer* tokenizer, const std::string& tokens, bool negated);
- // Positive terms.
- std::set<std::string> _positiveTerms;
+ const FTSLanguage* _language;
+ bool _caseSensitive;
- // Negated terms.
- std::set<std::string> _negatedTerms;
+ // Positive terms.
+ std::set<std::string> _positiveTerms;
- // Positive phrases.
- std::vector<std::string> _positivePhrases;
+ // Negated terms.
+ std::set<std::string> _negatedTerms;
- // Negated phrases.
- std::vector<std::string> _negatedPhrases;
+ // Positive phrases.
+ std::vector<std::string> _positivePhrases;
- // Terms for bounds.
- std::set<std::string> _termsForBounds;
- };
+ // Negated phrases.
+ std::vector<std::string> _negatedPhrases;
- }
+ // Terms for bounds.
+ std::set<std::string> _termsForBounds;
+};
+}
}
-
diff --git a/src/mongo/db/fts/fts_query_parser.cpp b/src/mongo/db/fts/fts_query_parser.cpp
index 5d73e69cb1e..6b2381c3366 100644
--- a/src/mongo/db/fts/fts_query_parser.cpp
+++ b/src/mongo/db/fts/fts_query_parser.cpp
@@ -34,77 +34,73 @@
namespace mongo {
- namespace fts {
+namespace fts {
- FTSQueryParser::FTSQueryParser( StringData str )
- : _pos(0), _raw( str ) {
- skipWhitespace();
- _previousWhiteSpace = true;
- }
-
- bool FTSQueryParser::more() const {
- return _pos < _raw.size();
- }
-
- QueryToken FTSQueryParser::next() {
- if ( _pos >= _raw.size() )
- return QueryToken( QueryToken::INVALID, "", 0, false );
+FTSQueryParser::FTSQueryParser(StringData str) : _pos(0), _raw(str) {
+ skipWhitespace();
+ _previousWhiteSpace = true;
+}
- unsigned start = _pos++;
- QueryToken::Type type = getType( _raw[start] );
+bool FTSQueryParser::more() const {
+ return _pos < _raw.size();
+}
- // Query Parser should never land on whitespace
- if ( type == QueryToken::WHITESPACE ) {
- invariant( false );
- }
+QueryToken FTSQueryParser::next() {
+ if (_pos >= _raw.size())
+ return QueryToken(QueryToken::INVALID, "", 0, false);
- if ( type == QueryToken::TEXT ) {
- while ( _pos < _raw.size() && getType( _raw[_pos] ) == type ) {
- _pos++;
- }
- }
+ unsigned start = _pos++;
+ QueryToken::Type type = getType(_raw[start]);
- StringData ret = _raw.substr( start, _pos - start );
- bool old = _previousWhiteSpace;
- _previousWhiteSpace = skipWhitespace();
+ // Query Parser should never land on whitespace
+ if (type == QueryToken::WHITESPACE) {
+ invariant(false);
+ }
- return QueryToken( type, ret, start, old );
+ if (type == QueryToken::TEXT) {
+ while (_pos < _raw.size() && getType(_raw[_pos]) == type) {
+ _pos++;
}
+ }
- bool FTSQueryParser::skipWhitespace() {
- unsigned start = _pos;
+ StringData ret = _raw.substr(start, _pos - start);
+ bool old = _previousWhiteSpace;
+ _previousWhiteSpace = skipWhitespace();
- while ( _pos < _raw.size() && getType( _raw[_pos] ) == QueryToken::WHITESPACE ) {
- _pos++;
- }
+ return QueryToken(type, ret, start, old);
+}
- return _pos > start;
- }
+bool FTSQueryParser::skipWhitespace() {
+ unsigned start = _pos;
+ while (_pos < _raw.size() && getType(_raw[_pos]) == QueryToken::WHITESPACE) {
+ _pos++;
+ }
- QueryToken::Type FTSQueryParser::getType( char c ) const {
- switch ( c ) {
- // Unicode TR29 defines these as Word Boundaries
- case '\n': // U+000A - LF
- case '\v': // U+000B - Veritical Tab
- case '\f': // U+000C - Form Feed
- case '\r': // U+000D - CR
- // Unicode TR29 remarks this could be used MidNum for Word Boundaries
- // but we treat this as a token separator
- case ' ': // U+0020 - Space
- return QueryToken::WHITESPACE;
- // Unicode TR29 has a particular note about the complexity of hyphens.
- // Since we use them for negation, we are sensitive to them, and we simply drop
- // them otherwise from words
- case '-':
- case '"':
- return QueryToken::DELIMITER;
- default:
- return QueryToken::TEXT;
- }
+ return _pos > start;
+}
- }
+QueryToken::Type FTSQueryParser::getType(char c) const {
+ switch (c) {
+ // Unicode TR29 defines these as Word Boundaries
+ case '\n': // U+000A - LF
+ case '\v': // U+000B - Veritical Tab
+ case '\f': // U+000C - Form Feed
+ case '\r': // U+000D - CR
+ // Unicode TR29 remarks this could be used MidNum for Word Boundaries
+ // but we treat this as a token separator
+ case ' ': // U+0020 - Space
+ return QueryToken::WHITESPACE;
+ // Unicode TR29 has a particular note about the complexity of hyphens.
+ // Since we use them for negation, we are sensitive to them, and we simply drop
+ // them otherwise from words
+ case '-':
+ case '"':
+ return QueryToken::DELIMITER;
+ default:
+ return QueryToken::TEXT;
}
-
+}
+}
}
diff --git a/src/mongo/db/fts/fts_query_parser.h b/src/mongo/db/fts/fts_query_parser.h
index 32804fd63fd..b5e8c53207f 100644
--- a/src/mongo/db/fts/fts_query_parser.h
+++ b/src/mongo/db/fts/fts_query_parser.h
@@ -34,57 +34,54 @@
namespace mongo {
- namespace fts {
+namespace fts {
- struct QueryToken {
- enum Type { WHITESPACE, DELIMITER, TEXT, INVALID };
- QueryToken( Type type, StringData data, unsigned offset, bool previousWhiteSpace )
- : type( type ),
- data( data ),
- offset( offset ),
- previousWhiteSpace( previousWhiteSpace ) {}
+struct QueryToken {
+ enum Type { WHITESPACE, DELIMITER, TEXT, INVALID };
+ QueryToken(Type type, StringData data, unsigned offset, bool previousWhiteSpace)
+ : type(type), data(data), offset(offset), previousWhiteSpace(previousWhiteSpace) {}
- bool ok() const { return type != INVALID; }
-
- Type type;
- StringData data;
- unsigned offset;
- bool previousWhiteSpace;
- };
+ bool ok() const {
+ return type != INVALID;
+ }
- /**
- * The pseudo EXBNF for the query parsing language is:
- *
- * SEARCH STRING = TOKEN_LIST ( ' ' TOKEN_LIST )*
- *
- * TOKEN_LIST = SEARCH_TOKEN
- * |'-' SEARCH_TOKEN
- * | QUOTED_SEARCH_TOKEN
- * |'-' QUOTED_SEARCH_TOKEN
- *
- * QUOTED_SEARCH_TOKEN = '“' SEARCH_TOKEN+ '"'
- *
- * SEARCH_TOKEN = CHARACTER_EXCLUDING_SPECIAL_CHARS
- *
- * SPECIAL_CHARS = '-' | ' ' | '"'
- */
- class FTSQueryParser {
- MONGO_DISALLOW_COPYING( FTSQueryParser );
- public:
+ Type type;
+ StringData data;
+ unsigned offset;
+ bool previousWhiteSpace;
+};
- FTSQueryParser(StringData str);
- bool more() const;
- QueryToken next();
+/**
+ * The pseudo EXBNF for the query parsing language is:
+ *
+ * SEARCH STRING = TOKEN_LIST ( ' ' TOKEN_LIST )*
+ *
+ * TOKEN_LIST = SEARCH_TOKEN
+ * |'-' SEARCH_TOKEN
+ * | QUOTED_SEARCH_TOKEN
+ * |'-' QUOTED_SEARCH_TOKEN
+ *
+ * QUOTED_SEARCH_TOKEN = '“' SEARCH_TOKEN+ '"'
+ *
+ * SEARCH_TOKEN = CHARACTER_EXCLUDING_SPECIAL_CHARS
+ *
+ * SPECIAL_CHARS = '-' | ' ' | '"'
+ */
+class FTSQueryParser {
+ MONGO_DISALLOW_COPYING(FTSQueryParser);
- private:
- QueryToken::Type getType( char c ) const;
- bool skipWhitespace();
+public:
+ FTSQueryParser(StringData str);
+ bool more() const;
+ QueryToken next();
- unsigned _pos;
- bool _previousWhiteSpace;
- const StringData _raw;
- };
+private:
+ QueryToken::Type getType(char c) const;
+ bool skipWhitespace();
- }
+ unsigned _pos;
+ bool _previousWhiteSpace;
+ const StringData _raw;
+};
+}
}
-
diff --git a/src/mongo/db/fts/fts_query_test.cpp b/src/mongo/db/fts/fts_query_test.cpp
index b090f23a660..a4a841c7f16 100644
--- a/src/mongo/db/fts/fts_query_test.cpp
+++ b/src/mongo/db/fts/fts_query_test.cpp
@@ -33,242 +33,222 @@
#include "mongo/unittest/unittest.h"
namespace mongo {
- namespace fts {
-
- TEST( FTSQuery, Basic1 ) {
- FTSQuery q;
- ASSERT( q.parse( "this is fun", "english", false, TEXT_INDEX_VERSION_2 ).isOK() );
-
- ASSERT_EQUALS( false, q.getCaseSensitive() );
- ASSERT_EQUALS( 1U, q.getPositiveTerms().size() );
- ASSERT_EQUALS( "fun", *q.getPositiveTerms().begin() );
- ASSERT_EQUALS( 0U, q.getNegatedTerms().size() );
- ASSERT_EQUALS( 0U, q.getPositivePhr().size() );
- ASSERT_EQUALS( 0U, q.getNegatedPhr().size() );
- ASSERT_TRUE( q.getTermsForBounds() == q.getPositiveTerms() );
- }
-
- TEST( FTSQuery, ParsePunctuation ) {
- FTSQuery q;
- ASSERT( q.parse( "hello.world", "english", false, TEXT_INDEX_VERSION_2 ).isOK() );
-
- ASSERT_EQUALS( false, q.getCaseSensitive() );
- ASSERT_EQUALS( 2U, q.getPositiveTerms().size() );
- ASSERT_EQUALS( "hello", *q.getPositiveTerms().begin() );
- ASSERT_EQUALS( "world", *(--q.getPositiveTerms().end()) );
- ASSERT_EQUALS( 0U, q.getNegatedTerms().size() );
- ASSERT_EQUALS( 0U, q.getPositivePhr().size() );
- ASSERT_EQUALS( 0U, q.getNegatedPhr().size() );
- ASSERT_TRUE( q.getTermsForBounds() == q.getPositiveTerms() );
- }
-
- TEST( FTSQuery, Neg1 ) {
- FTSQuery q;
- ASSERT( q.parse( "this is -really fun", "english", false, TEXT_INDEX_VERSION_2 ).isOK() );
-
- ASSERT_EQUALS( 1U, q.getPositiveTerms().size() );
- ASSERT_EQUALS( "fun", *q.getPositiveTerms().begin() );
- ASSERT_EQUALS( 1U, q.getNegatedTerms().size() );
- ASSERT_EQUALS( "realli", *q.getNegatedTerms().begin() );
- ASSERT_TRUE( q.getTermsForBounds() == q.getPositiveTerms() );
- }
-
- TEST( FTSQuery, Phrase1 ) {
- FTSQuery q;
- ASSERT( q.parse( "doing a \"phrase test\" for fun", "english", false,
- TEXT_INDEX_VERSION_2 ).isOK() );
-
- ASSERT_EQUALS( 3U, q.getPositiveTerms().size() );
- ASSERT_EQUALS( 0U, q.getNegatedTerms().size() );
- ASSERT_EQUALS( 1U, q.getPositivePhr().size() );
- ASSERT_EQUALS( 0U, q.getNegatedPhr().size() );
- ASSERT_TRUE( q.getTermsForBounds() == q.getPositiveTerms() );
-
- ASSERT_EQUALS( "phrase test", q.getPositivePhr()[0] );
- ASSERT_EQUALS( "fun|phrase|test||||phrase test||", q.debugString() );
- }
-
- TEST( FTSQuery, Phrase2 ) {
- FTSQuery q;
- ASSERT( q.parse( "doing a \"phrase-test\" for fun", "english", false,
- TEXT_INDEX_VERSION_2 ).isOK() );
- ASSERT_EQUALS( 1U, q.getPositivePhr().size() );
- ASSERT_EQUALS( "phrase-test", q.getPositivePhr()[0] );
- }
-
- TEST( FTSQuery, NegPhrase1 ) {
- FTSQuery q;
- ASSERT( q.parse( "doing a -\"phrase test\" for fun", "english", false,
- TEXT_INDEX_VERSION_2 ).isOK() );
- ASSERT_EQUALS( "fun||||||phrase test", q.debugString() );
- }
-
- TEST( FTSQuery, CaseSensitiveOption ) {
- FTSQuery q;
- ASSERT( q.parse( "this is fun", "english", true, TEXT_INDEX_VERSION_2 ).isOK() );
- ASSERT_EQUALS( true, q.getCaseSensitive() );
- }
-
- TEST( FTSQuery, CaseSensitivePositiveTerms ) {
- FTSQuery q;
- ASSERT( q.parse( "This is Positively fun", "english", true,
- TEXT_INDEX_VERSION_2 ).isOK() );
-
- ASSERT_EQUALS( 2U, q.getTermsForBounds().size() );
- ASSERT_EQUALS( 1, std::count( q.getTermsForBounds().begin(),
- q.getTermsForBounds().end(),
- "posit" ) );
- ASSERT_EQUALS( 1, std::count( q.getTermsForBounds().begin(),
- q.getTermsForBounds().end(),
- "fun" ) );
- ASSERT_EQUALS( 2U, q.getPositiveTerms().size() );
- ASSERT_EQUALS( 1, std::count( q.getPositiveTerms().begin(),
- q.getPositiveTerms().end(),
- "Posit" ) );
- ASSERT_EQUALS( 1, std::count( q.getPositiveTerms().begin(),
- q.getPositiveTerms().end(),
- "fun" ) );
- ASSERT_EQUALS( 0U, q.getNegatedTerms().size() );
- ASSERT_EQUALS( 0U, q.getPositivePhr().size() );
- ASSERT_EQUALS( 0U, q.getNegatedPhr().size() );
- }
-
- TEST( FTSQuery, CaseSensitiveNegativeTerms ) {
- FTSQuery q;
- ASSERT( q.parse( "-This -is -Negatively -miserable", "english", true,
- TEXT_INDEX_VERSION_2 ).isOK() );
-
- ASSERT_EQUALS( 0U, q.getPositiveTerms().size() );
- ASSERT_EQUALS( 0U, q.getTermsForBounds().size() );
- ASSERT_EQUALS( 2U, q.getNegatedTerms().size() );
- ASSERT_EQUALS( 1, std::count( q.getNegatedTerms().begin(),
- q.getNegatedTerms().end(),
- "Negat" ) );
- ASSERT_EQUALS( 1, std::count( q.getNegatedTerms().begin(),
- q.getNegatedTerms().end(),
- "miser" ) );
- ASSERT_EQUALS( 0U, q.getPositivePhr().size() );
- ASSERT_EQUALS( 0U, q.getNegatedPhr().size() );
- }
-
- TEST( FTSQuery, CaseSensitivePositivePhrases ) {
- FTSQuery q;
- ASSERT( q.parse( "doing a \"Phrase Test\" for fun", "english", true,
- TEXT_INDEX_VERSION_2 ).isOK() );
-
- ASSERT_EQUALS( 1U, q.getPositivePhr().size() );
- ASSERT_EQUALS( 0U, q.getNegatedPhr().size() );
- ASSERT_EQUALS( "Phrase Test", q.getPositivePhr()[0] );
- }
-
- TEST( FTSQuery, CaseSensitiveNegativePhrases ) {
- FTSQuery q;
- ASSERT( q.parse( "doing a -\"Phrase Test\" for fun", "english", true,
- TEXT_INDEX_VERSION_2 ).isOK() );
-
- ASSERT_EQUALS( 0U, q.getPositivePhr().size() );
- ASSERT_EQUALS( 1U, q.getNegatedPhr().size() );
- ASSERT_EQUALS( "Phrase Test", q.getNegatedPhr()[0] );
- }
-
- TEST( FTSQuery, Mix1 ) {
- FTSQuery q;
- ASSERT( q.parse( "\"industry\" -Melbourne -Physics", "english", false,
- TEXT_INDEX_VERSION_2 ).isOK() );
- ASSERT_EQUALS( "industri||melbourn|physic||industry||", q.debugString() );
- }
-
- TEST( FTSQuery, NegPhrase2) {
- FTSQuery q1, q2, q3;
- ASSERT( q1.parse( "foo \"bar\"", "english", false, TEXT_INDEX_VERSION_2 ).isOK() );
- ASSERT( q2.parse( "foo \"-bar\"", "english", false, TEXT_INDEX_VERSION_2 ).isOK() );
- ASSERT( q3.parse( "foo \" -bar\"", "english", false, TEXT_INDEX_VERSION_2 ).isOK() );
-
- ASSERT_EQUALS( 2U, q1.getPositiveTerms().size() );
- ASSERT_EQUALS( 2U, q2.getPositiveTerms().size() );
- ASSERT_EQUALS( 2U, q3.getPositiveTerms().size() );
-
- ASSERT_EQUALS( 0U, q1.getNegatedTerms().size() );
- ASSERT_EQUALS( 0U, q2.getNegatedTerms().size() );
- ASSERT_EQUALS( 0U, q3.getNegatedTerms().size() );
-
- ASSERT_EQUALS( 1U, q1.getPositivePhr().size() );
- ASSERT_EQUALS( 1U, q2.getPositivePhr().size() );
- ASSERT_EQUALS( 1U, q3.getPositivePhr().size() );
-
- ASSERT_EQUALS( 0U, q1.getNegatedPhr().size() );
- ASSERT_EQUALS( 0U, q2.getNegatedPhr().size() );
- ASSERT_EQUALS( 0U, q3.getNegatedPhr().size() );
- }
-
- TEST( FTSQuery, NegPhrase3) {
- FTSQuery q1, q2, q3;
- ASSERT( q1.parse( "foo -\"bar\"", "english", false, TEXT_INDEX_VERSION_2 ).isOK() );
- ASSERT( q2.parse( "foo -\"-bar\"", "english", false, TEXT_INDEX_VERSION_2 ).isOK() );
- ASSERT( q3.parse( "foo -\" -bar\"", "english", false, TEXT_INDEX_VERSION_2 ).isOK() );
-
- ASSERT_EQUALS( 1U, q1.getPositiveTerms().size() );
- ASSERT_EQUALS( 1U, q2.getPositiveTerms().size() );
- ASSERT_EQUALS( 1U, q3.getPositiveTerms().size() );
-
- ASSERT_EQUALS( 0U, q1.getNegatedTerms().size() );
- ASSERT_EQUALS( 0U, q2.getNegatedTerms().size() );
- ASSERT_EQUALS( 0U, q3.getNegatedTerms().size() );
-
- ASSERT_EQUALS( 0U, q1.getPositivePhr().size() );
- ASSERT_EQUALS( 0U, q2.getPositivePhr().size() );
- ASSERT_EQUALS( 0U, q3.getPositivePhr().size() );
-
- ASSERT_EQUALS( 1U, q1.getNegatedPhr().size() );
- ASSERT_EQUALS( 1U, q2.getNegatedPhr().size() );
- ASSERT_EQUALS( 1U, q3.getNegatedPhr().size() );
- }
-
- // Test textIndexVersion:1 query with language "english". This invokes the standard English
- // stemmer and stopword list.
- TEST( FTSQuery, TextIndexVersion1LanguageEnglish ) {
- FTSQuery q;
- ASSERT( q.parse( "the running", "english", false, TEXT_INDEX_VERSION_1 ).isOK() );
- ASSERT_EQUALS( 1U, q.getPositiveTerms().size() );
- ASSERT_EQUALS( "run", *q.getPositiveTerms().begin() );
- ASSERT_EQUALS( 0U, q.getNegatedTerms().size() );
- ASSERT_EQUALS( 0U, q.getPositivePhr().size() );
- ASSERT_EQUALS( 0U, q.getNegatedPhr().size() );
- }
-
- // Test textIndexVersion:1 query with language "eng". "eng" uses the English stemmer, and
- // no stopword list.
- TEST( FTSQuery, TextIndexVersion1LanguageEng ) {
- FTSQuery q;
- ASSERT( q.parse( "the running", "eng", false, TEXT_INDEX_VERSION_1 ).isOK() );
- ASSERT_EQUALS( 2U, q.getPositiveTerms().size() );
- ASSERT_EQUALS( 1, std::count( q.getPositiveTerms().begin(),
- q.getPositiveTerms().end(),
- "the" ) );
- ASSERT_EQUALS( 1, std::count( q.getPositiveTerms().begin(),
- q.getPositiveTerms().end(),
- "run" ) );
- ASSERT_EQUALS( 0U, q.getNegatedTerms().size() );
- ASSERT_EQUALS( 0U, q.getPositivePhr().size() );
- ASSERT_EQUALS( 0U, q.getNegatedPhr().size() );
- }
-
- // Test textIndexVersion:1 query with language "invalid". No stemming will be performed,
- // and no stopword list will be used.
- TEST( FTSQuery, TextIndexVersion1LanguageInvalid ) {
- FTSQuery q;
- ASSERT( q.parse( "the running", "invalid", false, TEXT_INDEX_VERSION_1 ).isOK() );
- ASSERT_EQUALS( 2U, q.getPositiveTerms().size() );
- ASSERT_EQUALS( 1, std::count( q.getPositiveTerms().begin(),
- q.getPositiveTerms().end(),
- "the" ) );
- ASSERT_EQUALS( 1, std::count( q.getPositiveTerms().begin(),
- q.getPositiveTerms().end(),
- "running" ) );
- ASSERT_EQUALS( 0U, q.getNegatedTerms().size() );
- ASSERT_EQUALS( 0U, q.getPositivePhr().size() );
- ASSERT_EQUALS( 0U, q.getNegatedPhr().size() );
- }
-
- }
+namespace fts {
+
+TEST(FTSQuery, Basic1) {
+ FTSQuery q;
+ ASSERT(q.parse("this is fun", "english", false, TEXT_INDEX_VERSION_2).isOK());
+
+ ASSERT_EQUALS(false, q.getCaseSensitive());
+ ASSERT_EQUALS(1U, q.getPositiveTerms().size());
+ ASSERT_EQUALS("fun", *q.getPositiveTerms().begin());
+ ASSERT_EQUALS(0U, q.getNegatedTerms().size());
+ ASSERT_EQUALS(0U, q.getPositivePhr().size());
+ ASSERT_EQUALS(0U, q.getNegatedPhr().size());
+ ASSERT_TRUE(q.getTermsForBounds() == q.getPositiveTerms());
+}
+
+TEST(FTSQuery, ParsePunctuation) {
+ FTSQuery q;
+ ASSERT(q.parse("hello.world", "english", false, TEXT_INDEX_VERSION_2).isOK());
+
+ ASSERT_EQUALS(false, q.getCaseSensitive());
+ ASSERT_EQUALS(2U, q.getPositiveTerms().size());
+ ASSERT_EQUALS("hello", *q.getPositiveTerms().begin());
+ ASSERT_EQUALS("world", *(--q.getPositiveTerms().end()));
+ ASSERT_EQUALS(0U, q.getNegatedTerms().size());
+ ASSERT_EQUALS(0U, q.getPositivePhr().size());
+ ASSERT_EQUALS(0U, q.getNegatedPhr().size());
+ ASSERT_TRUE(q.getTermsForBounds() == q.getPositiveTerms());
+}
+
+TEST(FTSQuery, Neg1) {
+ FTSQuery q;
+ ASSERT(q.parse("this is -really fun", "english", false, TEXT_INDEX_VERSION_2).isOK());
+
+ ASSERT_EQUALS(1U, q.getPositiveTerms().size());
+ ASSERT_EQUALS("fun", *q.getPositiveTerms().begin());
+ ASSERT_EQUALS(1U, q.getNegatedTerms().size());
+ ASSERT_EQUALS("realli", *q.getNegatedTerms().begin());
+ ASSERT_TRUE(q.getTermsForBounds() == q.getPositiveTerms());
+}
+
+TEST(FTSQuery, Phrase1) {
+ FTSQuery q;
+ ASSERT(
+ q.parse("doing a \"phrase test\" for fun", "english", false, TEXT_INDEX_VERSION_2).isOK());
+
+ ASSERT_EQUALS(3U, q.getPositiveTerms().size());
+ ASSERT_EQUALS(0U, q.getNegatedTerms().size());
+ ASSERT_EQUALS(1U, q.getPositivePhr().size());
+ ASSERT_EQUALS(0U, q.getNegatedPhr().size());
+ ASSERT_TRUE(q.getTermsForBounds() == q.getPositiveTerms());
+
+ ASSERT_EQUALS("phrase test", q.getPositivePhr()[0]);
+ ASSERT_EQUALS("fun|phrase|test||||phrase test||", q.debugString());
+}
+
+TEST(FTSQuery, Phrase2) {
+ FTSQuery q;
+ ASSERT(
+ q.parse("doing a \"phrase-test\" for fun", "english", false, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT_EQUALS(1U, q.getPositivePhr().size());
+ ASSERT_EQUALS("phrase-test", q.getPositivePhr()[0]);
+}
+
+TEST(FTSQuery, NegPhrase1) {
+ FTSQuery q;
+ ASSERT(
+ q.parse("doing a -\"phrase test\" for fun", "english", false, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT_EQUALS("fun||||||phrase test", q.debugString());
+}
+
+TEST(FTSQuery, CaseSensitiveOption) {
+ FTSQuery q;
+ ASSERT(q.parse("this is fun", "english", true, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT_EQUALS(true, q.getCaseSensitive());
+}
+
+TEST(FTSQuery, CaseSensitivePositiveTerms) {
+ FTSQuery q;
+ ASSERT(q.parse("This is Positively fun", "english", true, TEXT_INDEX_VERSION_2).isOK());
+
+ ASSERT_EQUALS(2U, q.getTermsForBounds().size());
+ ASSERT_EQUALS(1,
+ std::count(q.getTermsForBounds().begin(), q.getTermsForBounds().end(), "posit"));
+ ASSERT_EQUALS(1, std::count(q.getTermsForBounds().begin(), q.getTermsForBounds().end(), "fun"));
+ ASSERT_EQUALS(2U, q.getPositiveTerms().size());
+ ASSERT_EQUALS(1, std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "Posit"));
+ ASSERT_EQUALS(1, std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "fun"));
+ ASSERT_EQUALS(0U, q.getNegatedTerms().size());
+ ASSERT_EQUALS(0U, q.getPositivePhr().size());
+ ASSERT_EQUALS(0U, q.getNegatedPhr().size());
+}
+
+TEST(FTSQuery, CaseSensitiveNegativeTerms) {
+ FTSQuery q;
+ ASSERT(
+ q.parse("-This -is -Negatively -miserable", "english", true, TEXT_INDEX_VERSION_2).isOK());
+
+ ASSERT_EQUALS(0U, q.getPositiveTerms().size());
+ ASSERT_EQUALS(0U, q.getTermsForBounds().size());
+ ASSERT_EQUALS(2U, q.getNegatedTerms().size());
+ ASSERT_EQUALS(1, std::count(q.getNegatedTerms().begin(), q.getNegatedTerms().end(), "Negat"));
+ ASSERT_EQUALS(1, std::count(q.getNegatedTerms().begin(), q.getNegatedTerms().end(), "miser"));
+ ASSERT_EQUALS(0U, q.getPositivePhr().size());
+ ASSERT_EQUALS(0U, q.getNegatedPhr().size());
+}
+
+TEST(FTSQuery, CaseSensitivePositivePhrases) {
+ FTSQuery q;
+ ASSERT(
+ q.parse("doing a \"Phrase Test\" for fun", "english", true, TEXT_INDEX_VERSION_2).isOK());
+
+ ASSERT_EQUALS(1U, q.getPositivePhr().size());
+ ASSERT_EQUALS(0U, q.getNegatedPhr().size());
+ ASSERT_EQUALS("Phrase Test", q.getPositivePhr()[0]);
+}
+
+TEST(FTSQuery, CaseSensitiveNegativePhrases) {
+ FTSQuery q;
+ ASSERT(
+ q.parse("doing a -\"Phrase Test\" for fun", "english", true, TEXT_INDEX_VERSION_2).isOK());
+
+ ASSERT_EQUALS(0U, q.getPositivePhr().size());
+ ASSERT_EQUALS(1U, q.getNegatedPhr().size());
+ ASSERT_EQUALS("Phrase Test", q.getNegatedPhr()[0]);
+}
+
+TEST(FTSQuery, Mix1) {
+ FTSQuery q;
+ ASSERT(
+ q.parse("\"industry\" -Melbourne -Physics", "english", false, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT_EQUALS("industri||melbourn|physic||industry||", q.debugString());
+}
+
+TEST(FTSQuery, NegPhrase2) {
+ FTSQuery q1, q2, q3;
+ ASSERT(q1.parse("foo \"bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT(q2.parse("foo \"-bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT(q3.parse("foo \" -bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK());
+
+ ASSERT_EQUALS(2U, q1.getPositiveTerms().size());
+ ASSERT_EQUALS(2U, q2.getPositiveTerms().size());
+ ASSERT_EQUALS(2U, q3.getPositiveTerms().size());
+
+ ASSERT_EQUALS(0U, q1.getNegatedTerms().size());
+ ASSERT_EQUALS(0U, q2.getNegatedTerms().size());
+ ASSERT_EQUALS(0U, q3.getNegatedTerms().size());
+
+ ASSERT_EQUALS(1U, q1.getPositivePhr().size());
+ ASSERT_EQUALS(1U, q2.getPositivePhr().size());
+ ASSERT_EQUALS(1U, q3.getPositivePhr().size());
+
+ ASSERT_EQUALS(0U, q1.getNegatedPhr().size());
+ ASSERT_EQUALS(0U, q2.getNegatedPhr().size());
+ ASSERT_EQUALS(0U, q3.getNegatedPhr().size());
+}
+
+TEST(FTSQuery, NegPhrase3) {
+ FTSQuery q1, q2, q3;
+ ASSERT(q1.parse("foo -\"bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT(q2.parse("foo -\"-bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT(q3.parse("foo -\" -bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK());
+
+ ASSERT_EQUALS(1U, q1.getPositiveTerms().size());
+ ASSERT_EQUALS(1U, q2.getPositiveTerms().size());
+ ASSERT_EQUALS(1U, q3.getPositiveTerms().size());
+
+ ASSERT_EQUALS(0U, q1.getNegatedTerms().size());
+ ASSERT_EQUALS(0U, q2.getNegatedTerms().size());
+ ASSERT_EQUALS(0U, q3.getNegatedTerms().size());
+
+ ASSERT_EQUALS(0U, q1.getPositivePhr().size());
+ ASSERT_EQUALS(0U, q2.getPositivePhr().size());
+ ASSERT_EQUALS(0U, q3.getPositivePhr().size());
+
+ ASSERT_EQUALS(1U, q1.getNegatedPhr().size());
+ ASSERT_EQUALS(1U, q2.getNegatedPhr().size());
+ ASSERT_EQUALS(1U, q3.getNegatedPhr().size());
+}
+
+// Test textIndexVersion:1 query with language "english". This invokes the standard English
+// stemmer and stopword list.
+TEST(FTSQuery, TextIndexVersion1LanguageEnglish) {
+ FTSQuery q;
+ ASSERT(q.parse("the running", "english", false, TEXT_INDEX_VERSION_1).isOK());
+ ASSERT_EQUALS(1U, q.getPositiveTerms().size());
+ ASSERT_EQUALS("run", *q.getPositiveTerms().begin());
+ ASSERT_EQUALS(0U, q.getNegatedTerms().size());
+ ASSERT_EQUALS(0U, q.getPositivePhr().size());
+ ASSERT_EQUALS(0U, q.getNegatedPhr().size());
+}
+
+// Test textIndexVersion:1 query with language "eng". "eng" uses the English stemmer, and
+// no stopword list.
+TEST(FTSQuery, TextIndexVersion1LanguageEng) {
+ FTSQuery q;
+ ASSERT(q.parse("the running", "eng", false, TEXT_INDEX_VERSION_1).isOK());
+ ASSERT_EQUALS(2U, q.getPositiveTerms().size());
+ ASSERT_EQUALS(1, std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "the"));
+ ASSERT_EQUALS(1, std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "run"));
+ ASSERT_EQUALS(0U, q.getNegatedTerms().size());
+ ASSERT_EQUALS(0U, q.getPositivePhr().size());
+ ASSERT_EQUALS(0U, q.getNegatedPhr().size());
+}
+
+// Test textIndexVersion:1 query with language "invalid". No stemming will be performed,
+// and no stopword list will be used.
+TEST(FTSQuery, TextIndexVersion1LanguageInvalid) {
+ FTSQuery q;
+ ASSERT(q.parse("the running", "invalid", false, TEXT_INDEX_VERSION_1).isOK());
+ ASSERT_EQUALS(2U, q.getPositiveTerms().size());
+ ASSERT_EQUALS(1, std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "the"));
+ ASSERT_EQUALS(1,
+ std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "running"));
+ ASSERT_EQUALS(0U, q.getNegatedTerms().size());
+ ASSERT_EQUALS(0U, q.getPositivePhr().size());
+ ASSERT_EQUALS(0U, q.getNegatedPhr().size());
+}
+}
}
diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp
index 274d9a6d6ba..eb7e018b522 100644
--- a/src/mongo/db/fts/fts_spec.cpp
+++ b/src/mongo/db/fts/fts_spec.cpp
@@ -40,457 +40,408 @@
namespace mongo {
- namespace fts {
-
- using std::map;
- using std::string;
- using namespace mongoutils;
-
- const double DEFAULT_WEIGHT = 1;
- const double MAX_WEIGHT = 1000000000;
- const double MAX_WORD_WEIGHT = MAX_WEIGHT / 10000;
-
- namespace {
- // Default language. Used for new indexes.
- const std::string moduleDefaultLanguage( "english" );
-
- /** Validate the given language override string. */
- bool validateOverride( const string& override ) {
- // The override field can't be empty, can't be prefixed with a dollar sign, and
- // can't contain a dot.
- return !override.empty() &&
- override[0] != '$' &&
- override.find('.') == std::string::npos;
- }
- }
-
- FTSSpec::FTSSpec( const BSONObj& indexInfo ) {
- // indexInfo is a text index spec. Text index specs pass through fixSpec() before
- // being saved to the system.indexes collection. fixSpec() enforces a schema, such that
- // required fields must exist and be of the correct type (e.g. weights,
- // textIndexVersion).
- massert( 16739, "found invalid spec for text index",
- indexInfo["weights"].isABSONObj() );
- BSONElement textIndexVersionElt = indexInfo["textIndexVersion"];
- massert( 17367,
- "found invalid spec for text index, expected number for textIndexVersion",
- textIndexVersionElt.isNumber() );
-
- // We currently support TEXT_INDEX_VERSION_1 (deprecated) and TEXT_INDEX_VERSION_2.
- // Reject all other values.
- massert( 17364,
- str::stream() << "attempt to use unsupported textIndexVersion " <<
- textIndexVersionElt.numberInt() << "; versions supported: " <<
- TEXT_INDEX_VERSION_2 << ", " << TEXT_INDEX_VERSION_1,
- textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 ||
- textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_1 );
-
- _textIndexVersion = ( textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 ) ?
- TEXT_INDEX_VERSION_2 : TEXT_INDEX_VERSION_1;
-
- // Initialize _defaultLanguage. Note that the FTSLanguage constructor requires
- // textIndexVersion, since language parsing is version-specific.
- auto indexLanguage = indexInfo["default_language"].String();
- auto swl = FTSLanguage::make(indexLanguage , _textIndexVersion );
-
- // This can fail if the user originally created the text index under an instance of
- // MongoDB that supports different languages then the current instance
- // TODO: consder propagating the index ns to here to improve the error message
- uassert(28682,
- str::stream() << "Unrecognized language " << indexLanguage <<
- " found for text index. Verify mongod was started with the"
- " correct options.",
- swl.getStatus().isOK());
- _defaultLanguage = swl.getValue();
-
- _languageOverrideField = indexInfo["language_override"].valuestrsafe();
-
- _wildcard = false;
-
- // in this block we fill in the _weights map
- {
- BSONObjIterator i( indexInfo["weights"].Obj() );
- while ( i.more() ) {
- BSONElement e = i.next();
- verify( e.isNumber() );
-
- if ( WILDCARD == e.fieldName() ) {
- _wildcard = true;
- }
- else {
- double num = e.number();
- _weights[ e.fieldName() ] = num;
- verify( num > 0 && num < MAX_WORD_WEIGHT );
- }
- }
- verify( _wildcard || _weights.size() );
- }
-
- // extra information
- {
- BSONObj keyPattern = indexInfo["key"].Obj();
- verify( keyPattern.nFields() >= 2 );
- BSONObjIterator i( keyPattern );
+namespace fts {
- bool passedFTS = false;
+using std::map;
+using std::string;
+using namespace mongoutils;
- while ( i.more() ) {
- BSONElement e = i.next();
- if ( str::equals( e.fieldName(), "_fts" ) ||
- str::equals( e.fieldName(), "_ftsx" ) ) {
- passedFTS = true;
- continue;
- }
+const double DEFAULT_WEIGHT = 1;
+const double MAX_WEIGHT = 1000000000;
+const double MAX_WORD_WEIGHT = MAX_WEIGHT / 10000;
- if ( passedFTS )
- _extraAfter.push_back( e.fieldName() );
- else
- _extraBefore.push_back( e.fieldName() );
- }
+namespace {
+// Default language. Used for new indexes.
+const std::string moduleDefaultLanguage("english");
- }
- }
+/** Validate the given language override string. */
+bool validateOverride(const string& override) {
+ // The override field can't be empty, can't be prefixed with a dollar sign, and
+ // can't contain a dot.
+ return !override.empty() && override[0] != '$' && override.find('.') == std::string::npos;
+}
+}
- const FTSLanguage* FTSSpec::_getLanguageToUseV2( const BSONObj& userDoc,
- const FTSLanguage* currentLanguage ) const {
- BSONElement e = userDoc[_languageOverrideField];
- if ( e.eoo() ) {
- return currentLanguage;
+FTSSpec::FTSSpec(const BSONObj& indexInfo) {
+ // indexInfo is a text index spec. Text index specs pass through fixSpec() before
+ // being saved to the system.indexes collection. fixSpec() enforces a schema, such that
+ // required fields must exist and be of the correct type (e.g. weights,
+ // textIndexVersion).
+ massert(16739, "found invalid spec for text index", indexInfo["weights"].isABSONObj());
+ BSONElement textIndexVersionElt = indexInfo["textIndexVersion"];
+ massert(17367,
+ "found invalid spec for text index, expected number for textIndexVersion",
+ textIndexVersionElt.isNumber());
+
+ // We currently support TEXT_INDEX_VERSION_1 (deprecated) and TEXT_INDEX_VERSION_2.
+ // Reject all other values.
+ massert(17364,
+ str::stream() << "attempt to use unsupported textIndexVersion "
+ << textIndexVersionElt.numberInt() << "; versions supported: "
+ << TEXT_INDEX_VERSION_2 << ", " << TEXT_INDEX_VERSION_1,
+ textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 ||
+ textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_1);
+
+ _textIndexVersion = (textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2)
+ ? TEXT_INDEX_VERSION_2
+ : TEXT_INDEX_VERSION_1;
+
+ // Initialize _defaultLanguage. Note that the FTSLanguage constructor requires
+ // textIndexVersion, since language parsing is version-specific.
+ auto indexLanguage = indexInfo["default_language"].String();
+ auto swl = FTSLanguage::make(indexLanguage, _textIndexVersion);
+
+ // This can fail if the user originally created the text index under an instance of
+ // MongoDB that supports different languages then the current instance
+ // TODO: consder propagating the index ns to here to improve the error message
+ uassert(28682,
+ str::stream() << "Unrecognized language " << indexLanguage
+ << " found for text index. Verify mongod was started with the"
+ " correct options.",
+ swl.getStatus().isOK());
+ _defaultLanguage = swl.getValue();
+
+ _languageOverrideField = indexInfo["language_override"].valuestrsafe();
+
+ _wildcard = false;
+
+ // in this block we fill in the _weights map
+ {
+ BSONObjIterator i(indexInfo["weights"].Obj());
+ while (i.more()) {
+ BSONElement e = i.next();
+ verify(e.isNumber());
+
+ if (WILDCARD == e.fieldName()) {
+ _wildcard = true;
+ } else {
+ double num = e.number();
+ _weights[e.fieldName()] = num;
+ verify(num > 0 && num < MAX_WORD_WEIGHT);
}
- uassert( 17261,
- "found language override field in document with non-string type",
- e.type() == mongo::String );
- StatusWithFTSLanguage swl = FTSLanguage::make( e.String(), TEXT_INDEX_VERSION_2 );
- uassert( 17262,
- "language override unsupported: " + e.String(),
- swl.getStatus().isOK() );
- return swl.getValue();
}
+ verify(_wildcard || _weights.size());
+ }
- void FTSSpec::scoreDocument( const BSONObj& obj, TermFrequencyMap* term_freqs ) const {
- if ( _textIndexVersion == TEXT_INDEX_VERSION_1 ) {
- return _scoreDocumentV1( obj, term_freqs );
- }
+ // extra information
+ {
+ BSONObj keyPattern = indexInfo["key"].Obj();
+ verify(keyPattern.nFields() >= 2);
+ BSONObjIterator i(keyPattern);
- FTSElementIterator it( *this, obj );
+ bool passedFTS = false;
- while ( it.more() ) {
- FTSIteratorValue val = it.next();
- std::unique_ptr<FTSTokenizer> tokenizer(val._language->createTokenizer());
- _scoreStringV2( tokenizer.get(), val._text, term_freqs, val._weight );
+ while (i.more()) {
+ BSONElement e = i.next();
+ if (str::equals(e.fieldName(), "_fts") || str::equals(e.fieldName(), "_ftsx")) {
+ passedFTS = true;
+ continue;
}
+
+ if (passedFTS)
+ _extraAfter.push_back(e.fieldName());
+ else
+ _extraBefore.push_back(e.fieldName());
}
+ }
+}
- void FTSSpec::_scoreStringV2( FTSTokenizer* tokenizer,
- StringData raw,
- TermFrequencyMap* docScores,
- double weight ) const {
+const FTSLanguage* FTSSpec::_getLanguageToUseV2(const BSONObj& userDoc,
+ const FTSLanguage* currentLanguage) const {
+ BSONElement e = userDoc[_languageOverrideField];
+ if (e.eoo()) {
+ return currentLanguage;
+ }
+ uassert(17261,
+ "found language override field in document with non-string type",
+ e.type() == mongo::String);
+ StatusWithFTSLanguage swl = FTSLanguage::make(e.String(), TEXT_INDEX_VERSION_2);
+ uassert(17262, "language override unsupported: " + e.String(), swl.getStatus().isOK());
+ return swl.getValue();
+}
- ScoreHelperMap terms;
+void FTSSpec::scoreDocument(const BSONObj& obj, TermFrequencyMap* term_freqs) const {
+ if (_textIndexVersion == TEXT_INDEX_VERSION_1) {
+ return _scoreDocumentV1(obj, term_freqs);
+ }
- unsigned numTokens = 0;
+ FTSElementIterator it(*this, obj);
- tokenizer->reset(raw.rawData(), FTSTokenizer::FilterStopWords );
+ while (it.more()) {
+ FTSIteratorValue val = it.next();
+ std::unique_ptr<FTSTokenizer> tokenizer(val._language->createTokenizer());
+ _scoreStringV2(tokenizer.get(), val._text, term_freqs, val._weight);
+ }
+}
- while (tokenizer->moveNext()) {
- string term = tokenizer->get().toString();
+void FTSSpec::_scoreStringV2(FTSTokenizer* tokenizer,
+ StringData raw,
+ TermFrequencyMap* docScores,
+ double weight) const {
+ ScoreHelperMap terms;
- ScoreHelperStruct& data = terms[term];
+ unsigned numTokens = 0;
- if ( data.exp ) {
- data.exp *= 2;
- }
- else {
- data.exp = 1;
- }
- data.count += 1;
- data.freq += ( 1 / data.exp );
- numTokens++;
- }
+ tokenizer->reset(raw.rawData(), FTSTokenizer::FilterStopWords);
- for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) {
+ while (tokenizer->moveNext()) {
+ string term = tokenizer->get().toString();
- const string& term = i->first;
- const ScoreHelperStruct& data = i->second;
+ ScoreHelperStruct& data = terms[term];
- // in order to adjust weights as a function of term count as it
- // relates to total field length. ie. is this the only word or
- // a frequently occuring term? or does it only show up once in
- // a long block of text?
+ if (data.exp) {
+ data.exp *= 2;
+ } else {
+ data.exp = 1;
+ }
+ data.count += 1;
+ data.freq += (1 / data.exp);
+ numTokens++;
+ }
- double coeff = ( 0.5 * data.count / numTokens ) + 0.5;
+ for (ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i) {
+ const string& term = i->first;
+ const ScoreHelperStruct& data = i->second;
- // if term is identical to the raw form of the
- // field (untokenized) give it a small boost.
- double adjustment = 1;
- if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) )
- adjustment += 0.1;
+ // in order to adjust weights as a function of term count as it
+ // relates to total field length. ie. is this the only word or
+ // a frequently occuring term? or does it only show up once in
+ // a long block of text?
- double& score = (*docScores)[term];
- score += ( weight * data.freq * coeff * adjustment );
- verify( score <= MAX_WEIGHT );
- }
- }
+ double coeff = (0.5 * data.count / numTokens) + 0.5;
- Status FTSSpec::getIndexPrefix( const BSONObj& query, BSONObj* out ) const {
- if ( numExtraBefore() == 0 ) {
- *out = BSONObj();
- return Status::OK();
- }
+ // if term is identical to the raw form of the
+ // field (untokenized) give it a small boost.
+ double adjustment = 1;
+ if (raw.size() == term.length() && raw.equalCaseInsensitive(term))
+ adjustment += 0.1;
- BSONObjBuilder b;
- for ( unsigned i = 0; i < numExtraBefore(); i++ ) {
- BSONElement e = query.getFieldDotted(extraBefore(i));
- if ( e.eoo() )
- return Status( ErrorCodes::BadValue,
- str::stream()
- << "need have an equality filter on: "
- << extraBefore(i) );
-
- if ( e.isABSONObj() && e.Obj().firstElement().getGtLtOp( -1 ) != -1 )
- return Status( ErrorCodes::BadValue,
- str::stream()
- << "need have an equality filter on: "
- << extraBefore(i) );
-
- b.append( e );
- }
- *out = b.obj();
- return Status::OK();
- }
+ double& score = (*docScores)[term];
+ score += (weight * data.freq * coeff * adjustment);
+ verify(score <= MAX_WEIGHT);
+ }
+}
- namespace {
- void _addFTSStuff( BSONObjBuilder* b ) {
- b->append( "_fts", INDEX_NAME );
- b->append( "_ftsx", 1 );
- }
+Status FTSSpec::getIndexPrefix(const BSONObj& query, BSONObj* out) const {
+ if (numExtraBefore() == 0) {
+ *out = BSONObj();
+ return Status::OK();
+ }
- void verifyFieldNameNotReserved( StringData s ) {
- uassert( 17289,
- "text index with reserved fields _fts/_ftsx not allowed",
- s != "_fts" && s != "_ftsx" );
- }
- }
+ BSONObjBuilder b;
+ for (unsigned i = 0; i < numExtraBefore(); i++) {
+ BSONElement e = query.getFieldDotted(extraBefore(i));
+ if (e.eoo())
+ return Status(ErrorCodes::BadValue,
+ str::stream() << "need have an equality filter on: " << extraBefore(i));
- BSONObj FTSSpec::fixSpec( const BSONObj& spec ) {
- if ( spec["textIndexVersion"].numberInt() == TEXT_INDEX_VERSION_1 ) {
- return _fixSpecV1( spec );
- }
+ if (e.isABSONObj() && e.Obj().firstElement().getGtLtOp(-1) != -1)
+ return Status(ErrorCodes::BadValue,
+ str::stream() << "need have an equality filter on: " << extraBefore(i));
- map<string,int> m;
-
- BSONObj keyPattern;
- {
- BSONObjBuilder b;
-
- // Populate m and keyPattern.
- {
- bool addedFtsStuff = false;
- BSONObjIterator i( spec["key"].Obj() );
- while ( i.more() ) {
- BSONElement e = i.next();
- if ( str::equals( e.fieldName(), "_fts" ) ) {
- uassert( 17271,
- "expecting _fts:\"text\"",
- INDEX_NAME == e.valuestrsafe() );
- addedFtsStuff = true;
- b.append( e );
- }
- else if ( str::equals( e.fieldName(), "_ftsx" ) ) {
- uassert( 17272, "expecting _ftsx:1", e.numberInt() == 1 );
- b.append( e );
- }
- else if ( e.type() == String && INDEX_NAME == e.valuestr() ) {
-
- if ( !addedFtsStuff ) {
- _addFTSStuff( &b );
- addedFtsStuff = true;
- }
-
- m[e.fieldName()] = 1;
- }
- else {
- uassert( 17273,
- "expected value 1 or -1 for non-text key in compound index",
- e.numberInt() == 1 || e.numberInt() == -1 );
- b.append( e );
- }
- }
- verify( addedFtsStuff );
- }
- keyPattern = b.obj();
-
- // Verify that index key is in the correct format: extraBefore fields, then text
- // fields, then extraAfter fields.
- {
- BSONObjIterator i( spec["key"].Obj() );
- verify( i.more() );
- BSONElement e = i.next();
-
- // extraBefore fields
- while ( String != e.type() ) {
- verifyFieldNameNotReserved( e.fieldNameStringData() );
- verify( i.more() );
- e = i.next();
- }
+ b.append(e);
+ }
+ *out = b.obj();
+ return Status::OK();
+}
- // text fields
- bool alreadyFixed = str::equals( e.fieldName(), "_fts" );
- if ( alreadyFixed ) {
- uassert( 17288, "expected _ftsx after _fts", i.more() );
- e = i.next();
- uassert( 17274,
- "expected _ftsx after _fts",
- str::equals( e.fieldName(), "_ftsx" ) );
- e = i.next();
- }
- else {
- do {
- verifyFieldNameNotReserved( e.fieldNameStringData() );
- e = i.next();
- } while ( !e.eoo() && e.type() == String );
- }
+namespace {
+void _addFTSStuff(BSONObjBuilder* b) {
+ b->append("_fts", INDEX_NAME);
+ b->append("_ftsx", 1);
+}
- // extraAfterFields
- while ( !e.eoo() ) {
- uassert( 17389,
- "'text' fields in index must all be adjacent",
- e.type() != String );
- verifyFieldNameNotReserved( e.fieldNameStringData() );
- e = i.next();
- }
- }
+void verifyFieldNameNotReserved(StringData s) {
+ uassert(17289,
+ "text index with reserved fields _fts/_ftsx not allowed",
+ s != "_fts" && s != "_ftsx");
+}
+}
- }
+BSONObj FTSSpec::fixSpec(const BSONObj& spec) {
+ if (spec["textIndexVersion"].numberInt() == TEXT_INDEX_VERSION_1) {
+ return _fixSpecV1(spec);
+ }
- if ( spec["weights"].type() == Object ) {
- BSONObjIterator i( spec["weights"].Obj() );
- while ( i.more() ) {
- BSONElement e = i.next();
- uassert( 17283,
- "weight for text index needs numeric type",
- e.isNumber() );
- m[e.fieldName()] = e.numberInt();
- }
- }
- else if ( spec["weights"].str() == WILDCARD ) {
- m[WILDCARD] = 1;
- }
- else if ( !spec["weights"].eoo() ) {
- uasserted( 17284, "text index option 'weights' must be an object" );
- }
+ map<string, int> m;
- BSONObj weights;
- {
- BSONObjBuilder b;
- for ( map<string,int>::iterator i = m.begin(); i != m.end(); ++i ) {
- uassert( 16674, "score for word too high",
- i->second > 0 && i->second < MAX_WORD_WEIGHT );
-
- // Verify weight refers to a valid field.
- if ( i->first != "$**" ) {
- FieldRef keyField( i->first );
- uassert( 17294,
- "weight cannot be on an empty field",
- keyField.numParts() != 0 );
- for ( size_t partNum = 0; partNum < keyField.numParts(); partNum++ ) {
- StringData part = keyField.getPart(partNum);
- uassert( 17291,
- "weight cannot have empty path component",
- !part.empty() );
- uassert( 17292,
- "weight cannot have path component with $ prefix",
- !part.startsWith( "$" ) );
- }
+ BSONObj keyPattern;
+ {
+ BSONObjBuilder b;
+
+ // Populate m and keyPattern.
+ {
+ bool addedFtsStuff = false;
+ BSONObjIterator i(spec["key"].Obj());
+ while (i.more()) {
+ BSONElement e = i.next();
+ if (str::equals(e.fieldName(), "_fts")) {
+ uassert(17271, "expecting _fts:\"text\"", INDEX_NAME == e.valuestrsafe());
+ addedFtsStuff = true;
+ b.append(e);
+ } else if (str::equals(e.fieldName(), "_ftsx")) {
+ uassert(17272, "expecting _ftsx:1", e.numberInt() == 1);
+ b.append(e);
+ } else if (e.type() == String && INDEX_NAME == e.valuestr()) {
+ if (!addedFtsStuff) {
+ _addFTSStuff(&b);
+ addedFtsStuff = true;
}
- b.append( i->first, i->second );
+ m[e.fieldName()] = 1;
+ } else {
+ uassert(17273,
+ "expected value 1 or -1 for non-text key in compound index",
+ e.numberInt() == 1 || e.numberInt() == -1);
+ b.append(e);
}
- weights = b.obj();
- }
-
- BSONElement default_language_elt = spec["default_language"];
- string default_language( default_language_elt.str() );
- if ( default_language_elt.eoo() ) {
- default_language = moduleDefaultLanguage;
}
- else {
- uassert( 17263,
- "default_language needs a string type",
- default_language_elt.type() == String );
+ verify(addedFtsStuff);
+ }
+ keyPattern = b.obj();
+
+ // Verify that index key is in the correct format: extraBefore fields, then text
+ // fields, then extraAfter fields.
+ {
+ BSONObjIterator i(spec["key"].Obj());
+ verify(i.more());
+ BSONElement e = i.next();
+
+ // extraBefore fields
+ while (String != e.type()) {
+ verifyFieldNameNotReserved(e.fieldNameStringData());
+ verify(i.more());
+ e = i.next();
}
- uassert( 17264,
- "default_language is not valid",
- FTSLanguage::make( default_language,
- TEXT_INDEX_VERSION_2 ).getStatus().isOK() );
-
- BSONElement language_override_elt = spec["language_override"];
- string language_override( language_override_elt.str() );
- if ( language_override_elt.eoo() ) {
- language_override = "language";
+
+ // text fields
+ bool alreadyFixed = str::equals(e.fieldName(), "_fts");
+ if (alreadyFixed) {
+ uassert(17288, "expected _ftsx after _fts", i.more());
+ e = i.next();
+ uassert(17274, "expected _ftsx after _fts", str::equals(e.fieldName(), "_ftsx"));
+ e = i.next();
+ } else {
+ do {
+ verifyFieldNameNotReserved(e.fieldNameStringData());
+ e = i.next();
+ } while (!e.eoo() && e.type() == String);
}
- else {
- uassert( 17136,
- "language_override is not valid",
- language_override_elt.type() == String
- && validateOverride( language_override ) );
+
+ // extraAfterFields
+ while (!e.eoo()) {
+ uassert(17389, "'text' fields in index must all be adjacent", e.type() != String);
+ verifyFieldNameNotReserved(e.fieldNameStringData());
+ e = i.next();
}
+ }
+ }
- int version = -1;
- int textIndexVersion = TEXT_INDEX_VERSION_2;
+ if (spec["weights"].type() == Object) {
+ BSONObjIterator i(spec["weights"].Obj());
+ while (i.more()) {
+ BSONElement e = i.next();
+ uassert(17283, "weight for text index needs numeric type", e.isNumber());
+ m[e.fieldName()] = e.numberInt();
+ }
+ } else if (spec["weights"].str() == WILDCARD) {
+ m[WILDCARD] = 1;
+ } else if (!spec["weights"].eoo()) {
+ uasserted(17284, "text index option 'weights' must be an object");
+ }
- BSONObjBuilder b;
- BSONObjIterator i( spec );
- while ( i.more() ) {
- BSONElement e = i.next();
- if ( str::equals( e.fieldName(), "key" ) ) {
- b.append( "key", keyPattern );
- }
- else if ( str::equals( e.fieldName(), "weights" ) ) {
- b.append( "weights", weights );
- weights = BSONObj();
- }
- else if ( str::equals( e.fieldName(), "default_language" ) ) {
- b.append( "default_language", default_language);
- default_language = "";
- }
- else if ( str::equals( e.fieldName(), "language_override" ) ) {
- b.append( "language_override", language_override);
- language_override = "";
- }
- else if ( str::equals( e.fieldName(), "v" ) ) {
- version = e.numberInt();
- }
- else if ( str::equals( e.fieldName(), "textIndexVersion" ) ) {
- uassert( 17293,
- "text index option 'textIndexVersion' must be a number",
- e.isNumber() );
- textIndexVersion = e.numberInt();
- uassert( 16730,
- str::stream() << "bad textIndexVersion: " << textIndexVersion,
- textIndexVersion == TEXT_INDEX_VERSION_2 );
- }
- else {
- b.append( e );
+ BSONObj weights;
+ {
+ BSONObjBuilder b;
+ for (map<string, int>::iterator i = m.begin(); i != m.end(); ++i) {
+ uassert(16674, "score for word too high", i->second > 0 && i->second < MAX_WORD_WEIGHT);
+
+ // Verify weight refers to a valid field.
+ if (i->first != "$**") {
+ FieldRef keyField(i->first);
+ uassert(17294, "weight cannot be on an empty field", keyField.numParts() != 0);
+ for (size_t partNum = 0; partNum < keyField.numParts(); partNum++) {
+ StringData part = keyField.getPart(partNum);
+ uassert(17291, "weight cannot have empty path component", !part.empty());
+ uassert(17292,
+ "weight cannot have path component with $ prefix",
+ !part.startsWith("$"));
}
}
- if ( !weights.isEmpty() ) {
- b.append( "weights", weights );
- }
- if ( !default_language.empty() ) {
- b.append( "default_language", default_language);
- }
- if ( !language_override.empty() ) {
- b.append( "language_override", language_override);
- }
- if ( version >= 0 ) {
- b.append( "v", version );
- }
- b.append( "textIndexVersion", textIndexVersion );
+ b.append(i->first, i->second);
+ }
+ weights = b.obj();
+ }
+
+ BSONElement default_language_elt = spec["default_language"];
+ string default_language(default_language_elt.str());
+ if (default_language_elt.eoo()) {
+ default_language = moduleDefaultLanguage;
+ } else {
+ uassert(
+ 17263, "default_language needs a string type", default_language_elt.type() == String);
+ }
+ uassert(17264,
+ "default_language is not valid",
+ FTSLanguage::make(default_language, TEXT_INDEX_VERSION_2).getStatus().isOK());
+
+ BSONElement language_override_elt = spec["language_override"];
+ string language_override(language_override_elt.str());
+ if (language_override_elt.eoo()) {
+ language_override = "language";
+ } else {
+ uassert(17136,
+ "language_override is not valid",
+ language_override_elt.type() == String && validateOverride(language_override));
+ }
- return b.obj();
+ int version = -1;
+ int textIndexVersion = TEXT_INDEX_VERSION_2;
+
+ BSONObjBuilder b;
+ BSONObjIterator i(spec);
+ while (i.more()) {
+ BSONElement e = i.next();
+ if (str::equals(e.fieldName(), "key")) {
+ b.append("key", keyPattern);
+ } else if (str::equals(e.fieldName(), "weights")) {
+ b.append("weights", weights);
+ weights = BSONObj();
+ } else if (str::equals(e.fieldName(), "default_language")) {
+ b.append("default_language", default_language);
+ default_language = "";
+ } else if (str::equals(e.fieldName(), "language_override")) {
+ b.append("language_override", language_override);
+ language_override = "";
+ } else if (str::equals(e.fieldName(), "v")) {
+ version = e.numberInt();
+ } else if (str::equals(e.fieldName(), "textIndexVersion")) {
+ uassert(17293, "text index option 'textIndexVersion' must be a number", e.isNumber());
+ textIndexVersion = e.numberInt();
+ uassert(16730,
+ str::stream() << "bad textIndexVersion: " << textIndexVersion,
+ textIndexVersion == TEXT_INDEX_VERSION_2);
+ } else {
+ b.append(e);
}
+ }
+ if (!weights.isEmpty()) {
+ b.append("weights", weights);
+ }
+ if (!default_language.empty()) {
+ b.append("default_language", default_language);
}
+ if (!language_override.empty()) {
+ b.append("language_override", language_override);
+ }
+ if (version >= 0) {
+ b.append("v", version);
+ }
+ b.append("textIndexVersion", textIndexVersion);
+
+ return b.obj();
+}
+}
}
diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h
index 0f17d825dcc..d5cc0b46472 100644
--- a/src/mongo/db/fts/fts_spec.h
+++ b/src/mongo/db/fts/fts_spec.h
@@ -43,136 +43,146 @@
namespace mongo {
- namespace fts {
-
- extern const double MAX_WEIGHT;
- extern const double MAX_WORD_WEIGHT;
- extern const double DEFAULT_WEIGHT;
-
- typedef std::map<std::string,double> Weights; // TODO cool map
- typedef unordered_map<std::string,double> TermFrequencyMap;
-
- struct ScoreHelperStruct {
- ScoreHelperStruct()
- : freq(0), count(0), exp(0){
- }
- double freq;
- double count;
- double exp;
- };
- typedef unordered_map<std::string,ScoreHelperStruct> ScoreHelperMap;
-
- class FTSSpec {
-
- struct Tools {
- Tools( const FTSLanguage& _language,
- const Stemmer* _stemmer,
- const StopWords* _stopwords )
- : language( _language )
- , stemmer( _stemmer )
- , stopwords( _stopwords ) {}
-
- const FTSLanguage& language;
- const Stemmer* stemmer;
- const StopWords* stopwords;
- };
-
- public:
- FTSSpec( const BSONObj& indexInfo );
-
- bool wildcard() const { return _wildcard; }
- const FTSLanguage& defaultLanguage() const { return *_defaultLanguage; }
- const std::string& languageOverrideField() const { return _languageOverrideField; }
-
- size_t numExtraBefore() const { return _extraBefore.size(); }
- const std::string& extraBefore( unsigned i ) const { return _extraBefore[i]; }
-
- size_t numExtraAfter() const { return _extraAfter.size(); }
- const std::string& extraAfter( unsigned i ) const { return _extraAfter[i]; }
-
- /**
- * Calculates term/score pairs for a BSONObj as applied to this spec.
- * @arg obj document to traverse; can be a subdocument or array
- * @arg term_freqs output parameter to store (term,score) results
- */
- void scoreDocument( const BSONObj& obj, TermFrequencyMap* term_freqs ) const;
-
- /**
- * given a query, pulls out the pieces (in order) that go in the index first
- */
- Status getIndexPrefix( const BSONObj& filter, BSONObj* out ) const;
-
- const Weights& weights() const { return _weights; }
- static BSONObj fixSpec( const BSONObj& spec );
-
- /**
- * Returns text index version.
- */
- TextIndexVersion getTextIndexVersion() const { return _textIndexVersion; }
-
- private:
- //
- // Helper methods. Invoked for TEXT_INDEX_VERSION_2 spec objects only.
- //
-
- /**
- * Calculate the term scores for 'raw' and update 'term_freqs' with the result. Parses
- * 'raw' using 'tools', and weights term scores based on 'weight'.
- */
- void _scoreStringV2( FTSTokenizer* tokenizer,
- StringData raw,
- TermFrequencyMap* term_freqs,
- double weight ) const;
-
- public:
- /**
- * Get the language override for the given BSON doc. If no language override is
- * specified, returns currentLanguage.
- */
- const FTSLanguage* _getLanguageToUseV2( const BSONObj& userDoc,
- const FTSLanguage* currentLanguage ) const;
-
- private:
- //
- // Deprecated helper methods. Invoked for TEXT_INDEX_VERSION_1 spec objects only.
- //
-
- void _scoreStringV1( const Tools& tools,
- StringData raw,
- TermFrequencyMap* docScores,
- double weight ) const;
-
- bool _weightV1( StringData field, double* out ) const;
-
- void _scoreRecurseV1( const Tools& tools,
- const BSONObj& obj,
- TermFrequencyMap* term_freqs ) const;
-
- void _scoreDocumentV1( const BSONObj& obj, TermFrequencyMap* term_freqs ) const;
-
- const FTSLanguage& _getLanguageToUseV1( const BSONObj& userDoc ) const;
-
- static BSONObj _fixSpecV1( const BSONObj& spec );
-
- //
- // Instance variables.
- //
-
- TextIndexVersion _textIndexVersion;
+namespace fts {
+
+extern const double MAX_WEIGHT;
+extern const double MAX_WORD_WEIGHT;
+extern const double DEFAULT_WEIGHT;
+
+typedef std::map<std::string, double> Weights; // TODO cool map
+typedef unordered_map<std::string, double> TermFrequencyMap;
+
+struct ScoreHelperStruct {
+ ScoreHelperStruct() : freq(0), count(0), exp(0) {}
+ double freq;
+ double count;
+ double exp;
+};
+typedef unordered_map<std::string, ScoreHelperStruct> ScoreHelperMap;
+
+class FTSSpec {
+ struct Tools {
+ Tools(const FTSLanguage& _language, const Stemmer* _stemmer, const StopWords* _stopwords)
+ : language(_language), stemmer(_stemmer), stopwords(_stopwords) {}
+
+ const FTSLanguage& language;
+ const Stemmer* stemmer;
+ const StopWords* stopwords;
+ };
+
+public:
+ FTSSpec(const BSONObj& indexInfo);
+
+ bool wildcard() const {
+ return _wildcard;
+ }
+ const FTSLanguage& defaultLanguage() const {
+ return *_defaultLanguage;
+ }
+ const std::string& languageOverrideField() const {
+ return _languageOverrideField;
+ }
+
+ size_t numExtraBefore() const {
+ return _extraBefore.size();
+ }
+ const std::string& extraBefore(unsigned i) const {
+ return _extraBefore[i];
+ }
+
+ size_t numExtraAfter() const {
+ return _extraAfter.size();
+ }
+ const std::string& extraAfter(unsigned i) const {
+ return _extraAfter[i];
+ }
- const FTSLanguage* _defaultLanguage;
- std::string _languageOverrideField;
- bool _wildcard;
+ /**
+ * Calculates term/score pairs for a BSONObj as applied to this spec.
+ * @arg obj document to traverse; can be a subdocument or array
+ * @arg term_freqs output parameter to store (term,score) results
+ */
+ void scoreDocument(const BSONObj& obj, TermFrequencyMap* term_freqs) const;
- // mapping : fieldname -> weight
- Weights _weights;
-
- // Prefix compound key - used to partition search index
- std::vector<std::string> _extraBefore;
+ /**
+ * given a query, pulls out the pieces (in order) that go in the index first
+ */
+ Status getIndexPrefix(const BSONObj& filter, BSONObj* out) const;
- // Suffix compound key - used for covering index behavior
- std::vector<std::string> _extraAfter;
- };
+ const Weights& weights() const {
+ return _weights;
+ }
+ static BSONObj fixSpec(const BSONObj& spec);
+ /**
+ * Returns text index version.
+ */
+ TextIndexVersion getTextIndexVersion() const {
+ return _textIndexVersion;
}
+
+private:
+ //
+ // Helper methods. Invoked for TEXT_INDEX_VERSION_2 spec objects only.
+ //
+
+ /**
+ * Calculate the term scores for 'raw' and update 'term_freqs' with the result. Parses
+ * 'raw' using 'tools', and weights term scores based on 'weight'.
+ */
+ void _scoreStringV2(FTSTokenizer* tokenizer,
+ StringData raw,
+ TermFrequencyMap* term_freqs,
+ double weight) const;
+
+public:
+ /**
+ * Get the language override for the given BSON doc. If no language override is
+ * specified, returns currentLanguage.
+ */
+ const FTSLanguage* _getLanguageToUseV2(const BSONObj& userDoc,
+ const FTSLanguage* currentLanguage) const;
+
+private:
+ //
+ // Deprecated helper methods. Invoked for TEXT_INDEX_VERSION_1 spec objects only.
+ //
+
+ void _scoreStringV1(const Tools& tools,
+ StringData raw,
+ TermFrequencyMap* docScores,
+ double weight) const;
+
+ bool _weightV1(StringData field, double* out) const;
+
+ void _scoreRecurseV1(const Tools& tools,
+ const BSONObj& obj,
+ TermFrequencyMap* term_freqs) const;
+
+ void _scoreDocumentV1(const BSONObj& obj, TermFrequencyMap* term_freqs) const;
+
+ const FTSLanguage& _getLanguageToUseV1(const BSONObj& userDoc) const;
+
+ static BSONObj _fixSpecV1(const BSONObj& spec);
+
+ //
+ // Instance variables.
+ //
+
+ TextIndexVersion _textIndexVersion;
+
+ const FTSLanguage* _defaultLanguage;
+ std::string _languageOverrideField;
+ bool _wildcard;
+
+ // mapping : fieldname -> weight
+ Weights _weights;
+
+ // Prefix compound key - used to partition search index
+ std::vector<std::string> _extraBefore;
+
+ // Suffix compound key - used for covering index behavior
+ std::vector<std::string> _extraAfter;
+};
+}
}
diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp
index a2dc1dc2489..4a161c8614a 100644
--- a/src/mongo/db/fts/fts_spec_legacy.cpp
+++ b/src/mongo/db/fts/fts_spec_legacy.cpp
@@ -33,290 +33,268 @@
namespace mongo {
- namespace fts {
+namespace fts {
- //
- // This file contains functionality specific to indexing documents from TEXT_INDEX_VERSION_1
- // text indexes.
- //
+//
+// This file contains functionality specific to indexing documents from TEXT_INDEX_VERSION_1
+// text indexes.
+//
- using std::map;
- using std::string;
- using namespace mongoutils;
+using std::map;
+using std::string;
+using namespace mongoutils;
- namespace {
- void _addFTSStuff( BSONObjBuilder* b ) {
- b->append( "_fts", INDEX_NAME );
- b->append( "_ftsx", 1 );
- }
- }
+namespace {
+void _addFTSStuff(BSONObjBuilder* b) {
+ b->append("_fts", INDEX_NAME);
+ b->append("_ftsx", 1);
+}
+}
- const FTSLanguage& FTSSpec::_getLanguageToUseV1( const BSONObj& userDoc ) const {
- BSONElement e = userDoc[_languageOverrideField];
- if ( e.type() == String ) {
- const char * x = e.valuestrsafe();
- if ( strlen( x ) > 0 ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( x, TEXT_INDEX_VERSION_1 );
- dassert( swl.isOK() ); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail.
- return *swl.getValue();
- }
- }
- return *_defaultLanguage;
+const FTSLanguage& FTSSpec::_getLanguageToUseV1(const BSONObj& userDoc) const {
+ BSONElement e = userDoc[_languageOverrideField];
+ if (e.type() == String) {
+ const char* x = e.valuestrsafe();
+ if (strlen(x) > 0) {
+ StatusWithFTSLanguage swl = FTSLanguage::make(x, TEXT_INDEX_VERSION_1);
+ dassert(swl.isOK()); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail.
+ return *swl.getValue();
}
+ }
+ return *_defaultLanguage;
+}
- void FTSSpec::_scoreStringV1( const Tools& tools,
- StringData raw,
- TermFrequencyMap* docScores,
- double weight ) const {
-
- ScoreHelperMap terms;
+void FTSSpec::_scoreStringV1(const Tools& tools,
+ StringData raw,
+ TermFrequencyMap* docScores,
+ double weight) const {
+ ScoreHelperMap terms;
- unsigned numTokens = 0;
+ unsigned numTokens = 0;
- Tokenizer i( &tools.language, raw );
- while ( i.more() ) {
- Token t = i.next();
- if ( t.type != Token::TEXT )
- continue;
+ Tokenizer i(&tools.language, raw);
+ while (i.more()) {
+ Token t = i.next();
+ if (t.type != Token::TEXT)
+ continue;
- string term = tolowerString( t.data );
- if ( tools.stopwords->isStopWord( term ) )
- continue;
- term = tools.stemmer->stem( term );
+ string term = tolowerString(t.data);
+ if (tools.stopwords->isStopWord(term))
+ continue;
+ term = tools.stemmer->stem(term);
- ScoreHelperStruct& data = terms[term];
+ ScoreHelperStruct& data = terms[term];
- if ( data.exp )
- data.exp *= 2;
- else
- data.exp = 1;
- data.count += 1;
- data.freq += ( 1 / data.exp );
+ if (data.exp)
+ data.exp *= 2;
+ else
+ data.exp = 1;
+ data.count += 1;
+ data.freq += (1 / data.exp);
- numTokens++;
- }
+ numTokens++;
+ }
- for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) {
+ for (ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i) {
+ const string& term = i->first;
+ const ScoreHelperStruct& data = i->second;
- const string& term = i->first;
- const ScoreHelperStruct& data = i->second;
+ // in order to adjust weights as a function of term count as it
+ // relates to total field length. ie. is this the only word or
+ // a frequently occuring term? or does it only show up once in
+ // a long block of text?
- // in order to adjust weights as a function of term count as it
- // relates to total field length. ie. is this the only word or
- // a frequently occuring term? or does it only show up once in
- // a long block of text?
+ double coeff = (0.5 * data.count / numTokens) + 0.5;
- double coeff = ( 0.5 * data.count / numTokens ) + 0.5;
+ // if term is identical to the raw form of the
+ // field (untokenized) give it a small boost.
+ double adjustment = 1;
+ if (raw.size() == term.length() && raw.equalCaseInsensitive(term))
+ adjustment += 0.1;
- // if term is identical to the raw form of the
- // field (untokenized) give it a small boost.
- double adjustment = 1;
- if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) )
- adjustment += 0.1;
+ double& score = (*docScores)[term];
+ score += (weight * data.freq * coeff * adjustment);
+ verify(score <= MAX_WEIGHT);
+ }
+}
- double& score = (*docScores)[term];
- score += ( weight * data.freq * coeff * adjustment );
- verify( score <= MAX_WEIGHT );
- }
- }
+bool FTSSpec::_weightV1(StringData field, double* out) const {
+ Weights::const_iterator i = _weights.find(field.toString());
+ if (i == _weights.end())
+ return false;
+ *out = i->second;
+ return true;
+}
- bool FTSSpec::_weightV1( StringData field, double* out ) const {
- Weights::const_iterator i = _weights.find( field.toString() );
- if ( i == _weights.end() )
- return false;
- *out = i->second;
- return true;
+/*
+ * Recurses over all fields of an obj (document in collection)
+ * and fills term,score map term_freqs
+ * @param tokenizer, tokenizer to tokenize a string into terms
+ * @param obj, object being parsed
+ * term_freqs, map <term,score> to be filled up
+ */
+void FTSSpec::_scoreRecurseV1(const Tools& tools,
+ const BSONObj& obj,
+ TermFrequencyMap* term_freqs) const {
+ BSONObjIterator j(obj);
+ while (j.more()) {
+ BSONElement x = j.next();
+
+ if (languageOverrideField() == x.fieldName())
+ continue;
+
+ if (x.type() == String) {
+ double w = 1;
+ _weightV1(x.fieldName(), &w);
+ _scoreStringV1(tools, x.valuestr(), term_freqs, w);
+ } else if (x.isABSONObj()) {
+ _scoreRecurseV1(tools, x.Obj(), term_freqs);
}
+ }
+}
- /*
- * Recurses over all fields of an obj (document in collection)
- * and fills term,score map term_freqs
- * @param tokenizer, tokenizer to tokenize a string into terms
- * @param obj, object being parsed
- * term_freqs, map <term,score> to be filled up
- */
- void FTSSpec::_scoreRecurseV1( const Tools& tools,
- const BSONObj& obj,
- TermFrequencyMap* term_freqs ) const {
- BSONObjIterator j( obj );
- while ( j.more() ) {
- BSONElement x = j.next();
+void FTSSpec::_scoreDocumentV1(const BSONObj& obj, TermFrequencyMap* term_freqs) const {
+ const FTSLanguage& language = _getLanguageToUseV1(obj);
- if ( languageOverrideField() == x.fieldName() )
- continue;
+ Stemmer stemmer(&language);
+ Tools tools(language, &stemmer, StopWords::getStopWords(&language));
- if (x.type() == String) {
- double w = 1;
- _weightV1( x.fieldName(), &w );
- _scoreStringV1(tools, x.valuestr(), term_freqs, w);
- }
- else if ( x.isABSONObj() ) {
- _scoreRecurseV1( tools, x.Obj(), term_freqs);
- }
+ if (wildcard()) {
+ // if * is specified for weight, we can recurse over all fields.
+ _scoreRecurseV1(tools, obj, term_freqs);
+ return;
+ }
+ // otherwise, we need to remember the different weights for each field
+ // and act accordingly (in other words, call _score)
+ for (Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++) {
+ const char* leftOverName = i->first.c_str();
+ // name of field
+ BSONElement e = obj.getFieldDottedOrArray(leftOverName);
+ // weight associated to name of field
+ double weight = i->second;
+
+ if (e.eoo()) {
+ // do nothing
+ } else if (e.type() == Array) {
+ BSONObjIterator j(e.Obj());
+ while (j.more()) {
+ BSONElement x = j.next();
+ if (leftOverName[0] && x.isABSONObj())
+ x = x.Obj().getFieldDotted(leftOverName);
+ if (x.type() == String)
+ _scoreStringV1(tools, x.valuestr(), term_freqs, weight);
}
+ } else if (e.type() == String) {
+ _scoreStringV1(tools, e.valuestr(), term_freqs, weight);
}
+ }
+}
- void FTSSpec::_scoreDocumentV1( const BSONObj& obj,
- TermFrequencyMap* term_freqs ) const {
-
- const FTSLanguage& language = _getLanguageToUseV1( obj );
-
- Stemmer stemmer(&language);
- Tools tools(language, &stemmer, StopWords::getStopWords( &language ));
-
- if ( wildcard() ) {
- // if * is specified for weight, we can recurse over all fields.
- _scoreRecurseV1(tools, obj, term_freqs);
- return;
- }
-
- // otherwise, we need to remember the different weights for each field
- // and act accordingly (in other words, call _score)
- for ( Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++ ) {
- const char * leftOverName = i->first.c_str();
- // name of field
- BSONElement e = obj.getFieldDottedOrArray(leftOverName);
- // weight associated to name of field
- double weight = i->second;
-
- if ( e.eoo() ) {
- // do nothing
- }
- else if ( e.type() == Array ) {
- BSONObjIterator j( e.Obj() );
- while ( j.more() ) {
- BSONElement x = j.next();
- if ( leftOverName[0] && x.isABSONObj() )
- x = x.Obj().getFieldDotted( leftOverName );
- if ( x.type() == String )
- _scoreStringV1( tools, x.valuestr(), term_freqs, weight );
- }
- }
- else if ( e.type() == String ) {
- _scoreStringV1( tools, e.valuestr(), term_freqs, weight );
+BSONObj FTSSpec::_fixSpecV1(const BSONObj& spec) {
+ map<string, int> m;
+
+ BSONObj keyPattern;
+ {
+ BSONObjBuilder b;
+ bool addedFtsStuff = false;
+
+ BSONObjIterator i(spec["key"].Obj());
+ while (i.more()) {
+ BSONElement e = i.next();
+ if (str::equals(e.fieldName(), "_fts") || str::equals(e.fieldName(), "_ftsx")) {
+ addedFtsStuff = true;
+ b.append(e);
+ } else if (e.type() == String &&
+ (str::equals("fts", e.valuestr()) || str::equals("text", e.valuestr()))) {
+ if (!addedFtsStuff) {
+ _addFTSStuff(&b);
+ addedFtsStuff = true;
}
+ m[e.fieldName()] = 1;
+ } else {
+ b.append(e);
}
}
- BSONObj FTSSpec::_fixSpecV1( const BSONObj& spec ) {
- map<string,int> m;
-
- BSONObj keyPattern;
- {
- BSONObjBuilder b;
- bool addedFtsStuff = false;
-
- BSONObjIterator i( spec["key"].Obj() );
- while ( i.more() ) {
- BSONElement e = i.next();
- if ( str::equals( e.fieldName(), "_fts" ) ||
- str::equals( e.fieldName(), "_ftsx" ) ) {
- addedFtsStuff = true;
- b.append( e );
- }
- else if ( e.type() == String &&
- ( str::equals( "fts", e.valuestr() ) ||
- str::equals( "text", e.valuestr() ) ) ) {
-
- if ( !addedFtsStuff ) {
- _addFTSStuff( &b );
- addedFtsStuff = true;
- }
-
- m[e.fieldName()] = 1;
- }
- else {
- b.append( e );
- }
- }
-
- if ( !addedFtsStuff )
- _addFTSStuff( &b );
-
- keyPattern = b.obj();
- }
-
- if ( spec["weights"].isABSONObj() ) {
- BSONObjIterator i( spec["weights"].Obj() );
- while ( i.more() ) {
- BSONElement e = i.next();
- m[e.fieldName()] = e.numberInt();
- }
- }
- else if ( spec["weights"].str() == WILDCARD ) {
- m[WILDCARD] = 1;
- }
-
- BSONObj weights;
- {
- BSONObjBuilder b;
- for ( map<string,int>::iterator i = m.begin(); i != m.end(); ++i ) {
- uassert( 17365, "score for word too high",
- i->second > 0 && i->second < MAX_WORD_WEIGHT );
- b.append( i->first, i->second );
- }
- weights = b.obj();
- }
+ if (!addedFtsStuff)
+ _addFTSStuff(&b);
- string default_language(spec.getStringField("default_language"));
- if ( default_language.empty() )
- default_language = "english";
+ keyPattern = b.obj();
+ }
- string language_override(spec.getStringField("language_override"));
- if ( language_override.empty() )
- language_override = "language";
+ if (spec["weights"].isABSONObj()) {
+ BSONObjIterator i(spec["weights"].Obj());
+ while (i.more()) {
+ BSONElement e = i.next();
+ m[e.fieldName()] = e.numberInt();
+ }
+ } else if (spec["weights"].str() == WILDCARD) {
+ m[WILDCARD] = 1;
+ }
- int version = -1;
- int textIndexVersion = 1;
+ BSONObj weights;
+ {
+ BSONObjBuilder b;
+ for (map<string, int>::iterator i = m.begin(); i != m.end(); ++i) {
+ uassert(17365, "score for word too high", i->second > 0 && i->second < MAX_WORD_WEIGHT);
+ b.append(i->first, i->second);
+ }
+ weights = b.obj();
+ }
- BSONObjBuilder b;
- BSONObjIterator i( spec );
- while ( i.more() ) {
- BSONElement e = i.next();
- if ( str::equals( e.fieldName(), "key" ) ) {
- b.append( "key", keyPattern );
- }
- else if ( str::equals( e.fieldName(), "weights" ) ) {
- b.append( "weights", weights );
- weights = BSONObj();
- }
- else if ( str::equals( e.fieldName(), "default_language" ) ) {
- b.append( "default_language", default_language);
- default_language = "";
- }
- else if ( str::equals( e.fieldName(), "language_override" ) ) {
- b.append( "language_override", language_override);
- language_override = "";
- }
- else if ( str::equals( e.fieldName(), "v" ) ) {
- version = e.numberInt();
- }
- else if ( str::equals( e.fieldName(), "textIndexVersion" ) ) {
- textIndexVersion = e.numberInt();
- uassert( 17366,
- str::stream() << "bad textIndexVersion: " << textIndexVersion,
- textIndexVersion == 1 );
- }
- else {
- b.append( e );
- }
- }
+ string default_language(spec.getStringField("default_language"));
+ if (default_language.empty())
+ default_language = "english";
+
+ string language_override(spec.getStringField("language_override"));
+ if (language_override.empty())
+ language_override = "language";
+
+ int version = -1;
+ int textIndexVersion = 1;
+
+ BSONObjBuilder b;
+ BSONObjIterator i(spec);
+ while (i.more()) {
+ BSONElement e = i.next();
+ if (str::equals(e.fieldName(), "key")) {
+ b.append("key", keyPattern);
+ } else if (str::equals(e.fieldName(), "weights")) {
+ b.append("weights", weights);
+ weights = BSONObj();
+ } else if (str::equals(e.fieldName(), "default_language")) {
+ b.append("default_language", default_language);
+ default_language = "";
+ } else if (str::equals(e.fieldName(), "language_override")) {
+ b.append("language_override", language_override);
+ language_override = "";
+ } else if (str::equals(e.fieldName(), "v")) {
+ version = e.numberInt();
+ } else if (str::equals(e.fieldName(), "textIndexVersion")) {
+ textIndexVersion = e.numberInt();
+ uassert(17366,
+ str::stream() << "bad textIndexVersion: " << textIndexVersion,
+ textIndexVersion == 1);
+ } else {
+ b.append(e);
+ }
+ }
- if ( !weights.isEmpty() )
- b.append( "weights", weights );
- if ( !default_language.empty() )
- b.append( "default_language", default_language);
- if ( !language_override.empty() )
- b.append( "language_override", language_override);
+ if (!weights.isEmpty())
+ b.append("weights", weights);
+ if (!default_language.empty())
+ b.append("default_language", default_language);
+ if (!language_override.empty())
+ b.append("language_override", language_override);
- if ( version >= 0 )
- b.append( "v", version );
+ if (version >= 0)
+ b.append("v", version);
- b.append( "textIndexVersion", textIndexVersion );
+ b.append("textIndexVersion", textIndexVersion);
- return b.obj();
- }
- }
+ return b.obj();
+}
+}
}
diff --git a/src/mongo/db/fts/fts_spec_test.cpp b/src/mongo/db/fts/fts_spec_test.cpp
index 832279eb18d..c9f628a2b28 100644
--- a/src/mongo/db/fts/fts_spec_test.cpp
+++ b/src/mongo/db/fts/fts_spec_test.cpp
@@ -36,541 +36,558 @@
namespace mongo {
- using std::set;
- using std::string;
-
- namespace fts {
-
- /**
- * Assert that fixSpec() accepts the provided text index spec.
- */
- void assertFixSuccess( const std::string& s ) {
- BSONObj user = fromjson( s );
-
- try {
- // fixSpec() should not throw on a valid spec.
- BSONObj fixed = FTSSpec::fixSpec( user );
-
- // fixSpec() on an already-fixed spec shouldn't change it.
- BSONObj fixed2 = FTSSpec::fixSpec( fixed );
- ASSERT_EQUALS( fixed, fixed2 );
- }
- catch ( UserException& ) {
- ASSERT( false );
- }
- }
-
- /**
- * Assert that fixSpec() rejects the provided text index spec.
- */
- void assertFixFailure( const std::string& s ) {
- BSONObj user = fromjson( s );
-
- try {
- // fixSpec() on an invalid spec should uassert.
- BSONObj fixed = FTSSpec::fixSpec( user );
- }
- catch ( UserException& ) {
- return;
- }
- ASSERT( false );
- }
-
- TEST( FTSSpec, FixNormalKey1 ) {
- assertFixSuccess("{key: {a: 'text'}}");
- assertFixSuccess("{key: {a: 'text', b: 'text'}}");
- assertFixSuccess("{key: {a: 'text', b: 'text', c: 'text'}}");
-
- assertFixFailure("{key: {_fts: 'text'}}"); // not allowed to index reserved field
- assertFixFailure("{key: {_ftsx: 'text'}}");
- }
-
- TEST( FTSSpec, FixCompoundKey1 ) {
- assertFixSuccess("{key: {a: 'text', b: 1.0}}");
- assertFixSuccess("{key: {a: 'text', b: NumberInt(1)}}");
- assertFixSuccess("{key: {a: 'text', b: NumberLong(1)}}");
- assertFixSuccess("{key: {a: 'text', b: -1.0}}");
- assertFixSuccess("{key: {a: 'text', b: NumberInt(-1)}}");
- assertFixSuccess("{key: {a: 'text', b: NumberLong(-1)}}");
- assertFixSuccess("{key: {a: 1.0, b: 'text'}}");
- assertFixSuccess("{key: {a: NumberInt(1), b: 'text'}}");
- assertFixSuccess("{key: {a: NumberLong(1), b: 'text'}}");
- assertFixSuccess("{key: {a: -1, b: 'text'}}");
- assertFixSuccess("{key: {a: 1, b: 1, c: 'text'}}");
- assertFixSuccess("{key: {a: 1, b: -1, c: 'text'}}");
- assertFixSuccess("{key: {a: -1, b: 1, c: 'text'}}");
- assertFixSuccess("{key: {a: 1, b: 'text', c: 1}}");
- assertFixSuccess("{key: {a: 'text', b: 1, c: 1}}");
- assertFixSuccess("{key: {a: 'text', b: 1, c: -1}}");
- assertFixSuccess("{key: {a: 'text', b: 'text', c: 1}}");
- assertFixSuccess("{key: {a: 1, b: 'text', c: 'text'}}");
-
- assertFixFailure("{key: {a: 'text', b: 0}}");
- assertFixFailure("{key: {a: 'text', b: '2d'}}"); // not allowed to mix special indexes
- assertFixFailure("{key: {a: 'text', b: '1'}}");
- assertFixFailure("{key: {a: 'text', _fts: 1}}");
- assertFixFailure("{key: {a: 'text', _fts: 'text'}}");
- assertFixFailure("{key: {a: 'text', _ftsx: 1}}");
- assertFixFailure("{key: {a: 'text', _ftsx: 'text'}}");
- assertFixFailure("{key: {_fts: 1, a: 'text'}}");
- assertFixFailure("{key: {_fts: 'text', a: 'text'}}");
- assertFixFailure("{key: {_ftsx: 1, a: 'text'}}");
- assertFixFailure("{key: {_ftsx: 'text', a: 'text'}}");
- assertFixFailure("{key: {a: 'text', b: 1, c: 'text'}}"); // 'text' must all be adjacent
- assertFixFailure("{key: {a: 'text', b: 1, c: 'text', d: 1}}");
- assertFixFailure("{key: {a: 1, b: 'text', c: 1, d: 'text', e: 1}}");
- }
-
- TEST( FTSSpec, FixDefaultLanguage1 ) {
- assertFixSuccess("{key: {a: 'text'}, default_language: 'english'}");
- assertFixSuccess("{key: {a: 'text'}, default_language: 'engLISH'}");
- assertFixSuccess("{key: {a: 'text'}, default_language: 'en'}");
- assertFixSuccess("{key: {a: 'text'}, default_language: 'eN'}");
- assertFixSuccess("{key: {a: 'text'}, default_language: 'spanish'}");
- assertFixSuccess("{key: {a: 'text'}, default_language: 'none'}");
-
- assertFixFailure("{key: {a: 'text'}, default_language: 'engrish'}");
- assertFixFailure("{key: {a: 'text'}, default_language: ' english'}");
- assertFixFailure("{key: {a: 'text'}, default_language: ''}");
- }
-
- TEST( FTSSpec, FixWeights1 ) {
- assertFixSuccess("{key: {a: 'text'}, weights: {}}");
- assertFixSuccess("{key: {a: 'text'}, weights: {a: 1.0}}");
- assertFixSuccess("{key: {a: 'text'}, weights: {a: NumberInt(1)}}");
- assertFixSuccess("{key: {a: 'text'}, weights: {a: NumberLong(1)}}");
- assertFixSuccess("{key: {a: 'text'}, weights: {a: 99999}}");
- assertFixSuccess("{key: {'$**': 'text'}, weights: {'a.b': 2}}");
- assertFixSuccess("{key: {'$**': 'text'}, weights: {a: 2, b: 2}}");
- assertFixSuccess("{key: {'$**': 'text'}, weights: {'$**': 2}}");
-
- assertFixFailure("{key: {a: 'text'}, weights: 0}");
- assertFixFailure("{key: {a: 'text'}, weights: []}");
- assertFixFailure("{key: {a: 'text'}, weights: 'x'}");
- assertFixFailure("{key: {a: 'text'}, weights: {a: 0}}");
- assertFixFailure("{key: {a: 'text'}, weights: {a: -1}}");
- assertFixFailure("{key: {a: 'text'}, weights: {a: 100000}}"); // above max weight
- assertFixFailure("{key: {a: 'text'}, weights: {a: '1'}}");
- assertFixFailure("{key: {a: 'text'}, weights: {'': 1}}"); // "invalid" path
- assertFixFailure("{key: {a: 'text'}, weights: {'a.': 1}}");
- assertFixFailure("{key: {a: 'text'}, weights: {'.a': 1}}");
- assertFixFailure("{key: {a: 'text'}, weights: {'a..a': 1}}");
- assertFixFailure("{key: {a: 'text'}, weights: {$a: 1}}");
- assertFixFailure("{key: {a: 'text'}, weights: {'a.$a': 1}}");
- assertFixFailure("{key: {a: 'text'}, weights: {'a.$**': 1}}");
- }
-
- TEST( FTSSpec, FixLanguageOverride1 ) {
- assertFixSuccess("{key: {a: 'text'}, language_override: 'foo'}");
- assertFixSuccess("{key: {a: 'text'}, language_override: 'foo$bar'}");
-
- assertFixFailure("{key: {a: 'text'}, language_override: 'foo.bar'}"); // can't have '.'
- assertFixFailure("{key: {a: 'text'}, language_override: ''}");
- assertFixFailure("{key: {a: 'text'}, language_override: '$foo'}");
- }
-
- TEST( FTSSpec, FixTextIndexVersion1 ) {
- assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 1.0}}");
- assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(1)}}");
- assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(1)}}");
- assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 2.0}}");
- assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(2)}}");
- assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(2)}}");
-
- assertFixFailure("{key: {a: 'text'}, textIndexVersion: 3}");
- assertFixFailure("{key: {a: 'text'}, textIndexVersion: '2'}");
- assertFixFailure("{key: {a: 'text'}, textIndexVersion: {}}");
- }
-
- TEST( FTSSpec, ScoreSingleField1 ) {
- BSONObj user = BSON( "key" << BSON( "title" << "text" <<
- "text" << "text" ) <<
- "weights" << BSON( "title" << 10 ) );
-
- FTSSpec spec( FTSSpec::fixSpec( user ) );
-
- TermFrequencyMap m;
- spec.scoreDocument( BSON( "title" << "cat sat run" ), &m );
- ASSERT_EQUALS( 3U, m.size() );
- ASSERT_EQUALS( m["cat"], m["sat"] );
- ASSERT_EQUALS( m["cat"], m["run"] );
- ASSERT( m["cat"] > 0 );
- }
-
- TEST( FTSSpec, ScoreMultipleField1 ) {
- BSONObj user = BSON( "key" << BSON( "title" << "text" <<
- "text" << "text" ) <<
- "weights" << BSON( "title" << 10 ) );
-
- FTSSpec spec( FTSSpec::fixSpec( user ) );
-
- TermFrequencyMap m;
- spec.scoreDocument( BSON( "title" << "cat sat run" << "text" << "cat book" ), &m );
-
- ASSERT_EQUALS( 4U, m.size() );
- ASSERT_EQUALS( m["sat"], m["run"] );
- ASSERT( m["sat"] > 0 );
-
- ASSERT( m["cat"] > m["sat"] );
- ASSERT( m["cat"] > m["book"] );
- ASSERT( m["book"] > 0 );
- ASSERT( m["book"] < m["sat"] );
- }
-
- TEST( FTSSpec, ScoreMultipleField2 ) {
- // Test where one indexed field is a parent component of another indexed field.
- BSONObj user = BSON( "key" << BSON( "a" << "text" << "a.b" << "text" ) );
-
- FTSSpec spec( FTSSpec::fixSpec( user ) );
-
- TermFrequencyMap m;
- spec.scoreDocument( BSON( "a" << BSON( "b" << "term" ) ), &m );
- ASSERT_EQUALS( 1U, m.size() );
- }
-
- TEST( FTSSpec, ScoreRepeatWord ) {
- BSONObj user = BSON( "key" << BSON( "title" << "text" <<
- "text" << "text" ) <<
- "weights" << BSON( "title" << 10 ) );
-
- FTSSpec spec( FTSSpec::fixSpec( user ) );
-
- TermFrequencyMap m;
- spec.scoreDocument( BSON( "title" << "cat sat sat run run run" ), &m );
- ASSERT_EQUALS( 3U, m.size() );
- ASSERT( m["cat"] > 0 );
- ASSERT( m["sat"] > m["cat"] );
- ASSERT( m["run"] > m["sat"] );
-
- }
-
- TEST( FTSSpec, Extra1 ) {
- BSONObj user = BSON( "key" << BSON( "data" << "text" ) );
- FTSSpec spec( FTSSpec::fixSpec( user ) );
- ASSERT_EQUALS( 0U, spec.numExtraBefore() );
- ASSERT_EQUALS( 0U, spec.numExtraAfter() );
- }
-
- TEST( FTSSpec, Extra2 ) {
- BSONObj user = BSON( "key" << BSON( "data" << "text" << "x" << 1 ) );
- BSONObj fixed = FTSSpec::fixSpec( user );
- FTSSpec spec( fixed );
- ASSERT_EQUALS( 0U, spec.numExtraBefore() );
- ASSERT_EQUALS( 1U, spec.numExtraAfter() );
- ASSERT_EQUALS( StringData("x"), spec.extraAfter(0) );
-
- BSONObj fixed2 = FTSSpec::fixSpec( fixed );
- ASSERT_EQUALS( fixed, fixed2 );
- }
-
- TEST( FTSSpec, Extra3 ) {
- BSONObj user = BSON( "key" << BSON( "x" << 1 << "data" << "text" ) );
- BSONObj fixed = FTSSpec::fixSpec( user );
-
- ASSERT_EQUALS( BSON( "x" << 1 <<
- "_fts" << "text" <<
- "_ftsx" << 1 ),
- fixed["key"].Obj() );
- ASSERT_EQUALS( BSON( "data" << 1 ),
- fixed["weights"].Obj() );
-
- BSONObj fixed2 = FTSSpec::fixSpec( fixed );
- ASSERT_EQUALS( fixed, fixed2 );
-
- FTSSpec spec( fixed );
- ASSERT_EQUALS( 1U, spec.numExtraBefore() );
- ASSERT_EQUALS( StringData("x"), spec.extraBefore(0) );
- ASSERT_EQUALS( 0U, spec.numExtraAfter() );
-
- BSONObj prefix;
-
- ASSERT( spec.getIndexPrefix( BSON( "x" << 2 ), &prefix ).isOK() );
- ASSERT_EQUALS( BSON( "x" << 2 ), prefix );
-
- ASSERT( spec.getIndexPrefix( BSON( "x" << 3 << "y" << 4 ), &prefix ).isOK() );
- ASSERT_EQUALS( BSON( "x" << 3 ), prefix );
-
- ASSERT( !spec.getIndexPrefix( BSON( "x" << BSON( "$gt" << 5 ) ), &prefix ).isOK() );
- ASSERT( !spec.getIndexPrefix( BSON( "y" << 4 ), &prefix ).isOK() );
- ASSERT( !spec.getIndexPrefix( BSONObj(), &prefix ).isOK() );
- }
-
- // Test for correct behavior when encountering nested arrays (both directly nested and
- // indirectly nested).
-
- TEST( FTSSpec, NestedArraysPos1 ) {
- BSONObj user = BSON( "key" << BSON( "a.b" << "text" ) );
- FTSSpec spec( FTSSpec::fixSpec( user ) );
-
- // The following document matches {"a.b": {$type: 2}}, so "term" should be indexed.
- BSONObj obj = fromjson("{a: [{b: ['term']}]}"); // indirectly nested arrays
- TermFrequencyMap m;
- spec.scoreDocument( obj, &m );
- ASSERT_EQUALS( 1U, m.size() );
- }
-
- TEST( FTSSpec, NestedArraysPos2 ) {
- BSONObj user = BSON( "key" << BSON( "$**" << "text" ) );
- FTSSpec spec( FTSSpec::fixSpec( user ) );
-
- // The wildcard spec implies a full recursive traversal, so "term" should be indexed.
- BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays
- TermFrequencyMap m;
- spec.scoreDocument( obj, &m );
- ASSERT_EQUALS( 1U, m.size() );
- }
-
- TEST( FTSSpec, NestedArraysNeg1 ) {
- BSONObj user = BSON( "key" << BSON( "a.b" << "text" ) );
- FTSSpec spec( FTSSpec::fixSpec( user ) );
-
- // The following document does not match {"a.b": {$type: 2}}, so "term" should not be
- // indexed.
- BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays
- TermFrequencyMap m;
- spec.scoreDocument( obj, &m );
- ASSERT_EQUALS( 0U, m.size() );
- }
-
- // Multi-language test_1: test independent stemming per sub-document
- TEST( FTSSpec, NestedLanguages_PerArrayItemStemming ) {
- BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "text" ) );
- FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
- TermFrequencyMap tfm;
-
- BSONObj obj = fromjson(
- "{ a :"
- " { b :"
- " [ { c : \"walked\", language : \"english\" },"
- " { c : \"camminato\", language : \"italian\" },"
- " { c : \"ging\", language : \"german\" } ]"
- " }"
- " }" );
-
- spec.scoreDocument( obj, &tfm );
-
- set<string> hits;
- hits.insert("walk");
- hits.insert("cammin");
- hits.insert("ging");
-
- for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) {
- string term = i->first;
- ASSERT_EQUALS( 1U, hits.count( term ) );
- }
-
- }
-
- // Multi-language test_2: test nested stemming per sub-document
- TEST( FTSSpec, NestedLanguages_PerSubdocStemming ) {
- BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "text" ) );
- FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
- TermFrequencyMap tfm;
-
- BSONObj obj = fromjson(
- "{ language : \"english\","
- " a :"
- " { language : \"danish\","
- " b :"
- " [ { c : \"foredrag\" },"
- " { c : \"foredragsholder\" },"
- " { c : \"lector\" } ]"
- " }"
- "}" );
-
- spec.scoreDocument( obj, &tfm );
-
- set<string> hits;
- hits.insert("foredrag");
- hits.insert("foredragshold");
- hits.insert("lector");
-
- for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) {
- string term = i->first;
- ASSERT_EQUALS( 1U, hits.count( term ) );
- }
-
- }
-
- // Multi-language test_3: test nested arrays
- TEST( FTSSpec, NestedLanguages_NestedArrays ) {
- BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "text" ) );
- FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
- TermFrequencyMap tfm;
-
- BSONObj obj = fromjson(
- "{ language : \"english\","
- " a : ["
- " { language : \"danish\","
- " b :"
- " [ { c : [\"foredrag\"] },"
- " { c : [\"foredragsholder\"] },"
- " { c : [\"lector\"] } ]"
- " } ]"
- "}" );
-
- spec.scoreDocument( obj, &tfm );
-
- set<string> hits;
- hits.insert("foredrag");
- hits.insert("foredragshold");
- hits.insert("lector");
-
- for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) {
- string term = i->first;
- ASSERT_EQUALS( 1U, hits.count( term ) );
- }
-
- }
-
- // Multi-language test_4: test pruning
- TEST( FTSSpec, NestedLanguages_PathPruning ) {
- BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "text" ) );
- FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
- TermFrequencyMap tfm;
-
- BSONObj obj = fromjson(
- "{ language : \"english\","
- " a : "
- " { language : \"danish\","
- " bc : \"foo\","
- " b : { d: \"bar\" },"
- " b :"
- " [ { c : \"foredrag\" },"
- " { c : \"foredragsholder\" },"
- " { c : \"lector\" } ]"
- " }"
- "}" );
-
- spec.scoreDocument( obj, &tfm );
-
- set<string> hits;
- hits.insert("foredrag");
- hits.insert("foredragshold");
- hits.insert("lector");
-
- for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) {
- string term = i->first;
- ASSERT_EQUALS( 1U, hits.count( term ) );
- }
-
- }
-
- // Multi-language test_5: test wildcard spec
- TEST( FTSSpec, NestedLanguages_Wildcard ) {
- BSONObj indexSpec = BSON( "key" << BSON( "$**" << "text" ) );
- FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
- TermFrequencyMap tfm;
-
- BSONObj obj = fromjson(
- "{ language : \"english\","
- " b : \"walking\","
- " c : { e: \"walked\" },"
- " d : "
- " { language : \"danish\","
- " e :"
- " [ { f : \"foredrag\" },"
- " { f : \"foredragsholder\" },"
- " { f : \"lector\" } ]"
- " }"
- "}" );
-
- spec.scoreDocument( obj, &tfm );
-
- set<string> hits;
- hits.insert("foredrag");
- hits.insert("foredragshold");
- hits.insert("lector");
- hits.insert("walk");
-
- for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) {
- string term = i->first;
- ASSERT_EQUALS( 1U, hits.count( term ) );
- }
-
- }
-
- // Multi-language test_6: test wildcard spec with override
- TEST( FTSSpec, NestedLanguages_WildcardOverride ) {
- BSONObj indexSpec = BSON( "key" << BSON( "$**" << "text" ) <<
- "weights" << BSON( "d.e.f" << 20 ) );
- FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
- TermFrequencyMap tfm;
-
- BSONObj obj = fromjson(
- "{ language : \"english\","
- " b : \"walking\","
- " c : { e: \"walked\" },"
- " d : "
- " { language : \"danish\","
- " e :"
- " [ { f : \"foredrag\" },"
- " { f : \"foredragsholder\" },"
- " { f : \"lector\" } ]"
- " }"
- "}" );
-
- spec.scoreDocument( obj, &tfm );
-
- set<string> hits;
- hits.insert("foredrag");
- hits.insert("foredragshold");
- hits.insert("lector");
- hits.insert("walk");
-
- for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) {
- string term = i->first;
- ASSERT_EQUALS( 1U, hits.count( term ) );
- }
-
- }
-
- /** Test differences across textIndexVersion values in handling of nested arrays. */
- TEST( FTSSpec, TextIndexLegacyNestedArrays ) {
- BSONObj obj = fromjson( "{a: [{b: ['hello']}]}" );
-
- // textIndexVersion=1 FTSSpec objects do not index nested arrays.
- {
- BSONObj indexSpec = fromjson( "{key: {'a.b': 'text'}, textIndexVersion: 1}" );
- FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
- TermFrequencyMap tfm;
- spec.scoreDocument( obj, &tfm );
- ASSERT_EQUALS( tfm.size(), 0U );
- }
-
- // textIndexVersion=2 FTSSpec objects do index nested arrays.
- {
- BSONObj indexSpec = fromjson( "{key: {'a.b': 'text'}, textIndexVersion: 2}" );
- FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
- TermFrequencyMap tfm;
- spec.scoreDocument( obj, &tfm );
- ASSERT_EQUALS( tfm.size(), 1U );
- }
- }
-
- /** Test differences across textIndexVersion values in handling of language annotations. */
- TEST( FTSSpec, TextIndexLegacyLanguageRecognition) {
- BSONObj obj = fromjson( "{a: 'the', language: 'EN'}" );
-
- // textIndexVersion=1 FTSSpec objects treat two-letter language annotations as "none"
- // for purposes of stopword processing.
- {
- BSONObj indexSpec = fromjson( "{key: {'a': 'text'}, textIndexVersion: 1}" );
- FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
- TermFrequencyMap tfm;
- spec.scoreDocument( obj, &tfm );
- ASSERT_EQUALS( tfm.size(), 1U ); // "the" not recognized as stopword
- }
-
- // textIndexVersion=2 FTSSpec objects recognize two-letter codes.
- {
- BSONObj indexSpec = fromjson( "{key: {'a': 'text'}, textIndexVersion: 2}" );
- FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
- TermFrequencyMap tfm;
- spec.scoreDocument( obj, &tfm );
- ASSERT_EQUALS( tfm.size(), 0U ); // "the" recognized as stopword
- }
- }
+using std::set;
+using std::string;
+namespace fts {
+
+/**
+ * Assert that fixSpec() accepts the provided text index spec.
+ */
+void assertFixSuccess(const std::string& s) {
+ BSONObj user = fromjson(s);
+
+ try {
+ // fixSpec() should not throw on a valid spec.
+ BSONObj fixed = FTSSpec::fixSpec(user);
+
+ // fixSpec() on an already-fixed spec shouldn't change it.
+ BSONObj fixed2 = FTSSpec::fixSpec(fixed);
+ ASSERT_EQUALS(fixed, fixed2);
+ } catch (UserException&) {
+ ASSERT(false);
+ }
+}
+
+/**
+ * Assert that fixSpec() rejects the provided text index spec.
+ */
+void assertFixFailure(const std::string& s) {
+ BSONObj user = fromjson(s);
+
+ try {
+ // fixSpec() on an invalid spec should uassert.
+ BSONObj fixed = FTSSpec::fixSpec(user);
+ } catch (UserException&) {
+ return;
+ }
+ ASSERT(false);
+}
+
+TEST(FTSSpec, FixNormalKey1) {
+ assertFixSuccess("{key: {a: 'text'}}");
+ assertFixSuccess("{key: {a: 'text', b: 'text'}}");
+ assertFixSuccess("{key: {a: 'text', b: 'text', c: 'text'}}");
+
+ assertFixFailure("{key: {_fts: 'text'}}"); // not allowed to index reserved field
+ assertFixFailure("{key: {_ftsx: 'text'}}");
+}
+
+TEST(FTSSpec, FixCompoundKey1) {
+ assertFixSuccess("{key: {a: 'text', b: 1.0}}");
+ assertFixSuccess("{key: {a: 'text', b: NumberInt(1)}}");
+ assertFixSuccess("{key: {a: 'text', b: NumberLong(1)}}");
+ assertFixSuccess("{key: {a: 'text', b: -1.0}}");
+ assertFixSuccess("{key: {a: 'text', b: NumberInt(-1)}}");
+ assertFixSuccess("{key: {a: 'text', b: NumberLong(-1)}}");
+ assertFixSuccess("{key: {a: 1.0, b: 'text'}}");
+ assertFixSuccess("{key: {a: NumberInt(1), b: 'text'}}");
+ assertFixSuccess("{key: {a: NumberLong(1), b: 'text'}}");
+ assertFixSuccess("{key: {a: -1, b: 'text'}}");
+ assertFixSuccess("{key: {a: 1, b: 1, c: 'text'}}");
+ assertFixSuccess("{key: {a: 1, b: -1, c: 'text'}}");
+ assertFixSuccess("{key: {a: -1, b: 1, c: 'text'}}");
+ assertFixSuccess("{key: {a: 1, b: 'text', c: 1}}");
+ assertFixSuccess("{key: {a: 'text', b: 1, c: 1}}");
+ assertFixSuccess("{key: {a: 'text', b: 1, c: -1}}");
+ assertFixSuccess("{key: {a: 'text', b: 'text', c: 1}}");
+ assertFixSuccess("{key: {a: 1, b: 'text', c: 'text'}}");
+
+ assertFixFailure("{key: {a: 'text', b: 0}}");
+ assertFixFailure("{key: {a: 'text', b: '2d'}}"); // not allowed to mix special indexes
+ assertFixFailure("{key: {a: 'text', b: '1'}}");
+ assertFixFailure("{key: {a: 'text', _fts: 1}}");
+ assertFixFailure("{key: {a: 'text', _fts: 'text'}}");
+ assertFixFailure("{key: {a: 'text', _ftsx: 1}}");
+ assertFixFailure("{key: {a: 'text', _ftsx: 'text'}}");
+ assertFixFailure("{key: {_fts: 1, a: 'text'}}");
+ assertFixFailure("{key: {_fts: 'text', a: 'text'}}");
+ assertFixFailure("{key: {_ftsx: 1, a: 'text'}}");
+ assertFixFailure("{key: {_ftsx: 'text', a: 'text'}}");
+ assertFixFailure("{key: {a: 'text', b: 1, c: 'text'}}"); // 'text' must all be adjacent
+ assertFixFailure("{key: {a: 'text', b: 1, c: 'text', d: 1}}");
+ assertFixFailure("{key: {a: 1, b: 'text', c: 1, d: 'text', e: 1}}");
+}
+
+TEST(FTSSpec, FixDefaultLanguage1) {
+ assertFixSuccess("{key: {a: 'text'}, default_language: 'english'}");
+ assertFixSuccess("{key: {a: 'text'}, default_language: 'engLISH'}");
+ assertFixSuccess("{key: {a: 'text'}, default_language: 'en'}");
+ assertFixSuccess("{key: {a: 'text'}, default_language: 'eN'}");
+ assertFixSuccess("{key: {a: 'text'}, default_language: 'spanish'}");
+ assertFixSuccess("{key: {a: 'text'}, default_language: 'none'}");
+
+ assertFixFailure("{key: {a: 'text'}, default_language: 'engrish'}");
+ assertFixFailure("{key: {a: 'text'}, default_language: ' english'}");
+ assertFixFailure("{key: {a: 'text'}, default_language: ''}");
+}
+
+TEST(FTSSpec, FixWeights1) {
+ assertFixSuccess("{key: {a: 'text'}, weights: {}}");
+ assertFixSuccess("{key: {a: 'text'}, weights: {a: 1.0}}");
+ assertFixSuccess("{key: {a: 'text'}, weights: {a: NumberInt(1)}}");
+ assertFixSuccess("{key: {a: 'text'}, weights: {a: NumberLong(1)}}");
+ assertFixSuccess("{key: {a: 'text'}, weights: {a: 99999}}");
+ assertFixSuccess("{key: {'$**': 'text'}, weights: {'a.b': 2}}");
+ assertFixSuccess("{key: {'$**': 'text'}, weights: {a: 2, b: 2}}");
+ assertFixSuccess("{key: {'$**': 'text'}, weights: {'$**': 2}}");
+
+ assertFixFailure("{key: {a: 'text'}, weights: 0}");
+ assertFixFailure("{key: {a: 'text'}, weights: []}");
+ assertFixFailure("{key: {a: 'text'}, weights: 'x'}");
+ assertFixFailure("{key: {a: 'text'}, weights: {a: 0}}");
+ assertFixFailure("{key: {a: 'text'}, weights: {a: -1}}");
+ assertFixFailure("{key: {a: 'text'}, weights: {a: 100000}}"); // above max weight
+ assertFixFailure("{key: {a: 'text'}, weights: {a: '1'}}");
+ assertFixFailure("{key: {a: 'text'}, weights: {'': 1}}"); // "invalid" path
+ assertFixFailure("{key: {a: 'text'}, weights: {'a.': 1}}");
+ assertFixFailure("{key: {a: 'text'}, weights: {'.a': 1}}");
+ assertFixFailure("{key: {a: 'text'}, weights: {'a..a': 1}}");
+ assertFixFailure("{key: {a: 'text'}, weights: {$a: 1}}");
+ assertFixFailure("{key: {a: 'text'}, weights: {'a.$a': 1}}");
+ assertFixFailure("{key: {a: 'text'}, weights: {'a.$**': 1}}");
+}
+
+TEST(FTSSpec, FixLanguageOverride1) {
+ assertFixSuccess("{key: {a: 'text'}, language_override: 'foo'}");
+ assertFixSuccess("{key: {a: 'text'}, language_override: 'foo$bar'}");
+
+ assertFixFailure("{key: {a: 'text'}, language_override: 'foo.bar'}"); // can't have '.'
+ assertFixFailure("{key: {a: 'text'}, language_override: ''}");
+ assertFixFailure("{key: {a: 'text'}, language_override: '$foo'}");
+}
+
+TEST(FTSSpec, FixTextIndexVersion1) {
+ assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 1.0}}");
+ assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(1)}}");
+ assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(1)}}");
+ assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 2.0}}");
+ assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(2)}}");
+ assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(2)}}");
+
+ assertFixFailure("{key: {a: 'text'}, textIndexVersion: 3}");
+ assertFixFailure("{key: {a: 'text'}, textIndexVersion: '2'}");
+ assertFixFailure("{key: {a: 'text'}, textIndexVersion: {}}");
+}
+
+TEST(FTSSpec, ScoreSingleField1) {
+ BSONObj user = BSON("key" << BSON("title"
+ << "text"
+ << "text"
+ << "text") << "weights" << BSON("title" << 10));
+
+ FTSSpec spec(FTSSpec::fixSpec(user));
+
+ TermFrequencyMap m;
+ spec.scoreDocument(BSON("title"
+ << "cat sat run"),
+ &m);
+ ASSERT_EQUALS(3U, m.size());
+ ASSERT_EQUALS(m["cat"], m["sat"]);
+ ASSERT_EQUALS(m["cat"], m["run"]);
+ ASSERT(m["cat"] > 0);
+}
+
+TEST(FTSSpec, ScoreMultipleField1) {
+ BSONObj user = BSON("key" << BSON("title"
+ << "text"
+ << "text"
+ << "text") << "weights" << BSON("title" << 10));
+
+ FTSSpec spec(FTSSpec::fixSpec(user));
+
+ TermFrequencyMap m;
+ spec.scoreDocument(BSON("title"
+ << "cat sat run"
+ << "text"
+ << "cat book"),
+ &m);
+
+ ASSERT_EQUALS(4U, m.size());
+ ASSERT_EQUALS(m["sat"], m["run"]);
+ ASSERT(m["sat"] > 0);
+
+ ASSERT(m["cat"] > m["sat"]);
+ ASSERT(m["cat"] > m["book"]);
+ ASSERT(m["book"] > 0);
+ ASSERT(m["book"] < m["sat"]);
+}
+
+TEST(FTSSpec, ScoreMultipleField2) {
+ // Test where one indexed field is a parent component of another indexed field.
+ BSONObj user = BSON("key" << BSON("a"
+ << "text"
+ << "a.b"
+ << "text"));
+
+ FTSSpec spec(FTSSpec::fixSpec(user));
+
+ TermFrequencyMap m;
+ spec.scoreDocument(BSON("a" << BSON("b"
+ << "term")),
+ &m);
+ ASSERT_EQUALS(1U, m.size());
+}
+
+TEST(FTSSpec, ScoreRepeatWord) {
+ BSONObj user = BSON("key" << BSON("title"
+ << "text"
+ << "text"
+ << "text") << "weights" << BSON("title" << 10));
+
+ FTSSpec spec(FTSSpec::fixSpec(user));
+
+ TermFrequencyMap m;
+ spec.scoreDocument(BSON("title"
+ << "cat sat sat run run run"),
+ &m);
+ ASSERT_EQUALS(3U, m.size());
+ ASSERT(m["cat"] > 0);
+ ASSERT(m["sat"] > m["cat"]);
+ ASSERT(m["run"] > m["sat"]);
+}
+
+TEST(FTSSpec, Extra1) {
+ BSONObj user = BSON("key" << BSON("data"
+ << "text"));
+ FTSSpec spec(FTSSpec::fixSpec(user));
+ ASSERT_EQUALS(0U, spec.numExtraBefore());
+ ASSERT_EQUALS(0U, spec.numExtraAfter());
+}
+
+TEST(FTSSpec, Extra2) {
+ BSONObj user = BSON("key" << BSON("data"
+ << "text"
+ << "x" << 1));
+ BSONObj fixed = FTSSpec::fixSpec(user);
+ FTSSpec spec(fixed);
+ ASSERT_EQUALS(0U, spec.numExtraBefore());
+ ASSERT_EQUALS(1U, spec.numExtraAfter());
+ ASSERT_EQUALS(StringData("x"), spec.extraAfter(0));
+
+ BSONObj fixed2 = FTSSpec::fixSpec(fixed);
+ ASSERT_EQUALS(fixed, fixed2);
+}
+
+TEST(FTSSpec, Extra3) {
+ BSONObj user = BSON("key" << BSON("x" << 1 << "data"
+ << "text"));
+ BSONObj fixed = FTSSpec::fixSpec(user);
+
+ ASSERT_EQUALS(BSON("x" << 1 << "_fts"
+ << "text"
+ << "_ftsx" << 1),
+ fixed["key"].Obj());
+ ASSERT_EQUALS(BSON("data" << 1), fixed["weights"].Obj());
+
+ BSONObj fixed2 = FTSSpec::fixSpec(fixed);
+ ASSERT_EQUALS(fixed, fixed2);
+
+ FTSSpec spec(fixed);
+ ASSERT_EQUALS(1U, spec.numExtraBefore());
+ ASSERT_EQUALS(StringData("x"), spec.extraBefore(0));
+ ASSERT_EQUALS(0U, spec.numExtraAfter());
+
+ BSONObj prefix;
+
+ ASSERT(spec.getIndexPrefix(BSON("x" << 2), &prefix).isOK());
+ ASSERT_EQUALS(BSON("x" << 2), prefix);
+
+ ASSERT(spec.getIndexPrefix(BSON("x" << 3 << "y" << 4), &prefix).isOK());
+ ASSERT_EQUALS(BSON("x" << 3), prefix);
+
+ ASSERT(!spec.getIndexPrefix(BSON("x" << BSON("$gt" << 5)), &prefix).isOK());
+ ASSERT(!spec.getIndexPrefix(BSON("y" << 4), &prefix).isOK());
+ ASSERT(!spec.getIndexPrefix(BSONObj(), &prefix).isOK());
+}
+
+// Test for correct behavior when encountering nested arrays (both directly nested and
+// indirectly nested).
+
+TEST(FTSSpec, NestedArraysPos1) {
+ BSONObj user = BSON("key" << BSON("a.b"
+ << "text"));
+ FTSSpec spec(FTSSpec::fixSpec(user));
+
+ // The following document matches {"a.b": {$type: 2}}, so "term" should be indexed.
+ BSONObj obj = fromjson("{a: [{b: ['term']}]}"); // indirectly nested arrays
+ TermFrequencyMap m;
+ spec.scoreDocument(obj, &m);
+ ASSERT_EQUALS(1U, m.size());
+}
+
+TEST(FTSSpec, NestedArraysPos2) {
+ BSONObj user = BSON("key" << BSON("$**"
+ << "text"));
+ FTSSpec spec(FTSSpec::fixSpec(user));
+
+ // The wildcard spec implies a full recursive traversal, so "term" should be indexed.
+ BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays
+ TermFrequencyMap m;
+ spec.scoreDocument(obj, &m);
+ ASSERT_EQUALS(1U, m.size());
+}
+
+TEST(FTSSpec, NestedArraysNeg1) {
+ BSONObj user = BSON("key" << BSON("a.b"
+ << "text"));
+ FTSSpec spec(FTSSpec::fixSpec(user));
+
+ // The following document does not match {"a.b": {$type: 2}}, so "term" should not be
+ // indexed.
+ BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays
+ TermFrequencyMap m;
+ spec.scoreDocument(obj, &m);
+ ASSERT_EQUALS(0U, m.size());
+}
+
+// Multi-language test_1: test independent stemming per sub-document
+TEST(FTSSpec, NestedLanguages_PerArrayItemStemming) {
+ BSONObj indexSpec = BSON("key" << BSON("a.b.c"
+ << "text"));
+ FTSSpec spec(FTSSpec::fixSpec(indexSpec));
+ TermFrequencyMap tfm;
+
+ BSONObj obj = fromjson(
+ "{ a :"
+ " { b :"
+ " [ { c : \"walked\", language : \"english\" },"
+ " { c : \"camminato\", language : \"italian\" },"
+ " { c : \"ging\", language : \"german\" } ]"
+ " }"
+ " }");
+
+ spec.scoreDocument(obj, &tfm);
+
+ set<string> hits;
+ hits.insert("walk");
+ hits.insert("cammin");
+ hits.insert("ging");
+
+ for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) {
+ string term = i->first;
+ ASSERT_EQUALS(1U, hits.count(term));
+ }
+}
+
+// Multi-language test_2: test nested stemming per sub-document
+TEST(FTSSpec, NestedLanguages_PerSubdocStemming) {
+ BSONObj indexSpec = BSON("key" << BSON("a.b.c"
+ << "text"));
+ FTSSpec spec(FTSSpec::fixSpec(indexSpec));
+ TermFrequencyMap tfm;
+
+ BSONObj obj = fromjson(
+ "{ language : \"english\","
+ " a :"
+ " { language : \"danish\","
+ " b :"
+ " [ { c : \"foredrag\" },"
+ " { c : \"foredragsholder\" },"
+ " { c : \"lector\" } ]"
+ " }"
+ "}");
+
+ spec.scoreDocument(obj, &tfm);
+
+ set<string> hits;
+ hits.insert("foredrag");
+ hits.insert("foredragshold");
+ hits.insert("lector");
+
+ for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) {
+ string term = i->first;
+ ASSERT_EQUALS(1U, hits.count(term));
}
}
+
+// Multi-language test_3: test nested arrays
+TEST(FTSSpec, NestedLanguages_NestedArrays) {
+ BSONObj indexSpec = BSON("key" << BSON("a.b.c"
+ << "text"));
+ FTSSpec spec(FTSSpec::fixSpec(indexSpec));
+ TermFrequencyMap tfm;
+
+ BSONObj obj = fromjson(
+ "{ language : \"english\","
+ " a : ["
+ " { language : \"danish\","
+ " b :"
+ " [ { c : [\"foredrag\"] },"
+ " { c : [\"foredragsholder\"] },"
+ " { c : [\"lector\"] } ]"
+ " } ]"
+ "}");
+
+ spec.scoreDocument(obj, &tfm);
+
+ set<string> hits;
+ hits.insert("foredrag");
+ hits.insert("foredragshold");
+ hits.insert("lector");
+
+ for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) {
+ string term = i->first;
+ ASSERT_EQUALS(1U, hits.count(term));
+ }
+}
+
+// Multi-language test_4: test pruning
+TEST(FTSSpec, NestedLanguages_PathPruning) {
+ BSONObj indexSpec = BSON("key" << BSON("a.b.c"
+ << "text"));
+ FTSSpec spec(FTSSpec::fixSpec(indexSpec));
+ TermFrequencyMap tfm;
+
+ BSONObj obj = fromjson(
+ "{ language : \"english\","
+ " a : "
+ " { language : \"danish\","
+ " bc : \"foo\","
+ " b : { d: \"bar\" },"
+ " b :"
+ " [ { c : \"foredrag\" },"
+ " { c : \"foredragsholder\" },"
+ " { c : \"lector\" } ]"
+ " }"
+ "}");
+
+ spec.scoreDocument(obj, &tfm);
+
+ set<string> hits;
+ hits.insert("foredrag");
+ hits.insert("foredragshold");
+ hits.insert("lector");
+
+ for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) {
+ string term = i->first;
+ ASSERT_EQUALS(1U, hits.count(term));
+ }
+}
+
+// Multi-language test_5: test wildcard spec
+TEST(FTSSpec, NestedLanguages_Wildcard) {
+ BSONObj indexSpec = BSON("key" << BSON("$**"
+ << "text"));
+ FTSSpec spec(FTSSpec::fixSpec(indexSpec));
+ TermFrequencyMap tfm;
+
+ BSONObj obj = fromjson(
+ "{ language : \"english\","
+ " b : \"walking\","
+ " c : { e: \"walked\" },"
+ " d : "
+ " { language : \"danish\","
+ " e :"
+ " [ { f : \"foredrag\" },"
+ " { f : \"foredragsholder\" },"
+ " { f : \"lector\" } ]"
+ " }"
+ "}");
+
+ spec.scoreDocument(obj, &tfm);
+
+ set<string> hits;
+ hits.insert("foredrag");
+ hits.insert("foredragshold");
+ hits.insert("lector");
+ hits.insert("walk");
+
+ for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) {
+ string term = i->first;
+ ASSERT_EQUALS(1U, hits.count(term));
+ }
+}
+
+// Multi-language test_6: test wildcard spec with override
+TEST(FTSSpec, NestedLanguages_WildcardOverride) {
+ BSONObj indexSpec = BSON("key" << BSON("$**"
+ << "text") << "weights" << BSON("d.e.f" << 20));
+ FTSSpec spec(FTSSpec::fixSpec(indexSpec));
+ TermFrequencyMap tfm;
+
+ BSONObj obj = fromjson(
+ "{ language : \"english\","
+ " b : \"walking\","
+ " c : { e: \"walked\" },"
+ " d : "
+ " { language : \"danish\","
+ " e :"
+ " [ { f : \"foredrag\" },"
+ " { f : \"foredragsholder\" },"
+ " { f : \"lector\" } ]"
+ " }"
+ "}");
+
+ spec.scoreDocument(obj, &tfm);
+
+ set<string> hits;
+ hits.insert("foredrag");
+ hits.insert("foredragshold");
+ hits.insert("lector");
+ hits.insert("walk");
+
+ for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) {
+ string term = i->first;
+ ASSERT_EQUALS(1U, hits.count(term));
+ }
+}
+
+/** Test differences across textIndexVersion values in handling of nested arrays. */
+TEST(FTSSpec, TextIndexLegacyNestedArrays) {
+ BSONObj obj = fromjson("{a: [{b: ['hello']}]}");
+
+ // textIndexVersion=1 FTSSpec objects do not index nested arrays.
+ {
+ BSONObj indexSpec = fromjson("{key: {'a.b': 'text'}, textIndexVersion: 1}");
+ FTSSpec spec(FTSSpec::fixSpec(indexSpec));
+ TermFrequencyMap tfm;
+ spec.scoreDocument(obj, &tfm);
+ ASSERT_EQUALS(tfm.size(), 0U);
+ }
+
+ // textIndexVersion=2 FTSSpec objects do index nested arrays.
+ {
+ BSONObj indexSpec = fromjson("{key: {'a.b': 'text'}, textIndexVersion: 2}");
+ FTSSpec spec(FTSSpec::fixSpec(indexSpec));
+ TermFrequencyMap tfm;
+ spec.scoreDocument(obj, &tfm);
+ ASSERT_EQUALS(tfm.size(), 1U);
+ }
+}
+
+/** Test differences across textIndexVersion values in handling of language annotations. */
+TEST(FTSSpec, TextIndexLegacyLanguageRecognition) {
+ BSONObj obj = fromjson("{a: 'the', language: 'EN'}");
+
+ // textIndexVersion=1 FTSSpec objects treat two-letter language annotations as "none"
+ // for purposes of stopword processing.
+ {
+ BSONObj indexSpec = fromjson("{key: {'a': 'text'}, textIndexVersion: 1}");
+ FTSSpec spec(FTSSpec::fixSpec(indexSpec));
+ TermFrequencyMap tfm;
+ spec.scoreDocument(obj, &tfm);
+ ASSERT_EQUALS(tfm.size(), 1U); // "the" not recognized as stopword
+ }
+
+ // textIndexVersion=2 FTSSpec objects recognize two-letter codes.
+ {
+ BSONObj indexSpec = fromjson("{key: {'a': 'text'}, textIndexVersion: 2}");
+ FTSSpec spec(FTSSpec::fixSpec(indexSpec));
+ TermFrequencyMap tfm;
+ spec.scoreDocument(obj, &tfm);
+ ASSERT_EQUALS(tfm.size(), 0U); // "the" recognized as stopword
+ }
+}
+}
+}
diff --git a/src/mongo/db/fts/fts_tokenizer.h b/src/mongo/db/fts/fts_tokenizer.h
index 2b345d89266..40cdbde2cb8 100644
--- a/src/mongo/db/fts/fts_tokenizer.h
+++ b/src/mongo/db/fts/fts_tokenizer.h
@@ -35,58 +35,58 @@
namespace mongo {
namespace fts {
- class FTSLanguage;
- class StopWords;
+class FTSLanguage;
+class StopWords;
+
+/**
+ * FTSTokenizer
+ * A iterator of "documents" where a document contains space delimited words.
+ * For each word returns a stem or lemma version of a word optimized for full text indexing.
+ * Supports various options to control how tokens are generated.
+ */
+class FTSTokenizer {
+public:
+ virtual ~FTSTokenizer() = default;
/**
- * FTSTokenizer
- * A iterator of "documents" where a document contains space delimited words.
- * For each word returns a stem or lemma version of a word optimized for full text indexing.
- * Supports various options to control how tokens are generated.
+ * Options for generating tokens
*/
- class FTSTokenizer {
- public:
- virtual ~FTSTokenizer() = default;
-
- /**
- * Options for generating tokens
- */
- enum Options {
- /**
- * Default means lower cased, and stop words are not filtered.
- */
- None = 0,
-
- /**
- * Do not lower case terms.
- */
- GenerateCaseSensitiveTokens = 1 << 0,
-
- /**
- * Filter out stop words from return tokens.
- */
- FilterStopWords = 1 << 1,
- };
-
+ enum Options {
/**
- * Process a new document, and discards any previous results.
- * May be called multiple times on an instance of an iterator.
+ * Default means lower cased, and stop words are not filtered.
*/
- virtual void reset(StringData document, Options options) = 0;
+ None = 0,
/**
- * Moves to the next token in the iterator.
- * Returns false when the iterator reaches end of the document.
+ * Do not lower case terms.
*/
- virtual bool moveNext() = 0;
+ GenerateCaseSensitiveTokens = 1 << 0,
/**
- * Returns stemmed form, normalized, and lowercased depending on the parameter
- * to the reset method.
- * Returned StringData is valid until next call to moveNext().
+ * Filter out stop words from return tokens.
*/
- virtual StringData get() const = 0;
+ FilterStopWords = 1 << 1,
};
-} // namespace fts
-} // namespace mongo
+ /**
+ * Process a new document, and discards any previous results.
+ * May be called multiple times on an instance of an iterator.
+ */
+ virtual void reset(StringData document, Options options) = 0;
+
+ /**
+ * Moves to the next token in the iterator.
+ * Returns false when the iterator reaches end of the document.
+ */
+ virtual bool moveNext() = 0;
+
+ /**
+ * Returns stemmed form, normalized, and lowercased depending on the parameter
+ * to the reset method.
+ * Returned StringData is valid until next call to moveNext().
+ */
+ virtual StringData get() const = 0;
+};
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_util.cpp b/src/mongo/db/fts/fts_util.cpp
index f2bd4e50905..85420fc66ad 100644
--- a/src/mongo/db/fts/fts_util.cpp
+++ b/src/mongo/db/fts/fts_util.cpp
@@ -32,11 +32,9 @@
namespace mongo {
- namespace fts {
+namespace fts {
- const std::string INDEX_NAME = "text";
- const std::string WILDCARD = "$**";
-
- }
+const std::string INDEX_NAME = "text";
+const std::string WILDCARD = "$**";
+}
}
-
diff --git a/src/mongo/db/fts/fts_util.h b/src/mongo/db/fts/fts_util.h
index 7cde2bbe985..a1377162443 100644
--- a/src/mongo/db/fts/fts_util.h
+++ b/src/mongo/db/fts/fts_util.h
@@ -36,16 +36,14 @@
namespace mongo {
- namespace fts {
+namespace fts {
- extern const std::string WILDCARD;
- extern const std::string INDEX_NAME;
+extern const std::string WILDCARD;
+extern const std::string INDEX_NAME;
- enum TextIndexVersion {
- TEXT_INDEX_VERSION_1 = 1, // Legacy index format. Deprecated.
- TEXT_INDEX_VERSION_2 = 2 // Current index format.
- };
-
- }
+enum TextIndexVersion {
+ TEXT_INDEX_VERSION_1 = 1, // Legacy index format. Deprecated.
+ TEXT_INDEX_VERSION_2 = 2 // Current index format.
+};
+}
}
-
diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp
index 9353fccf297..07d17c050eb 100644
--- a/src/mongo/db/fts/stemmer.cpp
+++ b/src/mongo/db/fts/stemmer.cpp
@@ -36,39 +36,36 @@
namespace mongo {
- namespace fts {
+namespace fts {
- using std::string;
+using std::string;
- Stemmer::Stemmer( const FTSLanguage* language ) {
- _stemmer = NULL;
- if ( language->str() != "none" )
- _stemmer = sb_stemmer_new(language->str().c_str(), "UTF_8");
- }
-
- Stemmer::~Stemmer() {
- if ( _stemmer ) {
- sb_stemmer_delete(_stemmer);
- _stemmer = NULL;
- }
- }
-
- string Stemmer::stem( StringData word ) const {
- if ( !_stemmer )
- return word.toString();
+Stemmer::Stemmer(const FTSLanguage* language) {
+ _stemmer = NULL;
+ if (language->str() != "none")
+ _stemmer = sb_stemmer_new(language->str().c_str(), "UTF_8");
+}
- const sb_symbol* sb_sym = sb_stemmer_stem( _stemmer,
- (const sb_symbol*)word.rawData(),
- word.size() );
+Stemmer::~Stemmer() {
+ if (_stemmer) {
+ sb_stemmer_delete(_stemmer);
+ _stemmer = NULL;
+ }
+}
- if ( sb_sym == NULL ) {
- // out of memory
- invariant( false );
- }
+string Stemmer::stem(StringData word) const {
+ if (!_stemmer)
+ return word.toString();
- return string( (const char*)(sb_sym), sb_stemmer_length( _stemmer ) );
- }
+ const sb_symbol* sb_sym =
+ sb_stemmer_stem(_stemmer, (const sb_symbol*)word.rawData(), word.size());
+ if (sb_sym == NULL) {
+ // out of memory
+ invariant(false);
}
+ return string((const char*)(sb_sym), sb_stemmer_length(_stemmer));
+}
+}
}
diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h
index d6d76e64218..59261bfb6a0 100644
--- a/src/mongo/db/fts/stemmer.h
+++ b/src/mongo/db/fts/stemmer.h
@@ -39,23 +39,24 @@
namespace mongo {
- namespace fts {
-
- /**
- * maintains case
- * but works
- * running/Running -> run/Run
- */
- class Stemmer {
- MONGO_DISALLOW_COPYING( Stemmer );
- public:
- Stemmer( const FTSLanguage* language );
- ~Stemmer();
-
- std::string stem( StringData word ) const;
- private:
- struct sb_stemmer* _stemmer;
- };
- }
-}
+namespace fts {
+/**
+ * maintains case
+ * but works
+ * running/Running -> run/Run
+ */
+class Stemmer {
+ MONGO_DISALLOW_COPYING(Stemmer);
+
+public:
+ Stemmer(const FTSLanguage* language);
+ ~Stemmer();
+
+ std::string stem(StringData word) const;
+
+private:
+ struct sb_stemmer* _stemmer;
+};
+}
+}
diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp
index bef556bf2ad..d40d25e8348 100644
--- a/src/mongo/db/fts/stemmer_test.cpp
+++ b/src/mongo/db/fts/stemmer_test.cpp
@@ -35,19 +35,18 @@
#include "mongo/db/fts/stemmer.h"
namespace mongo {
- namespace fts {
+namespace fts {
- TEST( English, Stemmer1 ) {
- Stemmer s( &languageEnglishV2 );
- ASSERT_EQUALS( "run", s.stem( "running" ) );
- ASSERT_EQUALS( "Run", s.stem( "Running" ) );
- }
-
- TEST( English, Caps ) {
- Stemmer s( &languagePorterV1 );
- ASSERT_EQUALS( "unit", s.stem( "united" ) );
- ASSERT_EQUALS( "Unite", s.stem( "United" ) );
- }
+TEST(English, Stemmer1) {
+ Stemmer s(&languageEnglishV2);
+ ASSERT_EQUALS("run", s.stem("running"));
+ ASSERT_EQUALS("Run", s.stem("Running"));
+}
- }
+TEST(English, Caps) {
+ Stemmer s(&languagePorterV1);
+ ASSERT_EQUALS("unit", s.stem("united"));
+ ASSERT_EQUALS("Unite", s.stem("United"));
+}
+}
}
diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp
index 421bfae63db..0a44eaf25ff 100644
--- a/src/mongo/db/fts/stop_words.cpp
+++ b/src/mongo/db/fts/stop_words.cpp
@@ -38,43 +38,38 @@
namespace mongo {
- namespace fts {
+namespace fts {
- void loadStopWordMap( StringMap< std::set< std::string > >* m );
-
- namespace {
- StringMap< std::shared_ptr<StopWords> > StopWordsMap;
- StopWords empty;
- }
+void loadStopWordMap(StringMap<std::set<std::string>>* m);
+namespace {
+StringMap<std::shared_ptr<StopWords>> StopWordsMap;
+StopWords empty;
+}
- StopWords::StopWords(){
- }
- StopWords::StopWords( const std::set<std::string>& words ) {
- for ( std::set<std::string>::const_iterator i = words.begin(); i != words.end(); ++i )
- _words.insert( *i );
- }
+StopWords::StopWords() {}
- const StopWords* StopWords::getStopWords( const FTSLanguage* language ) {
- auto i = StopWordsMap.find( language->str() );
- if ( i == StopWordsMap.end() )
- return &empty;
- return i->second.get();
- }
+StopWords::StopWords(const std::set<std::string>& words) {
+ for (std::set<std::string>::const_iterator i = words.begin(); i != words.end(); ++i)
+ _words.insert(*i);
+}
+const StopWords* StopWords::getStopWords(const FTSLanguage* language) {
+ auto i = StopWordsMap.find(language->str());
+ if (i == StopWordsMap.end())
+ return &empty;
+ return i->second.get();
+}
- MONGO_INITIALIZER(StopWords)(InitializerContext* context) {
- StringMap< std::set< std::string > > raw;
- loadStopWordMap( &raw );
- for ( StringMap< std::set< std::string > >::const_iterator i = raw.begin();
- i != raw.end();
- ++i ) {
- StopWordsMap[i->first].reset(new StopWords( i->second ));
- }
- return Status::OK();
- }
+MONGO_INITIALIZER(StopWords)(InitializerContext* context) {
+ StringMap<std::set<std::string>> raw;
+ loadStopWordMap(&raw);
+ for (StringMap<std::set<std::string>>::const_iterator i = raw.begin(); i != raw.end(); ++i) {
+ StopWordsMap[i->first].reset(new StopWords(i->second));
}
-
+ return Status::OK();
+}
+}
}
diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h
index d989b4dcd32..eebc11c012a 100644
--- a/src/mongo/db/fts/stop_words.h
+++ b/src/mongo/db/fts/stop_words.h
@@ -39,25 +39,27 @@
namespace mongo {
- namespace fts {
+namespace fts {
- class StopWords {
- MONGO_DISALLOW_COPYING( StopWords );
- public:
- StopWords();
- StopWords( const std::set<std::string>& words );
+class StopWords {
+ MONGO_DISALLOW_COPYING(StopWords);
- bool isStopWord( const std::string& word ) const {
- return _words.count( word ) > 0;
- }
+public:
+ StopWords();
+ StopWords(const std::set<std::string>& words);
- size_t numStopWords() const { return _words.size(); }
-
- static const StopWords* getStopWords( const FTSLanguage* language );
- private:
- unordered_set<std::string> _words;
- };
+ bool isStopWord(const std::string& word) const {
+ return _words.count(word) > 0;
+ }
+ size_t numStopWords() const {
+ return _words.size();
}
-}
+ static const StopWords* getStopWords(const FTSLanguage* language);
+
+private:
+ unordered_set<std::string> _words;
+};
+}
+}
diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp
index 248c4d93407..5834503dd4a 100644
--- a/src/mongo/db/fts/stop_words_test.cpp
+++ b/src/mongo/db/fts/stop_words_test.cpp
@@ -33,13 +33,12 @@
#include "mongo/unittest/unittest.h"
namespace mongo {
- namespace fts {
+namespace fts {
- TEST( English, Basic1 ) {
- const StopWords* englishStopWords = StopWords::getStopWords( &languageEnglishV2 );
- ASSERT( englishStopWords->isStopWord( "the" ) );
- ASSERT( !englishStopWords->isStopWord( "computer" ) );
- }
-
- }
+TEST(English, Basic1) {
+ const StopWords* englishStopWords = StopWords::getStopWords(&languageEnglishV2);
+ ASSERT(englishStopWords->isStopWord("the"));
+ ASSERT(!englishStopWords->isStopWord("computer"));
+}
+}
}
diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp
index 01037a85c8d..e1f595b9a4a 100644
--- a/src/mongo/db/fts/tokenizer.cpp
+++ b/src/mongo/db/fts/tokenizer.cpp
@@ -36,105 +36,103 @@
namespace mongo {
- namespace fts {
-
- Tokenizer::Tokenizer(const FTSLanguage* language, StringData str)
- : _pos(0), _raw( str ) {
- _english = ( language->str() == "english" );
- _skipWhitespace();
- }
-
- bool Tokenizer::more() const {
- return _pos < _raw.size();
- }
-
- Token Tokenizer::next() {
- if ( _pos >= _raw.size() )
- return Token(Token::INVALID, "", 0);
-
- unsigned start = _pos++;
- Token::Type type = _type( _raw[start] );
- if ( type == Token::WHITESPACE ) invariant( false );
-
- if ( type == Token::TEXT )
- while ( _pos < _raw.size() && _type( _raw[_pos] ) == type )
- _pos++;
-
- StringData ret = _raw.substr( start, _pos - start );
- _skipWhitespace();
- return Token( type, ret, start );
- }
-
-
- bool Tokenizer::_skipWhitespace() {
- unsigned start = _pos;
- while ( _pos < _raw.size() && _type( _raw[_pos] ) == Token::WHITESPACE )
- _pos++;
- return _pos > start;
- }
-
-
- Token::Type Tokenizer::_type( char c ) const {
- switch ( c ) {
- case ' ':
- case '\f':
- case '\v':
- case '\t':
- case '\r':
- case '\n':
- return Token::WHITESPACE;
- case '\'':
- if ( _english )
- return Token::TEXT;
- else
- return Token::WHITESPACE;
-
- case '~':
- case '`':
-
- case '!':
- case '@':
- case '#':
- case '$':
- case '%':
- case '^':
- case '&':
- case '*':
- case '(':
- case ')':
-
- case '-':
-
- case '=':
- case '+':
-
- case '[':
- case ']':
- case '{':
- case '}':
- case '|':
- case '\\':
-
- case ';':
- case ':':
-
- case '"':
-
- case '<':
- case '>':
-
- case ',':
- case '.':
-
- case '/':
- case '?':
-
- return Token::DELIMITER;
- default:
+namespace fts {
+
+Tokenizer::Tokenizer(const FTSLanguage* language, StringData str) : _pos(0), _raw(str) {
+ _english = (language->str() == "english");
+ _skipWhitespace();
+}
+
+bool Tokenizer::more() const {
+ return _pos < _raw.size();
+}
+
+Token Tokenizer::next() {
+ if (_pos >= _raw.size())
+ return Token(Token::INVALID, "", 0);
+
+ unsigned start = _pos++;
+ Token::Type type = _type(_raw[start]);
+ if (type == Token::WHITESPACE)
+ invariant(false);
+
+ if (type == Token::TEXT)
+ while (_pos < _raw.size() && _type(_raw[_pos]) == type)
+ _pos++;
+
+ StringData ret = _raw.substr(start, _pos - start);
+ _skipWhitespace();
+ return Token(type, ret, start);
+}
+
+
+bool Tokenizer::_skipWhitespace() {
+ unsigned start = _pos;
+ while (_pos < _raw.size() && _type(_raw[_pos]) == Token::WHITESPACE)
+ _pos++;
+ return _pos > start;
+}
+
+
+Token::Type Tokenizer::_type(char c) const {
+ switch (c) {
+ case ' ':
+ case '\f':
+ case '\v':
+ case '\t':
+ case '\r':
+ case '\n':
+ return Token::WHITESPACE;
+ case '\'':
+ if (_english)
return Token::TEXT;
- }
- }
+ else
+ return Token::WHITESPACE;
- }
+ case '~':
+ case '`':
+
+ case '!':
+ case '@':
+ case '#':
+ case '$':
+ case '%':
+ case '^':
+ case '&':
+ case '*':
+ case '(':
+ case ')':
+
+ case '-':
+
+ case '=':
+ case '+':
+
+ case '[':
+ case ']':
+ case '{':
+ case '}':
+ case '|':
+ case '\\':
+
+ case ';':
+ case ':':
+ case '"':
+
+ case '<':
+ case '>':
+
+ case ',':
+ case '.':
+
+ case '/':
+ case '?':
+
+ return Token::DELIMITER;
+ default:
+ return Token::TEXT;
+ }
+}
+}
}
diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h
index 503816cc434..f1184a455f2 100644
--- a/src/mongo/db/fts/tokenizer.h
+++ b/src/mongo/db/fts/tokenizer.h
@@ -38,41 +38,37 @@
namespace mongo {
- namespace fts {
+namespace fts {
- struct Token {
- enum Type { WHITESPACE, DELIMITER, TEXT, INVALID };
- Token( Type type, StringData data, unsigned offset)
- : type( type ),
- data( data ),
- offset( offset )
- {}
+struct Token {
+ enum Type { WHITESPACE, DELIMITER, TEXT, INVALID };
+ Token(Type type, StringData data, unsigned offset) : type(type), data(data), offset(offset) {}
- bool ok() const { return type != INVALID; }
-
- Type type;
- StringData data;
- unsigned offset;
- };
+ bool ok() const {
+ return type != INVALID;
+ }
- class Tokenizer {
- MONGO_DISALLOW_COPYING( Tokenizer );
- public:
+ Type type;
+ StringData data;
+ unsigned offset;
+};
- Tokenizer( const FTSLanguage* language, StringData str);
+class Tokenizer {
+ MONGO_DISALLOW_COPYING(Tokenizer);
- bool more() const;
- Token next();
+public:
+ Tokenizer(const FTSLanguage* language, StringData str);
- private:
- Token::Type _type( char c ) const;
- bool _skipWhitespace();
+ bool more() const;
+ Token next();
- unsigned _pos;
- const StringData _raw;
- bool _english;
- };
+private:
+ Token::Type _type(char c) const;
+ bool _skipWhitespace();
- }
+ unsigned _pos;
+ const StringData _raw;
+ bool _english;
+};
+}
}
-
diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp
index d370c9f6c0b..143e3b372ce 100644
--- a/src/mongo/db/fts/tokenizer_test.cpp
+++ b/src/mongo/db/fts/tokenizer_test.cpp
@@ -33,91 +33,88 @@
#include "mongo/unittest/unittest.h"
namespace mongo {
- namespace fts {
+namespace fts {
- TEST( Tokenizer, Empty1 ) {
- Tokenizer i( &languageEnglishV2, "" );
- ASSERT( !i.more() );
- }
-
- TEST( Tokenizer, Basic1 ) {
- Tokenizer i( &languageEnglishV2, "blue red green" );
+TEST(Tokenizer, Empty1) {
+ Tokenizer i(&languageEnglishV2, "");
+ ASSERT(!i.more());
+}
- ASSERT( i.more() );
- ASSERT_EQUALS( i.next().data.toString(), "blue" );
+TEST(Tokenizer, Basic1) {
+ Tokenizer i(&languageEnglishV2, "blue red green");
- ASSERT( i.more() );
- ASSERT_EQUALS( i.next().data.toString(), "red" );
+ ASSERT(i.more());
+ ASSERT_EQUALS(i.next().data.toString(), "blue");
- ASSERT( i.more() );
- ASSERT_EQUALS( i.next().data.toString(), "green" );
+ ASSERT(i.more());
+ ASSERT_EQUALS(i.next().data.toString(), "red");
- ASSERT( !i.more() );
- }
+ ASSERT(i.more());
+ ASSERT_EQUALS(i.next().data.toString(), "green");
- TEST( Tokenizer, Basic2 ) {
- Tokenizer i( &languageEnglishV2, "blue-red" );
+ ASSERT(!i.more());
+}
- Token a = i.next();
- Token b = i.next();
- Token c = i.next();
- Token d = i.next();
+TEST(Tokenizer, Basic2) {
+ Tokenizer i(&languageEnglishV2, "blue-red");
- ASSERT_EQUALS( Token::TEXT, a.type );
- ASSERT_EQUALS( Token::DELIMITER, b.type );
- ASSERT_EQUALS( Token::TEXT, c.type );
- ASSERT_EQUALS( Token::INVALID, d.type );
+ Token a = i.next();
+ Token b = i.next();
+ Token c = i.next();
+ Token d = i.next();
- ASSERT_EQUALS( "blue", a.data.toString() );
- ASSERT_EQUALS( "-", b.data.toString() );
- ASSERT_EQUALS( "red", c.data.toString() );
- }
+ ASSERT_EQUALS(Token::TEXT, a.type);
+ ASSERT_EQUALS(Token::DELIMITER, b.type);
+ ASSERT_EQUALS(Token::TEXT, c.type);
+ ASSERT_EQUALS(Token::INVALID, d.type);
- TEST( Tokenizer, Basic3 ) {
- Tokenizer i( &languageEnglishV2, "blue -red" );
+ ASSERT_EQUALS("blue", a.data.toString());
+ ASSERT_EQUALS("-", b.data.toString());
+ ASSERT_EQUALS("red", c.data.toString());
+}
- Token a = i.next();
- Token b = i.next();
- Token c = i.next();
- Token d = i.next();
+TEST(Tokenizer, Basic3) {
+ Tokenizer i(&languageEnglishV2, "blue -red");
- ASSERT_EQUALS( Token::TEXT, a.type );
- ASSERT_EQUALS( Token::DELIMITER, b.type );
- ASSERT_EQUALS( Token::TEXT, c.type );
- ASSERT_EQUALS( Token::INVALID, d.type );
+ Token a = i.next();
+ Token b = i.next();
+ Token c = i.next();
+ Token d = i.next();
- ASSERT_EQUALS( "blue", a.data.toString() );
- ASSERT_EQUALS( "-", b.data.toString() );
- ASSERT_EQUALS( "red", c.data.toString() );
+ ASSERT_EQUALS(Token::TEXT, a.type);
+ ASSERT_EQUALS(Token::DELIMITER, b.type);
+ ASSERT_EQUALS(Token::TEXT, c.type);
+ ASSERT_EQUALS(Token::INVALID, d.type);
- ASSERT_EQUALS( 0U, a.offset );
- ASSERT_EQUALS( 5U, b.offset );
- ASSERT_EQUALS( 6U, c.offset );
- }
+ ASSERT_EQUALS("blue", a.data.toString());
+ ASSERT_EQUALS("-", b.data.toString());
+ ASSERT_EQUALS("red", c.data.toString());
- TEST( Tokenizer, Quote1English ) {
- Tokenizer i( &languageEnglishV2, "eliot's car" );
+ ASSERT_EQUALS(0U, a.offset);
+ ASSERT_EQUALS(5U, b.offset);
+ ASSERT_EQUALS(6U, c.offset);
+}
- Token a = i.next();
- Token b = i.next();
+TEST(Tokenizer, Quote1English) {
+ Tokenizer i(&languageEnglishV2, "eliot's car");
- ASSERT_EQUALS( "eliot's", a.data.toString() );
- ASSERT_EQUALS( "car", b.data.toString() );
- }
+ Token a = i.next();
+ Token b = i.next();
- TEST( Tokenizer, Quote1French ) {
- Tokenizer i( &languageFrenchV2, "eliot's car" );
+ ASSERT_EQUALS("eliot's", a.data.toString());
+ ASSERT_EQUALS("car", b.data.toString());
+}
- Token a = i.next();
- Token b = i.next();
- Token c = i.next();
+TEST(Tokenizer, Quote1French) {
+ Tokenizer i(&languageFrenchV2, "eliot's car");
- ASSERT_EQUALS( "eliot", a.data.toString() );
- ASSERT_EQUALS( "s", b.data.toString() );
- ASSERT_EQUALS( "car", c.data.toString() );
- }
+ Token a = i.next();
+ Token b = i.next();
+ Token c = i.next();
- }
+ ASSERT_EQUALS("eliot", a.data.toString());
+ ASSERT_EQUALS("s", b.data.toString());
+ ASSERT_EQUALS("car", c.data.toString());
+}
+}
}
-
-