diff options
Diffstat (limited to 'src/mongo/db/fts')
37 files changed, 3928 insertions, 4044 deletions
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp index 2d5cc493123..9fc41923d40 100644 --- a/src/mongo/db/fts/fts_basic_tokenizer.cpp +++ b/src/mongo/db/fts/fts_basic_tokenizer.cpp @@ -42,56 +42,54 @@ namespace mongo { namespace fts { - using std::string; - - BasicFTSTokenizer::BasicFTSTokenizer(const FTSLanguage* language) - : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) { - } - - void BasicFTSTokenizer::reset(StringData document, Options options) { - _options = options; - _document = document.toString(); - _tokenizer = stdx::make_unique<Tokenizer>(_language, _document); - } - - bool BasicFTSTokenizer::moveNext() { - while (true) { - bool hasMore = _tokenizer->more(); - if (!hasMore) { - _stem = ""; - return false; - } - - Token token = _tokenizer->next(); +using std::string; + +BasicFTSTokenizer::BasicFTSTokenizer(const FTSLanguage* language) + : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) {} + +void BasicFTSTokenizer::reset(StringData document, Options options) { + _options = options; + _document = document.toString(); + _tokenizer = stdx::make_unique<Tokenizer>(_language, _document); +} + +bool BasicFTSTokenizer::moveNext() { + while (true) { + bool hasMore = _tokenizer->more(); + if (!hasMore) { + _stem = ""; + return false; + } - // Do not return delimiters - if (token.type != Token::TEXT) { - continue; - } + Token token = _tokenizer->next(); - string word = token.data.toString(); + // Do not return delimiters + if (token.type != Token::TEXT) { + continue; + } - word = tolowerString(token.data); + string word = token.data.toString(); - // Stop words are case-sensitive so we need them to be lower cased to check - // against the stop word list - if ((_options & FTSTokenizer::FilterStopWords) && - _stopWords->isStopWord(word)) { - continue; - } + word = tolowerString(token.data); - if (_options & FTSTokenizer::GenerateCaseSensitiveTokens) { - word = token.data.toString(); - } + // Stop words are case-sensitive so we need them to be lower cased to check + // against the stop word list + if ((_options & FTSTokenizer::FilterStopWords) && _stopWords->isStopWord(word)) { + continue; + } - _stem = _stemmer.stem(word); - return true; + if (_options & FTSTokenizer::GenerateCaseSensitiveTokens) { + word = token.data.toString(); } - } - StringData BasicFTSTokenizer::get() const { - return _stem; + _stem = _stemmer.stem(word); + return true; } +} + +StringData BasicFTSTokenizer::get() const { + return _stem; +} -} // namespace fts -} // namespace mongo +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_basic_tokenizer.h b/src/mongo/db/fts/fts_basic_tokenizer.h index 45b3ad8e074..221de72bb8c 100644 --- a/src/mongo/db/fts/fts_basic_tokenizer.h +++ b/src/mongo/db/fts/fts_basic_tokenizer.h @@ -37,44 +37,45 @@ namespace mongo { namespace fts { - class FTSLanguage; - class StopWords; +class FTSLanguage; +class StopWords; - /** - * BasicFTSTokenizer - * A iterator of "documents" where a document contains ASCII space (U+0020) delimited words. - * Uses - * - Tokenizer for tokenizing words via ASCII space (ie, U+0020 space). - * - tolower from the C standard libary to lower letters, ie, it only supports lower casing - * - ASCII letters (U+0000 - U+007F) - * - Stemmer (ie, Snowball Stemmer) to stem words. - * - Embeded stop word lists for each language in StopWord class - * - * For each word returns a stem version of a word optimized for full text indexing. - * Optionally supports returning case sensitive search terms. - */ - class BasicFTSTokenizer : public FTSTokenizer { - MONGO_DISALLOW_COPYING(BasicFTSTokenizer); - public: - BasicFTSTokenizer(const FTSLanguage* language); +/** + * BasicFTSTokenizer + * A iterator of "documents" where a document contains ASCII space (U+0020) delimited words. + * Uses + * - Tokenizer for tokenizing words via ASCII space (ie, U+0020 space). + * - tolower from the C standard libary to lower letters, ie, it only supports lower casing + * - ASCII letters (U+0000 - U+007F) + * - Stemmer (ie, Snowball Stemmer) to stem words. + * - Embeded stop word lists for each language in StopWord class + * + * For each word returns a stem version of a word optimized for full text indexing. + * Optionally supports returning case sensitive search terms. + */ +class BasicFTSTokenizer : public FTSTokenizer { + MONGO_DISALLOW_COPYING(BasicFTSTokenizer); + +public: + BasicFTSTokenizer(const FTSLanguage* language); - void reset(StringData document, Options options) final; + void reset(StringData document, Options options) final; - bool moveNext() final; + bool moveNext() final; - StringData get() const final; + StringData get() const final; - private: - const FTSLanguage* const _language; - const Stemmer _stemmer; - const StopWords* const _stopWords; +private: + const FTSLanguage* const _language; + const Stemmer _stemmer; + const StopWords* const _stopWords; - std::string _document; - std::unique_ptr<Tokenizer> _tokenizer; - Options _options; + std::string _document; + std::unique_ptr<Tokenizer> _tokenizer; + Options _options; - std::string _stem; - }; + std::string _stem; +}; -} // namespace fts -} // namespace mongo +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_basic_tokenizer_test.cpp b/src/mongo/db/fts/fts_basic_tokenizer_test.cpp index 384be225f28..5feab67face 100644 --- a/src/mongo/db/fts/fts_basic_tokenizer_test.cpp +++ b/src/mongo/db/fts/fts_basic_tokenizer_test.cpp @@ -33,53 +33,51 @@ namespace mongo { namespace fts { - std::vector<std::string> tokenizeString(const char* str, const char* language) { - StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_2); - ASSERT_OK(swl); +std::vector<std::string> tokenizeString(const char* str, const char* language) { + StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_2); + ASSERT_OK(swl); - std::unique_ptr<FTSTokenizer> tokenizer(swl.getValue()->createTokenizer()); + std::unique_ptr<FTSTokenizer> tokenizer(swl.getValue()->createTokenizer()); - tokenizer->reset(str, FTSTokenizer::None); + tokenizer->reset(str, FTSTokenizer::None); - std::vector<std::string> terms; + std::vector<std::string> terms; - while (tokenizer->moveNext()) { - terms.push_back(tokenizer->get().toString()); - } - - return terms; + while (tokenizer->moveNext()) { + terms.push_back(tokenizer->get().toString()); } - // Ensure punctuation is filtered out of the indexed document - // and the 's is not separated - TEST(FtsBasicTokenizer, English) { - std::vector<std::string> terms = tokenizeString("Do you see Mark's dog running?", - "english"); + return terms; +} - ASSERT_EQUALS(6U, terms.size()); - ASSERT_EQUALS("do", terms[0]); - ASSERT_EQUALS("you", terms[1]); - ASSERT_EQUALS("see", terms[2]); - ASSERT_EQUALS("mark", terms[3]); - ASSERT_EQUALS("dog", terms[4]); - ASSERT_EQUALS("run", terms[5]); - } +// Ensure punctuation is filtered out of the indexed document +// and the 's is not separated +TEST(FtsBasicTokenizer, English) { + std::vector<std::string> terms = tokenizeString("Do you see Mark's dog running?", "english"); - // Ensure punctuation is filtered out of the indexed document - // and the 's is separated - TEST(FtsBasicTokenizer, French) { - std::vector<std::string> terms = tokenizeString("Do you see Mark's dog running?", - "french"); + ASSERT_EQUALS(6U, terms.size()); + ASSERT_EQUALS("do", terms[0]); + ASSERT_EQUALS("you", terms[1]); + ASSERT_EQUALS("see", terms[2]); + ASSERT_EQUALS("mark", terms[3]); + ASSERT_EQUALS("dog", terms[4]); + ASSERT_EQUALS("run", terms[5]); +} - ASSERT_EQUALS(7U, terms.size()); - ASSERT_EQUALS("do", terms[0]); - ASSERT_EQUALS("you", terms[1]); - ASSERT_EQUALS("se", terms[2]); - ASSERT_EQUALS("mark", terms[3]); - ASSERT_EQUALS("s", terms[4]); - ASSERT_EQUALS("dog", terms[5]); - ASSERT_EQUALS("running", terms[6]); - } +// Ensure punctuation is filtered out of the indexed document +// and the 's is separated +TEST(FtsBasicTokenizer, French) { + std::vector<std::string> terms = tokenizeString("Do you see Mark's dog running?", "french"); + + ASSERT_EQUALS(7U, terms.size()); + ASSERT_EQUALS("do", terms[0]); + ASSERT_EQUALS("you", terms[1]); + ASSERT_EQUALS("se", terms[2]); + ASSERT_EQUALS("mark", terms[3]); + ASSERT_EQUALS("s", terms[4]); + ASSERT_EQUALS("dog", terms[5]); + ASSERT_EQUALS("running", terms[6]); +} -} // namespace fts -} // namespace mongo +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_element_iterator.cpp b/src/mongo/db/fts/fts_element_iterator.cpp index f57e1097c14..4df642dc66a 100644 --- a/src/mongo/db/fts/fts_element_iterator.cpp +++ b/src/mongo/db/fts/fts_element_iterator.cpp @@ -37,152 +37,149 @@ namespace mongo { - namespace fts { +namespace fts { + +using std::string; + +extern const double DEFAULT_WEIGHT; +extern const double MAX_WEIGHT; + +std::ostream& operator<<(std::ostream& os, FTSElementIterator::FTSIteratorFrame& frame) { + BSONObjIterator it = frame._it; + return os << "FTSIteratorFrame[" + " element=" << (*it).toString() << ", _language=" << frame._language->str() + << ", _parentPath=" << frame._parentPath << ", _isArray=" << frame._isArray << "]"; +} + +FTSElementIterator::FTSElementIterator(const FTSSpec& spec, const BSONObj& obj) + : _frame(obj, spec, &spec.defaultLanguage(), "", false), + _spec(spec), + _currentValue(advance()) {} + +namespace { +/** Check for exact match or path prefix match. */ +inline bool _matchPrefix(const string& dottedName, const string& weight) { + if (weight == dottedName) { + return true; + } + return mongoutils::str::startsWith(weight, dottedName + '.'); +} +} + +bool FTSElementIterator::more() { + //_currentValue = advance(); + return _currentValue.valid(); +} + +FTSIteratorValue FTSElementIterator::next() { + FTSIteratorValue result = _currentValue; + _currentValue = advance(); + return result; +} - using std::string; - - extern const double DEFAULT_WEIGHT; - extern const double MAX_WEIGHT; - - std::ostream& operator<<( std::ostream& os, FTSElementIterator::FTSIteratorFrame& frame ) { - BSONObjIterator it = frame._it; - return os << "FTSIteratorFrame[" - " element=" << (*it).toString() << - ", _language=" << frame._language->str() << - ", _parentPath=" << frame._parentPath << - ", _isArray=" << frame._isArray << "]"; - } - - FTSElementIterator::FTSElementIterator( const FTSSpec& spec, const BSONObj& obj ) - : _frame( obj, spec, &spec.defaultLanguage(), "", false ), - _spec( spec ), - _currentValue( advance() ) - { } - - namespace { - /** Check for exact match or path prefix match. */ - inline bool _matchPrefix( const string& dottedName, const string& weight ) { - if ( weight == dottedName ) { - return true; - } - return mongoutils::str::startsWith( weight, dottedName + '.' ); - } - } - - bool FTSElementIterator::more() { - //_currentValue = advance(); - return _currentValue.valid(); +/** + * Helper method: + * if (current object iterator not exhausted) return true; + * while (frame stack not empty) { + * resume object iterator popped from stack; + * if (resumed iterator not exhausted) return true; + * } + * return false; + */ +bool FTSElementIterator::moreFrames() { + if (_frame._it.more()) + return true; + while (!_frameStack.empty()) { + _frame = _frameStack.top(); + _frameStack.pop(); + if (_frame._it.more()) { + return true; } - - FTSIteratorValue FTSElementIterator::next() { - FTSIteratorValue result = _currentValue; - _currentValue = advance(); - return result; + } + return false; +} + +FTSIteratorValue FTSElementIterator::advance() { + while (moreFrames()) { + BSONElement elem = _frame._it.next(); + string fieldName = elem.fieldName(); + + // Skip "language" specifier fields if wildcard. + if (_spec.wildcard() && _spec.languageOverrideField() == fieldName) { + continue; } - /** - * Helper method: - * if (current object iterator not exhausted) return true; - * while (frame stack not empty) { - * resume object iterator popped from stack; - * if (resumed iterator not exhausted) return true; - * } - * return false; - */ - bool FTSElementIterator::moreFrames() { - if (_frame._it.more()) return true; - while (!_frameStack.empty()) { - _frame = _frameStack.top(); - _frameStack.pop(); - if (_frame._it.more()) { - return true; - } + // Compose the dotted name of the current field: + // 1. parent path empty (top level): use the current field name + // 2. parent path non-empty and obj is an array: use the parent path + // 3. parent path non-empty and obj is a sub-doc: append field name to parent path + string dottedName = (_frame._parentPath.empty() ? fieldName : _frame._isArray + ? _frame._parentPath + : _frame._parentPath + '.' + fieldName); + + // Find lower bound of dottedName in _weights. lower_bound leaves us at the first + // weight that could possibly match or be a prefix of dottedName. And if this + // element fails to match, then no subsequent weight can match, since the weights + // are lexicographically ordered. + Weights::const_iterator i = + _spec.weights().lower_bound(elem.type() == Object ? dottedName + '.' : dottedName); + + // possibleWeightMatch is set if the weight map contains either a match or some item + // lexicographically larger than fieldName. This boolean acts as a guard on + // dereferences of iterator 'i'. + bool possibleWeightMatch = (i != _spec.weights().end()); + + // Optimize away two cases, when not wildcard: + // 1. lower_bound seeks to end(): no prefix match possible + // 2. lower_bound seeks to a name which is not a prefix + if (!_spec.wildcard()) { + if (!possibleWeightMatch) { + continue; + } else if (!_matchPrefix(dottedName, i->first)) { + continue; } - return false; } - FTSIteratorValue FTSElementIterator::advance() { - while ( moreFrames() ) { - - BSONElement elem = _frame._it.next(); - string fieldName = elem.fieldName(); + // Is the current field an exact match on a weight? + bool exactMatch = (possibleWeightMatch && i->first == dottedName); + double weight = (possibleWeightMatch ? i->second : DEFAULT_WEIGHT); - // Skip "language" specifier fields if wildcard. - if ( _spec.wildcard() && _spec.languageOverrideField() == fieldName ) { - continue; + switch (elem.type()) { + case String: + // Only index strings on exact match or wildcard. + if (exactMatch || _spec.wildcard()) { + return FTSIteratorValue(elem.valuestr(), _frame._language, weight); } - - // Compose the dotted name of the current field: - // 1. parent path empty (top level): use the current field name - // 2. parent path non-empty and obj is an array: use the parent path - // 3. parent path non-empty and obj is a sub-doc: append field name to parent path - string dottedName = ( _frame._parentPath.empty() ? fieldName - : _frame._isArray ? _frame._parentPath - : _frame._parentPath + '.' + fieldName ); - - // Find lower bound of dottedName in _weights. lower_bound leaves us at the first - // weight that could possibly match or be a prefix of dottedName. And if this - // element fails to match, then no subsequent weight can match, since the weights - // are lexicographically ordered. - Weights::const_iterator i = _spec.weights().lower_bound( elem.type() == Object - ? dottedName + '.' - : dottedName ); - - // possibleWeightMatch is set if the weight map contains either a match or some item - // lexicographically larger than fieldName. This boolean acts as a guard on - // dereferences of iterator 'i'. - bool possibleWeightMatch = ( i != _spec.weights().end() ); - - // Optimize away two cases, when not wildcard: - // 1. lower_bound seeks to end(): no prefix match possible - // 2. lower_bound seeks to a name which is not a prefix - if ( !_spec.wildcard() ) { - if ( !possibleWeightMatch ) { - continue; - } - else if ( !_matchPrefix( dottedName, i->first ) ) { - continue; - } + break; + + case Object: + // Only descend into a sub-document on proper prefix or wildcard. Note that + // !exactMatch is a sufficient test for proper prefix match, because of + // if ( !matchPrefix( dottedName, i->first ) ) continue; + // block above. + if (!exactMatch || _spec.wildcard()) { + _frameStack.push(_frame); + _frame = + FTSIteratorFrame(elem.Obj(), _spec, _frame._language, dottedName, false); } - - // Is the current field an exact match on a weight? - bool exactMatch = ( possibleWeightMatch && i->first == dottedName ); - double weight = ( possibleWeightMatch ? i->second : DEFAULT_WEIGHT ); - - switch ( elem.type() ) { - case String: - // Only index strings on exact match or wildcard. - if ( exactMatch || _spec.wildcard() ) { - return FTSIteratorValue( elem.valuestr(), _frame._language, weight ); - } - break; - - case Object: - // Only descend into a sub-document on proper prefix or wildcard. Note that - // !exactMatch is a sufficient test for proper prefix match, because of - // if ( !matchPrefix( dottedName, i->first ) ) continue; - // block above. - if ( !exactMatch || _spec.wildcard() ) { - _frameStack.push( _frame ); - _frame = FTSIteratorFrame( elem.Obj(), _spec, _frame._language, dottedName, false ); - } - break; - - case Array: - // Only descend into arrays from non-array parents or on wildcard. - if ( !_frame._isArray || _spec.wildcard() ) { - _frameStack.push( _frame ); - _frame = FTSIteratorFrame( elem.Obj(), _spec, _frame._language, dottedName, true ); - } - break; - - default: - // Skip over all other BSON types. - break; + break; + + case Array: + // Only descend into arrays from non-array parents or on wildcard. + if (!_frame._isArray || _spec.wildcard()) { + _frameStack.push(_frame); + _frame = + FTSIteratorFrame(elem.Obj(), _spec, _frame._language, dottedName, true); } - } - return FTSIteratorValue(); // valid()==false + break; + + default: + // Skip over all other BSON types. + break; } + } + return FTSIteratorValue(); // valid()==false +} - } // namespace fts -} // namespace mongo +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_element_iterator.h b/src/mongo/db/fts/fts_element_iterator.h index 17f72fff7f9..21e1e446627 100644 --- a/src/mongo/db/fts/fts_element_iterator.h +++ b/src/mongo/db/fts/fts_element_iterator.h @@ -40,133 +40,121 @@ namespace mongo { - namespace fts { - - /** - * Encapsulates data fields returned by FTSElementIterator - */ - struct FTSIteratorValue { - - FTSIteratorValue( const char* text, - const FTSLanguage* language, - double weight ) - : _text(text), - _language(language), - _weight(weight), - _valid(true) - {} - - FTSIteratorValue() - : _text(NULL), - _language(), - _weight(0.0), - _valid(false) - {} - - bool valid() const { return _valid; } - - const char* _text; - const FTSLanguage* _language; - double _weight; - bool _valid; - }; - - /** - * Iterator pattern for walking through text-indexed fields of a - * BSON document. - * - * Example usage: - * FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - * FTSElementIterator it( spec, obj ); - * while ( it.more() ) { - * FTSIteratorValue val = it.next(); - * std::cout << val._text << '[' << val._language.str() - * << ',' << val._weight << ']' << std::endl; - * } - * - */ - class FTSElementIterator { - public: - /** - * Iterator constructor - * - * Note: Caller must ensure that the constructed FTSElementIterator - * does >not< outlive either spec or obj. - * - * @arg spec text index specifier - * @arg obj document that the iterator will traverse - */ - FTSElementIterator( const FTSSpec& spec, const BSONObj& obj); - - /** - * Iterator interface: returns false iff there are no further text-indexable fields. - */ - bool more(); - - /** - * Iterator interface: advances to the next text-indexable field. - */ - FTSIteratorValue next(); - - /** - * Iterator frame needed for iterative implementation of - * recursive sub-documents. - */ - struct FTSIteratorFrame { - FTSIteratorFrame( const BSONObj& obj, - const FTSSpec& spec, - const FTSLanguage* parentLanguage, - const std::string& parentPath, - bool isArray ) - : _it( obj ), - _language( spec._getLanguageToUseV2( obj, parentLanguage ) ), - _parentPath( parentPath ), - _isArray( isArray ) - {} - - friend std::ostream& operator<<(std::ostream&, FTSIteratorFrame&); - - BSONObjIterator _it; - const FTSLanguage* _language; - std::string _parentPath; - bool _isArray; - }; - - private: - /** - * Helper method: - * returns false iff all FTSIteratorFrames on _frameStack are exhausted. - */ - bool moreFrames(); - - /** - * Helper method: - * advances to the next text-indexable field, possibly pushing frames as - * needed for recursive sub-documents. - */ - FTSIteratorValue advance(); - - /** - * Stack used by iterative implementation of recursive sub-document traversal. - */ - std::stack<FTSIteratorFrame> _frameStack; - - /** - * Current frame, not yet pushed to stack. - */ - FTSIteratorFrame _frame; - - /** - * Constructor input parameter: text index specification. - */ - const FTSSpec& _spec; - - /** - * Current iterator return value, computed by 'more()', returned by 'next()'. - */ - FTSIteratorValue _currentValue; - }; - - } // namespace fts -} // namespace mongo +namespace fts { +/** + * Encapsulates data fields returned by FTSElementIterator + */ +struct FTSIteratorValue { + FTSIteratorValue(const char* text, const FTSLanguage* language, double weight) + : _text(text), _language(language), _weight(weight), _valid(true) {} + + FTSIteratorValue() : _text(NULL), _language(), _weight(0.0), _valid(false) {} + + bool valid() const { + return _valid; + } + + const char* _text; + const FTSLanguage* _language; + double _weight; + bool _valid; +}; + +/** + * Iterator pattern for walking through text-indexed fields of a + * BSON document. + * + * Example usage: + * FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); + * FTSElementIterator it( spec, obj ); + * while ( it.more() ) { + * FTSIteratorValue val = it.next(); + * std::cout << val._text << '[' << val._language.str() + * << ',' << val._weight << ']' << std::endl; + * } + * + */ +class FTSElementIterator { +public: + /** + * Iterator constructor + * + * Note: Caller must ensure that the constructed FTSElementIterator + * does >not< outlive either spec or obj. + * + * @arg spec text index specifier + * @arg obj document that the iterator will traverse + */ + FTSElementIterator(const FTSSpec& spec, const BSONObj& obj); + + /** + * Iterator interface: returns false iff there are no further text-indexable fields. + */ + bool more(); + + /** + * Iterator interface: advances to the next text-indexable field. + */ + FTSIteratorValue next(); + + /** + * Iterator frame needed for iterative implementation of + * recursive sub-documents. + */ + struct FTSIteratorFrame { + FTSIteratorFrame(const BSONObj& obj, + const FTSSpec& spec, + const FTSLanguage* parentLanguage, + const std::string& parentPath, + bool isArray) + : _it(obj), + _language(spec._getLanguageToUseV2(obj, parentLanguage)), + _parentPath(parentPath), + _isArray(isArray) {} + + friend std::ostream& operator<<(std::ostream&, FTSIteratorFrame&); + + BSONObjIterator _it; + const FTSLanguage* _language; + std::string _parentPath; + bool _isArray; + }; + +private: + /** + * Helper method: + * returns false iff all FTSIteratorFrames on _frameStack are exhausted. + */ + bool moreFrames(); + + /** + * Helper method: + * advances to the next text-indexable field, possibly pushing frames as + * needed for recursive sub-documents. + */ + FTSIteratorValue advance(); + + /** + * Stack used by iterative implementation of recursive sub-document traversal. + */ + std::stack<FTSIteratorFrame> _frameStack; + + /** + * Current frame, not yet pushed to stack. + */ + FTSIteratorFrame _frame; + + /** + * Constructor input parameter: text index specification. + */ + const FTSSpec& _spec; + + /** + * Current iterator return value, computed by 'more()', returned by 'next()'. + */ + FTSIteratorValue _currentValue; +}; + +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_element_iterator_test.cpp b/src/mongo/db/fts/fts_element_iterator_test.cpp index 6d5694c5990..2a16c14b5a5 100644 --- a/src/mongo/db/fts/fts_element_iterator_test.cpp +++ b/src/mongo/db/fts/fts_element_iterator_test.cpp @@ -34,279 +34,267 @@ #include "mongo/unittest/unittest.h" namespace mongo { - namespace fts { - - using std::string; - - TEST( FTSElementIterator, Test1 ) { - - BSONObj obj = fromjson( - "{ b : \"walking\"," - " c : { e: \"walked\" }," - " d : \"walker\"" - " }" ); - - BSONObj indexSpec = fromjson( - "{ key : { a : \"text\" }, weights : { b : 10, d : 5 } }" ); - - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - Weights::const_iterator itt = spec.weights().begin(); - ASSERT( itt != spec.weights().end() ); - ASSERT_EQUALS( "a", itt->first ); - ASSERT_EQUALS( 1, itt->second ); - ++itt; - ASSERT( itt != spec.weights().end() ); - ASSERT_EQUALS( "b", itt->first ); - ASSERT_EQUALS( 10, itt->second ); - ++itt; - ASSERT( itt != spec.weights().end() ); - ASSERT_EQUALS( "d", itt->first ); - ASSERT_EQUALS( 5, itt->second ); - ++itt; - - FTSElementIterator it( spec, obj ); - - ASSERT( it.more() ); - FTSIteratorValue val = it.next(); - ASSERT_EQUALS( "walking", string(val._text) ); - ASSERT_EQUALS( "english", val._language->str() ); - ASSERT_EQUALS( 10, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "walker", string(val._text) ); - ASSERT_EQUALS( "english", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - } - - // Multi-language : test - TEST( FTSElementIterator, Test2 ) { - - BSONObj obj = fromjson( - "{ a :" - " { b :" - " [ { c : \"walked\", language : \"english\" }," - " { c : \"camminato\", language : \"italian\" }," - " { c : \"ging\", language : \"german\" } ]" - " }," - " d : \"Feliz Año Nuevo!\"," - " language : \"spanish\"" - " }" ); - - BSONObj indexSpec = fromjson( - "{ key : { \"a.b.c\" : \"text\", d : \"text\" } }" ); - - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - - FTSElementIterator it( spec, obj ); - - ASSERT( it.more() ); - FTSIteratorValue val = it.next(); - ASSERT_EQUALS( "walked", string(val._text) ); - ASSERT_EQUALS( "english", val._language->str() ); - ASSERT_EQUALS( 1, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "camminato", string(val._text) ); - ASSERT_EQUALS( "italian", val._language->str() ); - ASSERT_EQUALS( 1, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "ging", string(val._text) ); - ASSERT_EQUALS( "german", val._language->str() ); - ASSERT_EQUALS( 1, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "Feliz Año Nuevo!", string(val._text) ); - ASSERT_EQUALS( "spanish", val._language->str() ); - ASSERT_EQUALS( 1, val._weight ); - } - - // Multi-language : test nested stemming per sub-document - TEST( FTSElementIterator, Test3 ) { - - BSONObj obj = fromjson( - "{ language : \"english\"," - " a :" - " { language : \"danish\"," - " b :" - " [ { c : \"foredrag\" }," - " { c : \"foredragsholder\" }," - " { c : \"lector\" } ]" - " }" - "}" ); - - BSONObj indexSpec = fromjson( - "{ key : { a : \"text\", \"a.b.c\" : \"text\" }, weights : { \"a.b.c\" : 5 } }" ); - - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - Weights::const_iterator itt = spec.weights().begin(); - ASSERT( itt != spec.weights().end() ); - ASSERT_EQUALS( "a", itt->first ); - ASSERT_EQUALS( 1, itt->second ); - ++itt; - ASSERT( itt != spec.weights().end() ); - ASSERT_EQUALS( "a.b.c", itt->first ); - ASSERT_EQUALS( 5, itt->second ); - - FTSElementIterator it( spec, obj ); - - ASSERT( it.more() ); - FTSIteratorValue val = it.next(); - ASSERT_EQUALS( "foredrag", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "foredragsholder", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "lector", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - } - - // Multi-language : test nested arrays - TEST( FTSElementIterator, Test4 ) { - - BSONObj obj = fromjson( - "{ language : \"english\"," - " a : [" - " { language : \"danish\"," - " b :" - " [ { c : [\"foredrag\"] }," - " { c : [\"foredragsholder\"] }," - " { c : [\"lector\"] } ]" - " } ]" - "}" ); - - BSONObj indexSpec = fromjson( - "{ key : { \"a.b.c\" : \"text\" }, weights : { \"a.b.c\" : 5 } }" ); - - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - FTSElementIterator it( spec, obj ); - - ASSERT( it.more() ); - FTSIteratorValue val = it.next(); - ASSERT_EQUALS( "foredrag", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "foredragsholder", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "lector", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - } - - // Multi-language : test wildcard spec - TEST( FTSElementIterator, Test5 ) { - - BSONObj obj = fromjson( - "{ language : \"english\"," - " b : \"these boots were made for walking\"," - " c : { e: \"I walked half way to the market before seeing the sunrise\" }," - " d : " - " { language : \"danish\"," - " e :" - " [ { f : \"foredrag\", g : 12 }," - " { f : \"foredragsholder\", g : 13 }," - " { f : \"lector\", g : 14 } ]" - " }" - "}" ); - - BSONObj indexSpec = fromjson( - "{ key : { a : \"text\" }, weights : { b : 20, c : 10, \"d.e.f\" : 5 } }" ); - - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - FTSElementIterator it( spec, obj ); - - ASSERT( it.more() ); - FTSIteratorValue val = it.next(); - ASSERT_EQUALS( "these boots were made for walking", string(val._text) ); - ASSERT_EQUALS( "english", val._language->str() ); - ASSERT_EQUALS( 20, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "foredrag", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "foredragsholder", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "lector", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - } - - // Multi-language : test wildcard spec - TEST( FTSElementIterator, Test6 ) { - - BSONObj obj = fromjson( - "{ language : \"english\"," - " b : \"these boots were made for walking\"," - " c : { e: \"I walked half way to the market before seeing the sunrise\" }," - " d : " - " { language : \"danish\"," - " e :" - " [ { f : \"foredrag\", g : 12 }," - " { f : \"foredragsholder\", g : 13 }," - " { f : \"lector\", g : 14 } ]" - " }" - "}" ); - - BSONObj indexSpec = fromjson( - "{ key : { a : \"text\" }, weights : { b : 20, c : 10, \"d.e.f\" : 5 } }" ); - - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - FTSElementIterator it( spec, obj ); - - ASSERT( it.more() ); - FTSIteratorValue val = it.next(); - ASSERT_EQUALS( "these boots were made for walking", string(val._text) ); - ASSERT_EQUALS( "english", val._language->str() ); - ASSERT_EQUALS( 20, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "foredrag", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "foredragsholder", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "lector", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - } - } +namespace fts { + +using std::string; + +TEST(FTSElementIterator, Test1) { + BSONObj obj = fromjson( + "{ b : \"walking\"," + " c : { e: \"walked\" }," + " d : \"walker\"" + " }"); + + BSONObj indexSpec = fromjson("{ key : { a : \"text\" }, weights : { b : 10, d : 5 } }"); + + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + Weights::const_iterator itt = spec.weights().begin(); + ASSERT(itt != spec.weights().end()); + ASSERT_EQUALS("a", itt->first); + ASSERT_EQUALS(1, itt->second); + ++itt; + ASSERT(itt != spec.weights().end()); + ASSERT_EQUALS("b", itt->first); + ASSERT_EQUALS(10, itt->second); + ++itt; + ASSERT(itt != spec.weights().end()); + ASSERT_EQUALS("d", itt->first); + ASSERT_EQUALS(5, itt->second); + ++itt; + + FTSElementIterator it(spec, obj); + + ASSERT(it.more()); + FTSIteratorValue val = it.next(); + ASSERT_EQUALS("walking", string(val._text)); + ASSERT_EQUALS("english", val._language->str()); + ASSERT_EQUALS(10, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("walker", string(val._text)); + ASSERT_EQUALS("english", val._language->str()); + ASSERT_EQUALS(5, val._weight); } +// Multi-language : test +TEST(FTSElementIterator, Test2) { + BSONObj obj = fromjson( + "{ a :" + " { b :" + " [ { c : \"walked\", language : \"english\" }," + " { c : \"camminato\", language : \"italian\" }," + " { c : \"ging\", language : \"german\" } ]" + " }," + " d : \"Feliz Año Nuevo!\"," + " language : \"spanish\"" + " }"); + + BSONObj indexSpec = fromjson("{ key : { \"a.b.c\" : \"text\", d : \"text\" } }"); + + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + + FTSElementIterator it(spec, obj); + + ASSERT(it.more()); + FTSIteratorValue val = it.next(); + ASSERT_EQUALS("walked", string(val._text)); + ASSERT_EQUALS("english", val._language->str()); + ASSERT_EQUALS(1, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("camminato", string(val._text)); + ASSERT_EQUALS("italian", val._language->str()); + ASSERT_EQUALS(1, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("ging", string(val._text)); + ASSERT_EQUALS("german", val._language->str()); + ASSERT_EQUALS(1, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("Feliz Año Nuevo!", string(val._text)); + ASSERT_EQUALS("spanish", val._language->str()); + ASSERT_EQUALS(1, val._weight); +} + +// Multi-language : test nested stemming per sub-document +TEST(FTSElementIterator, Test3) { + BSONObj obj = fromjson( + "{ language : \"english\"," + " a :" + " { language : \"danish\"," + " b :" + " [ { c : \"foredrag\" }," + " { c : \"foredragsholder\" }," + " { c : \"lector\" } ]" + " }" + "}"); + + BSONObj indexSpec = + fromjson("{ key : { a : \"text\", \"a.b.c\" : \"text\" }, weights : { \"a.b.c\" : 5 } }"); + + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + Weights::const_iterator itt = spec.weights().begin(); + ASSERT(itt != spec.weights().end()); + ASSERT_EQUALS("a", itt->first); + ASSERT_EQUALS(1, itt->second); + ++itt; + ASSERT(itt != spec.weights().end()); + ASSERT_EQUALS("a.b.c", itt->first); + ASSERT_EQUALS(5, itt->second); + + FTSElementIterator it(spec, obj); + + ASSERT(it.more()); + FTSIteratorValue val = it.next(); + ASSERT_EQUALS("foredrag", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("foredragsholder", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("lector", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); +} + +// Multi-language : test nested arrays +TEST(FTSElementIterator, Test4) { + BSONObj obj = fromjson( + "{ language : \"english\"," + " a : [" + " { language : \"danish\"," + " b :" + " [ { c : [\"foredrag\"] }," + " { c : [\"foredragsholder\"] }," + " { c : [\"lector\"] } ]" + " } ]" + "}"); + + BSONObj indexSpec = fromjson("{ key : { \"a.b.c\" : \"text\" }, weights : { \"a.b.c\" : 5 } }"); + + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + FTSElementIterator it(spec, obj); + + ASSERT(it.more()); + FTSIteratorValue val = it.next(); + ASSERT_EQUALS("foredrag", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("foredragsholder", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("lector", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); +} + +// Multi-language : test wildcard spec +TEST(FTSElementIterator, Test5) { + BSONObj obj = fromjson( + "{ language : \"english\"," + " b : \"these boots were made for walking\"," + " c : { e: \"I walked half way to the market before seeing the sunrise\" }," + " d : " + " { language : \"danish\"," + " e :" + " [ { f : \"foredrag\", g : 12 }," + " { f : \"foredragsholder\", g : 13 }," + " { f : \"lector\", g : 14 } ]" + " }" + "}"); + + BSONObj indexSpec = + fromjson("{ key : { a : \"text\" }, weights : { b : 20, c : 10, \"d.e.f\" : 5 } }"); + + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + FTSElementIterator it(spec, obj); + + ASSERT(it.more()); + FTSIteratorValue val = it.next(); + ASSERT_EQUALS("these boots were made for walking", string(val._text)); + ASSERT_EQUALS("english", val._language->str()); + ASSERT_EQUALS(20, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("foredrag", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("foredragsholder", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("lector", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); +} + +// Multi-language : test wildcard spec +TEST(FTSElementIterator, Test6) { + BSONObj obj = fromjson( + "{ language : \"english\"," + " b : \"these boots were made for walking\"," + " c : { e: \"I walked half way to the market before seeing the sunrise\" }," + " d : " + " { language : \"danish\"," + " e :" + " [ { f : \"foredrag\", g : 12 }," + " { f : \"foredragsholder\", g : 13 }," + " { f : \"lector\", g : 14 } ]" + " }" + "}"); + + BSONObj indexSpec = + fromjson("{ key : { a : \"text\" }, weights : { b : 20, c : 10, \"d.e.f\" : 5 } }"); + + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + FTSElementIterator it(spec, obj); + + ASSERT(it.more()); + FTSIteratorValue val = it.next(); + ASSERT_EQUALS("these boots were made for walking", string(val._text)); + ASSERT_EQUALS("english", val._language->str()); + ASSERT_EQUALS(20, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("foredrag", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("foredragsholder", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("lector", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); +} +} +} diff --git a/src/mongo/db/fts/fts_enabled.cpp b/src/mongo/db/fts/fts_enabled.cpp index b8e071bd62a..fb261194db1 100644 --- a/src/mongo/db/fts/fts_enabled.cpp +++ b/src/mongo/db/fts/fts_enabled.cpp @@ -35,46 +35,42 @@ #include "mongo/util/log.h" namespace mongo { - namespace fts { - namespace { +namespace fts { +namespace { - bool dummyEnabledFlag = true; // Unused, needed for server parameter. +bool dummyEnabledFlag = true; // Unused, needed for server parameter. - /** - * Declaration for the "textSearchEnabled" server parameter, which is now deprecated. - * Note that: - * - setting to true performs a no-op and logs a deprecation message. - * - setting to false will fail. - */ - class ExportedTextSearchEnabledParameter : public ExportedServerParameter<bool> { - public: - ExportedTextSearchEnabledParameter() : - ExportedServerParameter<bool>( ServerParameterSet::getGlobal(), - "textSearchEnabled", - &dummyEnabledFlag, - true, - true ) {} - - virtual Status validate( const bool& potentialNewValue ) { - if ( !potentialNewValue ) { - return Status( ErrorCodes::BadValue, - "textSearchEnabled cannot be set to false"); - } - - log() << "Attempted to set textSearchEnabled server parameter."; - log() << "Text search is enabled by default and cannot be disabled."; - log() << "The following are now deprecated and will be removed in a future " - << "release:"; - log() << "- the \"textSearchEnabled\" server parameter (setting it has no " - << "effect)"; - log() << "- the \"text\" command (has been replaced by the $text query " - "operator)"; +/** + * Declaration for the "textSearchEnabled" server parameter, which is now deprecated. + * Note that: + * - setting to true performs a no-op and logs a deprecation message. + * - setting to false will fail. + */ +class ExportedTextSearchEnabledParameter : public ExportedServerParameter<bool> { +public: + ExportedTextSearchEnabledParameter() + : ExportedServerParameter<bool>( + ServerParameterSet::getGlobal(), "textSearchEnabled", &dummyEnabledFlag, true, true) { + } - return Status::OK(); - } + virtual Status validate(const bool& potentialNewValue) { + if (!potentialNewValue) { + return Status(ErrorCodes::BadValue, "textSearchEnabled cannot be set to false"); + } - } exportedTextSearchEnabledParam; + log() << "Attempted to set textSearchEnabled server parameter."; + log() << "Text search is enabled by default and cannot be disabled."; + log() << "The following are now deprecated and will be removed in a future " + << "release:"; + log() << "- the \"textSearchEnabled\" server parameter (setting it has no " + << "effect)"; + log() << "- the \"text\" command (has been replaced by the $text query " + "operator)"; - } + return Status::OK(); } + +} exportedTextSearchEnabledParam; +} +} } diff --git a/src/mongo/db/fts/fts_index_format.cpp b/src/mongo/db/fts/fts_index_format.cpp index fc0e703b84d..f7110d80858 100644 --- a/src/mongo/db/fts/fts_index_format.cpp +++ b/src/mongo/db/fts/fts_index_format.cpp @@ -40,178 +40,168 @@ namespace mongo { - namespace fts { - - using std::string; - using std::vector; - - namespace { - BSONObj nullObj; - BSONElement nullElt; - - // New in textIndexVersion 2. - // If the term is longer than 32 characters, it may - // result in the generated key being too large - // for the index. In that case, we generate a 64-character key - // from the concatenation of the first 32 characters - // and the hex string of the murmur3 hash value of the entire - // term value. - const size_t termKeyPrefixLength = 32U; - // 128-bit hash value expressed in hex = 32 characters - const size_t termKeySuffixLength = 32U; - const size_t termKeyLength = termKeyPrefixLength + termKeySuffixLength; - - /** - * Returns size of buffer required to store term in index key. - * In version 1, terms are stored verbatim in key. - * In version 2, terms longer than 32 characters are hashed and combined - * with a prefix. - */ - int guessTermSize( const std::string& term, TextIndexVersion textIndexVersion ) { - if ( TEXT_INDEX_VERSION_1 == textIndexVersion ) { - return term.size(); - } - else { - invariant( TEXT_INDEX_VERSION_2 == textIndexVersion ); - if ( term.size() <= termKeyPrefixLength ) { - return term.size(); - } - return termKeyLength; - } - } - } +namespace fts { + +using std::string; +using std::vector; + +namespace { +BSONObj nullObj; +BSONElement nullElt; + +// New in textIndexVersion 2. +// If the term is longer than 32 characters, it may +// result in the generated key being too large +// for the index. In that case, we generate a 64-character key +// from the concatenation of the first 32 characters +// and the hex string of the murmur3 hash value of the entire +// term value. +const size_t termKeyPrefixLength = 32U; +// 128-bit hash value expressed in hex = 32 characters +const size_t termKeySuffixLength = 32U; +const size_t termKeyLength = termKeyPrefixLength + termKeySuffixLength; - MONGO_INITIALIZER( FTSIndexFormat )( InitializerContext* context ) { - BSONObjBuilder b; - b.appendNull( "" ); - nullObj = b.obj(); - nullElt = nullObj.firstElement(); - return Status::OK(); +/** + * Returns size of buffer required to store term in index key. + * In version 1, terms are stored verbatim in key. + * In version 2, terms longer than 32 characters are hashed and combined + * with a prefix. + */ +int guessTermSize(const std::string& term, TextIndexVersion textIndexVersion) { + if (TEXT_INDEX_VERSION_1 == textIndexVersion) { + return term.size(); + } else { + invariant(TEXT_INDEX_VERSION_2 == textIndexVersion); + if (term.size() <= termKeyPrefixLength) { + return term.size(); } + return termKeyLength; + } +} +} - void FTSIndexFormat::getKeys( const FTSSpec& spec, - const BSONObj& obj, - BSONObjSet* keys ) { - - int extraSize = 0; - vector<BSONElement> extrasBefore; - vector<BSONElement> extrasAfter; - - // compute the non FTS key elements - for ( unsigned i = 0; i < spec.numExtraBefore(); i++ ) { - BSONElement e = obj.getFieldDotted(spec.extraBefore(i)); - if ( e.eoo() ) - e = nullElt; - uassert( 16675, "cannot have a multi-key as a prefix to a text index", - e.type() != Array ); - extrasBefore.push_back(e); - extraSize += e.size(); - } - for ( unsigned i = 0; i < spec.numExtraAfter(); i++ ) { - BSONElement e = obj.getFieldDotted(spec.extraAfter(i)); - if ( e.eoo() ) - e = nullElt; - extrasAfter.push_back(e); - extraSize += e.size(); - } - - - TermFrequencyMap term_freqs; - spec.scoreDocument( obj, &term_freqs ); - - // create index keys from raw scores - // only 1 per string - - uassert( 16732, - mongoutils::str::stream() << "too many unique keys for a single document to" - << " have a text index, max is " << term_freqs.size() << obj["_id"], - term_freqs.size() <= 400000 ); - - long long keyBSONSize = 0; - const int MaxKeyBSONSizeMB = 4; - - for ( TermFrequencyMap::const_iterator i = term_freqs.begin(); i != term_freqs.end(); ++i ) { - - const string& term = i->first; - double weight = i->second; - - // guess the total size of the btree entry based on the size of the weight, term tuple - int guess = - 5 /* bson overhead */ + - 10 /* weight */ + - 8 /* term overhead */ + - /* term size (could be truncated/hashed) */ - guessTermSize( term, spec.getTextIndexVersion() ) + - extraSize; - - BSONObjBuilder b(guess); // builds a BSON object with guess length. - for ( unsigned k = 0; k < extrasBefore.size(); k++ ) { - b.appendAs( extrasBefore[k], "" ); - } - _appendIndexKey( b, weight, term, spec.getTextIndexVersion() ); - for ( unsigned k = 0; k < extrasAfter.size(); k++ ) { - b.appendAs( extrasAfter[k], "" ); - } - BSONObj res = b.obj(); - - verify( guess >= res.objsize() ); - - keys->insert( res ); - keyBSONSize += res.objsize(); - - uassert( 16733, - mongoutils::str::stream() - << "trying to index text where term list is too big, max is " - << MaxKeyBSONSizeMB << "mb " << obj["_id"], - keyBSONSize <= ( MaxKeyBSONSizeMB * 1024 * 1024 ) ); - - } - } +MONGO_INITIALIZER(FTSIndexFormat)(InitializerContext* context) { + BSONObjBuilder b; + b.appendNull(""); + nullObj = b.obj(); + nullElt = nullObj.firstElement(); + return Status::OK(); +} + +void FTSIndexFormat::getKeys(const FTSSpec& spec, const BSONObj& obj, BSONObjSet* keys) { + int extraSize = 0; + vector<BSONElement> extrasBefore; + vector<BSONElement> extrasAfter; + + // compute the non FTS key elements + for (unsigned i = 0; i < spec.numExtraBefore(); i++) { + BSONElement e = obj.getFieldDotted(spec.extraBefore(i)); + if (e.eoo()) + e = nullElt; + uassert(16675, "cannot have a multi-key as a prefix to a text index", e.type() != Array); + extrasBefore.push_back(e); + extraSize += e.size(); + } + for (unsigned i = 0; i < spec.numExtraAfter(); i++) { + BSONElement e = obj.getFieldDotted(spec.extraAfter(i)); + if (e.eoo()) + e = nullElt; + extrasAfter.push_back(e); + extraSize += e.size(); + } + + + TermFrequencyMap term_freqs; + spec.scoreDocument(obj, &term_freqs); + + // create index keys from raw scores + // only 1 per string + + uassert(16732, + mongoutils::str::stream() << "too many unique keys for a single document to" + << " have a text index, max is " << term_freqs.size() + << obj["_id"], + term_freqs.size() <= 400000); + + long long keyBSONSize = 0; + const int MaxKeyBSONSizeMB = 4; - BSONObj FTSIndexFormat::getIndexKey( double weight, - const string& term, - const BSONObj& indexPrefix, - TextIndexVersion textIndexVersion ) { - BSONObjBuilder b; + for (TermFrequencyMap::const_iterator i = term_freqs.begin(); i != term_freqs.end(); ++i) { + const string& term = i->first; + double weight = i->second; - BSONObjIterator i( indexPrefix ); - while ( i.more() ) { - b.appendAs( i.next(), "" ); - } + // guess the total size of the btree entry based on the size of the weight, term tuple + int guess = 5 /* bson overhead */ + 10 /* weight */ + 8 /* term overhead */ + + /* term size (could be truncated/hashed) */ + guessTermSize(term, spec.getTextIndexVersion()) + extraSize; - _appendIndexKey( b, weight, term, textIndexVersion ); - return b.obj(); + BSONObjBuilder b(guess); // builds a BSON object with guess length. + for (unsigned k = 0; k < extrasBefore.size(); k++) { + b.appendAs(extrasBefore[k], ""); } + _appendIndexKey(b, weight, term, spec.getTextIndexVersion()); + for (unsigned k = 0; k < extrasAfter.size(); k++) { + b.appendAs(extrasAfter[k], ""); + } + BSONObj res = b.obj(); + + verify(guess >= res.objsize()); + + keys->insert(res); + keyBSONSize += res.objsize(); + + uassert(16733, + mongoutils::str::stream() + << "trying to index text where term list is too big, max is " + << MaxKeyBSONSizeMB << "mb " << obj["_id"], + keyBSONSize <= (MaxKeyBSONSizeMB * 1024 * 1024)); + } +} + +BSONObj FTSIndexFormat::getIndexKey(double weight, + const string& term, + const BSONObj& indexPrefix, + TextIndexVersion textIndexVersion) { + BSONObjBuilder b; - void FTSIndexFormat::_appendIndexKey( BSONObjBuilder& b, double weight, const string& term, - TextIndexVersion textIndexVersion ) { - verify( weight >= 0 && weight <= MAX_WEIGHT ); // FTSmaxweight = defined in fts_header - // Terms are added to index key verbatim. - if ( TEXT_INDEX_VERSION_1 == textIndexVersion ) { - b.append( "", term ); - b.append( "", weight ); - } - // See comments at the top of file for termKeyPrefixLength. - // Apply hash for text index version 2 to long terms (longer than 32 characters). - else { - invariant( TEXT_INDEX_VERSION_2 == textIndexVersion ); - if ( term.size() <= termKeyPrefixLength ) { - b.append( "", term ); - } - else { - union { - uint64_t hash[2]; - char data[16]; - } t; - uint32_t seed = 0; - MurmurHash3_x64_128( term.data(), term.size(), seed, t.hash ); - string keySuffix = mongo::toHexLower( t.data, sizeof( t.data ) ); - invariant( termKeySuffixLength == keySuffix.size() ); - b.append( "", term.substr( 0, termKeyPrefixLength ) + - keySuffix ); - } - b.append( "", weight ); - } + BSONObjIterator i(indexPrefix); + while (i.more()) { + b.appendAs(i.next(), ""); + } + + _appendIndexKey(b, weight, term, textIndexVersion); + return b.obj(); +} + +void FTSIndexFormat::_appendIndexKey(BSONObjBuilder& b, + double weight, + const string& term, + TextIndexVersion textIndexVersion) { + verify(weight >= 0 && weight <= MAX_WEIGHT); // FTSmaxweight = defined in fts_header + // Terms are added to index key verbatim. + if (TEXT_INDEX_VERSION_1 == textIndexVersion) { + b.append("", term); + b.append("", weight); + } + // See comments at the top of file for termKeyPrefixLength. + // Apply hash for text index version 2 to long terms (longer than 32 characters). + else { + invariant(TEXT_INDEX_VERSION_2 == textIndexVersion); + if (term.size() <= termKeyPrefixLength) { + b.append("", term); + } else { + union { + uint64_t hash[2]; + char data[16]; + } t; + uint32_t seed = 0; + MurmurHash3_x64_128(term.data(), term.size(), seed, t.hash); + string keySuffix = mongo::toHexLower(t.data, sizeof(t.data)); + invariant(termKeySuffixLength == keySuffix.size()); + b.append("", term.substr(0, termKeyPrefixLength) + keySuffix); } + b.append("", weight); } } +} +} diff --git a/src/mongo/db/fts/fts_index_format.h b/src/mongo/db/fts/fts_index_format.h index 579afb2d673..82be9ad03f5 100644 --- a/src/mongo/db/fts/fts_index_format.h +++ b/src/mongo/db/fts/fts_index_format.h @@ -37,40 +37,38 @@ namespace mongo { - namespace fts { +namespace fts { - class FTSSpec; +class FTSSpec; - class FTSIndexFormat { - public: +class FTSIndexFormat { +public: + static void getKeys(const FTSSpec& spec, const BSONObj& document, BSONObjSet* keys); - static void getKeys( const FTSSpec& spec, - const BSONObj& document, - BSONObjSet* keys ); + /** + * Helper method to get return entry from the FTSIndex as a BSONObj + * @param weight, the weight of the term in the entry + * @param term, the std::string term in the entry + * @param indexPrefix, the fields that go in the index first + * @param textIndexVersion, index version. affects key format. + */ + static BSONObj getIndexKey(double weight, + const std::string& term, + const BSONObj& indexPrefix, + TextIndexVersion textIndexVersion); - /** - * Helper method to get return entry from the FTSIndex as a BSONObj - * @param weight, the weight of the term in the entry - * @param term, the std::string term in the entry - * @param indexPrefix, the fields that go in the index first - * @param textIndexVersion, index version. affects key format. - */ - static BSONObj getIndexKey( double weight, - const std::string& term, - const BSONObj& indexPrefix, - TextIndexVersion textIndexVersion ); - - private: - /** - * Helper method to get return entry from the FTSIndex as a BSONObj - * @param b, reference to the BSONOBjBuilder - * @param weight, the weight of the term in the entry - * @param term, the std::string term in the entry - * @param textIndexVersion, index version. affects key format. - */ - static void _appendIndexKey( BSONObjBuilder& b, double weight, const std::string& term, - TextIndexVersion textIndexVersion ); - }; - - } +private: + /** + * Helper method to get return entry from the FTSIndex as a BSONObj + * @param b, reference to the BSONOBjBuilder + * @param weight, the weight of the term in the entry + * @param term, the std::string term in the entry + * @param textIndexVersion, index version. affects key format. + */ + static void _appendIndexKey(BSONObjBuilder& b, + double weight, + const std::string& term, + TextIndexVersion textIndexVersion); +}; +} } diff --git a/src/mongo/db/fts/fts_index_format_test.cpp b/src/mongo/db/fts/fts_index_format_test.cpp index a15d014e98c..f7c8a5fa432 100644 --- a/src/mongo/db/fts/fts_index_format_test.cpp +++ b/src/mongo/db/fts/fts_index_format_test.cpp @@ -42,165 +42,184 @@ namespace mongo { - namespace fts { - - using std::string; - - TEST( FTSIndexFormat, Simple1 ) { - FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" ) ) ) ); - BSONObjSet keys; - FTSIndexFormat::getKeys( spec, BSON( "data" << "cat sat" ), &keys ); - - ASSERT_EQUALS( 2U, keys.size() ); - for ( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) { - BSONObj key = *i; - ASSERT_EQUALS( 2, key.nFields() ); - ASSERT_EQUALS( String, key.firstElement().type() ); - } - } - - TEST( FTSIndexFormat, ExtraBack1 ) { - FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" << - "x" << 1 ) ) ) ); - BSONObjSet keys; - FTSIndexFormat::getKeys( spec, BSON( "data" << "cat" << "x" << 5 ), &keys ); - - ASSERT_EQUALS( 1U, keys.size() ); - BSONObj key = *(keys.begin()); - ASSERT_EQUALS( 3, key.nFields() ); - BSONObjIterator i( key ); - ASSERT_EQUALS( StringData("cat"), i.next().valuestr() ); - ASSERT( i.next().numberDouble() > 0 ); - ASSERT_EQUALS( 5, i.next().numberInt() ); - } +namespace fts { + +using std::string; + +TEST(FTSIndexFormat, Simple1) { + FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("data" + << "text")))); + BSONObjSet keys; + FTSIndexFormat::getKeys(spec, + BSON("data" + << "cat sat"), + &keys); + + ASSERT_EQUALS(2U, keys.size()); + for (BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i) { + BSONObj key = *i; + ASSERT_EQUALS(2, key.nFields()); + ASSERT_EQUALS(String, key.firstElement().type()); + } +} - /* - TEST( FTSIndexFormat, ExtraBackArray1 ) { - FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" << - "x.y" << 1 ) ) ) ); - BSONObjSet keys; - FTSIndexFormat::getKeys( spec, - BSON( "data" << "cat" << - "x" << BSON_ARRAY( BSON( "y" << 1 ) << - BSON( "y" << 2 ) ) ), - &keys ); - - ASSERT_EQUALS( 1U, keys.size() ); - BSONObj key = *(keys.begin()); - log() << "e: " << key << endl; - ASSERT_EQUALS( 3, key.nFields() ); - BSONObjIterator i( key ); - ASSERT_EQUALS( StringData("cat"), i.next().valuestr() ); - ASSERT( i.next().numberDouble() > 0 ); - ASSERT_EQUALS( 5, i.next().numberInt() ); - } - */ - - TEST( FTSIndexFormat, ExtraFront1 ) { - FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << 1 << - "data" << "text" ) ) ) ); - BSONObjSet keys; - FTSIndexFormat::getKeys( spec, BSON( "data" << "cat" << "x" << 5 ), &keys ); - - ASSERT_EQUALS( 1U, keys.size() ); - BSONObj key = *(keys.begin()); - ASSERT_EQUALS( 3, key.nFields() ); - BSONObjIterator i( key ); - ASSERT_EQUALS( 5, i.next().numberInt() ); - ASSERT_EQUALS( StringData("cat"), i.next().valuestr() ); - ASSERT( i.next().numberDouble() > 0 ); - } +TEST(FTSIndexFormat, ExtraBack1) { + FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("data" + << "text" + << "x" << 1)))); + BSONObjSet keys; + FTSIndexFormat::getKeys(spec, + BSON("data" + << "cat" + << "x" << 5), + &keys); + + ASSERT_EQUALS(1U, keys.size()); + BSONObj key = *(keys.begin()); + ASSERT_EQUALS(3, key.nFields()); + BSONObjIterator i(key); + ASSERT_EQUALS(StringData("cat"), i.next().valuestr()); + ASSERT(i.next().numberDouble() > 0); + ASSERT_EQUALS(5, i.next().numberInt()); +} - TEST( FTSIndexFormat, StopWords1 ) { - FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" ) ) ) ); +/* +TEST( FTSIndexFormat, ExtraBackArray1 ) { + FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" << + "x.y" << 1 ) ) ) ); + BSONObjSet keys; + FTSIndexFormat::getKeys( spec, + BSON( "data" << "cat" << + "x" << BSON_ARRAY( BSON( "y" << 1 ) << + BSON( "y" << 2 ) ) ), + &keys ); + + ASSERT_EQUALS( 1U, keys.size() ); + BSONObj key = *(keys.begin()); + log() << "e: " << key << endl; + ASSERT_EQUALS( 3, key.nFields() ); + BSONObjIterator i( key ); + ASSERT_EQUALS( StringData("cat"), i.next().valuestr() ); + ASSERT( i.next().numberDouble() > 0 ); + ASSERT_EQUALS( 5, i.next().numberInt() ); +} +*/ - BSONObjSet keys1; - FTSIndexFormat::getKeys( spec, BSON( "data" << "computer" ), &keys1 ); - ASSERT_EQUALS( 1U, keys1.size() ); +TEST(FTSIndexFormat, ExtraFront1) { + FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("x" << 1 << "data" + << "text")))); + BSONObjSet keys; + FTSIndexFormat::getKeys(spec, + BSON("data" + << "cat" + << "x" << 5), + &keys); + + ASSERT_EQUALS(1U, keys.size()); + BSONObj key = *(keys.begin()); + ASSERT_EQUALS(3, key.nFields()); + BSONObjIterator i(key); + ASSERT_EQUALS(5, i.next().numberInt()); + ASSERT_EQUALS(StringData("cat"), i.next().valuestr()); + ASSERT(i.next().numberDouble() > 0); +} - BSONObjSet keys2; - FTSIndexFormat::getKeys( spec, BSON( "data" << "any computer" ), &keys2 ); - ASSERT_EQUALS( 1U, keys2.size() ); - } +TEST(FTSIndexFormat, StopWords1) { + FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("data" + << "text")))); + + BSONObjSet keys1; + FTSIndexFormat::getKeys(spec, + BSON("data" + << "computer"), + &keys1); + ASSERT_EQUALS(1U, keys1.size()); + + BSONObjSet keys2; + FTSIndexFormat::getKeys(spec, + BSON("data" + << "any computer"), + &keys2); + ASSERT_EQUALS(1U, keys2.size()); +} - /** - * Helper function to compare keys returned in getKeys() result - * with expected values. - */ - void assertEqualsIndexKeys( std::set<std::string>& expectedKeys, const BSONObjSet& keys ) { - ASSERT_EQUALS( expectedKeys.size(), keys.size() ); - for ( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) { - BSONObj key = *i; - ASSERT_EQUALS( 2, key.nFields() ); - ASSERT_EQUALS( String, key.firstElement().type() ); - string s = key.firstElement().String(); - std::set<string>::const_iterator j = expectedKeys.find(s); - if (j == expectedKeys.end()) { - mongoutils::str::stream ss; - ss << "unexpected key " << s << " in FTSIndexFormat::getKeys result. " - << "expected keys:"; - for (std::set<string>::const_iterator k = expectedKeys.begin(); - k != expectedKeys.end(); ++k) { - ss << "\n " << *k; - } - FAIL(ss); - } +/** + * Helper function to compare keys returned in getKeys() result + * with expected values. + */ +void assertEqualsIndexKeys(std::set<std::string>& expectedKeys, const BSONObjSet& keys) { + ASSERT_EQUALS(expectedKeys.size(), keys.size()); + for (BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i) { + BSONObj key = *i; + ASSERT_EQUALS(2, key.nFields()); + ASSERT_EQUALS(String, key.firstElement().type()); + string s = key.firstElement().String(); + std::set<string>::const_iterator j = expectedKeys.find(s); + if (j == expectedKeys.end()) { + mongoutils::str::stream ss; + ss << "unexpected key " << s << " in FTSIndexFormat::getKeys result. " + << "expected keys:"; + for (std::set<string>::const_iterator k = expectedKeys.begin(); k != expectedKeys.end(); + ++k) { + ss << "\n " << *k; } + FAIL(ss); } + } +} - /** - * Tests keys for long terms using text index version 1. - * Terms that are too long are not truncated in version 1. - */ - TEST( FTSIndexFormat, LongWordsTextIndexVersion1 ) { - FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" ) << - "textIndexVersion" << 1 ) ) ); - BSONObjSet keys; - string longPrefix( 1024U, 'a' ); - // "aaa...aaacat" - string longWordCat = longPrefix + "cat"; - // "aaa...aaasat" - string longWordSat = longPrefix + "sat"; - string text = mongoutils::str::stream() << longWordCat << " " << longWordSat; - FTSIndexFormat::getKeys( spec, BSON( "data" << text ), &keys ); - - // Hard-coded expected computed keys for future-proofing. - std::set<string> expectedKeys; - // cat - expectedKeys.insert( longWordCat ); - // sat - expectedKeys.insert( longWordSat ); - - assertEqualsIndexKeys( expectedKeys, keys); - } - - /** - * Tests keys for long terms using text index version 2. - * In version 2, long terms (longer than 32 characters) - * are hashed with murmur3 and appended to the first 32 - * characters of the term to form the index key. - */ - TEST( FTSIndexFormat, LongWordTextIndexVersion2 ) { - FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" ) << - "textIndexVersion" << 2 ) ) ); - BSONObjSet keys; - string longPrefix( 1024U, 'a' ); - // "aaa...aaacat" - string longWordCat = longPrefix + "cat"; - // "aaa...aaasat" - string longWordSat = longPrefix + "sat"; - string text = mongoutils::str::stream() << longWordCat << " " << longWordSat; - FTSIndexFormat::getKeys( spec, BSON( "data" << text ), &keys ); - - // Hard-coded expected computed keys for future-proofing. - std::set<string> expectedKeys; - // cat - expectedKeys.insert( "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab8e78455d827ebb87cbe87f392bf45f6" ); - // sat - expectedKeys.insert( "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaf2d6f58bb3b81b97e611ae7ccac6dea7" ); - - assertEqualsIndexKeys( expectedKeys, keys); - } +/** + * Tests keys for long terms using text index version 1. + * Terms that are too long are not truncated in version 1. + */ +TEST(FTSIndexFormat, LongWordsTextIndexVersion1) { + FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("data" + << "text") << "textIndexVersion" << 1))); + BSONObjSet keys; + string longPrefix(1024U, 'a'); + // "aaa...aaacat" + string longWordCat = longPrefix + "cat"; + // "aaa...aaasat" + string longWordSat = longPrefix + "sat"; + string text = mongoutils::str::stream() << longWordCat << " " << longWordSat; + FTSIndexFormat::getKeys(spec, BSON("data" << text), &keys); + + // Hard-coded expected computed keys for future-proofing. + std::set<string> expectedKeys; + // cat + expectedKeys.insert(longWordCat); + // sat + expectedKeys.insert(longWordSat); + + assertEqualsIndexKeys(expectedKeys, keys); +} - } +/** + * Tests keys for long terms using text index version 2. + * In version 2, long terms (longer than 32 characters) + * are hashed with murmur3 and appended to the first 32 + * characters of the term to form the index key. + */ +TEST(FTSIndexFormat, LongWordTextIndexVersion2) { + FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("data" + << "text") << "textIndexVersion" << 2))); + BSONObjSet keys; + string longPrefix(1024U, 'a'); + // "aaa...aaacat" + string longWordCat = longPrefix + "cat"; + // "aaa...aaasat" + string longWordSat = longPrefix + "sat"; + string text = mongoutils::str::stream() << longWordCat << " " << longWordSat; + FTSIndexFormat::getKeys(spec, BSON("data" << text), &keys); + + // Hard-coded expected computed keys for future-proofing. + std::set<string> expectedKeys; + // cat + expectedKeys.insert("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab8e78455d827ebb87cbe87f392bf45f6"); + // sat + expectedKeys.insert("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaf2d6f58bb3b81b97e611ae7ccac6dea7"); + + assertEqualsIndexKeys(expectedKeys, keys); +} +} } diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp index 4b1966d6458..7a0c64ab1cf 100644 --- a/src/mongo/db/fts/fts_language.cpp +++ b/src/mongo/db/fts/fts_language.cpp @@ -31,7 +31,7 @@ #include "mongo/db/fts/fts_language.h" #include <string> - + #include "mongo/base/init.h" #include "mongo/db/fts/fts_basic_tokenizer.h" #include "mongo/stdx/memory.h" @@ -42,225 +42,220 @@ namespace mongo { - namespace fts { +namespace fts { - namespace { +namespace { - /** - * Case-insensitive StringData comparator. - */ - struct LanguageStringCompare { - /** Returns true if lhs < rhs. */ - bool operator()( std::string lhs, std::string rhs ) const { - size_t minSize = std::min( lhs.size(), rhs.size() ); +/** + * Case-insensitive StringData comparator. + */ +struct LanguageStringCompare { + /** Returns true if lhs < rhs. */ + bool operator()(std::string lhs, std::string rhs) const { + size_t minSize = std::min(lhs.size(), rhs.size()); - for ( size_t x = 0; x < minSize; x++ ) { - char a = tolower( lhs[x] ); - char b = tolower( rhs[x] ); - if ( a < b ) { - return true; - } - if ( a > b ) { - return false; - } - } + for (size_t x = 0; x < minSize; x++) { + char a = tolower(lhs[x]); + char b = tolower(rhs[x]); + if (a < b) { + return true; + } + if (a > b) { + return false; + } + } - return lhs.size() < rhs.size(); - } - }; + return lhs.size() < rhs.size(); + } +}; - // Lookup table from user language string (case-insensitive) to FTSLanguage. Populated - // by initializers in group FTSAllLanguagesRegistered and initializer - // FTSRegisterLanguageAliases. For use with TEXT_INDEX_VERSION_2 text indexes only. - typedef std::map<std::string, const FTSLanguage*, LanguageStringCompare> LanguageMapV2; - LanguageMapV2 languageMapV2; +// Lookup table from user language string (case-insensitive) to FTSLanguage. Populated +// by initializers in group FTSAllLanguagesRegistered and initializer +// FTSRegisterLanguageAliases. For use with TEXT_INDEX_VERSION_2 text indexes only. +typedef std::map<std::string, const FTSLanguage*, LanguageStringCompare> LanguageMapV2; +LanguageMapV2 languageMapV2; - // Like languageMapV2, but for use with TEXT_INDEX_VERSION_1 text indexes. - // Case-sensitive by lookup key. - typedef std::map<StringData, const FTSLanguage*> LanguageMapV1; - LanguageMapV1 languageMapV1; - } +// Like languageMapV2, but for use with TEXT_INDEX_VERSION_1 text indexes. +// Case-sensitive by lookup key. +typedef std::map<StringData, const FTSLanguage*> LanguageMapV1; +LanguageMapV1 languageMapV1; +} - std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const { - return stdx::make_unique<BasicFTSTokenizer>(this); - } +std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const { + return stdx::make_unique<BasicFTSTokenizer>(this); +} - MONGO_INITIALIZER_GROUP( FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, - MONGO_NO_DEPENDENTS ); +MONGO_INITIALIZER_GROUP(FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, MONGO_NO_DEPENDENTS); - // - // Register supported languages' canonical names for TEXT_INDEX_VERSION_2. - // +// +// Register supported languages' canonical names for TEXT_INDEX_VERSION_2. +// - MONGO_FTS_LANGUAGE_DECLARE( languageNoneV2, "none", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDanishV2, "danish", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDutchV2, "dutch", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageEnglishV2, "english", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFinnishV2, "finnish", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFrenchV2, "french", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageGermanV2, "german", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageHungarianV2, "hungarian", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageItalianV2, "italian", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageNorwegianV2, "norwegian", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languagePortugueseV2, "portuguese", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRomanianV2, "romanian", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRussianV2, "russian", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageSpanishV2, "spanish", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageSwedishV2, "swedish", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageTurkishV2, "turkish", TEXT_INDEX_VERSION_2 ); +MONGO_FTS_LANGUAGE_DECLARE(languageNoneV2, "none", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageDanishV2, "danish", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageDutchV2, "dutch", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageEnglishV2, "english", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageFinnishV2, "finnish", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageFrenchV2, "french", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageGermanV2, "german", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageHungarianV2, "hungarian", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageItalianV2, "italian", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageNorwegianV2, "norwegian", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languagePortugueseV2, "portuguese", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageRomanianV2, "romanian", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageRussianV2, "russian", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageSpanishV2, "spanish", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageSwedishV2, "swedish", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageTurkishV2, "turkish", TEXT_INDEX_VERSION_2); - // - // Register all Snowball language modules for TEXT_INDEX_VERSION_1. Note that only the full - // names are recognized by the StopWords class (as such, the language string "dan" in - // TEXT_INDEX_VERSION_1 will generate the Danish stemmer and the empty stopword list). - // +// +// Register all Snowball language modules for TEXT_INDEX_VERSION_1. Note that only the full +// names are recognized by the StopWords class (as such, the language string "dan" in +// TEXT_INDEX_VERSION_1 will generate the Danish stemmer and the empty stopword list). +// - MONGO_FTS_LANGUAGE_DECLARE( languageNoneV1, "none", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDaV1, "da", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDanV1, "dan", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDanishV1, "danish", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDeV1, "de", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDeuV1, "deu", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDutV1, "dut", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDutchV1, "dutch", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageEnV1, "en", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageEngV1, "eng", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageEnglishV1, "english", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageEsV1, "es", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageEslV1, "esl", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFiV1, "fi", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFinV1, "fin", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFinnishV1, "finnish", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFrV1, "fr", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFraV1, "fra", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFreV1, "fre", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFrenchV1, "french", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageGerV1, "ger", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageGermanV1, "german", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageHuV1, "hu", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageHunV1, "hun", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageHungarianV1, "hungarian", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageItV1, "it", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageItaV1, "ita", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageItalianV1, "italian", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageNlV1, "nl", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageNldV1, "nld", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageNoV1, "no", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageNorV1, "nor", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageNorwegianV1, "norwegian", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languagePorV1, "por", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languagePorterV1, "porter", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languagePortugueseV1, "portuguese", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languagePtV1, "pt", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRoV1, "ro", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRomanianV1, "romanian", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRonV1, "ron", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRuV1, "ru", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRumV1, "rum", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRusV1, "rus", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRussianV1, "russian", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageSpaV1, "spa", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageSpanishV1, "spanish", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageSvV1, "sv", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageSweV1, "swe", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageSwedishV1, "swedish", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageTrV1, "tr", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageTurV1, "tur", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageTurkishV1, "turkish", TEXT_INDEX_VERSION_1 ); +MONGO_FTS_LANGUAGE_DECLARE(languageNoneV1, "none", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageDaV1, "da", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageDanV1, "dan", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageDanishV1, "danish", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageDeV1, "de", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageDeuV1, "deu", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageDutV1, "dut", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageDutchV1, "dutch", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageEnV1, "en", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageEngV1, "eng", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageEnglishV1, "english", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageEsV1, "es", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageEslV1, "esl", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageFiV1, "fi", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageFinV1, "fin", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageFinnishV1, "finnish", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageFrV1, "fr", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageFraV1, "fra", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageFreV1, "fre", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageFrenchV1, "french", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageGerV1, "ger", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageGermanV1, "german", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageHuV1, "hu", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageHunV1, "hun", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageHungarianV1, "hungarian", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageItV1, "it", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageItaV1, "ita", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageItalianV1, "italian", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageNlV1, "nl", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageNldV1, "nld", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageNoV1, "no", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageNorV1, "nor", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageNorwegianV1, "norwegian", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languagePorV1, "por", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languagePorterV1, "porter", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languagePortugueseV1, "portuguese", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languagePtV1, "pt", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageRoV1, "ro", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageRomanianV1, "romanian", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageRonV1, "ron", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageRuV1, "ru", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageRumV1, "rum", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageRusV1, "rus", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageRussianV1, "russian", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageSpaV1, "spa", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageSpanishV1, "spanish", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageSvV1, "sv", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageSweV1, "swe", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageSwedishV1, "swedish", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageTrV1, "tr", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageTurV1, "tur", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageTurkishV1, "turkish", TEXT_INDEX_VERSION_1); - MONGO_INITIALIZER_WITH_PREREQUISITES( FTSRegisterLanguageAliases, - ( "FTSAllLanguagesRegistered" ) ) - ( InitializerContext* context ) { - // Register language aliases for TEXT_INDEX_VERSION_2. - FTSLanguage::registerLanguageAlias( &languageDanishV2, "da", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageDutchV2, "nl", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageEnglishV2, "en", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageFinnishV2, "fi", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageFrenchV2, "fr", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageGermanV2, "de", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageHungarianV2, "hu", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageItalianV2, "it", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageNorwegianV2, "nb", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languagePortugueseV2, "pt", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageRomanianV2, "ro", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageRussianV2, "ru", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageSpanishV2, "es", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageSwedishV2, "sv", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageTurkishV2, "tr", TEXT_INDEX_VERSION_2 ); - return Status::OK(); - } - - // static - void FTSLanguage::registerLanguage( StringData languageName, - TextIndexVersion textIndexVersion, - FTSLanguage* language ) { - verify( !languageName.empty() ); - language->_canonicalName = languageName.toString(); - switch ( textIndexVersion ) { - case TEXT_INDEX_VERSION_2: - languageMapV2[ languageName.toString() ] = language; - return; - case TEXT_INDEX_VERSION_1: - verify( languageMapV1.find( languageName ) == languageMapV1.end() ); - languageMapV1[ languageName ] = language; - return; - } - verify( false ); - } +MONGO_INITIALIZER_WITH_PREREQUISITES(FTSRegisterLanguageAliases, ("FTSAllLanguagesRegistered")) +(InitializerContext* context) { + // Register language aliases for TEXT_INDEX_VERSION_2. + FTSLanguage::registerLanguageAlias(&languageDanishV2, "da", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageDutchV2, "nl", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageEnglishV2, "en", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageFinnishV2, "fi", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageFrenchV2, "fr", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageGermanV2, "de", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageHungarianV2, "hu", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageItalianV2, "it", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageNorwegianV2, "nb", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languagePortugueseV2, "pt", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageRomanianV2, "ro", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageRussianV2, "ru", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageSpanishV2, "es", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageSwedishV2, "sv", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageTurkishV2, "tr", TEXT_INDEX_VERSION_2); + return Status::OK(); +} - // static - void FTSLanguage::registerLanguageAlias( const FTSLanguage* language, - StringData alias, - TextIndexVersion textIndexVersion ) { - switch ( textIndexVersion ) { - case TEXT_INDEX_VERSION_2: - languageMapV2[ alias.toString() ] = language; - return; - case TEXT_INDEX_VERSION_1: - verify( languageMapV1.find( alias ) == languageMapV1.end() ); - languageMapV1[ alias ] = language; - return; - } - verify( false ); - } +// static +void FTSLanguage::registerLanguage(StringData languageName, + TextIndexVersion textIndexVersion, + FTSLanguage* language) { + verify(!languageName.empty()); + language->_canonicalName = languageName.toString(); + switch (textIndexVersion) { + case TEXT_INDEX_VERSION_2: + languageMapV2[languageName.toString()] = language; + return; + case TEXT_INDEX_VERSION_1: + verify(languageMapV1.find(languageName) == languageMapV1.end()); + languageMapV1[languageName] = language; + return; + } + verify(false); +} - FTSLanguage::FTSLanguage() : _canonicalName() { - } +// static +void FTSLanguage::registerLanguageAlias(const FTSLanguage* language, + StringData alias, + TextIndexVersion textIndexVersion) { + switch (textIndexVersion) { + case TEXT_INDEX_VERSION_2: + languageMapV2[alias.toString()] = language; + return; + case TEXT_INDEX_VERSION_1: + verify(languageMapV1.find(alias) == languageMapV1.end()); + languageMapV1[alias] = language; + return; + } + verify(false); +} - const std::string& FTSLanguage::str() const { - verify( !_canonicalName.empty() ); - return _canonicalName; - } +FTSLanguage::FTSLanguage() : _canonicalName() {} - // static - StatusWithFTSLanguage FTSLanguage::make( StringData langName, - TextIndexVersion textIndexVersion ) { - switch ( textIndexVersion ) { - case TEXT_INDEX_VERSION_2: { - LanguageMapV2::const_iterator it = languageMapV2.find( langName.toString() ); - if ( it == languageMapV2.end() ) { - // TEXT_INDEX_VERSION_2 rejects unrecognized language strings. - Status status = Status( ErrorCodes::BadValue, - mongoutils::str::stream() << - "unsupported language: \"" << langName << - "\"" ); - return StatusWithFTSLanguage( status ); - } +const std::string& FTSLanguage::str() const { + verify(!_canonicalName.empty()); + return _canonicalName; +} - return StatusWithFTSLanguage( it->second ); - } - case TEXT_INDEX_VERSION_1: { - LanguageMapV1::const_iterator it = languageMapV1.find( langName ); - if ( it == languageMapV1.end() ) { - // TEXT_INDEX_VERSION_1 treats unrecognized language strings as "none". - return StatusWithFTSLanguage( &languageNoneV1 ); - } - return StatusWithFTSLanguage( it->second ); - } +// static +StatusWithFTSLanguage FTSLanguage::make(StringData langName, TextIndexVersion textIndexVersion) { + switch (textIndexVersion) { + case TEXT_INDEX_VERSION_2: { + LanguageMapV2::const_iterator it = languageMapV2.find(langName.toString()); + if (it == languageMapV2.end()) { + // TEXT_INDEX_VERSION_2 rejects unrecognized language strings. + Status status = Status(ErrorCodes::BadValue, + mongoutils::str::stream() << "unsupported language: \"" + << langName << "\""); + return StatusWithFTSLanguage(status); } - verify( false ); - return StatusWithFTSLanguage( Status::OK() ); + return StatusWithFTSLanguage(it->second); + } + case TEXT_INDEX_VERSION_1: { + LanguageMapV1::const_iterator it = languageMapV1.find(langName); + if (it == languageMapV1.end()) { + // TEXT_INDEX_VERSION_1 treats unrecognized language strings as "none". + return StatusWithFTSLanguage(&languageNoneV1); + } + return StatusWithFTSLanguage(it->second); } } + + verify(false); + return StatusWithFTSLanguage(Status::OK()); +} +} } diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h index ce45e0b812a..facdb8c9ce0 100644 --- a/src/mongo/db/fts/fts_language.h +++ b/src/mongo/db/fts/fts_language.h @@ -37,108 +37,107 @@ namespace mongo { - namespace fts { - - class FTSTokenizer; - - #define MONGO_FTS_LANGUAGE_DECLARE( language, name, version ) \ - BasicFTSLanguage language; \ - MONGO_INITIALIZER_GENERAL( language, MONGO_NO_PREREQUISITES, \ - ( "FTSAllLanguagesRegistered" ) ) \ - ( ::mongo::InitializerContext* context ) { \ - FTSLanguage::registerLanguage( name, version, &language ); \ - return Status::OK(); \ - } - - /** - * A FTSLanguage represents a language for a text-indexed document or a text search. - * FTSLanguage objects are not copyable. - * - * Recommended usage: - * - * StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_2 ); - * if ( !swl.getStatus().isOK() ) { - * // Error. - * } - * else { - * const FTSLanguage* language = swl.getValue(); - * // Use language. - * } - */ - class FTSLanguage { - // Use make() instead of copying. - MONGO_DISALLOW_COPYING( FTSLanguage ); - public: - /** Create an uninitialized language. */ - FTSLanguage(); - - virtual ~FTSLanguage() {} - - /** - * Returns the language as a std::string in canonical form (lowercased English name). It is - * an error to call str() on an uninitialized language. - */ - const std::string& str() const; - - /** - * Returns a new FTSTokenizer instance for this language. - * Lifetime is scoped to FTSLanguage (which are currently all process lifetime) - */ - virtual std::unique_ptr<FTSTokenizer> createTokenizer() const = 0; - - /** - * Register std::string 'languageName' as a new language with text index version - * 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'. - * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language - * string. - */ - static void registerLanguage( StringData languageName, - TextIndexVersion textIndexVersion, - FTSLanguage *languageOut ); - - /** - * Register 'alias' as an alias for 'language' with text index version - * 'textIndexVersion'. Subsequent calls to FTSLanguage::make() will recognize the - * newly-registered alias. - */ - static void registerLanguageAlias( const FTSLanguage* language, - StringData alias, - TextIndexVersion textIndexVersion ); - - /** - * Return the FTSLanguage associated with the given language string. Returns an error - * Status if an invalid language std::string is passed. - * - * For textIndexVersion=TEXT_INDEX_VERSION_2, language strings are - * case-insensitive, and need to be in one of the two following forms: - * - English name, like "spanish". - * - Two-letter code, like "es". - * - * For textIndexVersion=TEXT_INDEX_VERSION_1, no validation or normalization of - * language strings is performed. This is necessary to preserve indexing behavior for - * documents with language strings like "en": for compatibility, text data in these - * documents needs to be processed with the English stemmer and the empty stopword list - * (since "en" is recognized by Snowball but not the stopword processing logic). - */ - static StatusWith<const FTSLanguage*> make( StringData langName, - TextIndexVersion textIndexVersion ); - - private: - // std::string representation of language in canonical form. - std::string _canonicalName; - }; - - typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage; - - - class BasicFTSLanguage : public FTSLanguage { - public: - std::unique_ptr<FTSTokenizer> createTokenizer() const override; - }; - - extern BasicFTSLanguage languagePorterV1; - extern BasicFTSLanguage languageEnglishV2; - extern BasicFTSLanguage languageFrenchV2; +namespace fts { +class FTSTokenizer; + +#define MONGO_FTS_LANGUAGE_DECLARE(language, name, version) \ + BasicFTSLanguage language; \ + MONGO_INITIALIZER_GENERAL(language, MONGO_NO_PREREQUISITES, ("FTSAllLanguagesRegistered")) \ + (::mongo::InitializerContext * context) { \ + FTSLanguage::registerLanguage(name, version, &language); \ + return Status::OK(); \ } + +/** + * A FTSLanguage represents a language for a text-indexed document or a text search. + * FTSLanguage objects are not copyable. + * + * Recommended usage: + * + * StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_2 ); + * if ( !swl.getStatus().isOK() ) { + * // Error. + * } + * else { + * const FTSLanguage* language = swl.getValue(); + * // Use language. + * } + */ +class FTSLanguage { + // Use make() instead of copying. + MONGO_DISALLOW_COPYING(FTSLanguage); + +public: + /** Create an uninitialized language. */ + FTSLanguage(); + + virtual ~FTSLanguage() {} + + /** + * Returns the language as a std::string in canonical form (lowercased English name). It is + * an error to call str() on an uninitialized language. + */ + const std::string& str() const; + + /** + * Returns a new FTSTokenizer instance for this language. + * Lifetime is scoped to FTSLanguage (which are currently all process lifetime) + */ + virtual std::unique_ptr<FTSTokenizer> createTokenizer() const = 0; + + /** + * Register std::string 'languageName' as a new language with text index version + * 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'. + * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language + * string. + */ + static void registerLanguage(StringData languageName, + TextIndexVersion textIndexVersion, + FTSLanguage* languageOut); + + /** + * Register 'alias' as an alias for 'language' with text index version + * 'textIndexVersion'. Subsequent calls to FTSLanguage::make() will recognize the + * newly-registered alias. + */ + static void registerLanguageAlias(const FTSLanguage* language, + StringData alias, + TextIndexVersion textIndexVersion); + + /** + * Return the FTSLanguage associated with the given language string. Returns an error + * Status if an invalid language std::string is passed. + * + * For textIndexVersion=TEXT_INDEX_VERSION_2, language strings are + * case-insensitive, and need to be in one of the two following forms: + * - English name, like "spanish". + * - Two-letter code, like "es". + * + * For textIndexVersion=TEXT_INDEX_VERSION_1, no validation or normalization of + * language strings is performed. This is necessary to preserve indexing behavior for + * documents with language strings like "en": for compatibility, text data in these + * documents needs to be processed with the English stemmer and the empty stopword list + * (since "en" is recognized by Snowball but not the stopword processing logic). + */ + static StatusWith<const FTSLanguage*> make(StringData langName, + TextIndexVersion textIndexVersion); + +private: + // std::string representation of language in canonical form. + std::string _canonicalName; +}; + +typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage; + + +class BasicFTSLanguage : public FTSLanguage { +public: + std::unique_ptr<FTSTokenizer> createTokenizer() const override; +}; + +extern BasicFTSLanguage languagePorterV1; +extern BasicFTSLanguage languageEnglishV2; +extern BasicFTSLanguage languageFrenchV2; +} } diff --git a/src/mongo/db/fts/fts_language_test.cpp b/src/mongo/db/fts/fts_language_test.cpp index 0fb46ef2df7..c24f02ff7fd 100644 --- a/src/mongo/db/fts/fts_language_test.cpp +++ b/src/mongo/db/fts/fts_language_test.cpp @@ -35,103 +35,102 @@ namespace mongo { - namespace fts { - - // Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2. - - TEST( FTSLanguageV2, ExactLanguage ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "spanish", TEXT_INDEX_VERSION_2 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "spanish" ); - } - - TEST( FTSLanguageV2, ExactCode ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "es", TEXT_INDEX_VERSION_2 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "spanish" ); - } - - TEST( FTSLanguageV2, UpperCaseLanguage ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "SPANISH", TEXT_INDEX_VERSION_2 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "spanish" ); - } - - TEST( FTSLanguageV2, UpperCaseCode ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "ES", TEXT_INDEX_VERSION_2 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "spanish" ); - } - - TEST( FTSLanguageV2, NoneLanguage ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "none", TEXT_INDEX_VERSION_2 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "none" ); - } - - // Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2. - - TEST( FTSLanguageV2, Unknown ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "spanglish", TEXT_INDEX_VERSION_2 ); - ASSERT( !swl.getStatus().isOK() ); - } - - TEST( FTSLanguageV2, Empty ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "", TEXT_INDEX_VERSION_2 ); - ASSERT( !swl.getStatus().isOK() ); - } - - // Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1. - - TEST( FTSLanguageV1, ExactLanguage ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "spanish", TEXT_INDEX_VERSION_1 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "spanish" ); - } - - TEST( FTSLanguageV1, DeprecatedLanguage ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "porter", TEXT_INDEX_VERSION_1 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "porter" ); - } - - TEST( FTSLanguageV1, StemmerOnlyLanguage1 ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_1 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "en" ); - } - - TEST( FTSLanguageV1, StemmerOnlyLanguage2 ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "eng", TEXT_INDEX_VERSION_1 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "eng" ); - } - - TEST( FTSLanguageV1, NoneLanguage ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "none", TEXT_INDEX_VERSION_1 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "none" ); - } - - // Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1. - - TEST( FTSLanguageV1, CaseSensitive ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "SPANISH", TEXT_INDEX_VERSION_1 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "none" ); - } - - TEST( FTSLanguageV1, Unknown ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "asdf", TEXT_INDEX_VERSION_1 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "none" ); - } - - TEST( FTSLanguageV1, Empty ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "", TEXT_INDEX_VERSION_1 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "none" ); - } - - } +namespace fts { + +// Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2. + +TEST(FTSLanguageV2, ExactLanguage) { + StatusWithFTSLanguage swl = FTSLanguage::make("spanish", TEXT_INDEX_VERSION_2); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "spanish"); +} + +TEST(FTSLanguageV2, ExactCode) { + StatusWithFTSLanguage swl = FTSLanguage::make("es", TEXT_INDEX_VERSION_2); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "spanish"); +} + +TEST(FTSLanguageV2, UpperCaseLanguage) { + StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_2); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "spanish"); +} + +TEST(FTSLanguageV2, UpperCaseCode) { + StatusWithFTSLanguage swl = FTSLanguage::make("ES", TEXT_INDEX_VERSION_2); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "spanish"); +} + +TEST(FTSLanguageV2, NoneLanguage) { + StatusWithFTSLanguage swl = FTSLanguage::make("none", TEXT_INDEX_VERSION_2); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "none"); +} + +// Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2. + +TEST(FTSLanguageV2, Unknown) { + StatusWithFTSLanguage swl = FTSLanguage::make("spanglish", TEXT_INDEX_VERSION_2); + ASSERT(!swl.getStatus().isOK()); +} + +TEST(FTSLanguageV2, Empty) { + StatusWithFTSLanguage swl = FTSLanguage::make("", TEXT_INDEX_VERSION_2); + ASSERT(!swl.getStatus().isOK()); +} + +// Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1. + +TEST(FTSLanguageV1, ExactLanguage) { + StatusWithFTSLanguage swl = FTSLanguage::make("spanish", TEXT_INDEX_VERSION_1); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "spanish"); +} + +TEST(FTSLanguageV1, DeprecatedLanguage) { + StatusWithFTSLanguage swl = FTSLanguage::make("porter", TEXT_INDEX_VERSION_1); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "porter"); +} + +TEST(FTSLanguageV1, StemmerOnlyLanguage1) { + StatusWithFTSLanguage swl = FTSLanguage::make("en", TEXT_INDEX_VERSION_1); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "en"); +} + +TEST(FTSLanguageV1, StemmerOnlyLanguage2) { + StatusWithFTSLanguage swl = FTSLanguage::make("eng", TEXT_INDEX_VERSION_1); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "eng"); +} + +TEST(FTSLanguageV1, NoneLanguage) { + StatusWithFTSLanguage swl = FTSLanguage::make("none", TEXT_INDEX_VERSION_1); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "none"); +} + +// Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1. + +TEST(FTSLanguageV1, CaseSensitive) { + StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_1); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "none"); +} + +TEST(FTSLanguageV1, Unknown) { + StatusWithFTSLanguage swl = FTSLanguage::make("asdf", TEXT_INDEX_VERSION_1); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "none"); +} + +TEST(FTSLanguageV1, Empty) { + StatusWithFTSLanguage swl = FTSLanguage::make("", TEXT_INDEX_VERSION_1); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "none"); +} +} } diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp index c2aa234cd51..544ef93cf36 100644 --- a/src/mongo/db/fts/fts_matcher.cpp +++ b/src/mongo/db/fts/fts_matcher.cpp @@ -37,144 +37,138 @@ namespace mongo { - namespace fts { - - using std::string; - - /** - * Does the string 'phrase' occur in the string 'haystack'? Match is case-insensitive if - * 'caseSensitive' is false; otherwise, an exact substring match is performed. - */ - static bool phraseMatches( const string& phrase, - const string& haystack, - bool caseSensitive ) { - if ( caseSensitive ) { - return haystack.find( phrase ) != string::npos; - } - return strcasestr( haystack.c_str(), phrase.c_str() ) != NULL; - } - - FTSMatcher::FTSMatcher( const FTSQuery& query, const FTSSpec& spec ) - : _query( query ), - _spec( spec ) { - } +namespace fts { - bool FTSMatcher::matches( const BSONObj& obj ) const { - if ( canSkipPositiveTermCheck() ) { - // We can assume that 'obj' has at least one positive term, and dassert as a sanity - // check. - dassert( hasPositiveTerm( obj ) ); - } - else { - if ( !hasPositiveTerm( obj ) ) { - return false; - } - } - - if ( hasNegativeTerm( obj ) ) { - return false; - } - - if ( !positivePhrasesMatch( obj ) ) { - return false; - } - - return negativePhrasesMatch( obj ); - } +using std::string; - bool FTSMatcher::hasPositiveTerm( const BSONObj& obj ) const { - FTSElementIterator it( _spec, obj ); +/** + * Does the string 'phrase' occur in the string 'haystack'? Match is case-insensitive if + * 'caseSensitive' is false; otherwise, an exact substring match is performed. + */ +static bool phraseMatches(const string& phrase, const string& haystack, bool caseSensitive) { + if (caseSensitive) { + return haystack.find(phrase) != string::npos; + } + return strcasestr(haystack.c_str(), phrase.c_str()) != NULL; +} - while ( it.more() ) { - FTSIteratorValue val = it.next(); - if ( _hasPositiveTerm_string( val._language, val._text ) ) { - return true; - } - } +FTSMatcher::FTSMatcher(const FTSQuery& query, const FTSSpec& spec) : _query(query), _spec(spec) {} +bool FTSMatcher::matches(const BSONObj& obj) const { + if (canSkipPositiveTermCheck()) { + // We can assume that 'obj' has at least one positive term, and dassert as a sanity + // check. + dassert(hasPositiveTerm(obj)); + } else { + if (!hasPositiveTerm(obj)) { return false; } + } - bool FTSMatcher::_hasPositiveTerm_string( const FTSLanguage* language, - const string& raw ) const { - std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer()); + if (hasNegativeTerm(obj)) { + return false; + } - tokenizer->reset(raw.c_str(), _query.getCaseSensitive() ? - FTSTokenizer::GenerateCaseSensitiveTokens : FTSTokenizer::None); + if (!positivePhrasesMatch(obj)) { + return false; + } - while (tokenizer->moveNext()) { - string word = tokenizer->get().toString(); - if (_query.getPositiveTerms().count(word) > 0) { - return true; - } - } - return false; + return negativePhrasesMatch(obj); +} + +bool FTSMatcher::hasPositiveTerm(const BSONObj& obj) const { + FTSElementIterator it(_spec, obj); + + while (it.more()) { + FTSIteratorValue val = it.next(); + if (_hasPositiveTerm_string(val._language, val._text)) { + return true; } + } - bool FTSMatcher::hasNegativeTerm( const BSONObj& obj ) const { - if ( _query.getNegatedTerms().size() == 0 ) { - return false; - } + return false; +} - FTSElementIterator it( _spec, obj ); +bool FTSMatcher::_hasPositiveTerm_string(const FTSLanguage* language, const string& raw) const { + std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer()); - while ( it.more() ) { - FTSIteratorValue val = it.next(); - if ( _hasNegativeTerm_string( val._language, val._text ) ) { - return true; - } - } + tokenizer->reset(raw.c_str(), + _query.getCaseSensitive() ? FTSTokenizer::GenerateCaseSensitiveTokens + : FTSTokenizer::None); - return false; + while (tokenizer->moveNext()) { + string word = tokenizer->get().toString(); + if (_query.getPositiveTerms().count(word) > 0) { + return true; } + } + return false; +} - bool FTSMatcher::_hasNegativeTerm_string( const FTSLanguage* language, - const string& raw ) const { - std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer()); +bool FTSMatcher::hasNegativeTerm(const BSONObj& obj) const { + if (_query.getNegatedTerms().size() == 0) { + return false; + } - tokenizer->reset(raw.c_str(), _query.getCaseSensitive() ? - FTSTokenizer::GenerateCaseSensitiveTokens : FTSTokenizer::None); + FTSElementIterator it(_spec, obj); - while (tokenizer->moveNext()) { - string word = tokenizer->get().toString(); - if ( _query.getNegatedTerms().count( word ) > 0 ) { - return true; - } - } - return false; + while (it.more()) { + FTSIteratorValue val = it.next(); + if (_hasNegativeTerm_string(val._language, val._text)) { + return true; } + } - bool FTSMatcher::positivePhrasesMatch( const BSONObj& obj ) const { - for ( size_t i = 0; i < _query.getPositivePhr().size(); i++ ) { - if ( !_phraseMatch( _query.getPositivePhr()[i], obj ) ) { - return false; - } - } + return false; +} - return true; - } +bool FTSMatcher::_hasNegativeTerm_string(const FTSLanguage* language, const string& raw) const { + std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer()); - bool FTSMatcher::negativePhrasesMatch( const BSONObj& obj ) const { - for ( size_t i = 0; i < _query.getNegatedPhr().size(); i++ ) { - if ( _phraseMatch( _query.getNegatedPhr()[i], obj ) ) { - return false; - } - } + tokenizer->reset(raw.c_str(), + _query.getCaseSensitive() ? FTSTokenizer::GenerateCaseSensitiveTokens + : FTSTokenizer::None); + while (tokenizer->moveNext()) { + string word = tokenizer->get().toString(); + if (_query.getNegatedTerms().count(word) > 0) { return true; } + } + return false; +} - bool FTSMatcher::_phraseMatch( const string& phrase, const BSONObj& obj ) const { - FTSElementIterator it( _spec, obj ); +bool FTSMatcher::positivePhrasesMatch(const BSONObj& obj) const { + for (size_t i = 0; i < _query.getPositivePhr().size(); i++) { + if (!_phraseMatch(_query.getPositivePhr()[i], obj)) { + return false; + } + } - while ( it.more() ) { - FTSIteratorValue val = it.next(); - if ( phraseMatches( phrase, val._text, _query.getCaseSensitive() ) ) { - return true; - } - } + return true; +} +bool FTSMatcher::negativePhrasesMatch(const BSONObj& obj) const { + for (size_t i = 0; i < _query.getNegatedPhr().size(); i++) { + if (_phraseMatch(_query.getNegatedPhr()[i], obj)) { return false; } } + + return true; +} + +bool FTSMatcher::_phraseMatch(const string& phrase, const BSONObj& obj) const { + FTSElementIterator it(_spec, obj); + + while (it.more()) { + FTSIteratorValue val = it.next(); + if (phraseMatches(phrase, val._text, _query.getCaseSensitive())) { + return true; + } + } + + return false; +} +} } diff --git a/src/mongo/db/fts/fts_matcher.h b/src/mongo/db/fts/fts_matcher.h index 058dcc7bcb6..00fe8291c4d 100644 --- a/src/mongo/db/fts/fts_matcher.h +++ b/src/mongo/db/fts/fts_matcher.h @@ -36,74 +36,74 @@ namespace mongo { - namespace fts { - - class FTSMatcher { - MONGO_DISALLOW_COPYING( FTSMatcher ); - public: - FTSMatcher( const FTSQuery& query, const FTSSpec& spec ); - - /** - * Returns whether 'obj' matches the query. An object is considered to match the query - * if all four of the following conditions hold: - * 1) The object contains at least one positive term. - * 2) The object contains zero negative terms. - * 3) The object contains all positive phrases. - * 4) The object contains zero negative phrases. - */ - bool matches( const BSONObj& obj ) const; - - /** - * Returns whether 'obj' contains at least one positive term. - */ - bool hasPositiveTerm( const BSONObj& obj ) const; - - /** - * Returns whether 'obj' contains at least one negative term. - */ - bool hasNegativeTerm( const BSONObj& obj ) const; - - /** - * Returns whether 'obj' contains all positive phrases. - */ - bool positivePhrasesMatch( const BSONObj& obj ) const; - - /** - * Returns whether 'obj' contains zero negative phrases. - */ - bool negativePhrasesMatch( const BSONObj& obj ) const; - - private: - /** - * For matching, can we skip the positive term check? This is done as optimization when - * we have a-priori knowledge that all documents being matched pass the positive term - * check. - */ - bool canSkipPositiveTermCheck() const { return !_query.getCaseSensitive(); } - - /** - * Returns whether the string 'raw' contains any positive terms from the query. - * 'language' specifies the language for 'raw'. - */ - bool _hasPositiveTerm_string( const FTSLanguage* language, - const std::string& raw ) const; - - /** - * Returns whether the string 'raw' contains any negative terms from the query. - * 'language' specifies the language for 'raw'. - */ - bool _hasNegativeTerm_string( const FTSLanguage* language, - const std::string& raw ) const; - - /** - * Returns whether 'obj' contains the exact string 'phrase' in any indexed fields. - */ - bool _phraseMatch( const std::string& phrase, const BSONObj& obj ) const; - - // TODO These should be unowned pointers instead of owned copies. - const FTSQuery _query; - const FTSSpec _spec; - }; - +namespace fts { + +class FTSMatcher { + MONGO_DISALLOW_COPYING(FTSMatcher); + +public: + FTSMatcher(const FTSQuery& query, const FTSSpec& spec); + + /** + * Returns whether 'obj' matches the query. An object is considered to match the query + * if all four of the following conditions hold: + * 1) The object contains at least one positive term. + * 2) The object contains zero negative terms. + * 3) The object contains all positive phrases. + * 4) The object contains zero negative phrases. + */ + bool matches(const BSONObj& obj) const; + + /** + * Returns whether 'obj' contains at least one positive term. + */ + bool hasPositiveTerm(const BSONObj& obj) const; + + /** + * Returns whether 'obj' contains at least one negative term. + */ + bool hasNegativeTerm(const BSONObj& obj) const; + + /** + * Returns whether 'obj' contains all positive phrases. + */ + bool positivePhrasesMatch(const BSONObj& obj) const; + + /** + * Returns whether 'obj' contains zero negative phrases. + */ + bool negativePhrasesMatch(const BSONObj& obj) const; + +private: + /** + * For matching, can we skip the positive term check? This is done as optimization when + * we have a-priori knowledge that all documents being matched pass the positive term + * check. + */ + bool canSkipPositiveTermCheck() const { + return !_query.getCaseSensitive(); } + + /** + * Returns whether the string 'raw' contains any positive terms from the query. + * 'language' specifies the language for 'raw'. + */ + bool _hasPositiveTerm_string(const FTSLanguage* language, const std::string& raw) const; + + /** + * Returns whether the string 'raw' contains any negative terms from the query. + * 'language' specifies the language for 'raw'. + */ + bool _hasNegativeTerm_string(const FTSLanguage* language, const std::string& raw) const; + + /** + * Returns whether 'obj' contains the exact string 'phrase' in any indexed fields. + */ + bool _phraseMatch(const std::string& phrase, const BSONObj& obj) const; + + // TODO These should be unowned pointers instead of owned copies. + const FTSQuery _query; + const FTSSpec _spec; +}; +} } diff --git a/src/mongo/db/fts/fts_matcher_test.cpp b/src/mongo/db/fts/fts_matcher_test.cpp index 0ea0fbe9e7e..13eb74609dc 100644 --- a/src/mongo/db/fts/fts_matcher_test.cpp +++ b/src/mongo/db/fts/fts_matcher_test.cpp @@ -34,187 +34,204 @@ #include "mongo/unittest/unittest.h" namespace mongo { - namespace fts { - - TEST( FTSMatcher, NegWild1 ) { - FTSQuery q; - ASSERT_OK( q.parse( "foo -bar", "english", false, TEXT_INDEX_VERSION_2 ) ); - FTSMatcher m( q, - FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "text" ) ) ) ) ); - - ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "bar" ) ) ) ); - ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "bar" ) ) ) ); - } - - // Regression test for SERVER-11994. - TEST( FTSMatcher, NegWild2 ) { - FTSQuery q; - ASSERT_OK( q.parse( "pizza -restaurant", "english", false, TEXT_INDEX_VERSION_2 ) ); - FTSMatcher m( q, - FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "text" ) ) ) ) ); - - ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "pizza restaurant" ) ) ) ); - ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "PIZZA RESTAURANT" ) ) ) ); - } - - TEST( FTSMatcher, Phrase1 ) { - FTSQuery q; - ASSERT_OK( q.parse( "foo \"table top\"", "english", false, TEXT_INDEX_VERSION_2 ) ); - FTSMatcher m( q, - FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "text" ) ) ) ) ); - - ASSERT( m.positivePhrasesMatch( BSON( "x" << "table top" ) ) ); - ASSERT( m.positivePhrasesMatch( BSON( "x" << " asd table top asd" ) ) ); - ASSERT( !m.positivePhrasesMatch( BSON( "x" << "tablz top" ) ) ); - ASSERT( !m.positivePhrasesMatch( BSON( "x" << " asd tablz top asd" ) ) ); - - ASSERT( m.positivePhrasesMatch( BSON( "x" << "table top" ) ) ); - ASSERT( !m.positivePhrasesMatch( BSON( "x" << "table a top" ) ) ); - - } - - TEST( FTSMatcher, Phrase2 ) { - FTSQuery q; - ASSERT_OK( q.parse( "foo \"table top\"", "english", false, TEXT_INDEX_VERSION_2 ) ); - FTSMatcher m( q, - FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) ); - ASSERT( m.positivePhrasesMatch( BSON( "x" << BSON_ARRAY( "table top" ) ) ) ); - } - - // Test that the matcher parses the document with the document language, not the search - // language. - TEST( FTSMatcher, ParsesUsingDocLanguage ) { - FTSQuery q; - ASSERT_OK( q.parse( "-glad", "none", false, TEXT_INDEX_VERSION_2 ) ); - FTSMatcher m( q, - FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) ); - - // Even though the search language is "none", the document {x: "gladly"} should be - // parsed using the English stemmer, and as such should match the negated term "glad". - ASSERT( m.hasNegativeTerm( BSON( "x" << "gladly" ) ) ); - } - - // Test the matcher does not filter out stop words from positive terms - TEST( FTSMatcher, MatcherDoesNotFilterStopWordsNeg ) { - FTSQuery q; - ASSERT_OK( q.parse( "-the", "none", false, TEXT_INDEX_VERSION_2 ) ); - FTSMatcher m( q, - FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) ); - - ASSERT( m.hasNegativeTerm( BSON( "x" << "the" ) ) ); - } - - // Test the matcher does not filter out stop words from negative terms - TEST( FTSMatcher, MatcherDoesNotFilterStopWordsPos ) { - FTSQuery q; - ASSERT_OK( q.parse( "the", "none", false, TEXT_INDEX_VERSION_2 ) ); - FTSMatcher m( q, - FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) ); - - ASSERT( m.hasPositiveTerm( BSON( "x" << "the" ) ) ); - } - - // Returns whether a document indexed with text data 'doc' contains any positive terms from - // case-sensitive text query 'search'. - static bool docHasPositiveTermWithCase( const std::string& doc, - const std::string& search ) { - FTSQuery q; - ASSERT_OK( q.parse( search, "english", true, TEXT_INDEX_VERSION_2 ) ); - FTSMatcher m( q, - FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) ); - - return m.hasPositiveTerm( BSON( "x" << doc ) ); - } - - TEST( FTSMatcher, HasPositiveTermCaseSensitive ) { - ASSERT_TRUE( docHasPositiveTermWithCase( "hello world", "hello" ) ); - ASSERT_TRUE( docHasPositiveTermWithCase( "Hello World", "Hello" ) ); - ASSERT_TRUE( docHasPositiveTermWithCase( "Hello World", "World Hello" ) ); - ASSERT_TRUE( docHasPositiveTermWithCase( "Hello World", "World GoodBye" ) ); - ASSERT_TRUE( docHasPositiveTermWithCase( "John Runs", "Runs" ) ); - ASSERT_TRUE( docHasPositiveTermWithCase( "John Runs", "Running" ) ); - ASSERT_TRUE( docHasPositiveTermWithCase( "John Runs", "Run" ) ); - - ASSERT_FALSE( docHasPositiveTermWithCase( "John Runs", "run" ) ); - ASSERT_FALSE( docHasPositiveTermWithCase( "Hello World", "HELLO" ) ); - ASSERT_FALSE( docHasPositiveTermWithCase( "hello world", "Hello" ) ); - ASSERT_FALSE( docHasPositiveTermWithCase( "Hello World", "hello" ) ); - } - - // Returns whether a document indexed with text data 'doc' contains any negative terms from - // case-sensitive text query 'search'. - static bool docHasNegativeTermWithCase( const std::string& doc, - const std::string& search ) { - FTSQuery q; - ASSERT_OK( q.parse( search, "english", true, TEXT_INDEX_VERSION_2 ) ); - FTSMatcher m( q, - FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) ); - - return m.hasNegativeTerm( BSON( "x" << doc ) ); - } - - TEST( FTSMatcher, HasNegativeTermCaseSensitive ) { - ASSERT_TRUE( docHasNegativeTermWithCase( "hello world", "hello -world" ) ); - ASSERT_TRUE( docHasNegativeTermWithCase( "Hello World", "Hello -World" ) ); - ASSERT_TRUE( docHasNegativeTermWithCase( "Hello World", "-World -Hello" ) ); - ASSERT_TRUE( docHasNegativeTermWithCase( "Hello World", "-Goodbye -World" ) ); - ASSERT_TRUE( docHasNegativeTermWithCase( "John Runs", "-Runs" ) ); - ASSERT_TRUE( docHasNegativeTermWithCase( "John Runs", "-Running" ) ); - ASSERT_TRUE( docHasNegativeTermWithCase( "John Runs", "-Run" ) ); - - ASSERT_FALSE( docHasNegativeTermWithCase( "John Runs", "-run" ) ); - ASSERT_FALSE( docHasNegativeTermWithCase( "Hello World", "Hello -WORLD" ) ); - ASSERT_FALSE( docHasNegativeTermWithCase( "hello world", "hello -World" ) ); - ASSERT_FALSE( docHasNegativeTermWithCase( "Hello World", "Hello -world" ) ); - } - - // Returns whether a document indexed with text data 'doc' contains all positive phrases - // from case-sensitive text query 'search'. - static bool docPositivePhrasesMatchWithCase( const std::string& doc, - const std::string& search ) { - FTSQuery q; - ASSERT_OK( q.parse( search, "english", true, TEXT_INDEX_VERSION_2 ) ); - FTSMatcher m( q, - FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) ); - - return m.positivePhrasesMatch( BSON( "x" << doc ) ); - } - - TEST( FTSMatcher, PositivePhrasesMatchWithCase ) { - ASSERT_TRUE( docPositivePhrasesMatchWithCase( "John Runs", "\"John Runs\"" ) ); - ASSERT_TRUE( docPositivePhrasesMatchWithCase( "John Runs", "\"John Run\"" ) ); - ASSERT_TRUE( docPositivePhrasesMatchWithCase( "John Runs", "\"John\" \"Run\"" ) ); - ASSERT_TRUE( docPositivePhrasesMatchWithCase( "John Runs", "\"n R\"" ) ); - - ASSERT_FALSE( docPositivePhrasesMatchWithCase( "John Runs", "\"john runs\"" ) ); - ASSERT_FALSE( docPositivePhrasesMatchWithCase( "john runs", "\"John Runs\"" ) ); - ASSERT_FALSE( docPositivePhrasesMatchWithCase( "John Runs", "\"John\" \"Running\"" ) ); - } - - // Returns whether a document indexed with text data 'doc' contains zero negative phrases - // from case-sensitive text query 'search'. - static bool docNegativePhrasesMatchWithCase( const std::string& doc, - const std::string& search ) { - FTSQuery q; - ASSERT_OK( q.parse( search, "english", true, TEXT_INDEX_VERSION_2 ) ); - FTSMatcher m( q, - FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) ); - - return m.negativePhrasesMatch( BSON( "x" << doc ) ); - } - - TEST( FTSMatcher, NegativePhrasesMatchWithCase ) { - ASSERT_TRUE( docNegativePhrasesMatchWithCase( "John Runs", "-\"john runs\"" ) ); - ASSERT_TRUE( docNegativePhrasesMatchWithCase( "john runs", "-\"John Runs\"" ) ); - ASSERT_TRUE( docNegativePhrasesMatchWithCase( "john runs", "-\"John\" -\"Runs\"" ) ); - - ASSERT_FALSE( docNegativePhrasesMatchWithCase( "John Runs", "-\"John Runs\"" ) ); - ASSERT_FALSE( docNegativePhrasesMatchWithCase( "John Runs", "-\"John Run\"" ) ); - ASSERT_FALSE( docNegativePhrasesMatchWithCase( "John Runs", "-\"John\" -\"Run\"" ) ); - ASSERT_FALSE( docNegativePhrasesMatchWithCase( "John Runs", "-\"n R\"" ) ); - ASSERT_FALSE( docNegativePhrasesMatchWithCase( "John Runs", - "-\"John\" -\"Running\"" ) ); - } - - } +namespace fts { + +TEST(FTSMatcher, NegWild1) { + FTSQuery q; + ASSERT_OK(q.parse("foo -bar", "english", false, TEXT_INDEX_VERSION_2)); + FTSMatcher m(q, + FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("$**" + << "text"))))); + + ASSERT(m.hasNegativeTerm(BSON("x" << BSON("y" + << "bar")))); + ASSERT(m.hasNegativeTerm(BSON("x" << BSON("y" + << "bar")))); +} + +// Regression test for SERVER-11994. +TEST(FTSMatcher, NegWild2) { + FTSQuery q; + ASSERT_OK(q.parse("pizza -restaurant", "english", false, TEXT_INDEX_VERSION_2)); + FTSMatcher m(q, + FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("$**" + << "text"))))); + + ASSERT(m.hasNegativeTerm(BSON("x" << BSON("y" + << "pizza restaurant")))); + ASSERT(m.hasNegativeTerm(BSON("x" << BSON("y" + << "PIZZA RESTAURANT")))); +} + +TEST(FTSMatcher, Phrase1) { + FTSQuery q; + ASSERT_OK(q.parse("foo \"table top\"", "english", false, TEXT_INDEX_VERSION_2)); + FTSMatcher m(q, + FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("$**" + << "text"))))); + + ASSERT(m.positivePhrasesMatch(BSON("x" + << "table top"))); + ASSERT(m.positivePhrasesMatch(BSON("x" + << " asd table top asd"))); + ASSERT(!m.positivePhrasesMatch(BSON("x" + << "tablz top"))); + ASSERT(!m.positivePhrasesMatch(BSON("x" + << " asd tablz top asd"))); + + ASSERT(m.positivePhrasesMatch(BSON("x" + << "table top"))); + ASSERT(!m.positivePhrasesMatch(BSON("x" + << "table a top"))); +} + +TEST(FTSMatcher, Phrase2) { + FTSQuery q; + ASSERT_OK(q.parse("foo \"table top\"", "english", false, TEXT_INDEX_VERSION_2)); + FTSMatcher m(q, + FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" + << "text"))))); + ASSERT(m.positivePhrasesMatch(BSON("x" << BSON_ARRAY("table top")))); +} + +// Test that the matcher parses the document with the document language, not the search +// language. +TEST(FTSMatcher, ParsesUsingDocLanguage) { + FTSQuery q; + ASSERT_OK(q.parse("-glad", "none", false, TEXT_INDEX_VERSION_2)); + FTSMatcher m(q, + FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" + << "text"))))); + + // Even though the search language is "none", the document {x: "gladly"} should be + // parsed using the English stemmer, and as such should match the negated term "glad". + ASSERT(m.hasNegativeTerm(BSON("x" + << "gladly"))); +} + +// Test the matcher does not filter out stop words from positive terms +TEST(FTSMatcher, MatcherDoesNotFilterStopWordsNeg) { + FTSQuery q; + ASSERT_OK(q.parse("-the", "none", false, TEXT_INDEX_VERSION_2)); + FTSMatcher m(q, + FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" + << "text"))))); + + ASSERT(m.hasNegativeTerm(BSON("x" + << "the"))); +} + +// Test the matcher does not filter out stop words from negative terms +TEST(FTSMatcher, MatcherDoesNotFilterStopWordsPos) { + FTSQuery q; + ASSERT_OK(q.parse("the", "none", false, TEXT_INDEX_VERSION_2)); + FTSMatcher m(q, + FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" + << "text"))))); + + ASSERT(m.hasPositiveTerm(BSON("x" + << "the"))); +} + +// Returns whether a document indexed with text data 'doc' contains any positive terms from +// case-sensitive text query 'search'. +static bool docHasPositiveTermWithCase(const std::string& doc, const std::string& search) { + FTSQuery q; + ASSERT_OK(q.parse(search, "english", true, TEXT_INDEX_VERSION_2)); + FTSMatcher m(q, + FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" + << "text"))))); + + return m.hasPositiveTerm(BSON("x" << doc)); +} + +TEST(FTSMatcher, HasPositiveTermCaseSensitive) { + ASSERT_TRUE(docHasPositiveTermWithCase("hello world", "hello")); + ASSERT_TRUE(docHasPositiveTermWithCase("Hello World", "Hello")); + ASSERT_TRUE(docHasPositiveTermWithCase("Hello World", "World Hello")); + ASSERT_TRUE(docHasPositiveTermWithCase("Hello World", "World GoodBye")); + ASSERT_TRUE(docHasPositiveTermWithCase("John Runs", "Runs")); + ASSERT_TRUE(docHasPositiveTermWithCase("John Runs", "Running")); + ASSERT_TRUE(docHasPositiveTermWithCase("John Runs", "Run")); + + ASSERT_FALSE(docHasPositiveTermWithCase("John Runs", "run")); + ASSERT_FALSE(docHasPositiveTermWithCase("Hello World", "HELLO")); + ASSERT_FALSE(docHasPositiveTermWithCase("hello world", "Hello")); + ASSERT_FALSE(docHasPositiveTermWithCase("Hello World", "hello")); +} + +// Returns whether a document indexed with text data 'doc' contains any negative terms from +// case-sensitive text query 'search'. +static bool docHasNegativeTermWithCase(const std::string& doc, const std::string& search) { + FTSQuery q; + ASSERT_OK(q.parse(search, "english", true, TEXT_INDEX_VERSION_2)); + FTSMatcher m(q, + FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" + << "text"))))); + + return m.hasNegativeTerm(BSON("x" << doc)); +} + +TEST(FTSMatcher, HasNegativeTermCaseSensitive) { + ASSERT_TRUE(docHasNegativeTermWithCase("hello world", "hello -world")); + ASSERT_TRUE(docHasNegativeTermWithCase("Hello World", "Hello -World")); + ASSERT_TRUE(docHasNegativeTermWithCase("Hello World", "-World -Hello")); + ASSERT_TRUE(docHasNegativeTermWithCase("Hello World", "-Goodbye -World")); + ASSERT_TRUE(docHasNegativeTermWithCase("John Runs", "-Runs")); + ASSERT_TRUE(docHasNegativeTermWithCase("John Runs", "-Running")); + ASSERT_TRUE(docHasNegativeTermWithCase("John Runs", "-Run")); + + ASSERT_FALSE(docHasNegativeTermWithCase("John Runs", "-run")); + ASSERT_FALSE(docHasNegativeTermWithCase("Hello World", "Hello -WORLD")); + ASSERT_FALSE(docHasNegativeTermWithCase("hello world", "hello -World")); + ASSERT_FALSE(docHasNegativeTermWithCase("Hello World", "Hello -world")); +} + +// Returns whether a document indexed with text data 'doc' contains all positive phrases +// from case-sensitive text query 'search'. +static bool docPositivePhrasesMatchWithCase(const std::string& doc, const std::string& search) { + FTSQuery q; + ASSERT_OK(q.parse(search, "english", true, TEXT_INDEX_VERSION_2)); + FTSMatcher m(q, + FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" + << "text"))))); + + return m.positivePhrasesMatch(BSON("x" << doc)); +} + +TEST(FTSMatcher, PositivePhrasesMatchWithCase) { + ASSERT_TRUE(docPositivePhrasesMatchWithCase("John Runs", "\"John Runs\"")); + ASSERT_TRUE(docPositivePhrasesMatchWithCase("John Runs", "\"John Run\"")); + ASSERT_TRUE(docPositivePhrasesMatchWithCase("John Runs", "\"John\" \"Run\"")); + ASSERT_TRUE(docPositivePhrasesMatchWithCase("John Runs", "\"n R\"")); + + ASSERT_FALSE(docPositivePhrasesMatchWithCase("John Runs", "\"john runs\"")); + ASSERT_FALSE(docPositivePhrasesMatchWithCase("john runs", "\"John Runs\"")); + ASSERT_FALSE(docPositivePhrasesMatchWithCase("John Runs", "\"John\" \"Running\"")); +} + +// Returns whether a document indexed with text data 'doc' contains zero negative phrases +// from case-sensitive text query 'search'. +static bool docNegativePhrasesMatchWithCase(const std::string& doc, const std::string& search) { + FTSQuery q; + ASSERT_OK(q.parse(search, "english", true, TEXT_INDEX_VERSION_2)); + FTSMatcher m(q, + FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" + << "text"))))); + + return m.negativePhrasesMatch(BSON("x" << doc)); +} + +TEST(FTSMatcher, NegativePhrasesMatchWithCase) { + ASSERT_TRUE(docNegativePhrasesMatchWithCase("John Runs", "-\"john runs\"")); + ASSERT_TRUE(docNegativePhrasesMatchWithCase("john runs", "-\"John Runs\"")); + ASSERT_TRUE(docNegativePhrasesMatchWithCase("john runs", "-\"John\" -\"Runs\"")); + + ASSERT_FALSE(docNegativePhrasesMatchWithCase("John Runs", "-\"John Runs\"")); + ASSERT_FALSE(docNegativePhrasesMatchWithCase("John Runs", "-\"John Run\"")); + ASSERT_FALSE(docNegativePhrasesMatchWithCase("John Runs", "-\"John\" -\"Run\"")); + ASSERT_FALSE(docNegativePhrasesMatchWithCase("John Runs", "-\"n R\"")); + ASSERT_FALSE(docNegativePhrasesMatchWithCase("John Runs", "-\"John\" -\"Running\"")); +} +} } diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp index bbaac9b2f1e..8dec8e29204 100644 --- a/src/mongo/db/fts/fts_query.cpp +++ b/src/mongo/db/fts/fts_query.cpp @@ -40,219 +40,208 @@ namespace mongo { - namespace fts { +namespace fts { - using namespace mongoutils; +using namespace mongoutils; - using std::set; - using std::string; - using std::stringstream; - using std::vector; +using std::set; +using std::string; +using std::stringstream; +using std::vector; - const bool FTSQuery::caseSensitiveDefault = false; +const bool FTSQuery::caseSensitiveDefault = false; - Status FTSQuery::parse(const string& query, StringData language, bool caseSensitive, - TextIndexVersion textIndexVersion) { - StatusWithFTSLanguage swl = FTSLanguage::make( language, textIndexVersion ); - if ( !swl.getStatus().isOK() ) { - return swl.getStatus(); - } - _language = swl.getValue(); - _caseSensitive = caseSensitive; - - // Build a space delimited list of words to have the FtsTokenizer tokenize - string positiveTermSentence; - string negativeTermSentence; - - bool inNegation = false; - bool inPhrase = false; - - unsigned quoteOffset = 0; - - FTSQueryParser i(query); - while ( i.more() ) { - QueryToken t = i.next(); - - if ( t.type == QueryToken::TEXT ) { - string s = t.data.toString(); - - if ( inPhrase && inNegation ) { - // don't add term - } - else { - if (inNegation) { - negativeTermSentence.append(s); - negativeTermSentence.push_back(' '); - } - else { - positiveTermSentence.append(s); - positiveTermSentence.push_back(' '); - } - } - - if ( inNegation && !inPhrase ) - inNegation = false; +Status FTSQuery::parse(const string& query, + StringData language, + bool caseSensitive, + TextIndexVersion textIndexVersion) { + StatusWithFTSLanguage swl = FTSLanguage::make(language, textIndexVersion); + if (!swl.getStatus().isOK()) { + return swl.getStatus(); + } + _language = swl.getValue(); + _caseSensitive = caseSensitive; + + // Build a space delimited list of words to have the FtsTokenizer tokenize + string positiveTermSentence; + string negativeTermSentence; + + bool inNegation = false; + bool inPhrase = false; + + unsigned quoteOffset = 0; + + FTSQueryParser i(query); + while (i.more()) { + QueryToken t = i.next(); + + if (t.type == QueryToken::TEXT) { + string s = t.data.toString(); + + if (inPhrase && inNegation) { + // don't add term + } else { + if (inNegation) { + negativeTermSentence.append(s); + negativeTermSentence.push_back(' '); + } else { + positiveTermSentence.append(s); + positiveTermSentence.push_back(' '); } - else if ( t.type == QueryToken::DELIMITER ) { - char c = t.data[0]; - if ( c == '-' ) { - if ( !inPhrase && t.previousWhiteSpace ) { - // phrases can be negated, and terms not in phrases can be negated. - // terms in phrases can not be negated. - inNegation = true; - } - } - else if ( c == '"' ) { - if ( inPhrase ) { - // end of a phrase - unsigned phraseStart = quoteOffset + 1; - unsigned phraseLength = t.offset - phraseStart; - StringData phrase = StringData( query ).substr( phraseStart, - phraseLength ); - if ( inNegation ) - _negatedPhrases.push_back( normalizeString( phrase ) ); - else - _positivePhrases.push_back( normalizeString( phrase ) ); - inNegation = false; - inPhrase = false; - } - else { - // start of a phrase - inPhrase = true; - quoteOffset = t.offset; - } - } + } + + if (inNegation && !inPhrase) + inNegation = false; + } else if (t.type == QueryToken::DELIMITER) { + char c = t.data[0]; + if (c == '-') { + if (!inPhrase && t.previousWhiteSpace) { + // phrases can be negated, and terms not in phrases can be negated. + // terms in phrases can not be negated. + inNegation = true; } - else { - invariant( false ); + } else if (c == '"') { + if (inPhrase) { + // end of a phrase + unsigned phraseStart = quoteOffset + 1; + unsigned phraseLength = t.offset - phraseStart; + StringData phrase = StringData(query).substr(phraseStart, phraseLength); + if (inNegation) + _negatedPhrases.push_back(normalizeString(phrase)); + else + _positivePhrases.push_back(normalizeString(phrase)); + inNegation = false; + inPhrase = false; + } else { + // start of a phrase + inPhrase = true; + quoteOffset = t.offset; } } - - std::unique_ptr<FTSTokenizer> tokenizer(_language->createTokenizer()); - - _addTerms(tokenizer.get(), positiveTermSentence, false); - _addTerms(tokenizer.get(), negativeTermSentence, true); - - return Status::OK(); + } else { + invariant(false); } + } - void FTSQuery::_addTerms( FTSTokenizer* tokenizer, - const string& sentence, - bool negated ) { - - tokenizer->reset(sentence.c_str(), FTSTokenizer::FilterStopWords); + std::unique_ptr<FTSTokenizer> tokenizer(_language->createTokenizer()); - auto& activeTerms = negated ? _negatedTerms : _positiveTerms; + _addTerms(tokenizer.get(), positiveTermSentence, false); + _addTerms(tokenizer.get(), negativeTermSentence, true); - // First, get all the terms for indexing, ie, lower cased words - // If we are case-insensitive, we can also used this for positive, and negative terms - // Some terms may be expanded into multiple words in some non-English languages - while (tokenizer->moveNext()) { + return Status::OK(); +} - string word = tokenizer->get().toString(); +void FTSQuery::_addTerms(FTSTokenizer* tokenizer, const string& sentence, bool negated) { + tokenizer->reset(sentence.c_str(), FTSTokenizer::FilterStopWords); - if (!negated) { - _termsForBounds.insert(word); - } + auto& activeTerms = negated ? _negatedTerms : _positiveTerms; - // Compute the string corresponding to 'token' that will be used for the matcher. - // For case-insensitive queries, this is the same string as 'boundsTerm' computed - // above. - if (!_caseSensitive) { - activeTerms.insert(word); - } - } + // First, get all the terms for indexing, ie, lower cased words + // If we are case-insensitive, we can also used this for positive, and negative terms + // Some terms may be expanded into multiple words in some non-English languages + while (tokenizer->moveNext()) { + string word = tokenizer->get().toString(); - if (!_caseSensitive) { - return; - } + if (!negated) { + _termsForBounds.insert(word); + } - tokenizer->reset(sentence.c_str(), static_cast<FTSTokenizer::Options>( - FTSTokenizer::FilterStopWords - | FTSTokenizer::GenerateCaseSensitiveTokens)); + // Compute the string corresponding to 'token' that will be used for the matcher. + // For case-insensitive queries, this is the same string as 'boundsTerm' computed + // above. + if (!_caseSensitive) { + activeTerms.insert(word); + } + } - // If we want case-sensitivity, get the case-sensitive token - while (tokenizer->moveNext()) { + if (!_caseSensitive) { + return; + } - string word = tokenizer->get().toString(); + tokenizer->reset(sentence.c_str(), + static_cast<FTSTokenizer::Options>(FTSTokenizer::FilterStopWords | + FTSTokenizer::GenerateCaseSensitiveTokens)); - activeTerms.insert(word); - } - } + // If we want case-sensitivity, get the case-sensitive token + while (tokenizer->moveNext()) { + string word = tokenizer->get().toString(); - string FTSQuery::normalizeString(StringData str) const { - if (_caseSensitive) { - return str.toString(); - } - return tolowerString(str); - } + activeTerms.insert(word); + } +} - namespace { - void _debugHelp( stringstream& ss, const set<string>& s, const string& sep ) { - bool first = true; - for ( set<string>::const_iterator i = s.begin(); i != s.end(); ++i ) { - if ( first ) - first = false; - else - ss << sep; - ss << *i; - } - } +string FTSQuery::normalizeString(StringData str) const { + if (_caseSensitive) { + return str.toString(); + } + return tolowerString(str); +} - void _debugHelp( stringstream& ss, const vector<string>& v, const string& sep ) { - set<string> s( v.begin(), v.end() ); - _debugHelp( ss, s, sep ); - } +namespace { +void _debugHelp(stringstream& ss, const set<string>& s, const string& sep) { + bool first = true; + for (set<string>::const_iterator i = s.begin(); i != s.end(); ++i) { + if (first) + first = false; + else + ss << sep; + ss << *i; + } +} - } +void _debugHelp(stringstream& ss, const vector<string>& v, const string& sep) { + set<string> s(v.begin(), v.end()); + _debugHelp(ss, s, sep); +} +} - string FTSQuery::toString() const { - stringstream ss; - ss << "FTSQuery\n"; +string FTSQuery::toString() const { + stringstream ss; + ss << "FTSQuery\n"; - ss << " terms: "; - _debugHelp( ss, getPositiveTerms(), ", " ); - ss << "\n"; + ss << " terms: "; + _debugHelp(ss, getPositiveTerms(), ", "); + ss << "\n"; - ss << " negated terms: "; - _debugHelp( ss, getNegatedTerms(), ", " ); - ss << "\n"; + ss << " negated terms: "; + _debugHelp(ss, getNegatedTerms(), ", "); + ss << "\n"; - ss << " phrases: "; - _debugHelp( ss, getPositivePhr(), ", " ); - ss << "\n"; + ss << " phrases: "; + _debugHelp(ss, getPositivePhr(), ", "); + ss << "\n"; - ss << " negated phrases: "; - _debugHelp( ss, getNegatedPhr(), ", " ); - ss << "\n"; + ss << " negated phrases: "; + _debugHelp(ss, getNegatedPhr(), ", "); + ss << "\n"; - return ss.str(); - } + return ss.str(); +} - string FTSQuery::debugString() const { - stringstream ss; +string FTSQuery::debugString() const { + stringstream ss; - _debugHelp( ss, getPositiveTerms(), "|" ); - ss << "||"; + _debugHelp(ss, getPositiveTerms(), "|"); + ss << "||"; - _debugHelp( ss, getNegatedTerms(), "|" ); - ss << "||"; + _debugHelp(ss, getNegatedTerms(), "|"); + ss << "||"; - _debugHelp( ss, getPositivePhr(), "|" ); - ss << "||"; + _debugHelp(ss, getPositivePhr(), "|"); + ss << "||"; - _debugHelp( ss, getNegatedPhr(), "|" ); + _debugHelp(ss, getNegatedPhr(), "|"); - return ss.str(); - } + return ss.str(); +} - BSONObj FTSQuery::toBSON() const { - BSONObjBuilder bob; - bob.append( "terms", getPositiveTerms() ); - bob.append( "negatedTerms", getNegatedTerms() ); - bob.append( "phrases", getPositivePhr() ); - bob.append( "negatedPhrases", getNegatedPhr() ); - return bob.obj(); - } - } +BSONObj FTSQuery::toBSON() const { + BSONObjBuilder bob; + bob.append("terms", getPositiveTerms()); + bob.append("negatedTerms", getNegatedTerms()); + bob.append("phrases", getPositivePhr()); + bob.append("negatedPhrases", getNegatedPhr()); + return bob.obj(); +} +} } diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h index 88ca4ce64d0..10e0cd2faaf 100644 --- a/src/mongo/db/fts/fts_query.h +++ b/src/mongo/db/fts/fts_query.h @@ -40,68 +40,77 @@ namespace mongo { - namespace fts { - - class FTSQuery { - - public: - // Initializes an FTSQuery. Note that the parsing of "language" depends on the text - // index version, since a query which doesn't specify a language and is against a - // version 1 text index with a version 1 default language string needs to be parsed as - // version 1 (see fts_language.cpp for a list of language strings specific to version - // 1). - Status parse(const std::string& query, StringData language, bool caseSensitive, - TextIndexVersion textIndexVersion); - - const std::set<std::string>& getPositiveTerms() const { return _positiveTerms; } - const std::set<std::string>& getNegatedTerms() const { return _negatedTerms; } - const std::vector<std::string>& getPositivePhr() const { return _positivePhrases; } - const std::vector<std::string>& getNegatedPhr() const { return _negatedPhrases; } - - const std::set<std::string>& getTermsForBounds() const { - return _termsForBounds; - } +namespace fts { + +class FTSQuery { +public: + // Initializes an FTSQuery. Note that the parsing of "language" depends on the text + // index version, since a query which doesn't specify a language and is against a + // version 1 text index with a version 1 default language string needs to be parsed as + // version 1 (see fts_language.cpp for a list of language strings specific to version + // 1). + Status parse(const std::string& query, + StringData language, + bool caseSensitive, + TextIndexVersion textIndexVersion); + + const std::set<std::string>& getPositiveTerms() const { + return _positiveTerms; + } + const std::set<std::string>& getNegatedTerms() const { + return _negatedTerms; + } + const std::vector<std::string>& getPositivePhr() const { + return _positivePhrases; + } + const std::vector<std::string>& getNegatedPhr() const { + return _negatedPhrases; + } - const FTSLanguage& getLanguage() const { return *_language; } - bool getCaseSensitive() const { return _caseSensitive; } + const std::set<std::string>& getTermsForBounds() const { + return _termsForBounds; + } - std::string toString() const; + const FTSLanguage& getLanguage() const { + return *_language; + } + bool getCaseSensitive() const { + return _caseSensitive; + } - std::string debugString() const; + std::string toString() const; - BSONObj toBSON() const; + std::string debugString() const; - /** - * Lowercases "str" if _caseSensitive is set, else returns a copy of "str" unchanged. - */ - std::string normalizeString( StringData str ) const; + BSONObj toBSON() const; - static const bool caseSensitiveDefault; + /** + * Lowercases "str" if _caseSensitive is set, else returns a copy of "str" unchanged. + */ + std::string normalizeString(StringData str) const; - private: - void _addTerms( FTSTokenizer* tokenizer, - const std::string& tokens, - bool negated ); + static const bool caseSensitiveDefault; - const FTSLanguage* _language; - bool _caseSensitive; +private: + void _addTerms(FTSTokenizer* tokenizer, const std::string& tokens, bool negated); - // Positive terms. - std::set<std::string> _positiveTerms; + const FTSLanguage* _language; + bool _caseSensitive; - // Negated terms. - std::set<std::string> _negatedTerms; + // Positive terms. + std::set<std::string> _positiveTerms; - // Positive phrases. - std::vector<std::string> _positivePhrases; + // Negated terms. + std::set<std::string> _negatedTerms; - // Negated phrases. - std::vector<std::string> _negatedPhrases; + // Positive phrases. + std::vector<std::string> _positivePhrases; - // Terms for bounds. - std::set<std::string> _termsForBounds; - }; + // Negated phrases. + std::vector<std::string> _negatedPhrases; - } + // Terms for bounds. + std::set<std::string> _termsForBounds; +}; +} } - diff --git a/src/mongo/db/fts/fts_query_parser.cpp b/src/mongo/db/fts/fts_query_parser.cpp index 5d73e69cb1e..6b2381c3366 100644 --- a/src/mongo/db/fts/fts_query_parser.cpp +++ b/src/mongo/db/fts/fts_query_parser.cpp @@ -34,77 +34,73 @@ namespace mongo { - namespace fts { +namespace fts { - FTSQueryParser::FTSQueryParser( StringData str ) - : _pos(0), _raw( str ) { - skipWhitespace(); - _previousWhiteSpace = true; - } - - bool FTSQueryParser::more() const { - return _pos < _raw.size(); - } - - QueryToken FTSQueryParser::next() { - if ( _pos >= _raw.size() ) - return QueryToken( QueryToken::INVALID, "", 0, false ); +FTSQueryParser::FTSQueryParser(StringData str) : _pos(0), _raw(str) { + skipWhitespace(); + _previousWhiteSpace = true; +} - unsigned start = _pos++; - QueryToken::Type type = getType( _raw[start] ); +bool FTSQueryParser::more() const { + return _pos < _raw.size(); +} - // Query Parser should never land on whitespace - if ( type == QueryToken::WHITESPACE ) { - invariant( false ); - } +QueryToken FTSQueryParser::next() { + if (_pos >= _raw.size()) + return QueryToken(QueryToken::INVALID, "", 0, false); - if ( type == QueryToken::TEXT ) { - while ( _pos < _raw.size() && getType( _raw[_pos] ) == type ) { - _pos++; - } - } + unsigned start = _pos++; + QueryToken::Type type = getType(_raw[start]); - StringData ret = _raw.substr( start, _pos - start ); - bool old = _previousWhiteSpace; - _previousWhiteSpace = skipWhitespace(); + // Query Parser should never land on whitespace + if (type == QueryToken::WHITESPACE) { + invariant(false); + } - return QueryToken( type, ret, start, old ); + if (type == QueryToken::TEXT) { + while (_pos < _raw.size() && getType(_raw[_pos]) == type) { + _pos++; } + } - bool FTSQueryParser::skipWhitespace() { - unsigned start = _pos; + StringData ret = _raw.substr(start, _pos - start); + bool old = _previousWhiteSpace; + _previousWhiteSpace = skipWhitespace(); - while ( _pos < _raw.size() && getType( _raw[_pos] ) == QueryToken::WHITESPACE ) { - _pos++; - } + return QueryToken(type, ret, start, old); +} - return _pos > start; - } +bool FTSQueryParser::skipWhitespace() { + unsigned start = _pos; + while (_pos < _raw.size() && getType(_raw[_pos]) == QueryToken::WHITESPACE) { + _pos++; + } - QueryToken::Type FTSQueryParser::getType( char c ) const { - switch ( c ) { - // Unicode TR29 defines these as Word Boundaries - case '\n': // U+000A - LF - case '\v': // U+000B - Veritical Tab - case '\f': // U+000C - Form Feed - case '\r': // U+000D - CR - // Unicode TR29 remarks this could be used MidNum for Word Boundaries - // but we treat this as a token separator - case ' ': // U+0020 - Space - return QueryToken::WHITESPACE; - // Unicode TR29 has a particular note about the complexity of hyphens. - // Since we use them for negation, we are sensitive to them, and we simply drop - // them otherwise from words - case '-': - case '"': - return QueryToken::DELIMITER; - default: - return QueryToken::TEXT; - } + return _pos > start; +} - } +QueryToken::Type FTSQueryParser::getType(char c) const { + switch (c) { + // Unicode TR29 defines these as Word Boundaries + case '\n': // U+000A - LF + case '\v': // U+000B - Veritical Tab + case '\f': // U+000C - Form Feed + case '\r': // U+000D - CR + // Unicode TR29 remarks this could be used MidNum for Word Boundaries + // but we treat this as a token separator + case ' ': // U+0020 - Space + return QueryToken::WHITESPACE; + // Unicode TR29 has a particular note about the complexity of hyphens. + // Since we use them for negation, we are sensitive to them, and we simply drop + // them otherwise from words + case '-': + case '"': + return QueryToken::DELIMITER; + default: + return QueryToken::TEXT; } - +} +} } diff --git a/src/mongo/db/fts/fts_query_parser.h b/src/mongo/db/fts/fts_query_parser.h index 32804fd63fd..b5e8c53207f 100644 --- a/src/mongo/db/fts/fts_query_parser.h +++ b/src/mongo/db/fts/fts_query_parser.h @@ -34,57 +34,54 @@ namespace mongo { - namespace fts { +namespace fts { - struct QueryToken { - enum Type { WHITESPACE, DELIMITER, TEXT, INVALID }; - QueryToken( Type type, StringData data, unsigned offset, bool previousWhiteSpace ) - : type( type ), - data( data ), - offset( offset ), - previousWhiteSpace( previousWhiteSpace ) {} +struct QueryToken { + enum Type { WHITESPACE, DELIMITER, TEXT, INVALID }; + QueryToken(Type type, StringData data, unsigned offset, bool previousWhiteSpace) + : type(type), data(data), offset(offset), previousWhiteSpace(previousWhiteSpace) {} - bool ok() const { return type != INVALID; } - - Type type; - StringData data; - unsigned offset; - bool previousWhiteSpace; - }; + bool ok() const { + return type != INVALID; + } - /** - * The pseudo EXBNF for the query parsing language is: - * - * SEARCH STRING = TOKEN_LIST ( ' ' TOKEN_LIST )* - * - * TOKEN_LIST = SEARCH_TOKEN - * |'-' SEARCH_TOKEN - * | QUOTED_SEARCH_TOKEN - * |'-' QUOTED_SEARCH_TOKEN - * - * QUOTED_SEARCH_TOKEN = '“' SEARCH_TOKEN+ '"' - * - * SEARCH_TOKEN = CHARACTER_EXCLUDING_SPECIAL_CHARS - * - * SPECIAL_CHARS = '-' | ' ' | '"' - */ - class FTSQueryParser { - MONGO_DISALLOW_COPYING( FTSQueryParser ); - public: + Type type; + StringData data; + unsigned offset; + bool previousWhiteSpace; +}; - FTSQueryParser(StringData str); - bool more() const; - QueryToken next(); +/** + * The pseudo EXBNF for the query parsing language is: + * + * SEARCH STRING = TOKEN_LIST ( ' ' TOKEN_LIST )* + * + * TOKEN_LIST = SEARCH_TOKEN + * |'-' SEARCH_TOKEN + * | QUOTED_SEARCH_TOKEN + * |'-' QUOTED_SEARCH_TOKEN + * + * QUOTED_SEARCH_TOKEN = '“' SEARCH_TOKEN+ '"' + * + * SEARCH_TOKEN = CHARACTER_EXCLUDING_SPECIAL_CHARS + * + * SPECIAL_CHARS = '-' | ' ' | '"' + */ +class FTSQueryParser { + MONGO_DISALLOW_COPYING(FTSQueryParser); - private: - QueryToken::Type getType( char c ) const; - bool skipWhitespace(); +public: + FTSQueryParser(StringData str); + bool more() const; + QueryToken next(); - unsigned _pos; - bool _previousWhiteSpace; - const StringData _raw; - }; +private: + QueryToken::Type getType(char c) const; + bool skipWhitespace(); - } + unsigned _pos; + bool _previousWhiteSpace; + const StringData _raw; +}; +} } - diff --git a/src/mongo/db/fts/fts_query_test.cpp b/src/mongo/db/fts/fts_query_test.cpp index b090f23a660..a4a841c7f16 100644 --- a/src/mongo/db/fts/fts_query_test.cpp +++ b/src/mongo/db/fts/fts_query_test.cpp @@ -33,242 +33,222 @@ #include "mongo/unittest/unittest.h" namespace mongo { - namespace fts { - - TEST( FTSQuery, Basic1 ) { - FTSQuery q; - ASSERT( q.parse( "this is fun", "english", false, TEXT_INDEX_VERSION_2 ).isOK() ); - - ASSERT_EQUALS( false, q.getCaseSensitive() ); - ASSERT_EQUALS( 1U, q.getPositiveTerms().size() ); - ASSERT_EQUALS( "fun", *q.getPositiveTerms().begin() ); - ASSERT_EQUALS( 0U, q.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q.getPositivePhr().size() ); - ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); - ASSERT_TRUE( q.getTermsForBounds() == q.getPositiveTerms() ); - } - - TEST( FTSQuery, ParsePunctuation ) { - FTSQuery q; - ASSERT( q.parse( "hello.world", "english", false, TEXT_INDEX_VERSION_2 ).isOK() ); - - ASSERT_EQUALS( false, q.getCaseSensitive() ); - ASSERT_EQUALS( 2U, q.getPositiveTerms().size() ); - ASSERT_EQUALS( "hello", *q.getPositiveTerms().begin() ); - ASSERT_EQUALS( "world", *(--q.getPositiveTerms().end()) ); - ASSERT_EQUALS( 0U, q.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q.getPositivePhr().size() ); - ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); - ASSERT_TRUE( q.getTermsForBounds() == q.getPositiveTerms() ); - } - - TEST( FTSQuery, Neg1 ) { - FTSQuery q; - ASSERT( q.parse( "this is -really fun", "english", false, TEXT_INDEX_VERSION_2 ).isOK() ); - - ASSERT_EQUALS( 1U, q.getPositiveTerms().size() ); - ASSERT_EQUALS( "fun", *q.getPositiveTerms().begin() ); - ASSERT_EQUALS( 1U, q.getNegatedTerms().size() ); - ASSERT_EQUALS( "realli", *q.getNegatedTerms().begin() ); - ASSERT_TRUE( q.getTermsForBounds() == q.getPositiveTerms() ); - } - - TEST( FTSQuery, Phrase1 ) { - FTSQuery q; - ASSERT( q.parse( "doing a \"phrase test\" for fun", "english", false, - TEXT_INDEX_VERSION_2 ).isOK() ); - - ASSERT_EQUALS( 3U, q.getPositiveTerms().size() ); - ASSERT_EQUALS( 0U, q.getNegatedTerms().size() ); - ASSERT_EQUALS( 1U, q.getPositivePhr().size() ); - ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); - ASSERT_TRUE( q.getTermsForBounds() == q.getPositiveTerms() ); - - ASSERT_EQUALS( "phrase test", q.getPositivePhr()[0] ); - ASSERT_EQUALS( "fun|phrase|test||||phrase test||", q.debugString() ); - } - - TEST( FTSQuery, Phrase2 ) { - FTSQuery q; - ASSERT( q.parse( "doing a \"phrase-test\" for fun", "english", false, - TEXT_INDEX_VERSION_2 ).isOK() ); - ASSERT_EQUALS( 1U, q.getPositivePhr().size() ); - ASSERT_EQUALS( "phrase-test", q.getPositivePhr()[0] ); - } - - TEST( FTSQuery, NegPhrase1 ) { - FTSQuery q; - ASSERT( q.parse( "doing a -\"phrase test\" for fun", "english", false, - TEXT_INDEX_VERSION_2 ).isOK() ); - ASSERT_EQUALS( "fun||||||phrase test", q.debugString() ); - } - - TEST( FTSQuery, CaseSensitiveOption ) { - FTSQuery q; - ASSERT( q.parse( "this is fun", "english", true, TEXT_INDEX_VERSION_2 ).isOK() ); - ASSERT_EQUALS( true, q.getCaseSensitive() ); - } - - TEST( FTSQuery, CaseSensitivePositiveTerms ) { - FTSQuery q; - ASSERT( q.parse( "This is Positively fun", "english", true, - TEXT_INDEX_VERSION_2 ).isOK() ); - - ASSERT_EQUALS( 2U, q.getTermsForBounds().size() ); - ASSERT_EQUALS( 1, std::count( q.getTermsForBounds().begin(), - q.getTermsForBounds().end(), - "posit" ) ); - ASSERT_EQUALS( 1, std::count( q.getTermsForBounds().begin(), - q.getTermsForBounds().end(), - "fun" ) ); - ASSERT_EQUALS( 2U, q.getPositiveTerms().size() ); - ASSERT_EQUALS( 1, std::count( q.getPositiveTerms().begin(), - q.getPositiveTerms().end(), - "Posit" ) ); - ASSERT_EQUALS( 1, std::count( q.getPositiveTerms().begin(), - q.getPositiveTerms().end(), - "fun" ) ); - ASSERT_EQUALS( 0U, q.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q.getPositivePhr().size() ); - ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); - } - - TEST( FTSQuery, CaseSensitiveNegativeTerms ) { - FTSQuery q; - ASSERT( q.parse( "-This -is -Negatively -miserable", "english", true, - TEXT_INDEX_VERSION_2 ).isOK() ); - - ASSERT_EQUALS( 0U, q.getPositiveTerms().size() ); - ASSERT_EQUALS( 0U, q.getTermsForBounds().size() ); - ASSERT_EQUALS( 2U, q.getNegatedTerms().size() ); - ASSERT_EQUALS( 1, std::count( q.getNegatedTerms().begin(), - q.getNegatedTerms().end(), - "Negat" ) ); - ASSERT_EQUALS( 1, std::count( q.getNegatedTerms().begin(), - q.getNegatedTerms().end(), - "miser" ) ); - ASSERT_EQUALS( 0U, q.getPositivePhr().size() ); - ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); - } - - TEST( FTSQuery, CaseSensitivePositivePhrases ) { - FTSQuery q; - ASSERT( q.parse( "doing a \"Phrase Test\" for fun", "english", true, - TEXT_INDEX_VERSION_2 ).isOK() ); - - ASSERT_EQUALS( 1U, q.getPositivePhr().size() ); - ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); - ASSERT_EQUALS( "Phrase Test", q.getPositivePhr()[0] ); - } - - TEST( FTSQuery, CaseSensitiveNegativePhrases ) { - FTSQuery q; - ASSERT( q.parse( "doing a -\"Phrase Test\" for fun", "english", true, - TEXT_INDEX_VERSION_2 ).isOK() ); - - ASSERT_EQUALS( 0U, q.getPositivePhr().size() ); - ASSERT_EQUALS( 1U, q.getNegatedPhr().size() ); - ASSERT_EQUALS( "Phrase Test", q.getNegatedPhr()[0] ); - } - - TEST( FTSQuery, Mix1 ) { - FTSQuery q; - ASSERT( q.parse( "\"industry\" -Melbourne -Physics", "english", false, - TEXT_INDEX_VERSION_2 ).isOK() ); - ASSERT_EQUALS( "industri||melbourn|physic||industry||", q.debugString() ); - } - - TEST( FTSQuery, NegPhrase2) { - FTSQuery q1, q2, q3; - ASSERT( q1.parse( "foo \"bar\"", "english", false, TEXT_INDEX_VERSION_2 ).isOK() ); - ASSERT( q2.parse( "foo \"-bar\"", "english", false, TEXT_INDEX_VERSION_2 ).isOK() ); - ASSERT( q3.parse( "foo \" -bar\"", "english", false, TEXT_INDEX_VERSION_2 ).isOK() ); - - ASSERT_EQUALS( 2U, q1.getPositiveTerms().size() ); - ASSERT_EQUALS( 2U, q2.getPositiveTerms().size() ); - ASSERT_EQUALS( 2U, q3.getPositiveTerms().size() ); - - ASSERT_EQUALS( 0U, q1.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q2.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q3.getNegatedTerms().size() ); - - ASSERT_EQUALS( 1U, q1.getPositivePhr().size() ); - ASSERT_EQUALS( 1U, q2.getPositivePhr().size() ); - ASSERT_EQUALS( 1U, q3.getPositivePhr().size() ); - - ASSERT_EQUALS( 0U, q1.getNegatedPhr().size() ); - ASSERT_EQUALS( 0U, q2.getNegatedPhr().size() ); - ASSERT_EQUALS( 0U, q3.getNegatedPhr().size() ); - } - - TEST( FTSQuery, NegPhrase3) { - FTSQuery q1, q2, q3; - ASSERT( q1.parse( "foo -\"bar\"", "english", false, TEXT_INDEX_VERSION_2 ).isOK() ); - ASSERT( q2.parse( "foo -\"-bar\"", "english", false, TEXT_INDEX_VERSION_2 ).isOK() ); - ASSERT( q3.parse( "foo -\" -bar\"", "english", false, TEXT_INDEX_VERSION_2 ).isOK() ); - - ASSERT_EQUALS( 1U, q1.getPositiveTerms().size() ); - ASSERT_EQUALS( 1U, q2.getPositiveTerms().size() ); - ASSERT_EQUALS( 1U, q3.getPositiveTerms().size() ); - - ASSERT_EQUALS( 0U, q1.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q2.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q3.getNegatedTerms().size() ); - - ASSERT_EQUALS( 0U, q1.getPositivePhr().size() ); - ASSERT_EQUALS( 0U, q2.getPositivePhr().size() ); - ASSERT_EQUALS( 0U, q3.getPositivePhr().size() ); - - ASSERT_EQUALS( 1U, q1.getNegatedPhr().size() ); - ASSERT_EQUALS( 1U, q2.getNegatedPhr().size() ); - ASSERT_EQUALS( 1U, q3.getNegatedPhr().size() ); - } - - // Test textIndexVersion:1 query with language "english". This invokes the standard English - // stemmer and stopword list. - TEST( FTSQuery, TextIndexVersion1LanguageEnglish ) { - FTSQuery q; - ASSERT( q.parse( "the running", "english", false, TEXT_INDEX_VERSION_1 ).isOK() ); - ASSERT_EQUALS( 1U, q.getPositiveTerms().size() ); - ASSERT_EQUALS( "run", *q.getPositiveTerms().begin() ); - ASSERT_EQUALS( 0U, q.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q.getPositivePhr().size() ); - ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); - } - - // Test textIndexVersion:1 query with language "eng". "eng" uses the English stemmer, and - // no stopword list. - TEST( FTSQuery, TextIndexVersion1LanguageEng ) { - FTSQuery q; - ASSERT( q.parse( "the running", "eng", false, TEXT_INDEX_VERSION_1 ).isOK() ); - ASSERT_EQUALS( 2U, q.getPositiveTerms().size() ); - ASSERT_EQUALS( 1, std::count( q.getPositiveTerms().begin(), - q.getPositiveTerms().end(), - "the" ) ); - ASSERT_EQUALS( 1, std::count( q.getPositiveTerms().begin(), - q.getPositiveTerms().end(), - "run" ) ); - ASSERT_EQUALS( 0U, q.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q.getPositivePhr().size() ); - ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); - } - - // Test textIndexVersion:1 query with language "invalid". No stemming will be performed, - // and no stopword list will be used. - TEST( FTSQuery, TextIndexVersion1LanguageInvalid ) { - FTSQuery q; - ASSERT( q.parse( "the running", "invalid", false, TEXT_INDEX_VERSION_1 ).isOK() ); - ASSERT_EQUALS( 2U, q.getPositiveTerms().size() ); - ASSERT_EQUALS( 1, std::count( q.getPositiveTerms().begin(), - q.getPositiveTerms().end(), - "the" ) ); - ASSERT_EQUALS( 1, std::count( q.getPositiveTerms().begin(), - q.getPositiveTerms().end(), - "running" ) ); - ASSERT_EQUALS( 0U, q.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q.getPositivePhr().size() ); - ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); - } - - } +namespace fts { + +TEST(FTSQuery, Basic1) { + FTSQuery q; + ASSERT(q.parse("this is fun", "english", false, TEXT_INDEX_VERSION_2).isOK()); + + ASSERT_EQUALS(false, q.getCaseSensitive()); + ASSERT_EQUALS(1U, q.getPositiveTerms().size()); + ASSERT_EQUALS("fun", *q.getPositiveTerms().begin()); + ASSERT_EQUALS(0U, q.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q.getPositivePhr().size()); + ASSERT_EQUALS(0U, q.getNegatedPhr().size()); + ASSERT_TRUE(q.getTermsForBounds() == q.getPositiveTerms()); +} + +TEST(FTSQuery, ParsePunctuation) { + FTSQuery q; + ASSERT(q.parse("hello.world", "english", false, TEXT_INDEX_VERSION_2).isOK()); + + ASSERT_EQUALS(false, q.getCaseSensitive()); + ASSERT_EQUALS(2U, q.getPositiveTerms().size()); + ASSERT_EQUALS("hello", *q.getPositiveTerms().begin()); + ASSERT_EQUALS("world", *(--q.getPositiveTerms().end())); + ASSERT_EQUALS(0U, q.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q.getPositivePhr().size()); + ASSERT_EQUALS(0U, q.getNegatedPhr().size()); + ASSERT_TRUE(q.getTermsForBounds() == q.getPositiveTerms()); +} + +TEST(FTSQuery, Neg1) { + FTSQuery q; + ASSERT(q.parse("this is -really fun", "english", false, TEXT_INDEX_VERSION_2).isOK()); + + ASSERT_EQUALS(1U, q.getPositiveTerms().size()); + ASSERT_EQUALS("fun", *q.getPositiveTerms().begin()); + ASSERT_EQUALS(1U, q.getNegatedTerms().size()); + ASSERT_EQUALS("realli", *q.getNegatedTerms().begin()); + ASSERT_TRUE(q.getTermsForBounds() == q.getPositiveTerms()); +} + +TEST(FTSQuery, Phrase1) { + FTSQuery q; + ASSERT( + q.parse("doing a \"phrase test\" for fun", "english", false, TEXT_INDEX_VERSION_2).isOK()); + + ASSERT_EQUALS(3U, q.getPositiveTerms().size()); + ASSERT_EQUALS(0U, q.getNegatedTerms().size()); + ASSERT_EQUALS(1U, q.getPositivePhr().size()); + ASSERT_EQUALS(0U, q.getNegatedPhr().size()); + ASSERT_TRUE(q.getTermsForBounds() == q.getPositiveTerms()); + + ASSERT_EQUALS("phrase test", q.getPositivePhr()[0]); + ASSERT_EQUALS("fun|phrase|test||||phrase test||", q.debugString()); +} + +TEST(FTSQuery, Phrase2) { + FTSQuery q; + ASSERT( + q.parse("doing a \"phrase-test\" for fun", "english", false, TEXT_INDEX_VERSION_2).isOK()); + ASSERT_EQUALS(1U, q.getPositivePhr().size()); + ASSERT_EQUALS("phrase-test", q.getPositivePhr()[0]); +} + +TEST(FTSQuery, NegPhrase1) { + FTSQuery q; + ASSERT( + q.parse("doing a -\"phrase test\" for fun", "english", false, TEXT_INDEX_VERSION_2).isOK()); + ASSERT_EQUALS("fun||||||phrase test", q.debugString()); +} + +TEST(FTSQuery, CaseSensitiveOption) { + FTSQuery q; + ASSERT(q.parse("this is fun", "english", true, TEXT_INDEX_VERSION_2).isOK()); + ASSERT_EQUALS(true, q.getCaseSensitive()); +} + +TEST(FTSQuery, CaseSensitivePositiveTerms) { + FTSQuery q; + ASSERT(q.parse("This is Positively fun", "english", true, TEXT_INDEX_VERSION_2).isOK()); + + ASSERT_EQUALS(2U, q.getTermsForBounds().size()); + ASSERT_EQUALS(1, + std::count(q.getTermsForBounds().begin(), q.getTermsForBounds().end(), "posit")); + ASSERT_EQUALS(1, std::count(q.getTermsForBounds().begin(), q.getTermsForBounds().end(), "fun")); + ASSERT_EQUALS(2U, q.getPositiveTerms().size()); + ASSERT_EQUALS(1, std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "Posit")); + ASSERT_EQUALS(1, std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "fun")); + ASSERT_EQUALS(0U, q.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q.getPositivePhr().size()); + ASSERT_EQUALS(0U, q.getNegatedPhr().size()); +} + +TEST(FTSQuery, CaseSensitiveNegativeTerms) { + FTSQuery q; + ASSERT( + q.parse("-This -is -Negatively -miserable", "english", true, TEXT_INDEX_VERSION_2).isOK()); + + ASSERT_EQUALS(0U, q.getPositiveTerms().size()); + ASSERT_EQUALS(0U, q.getTermsForBounds().size()); + ASSERT_EQUALS(2U, q.getNegatedTerms().size()); + ASSERT_EQUALS(1, std::count(q.getNegatedTerms().begin(), q.getNegatedTerms().end(), "Negat")); + ASSERT_EQUALS(1, std::count(q.getNegatedTerms().begin(), q.getNegatedTerms().end(), "miser")); + ASSERT_EQUALS(0U, q.getPositivePhr().size()); + ASSERT_EQUALS(0U, q.getNegatedPhr().size()); +} + +TEST(FTSQuery, CaseSensitivePositivePhrases) { + FTSQuery q; + ASSERT( + q.parse("doing a \"Phrase Test\" for fun", "english", true, TEXT_INDEX_VERSION_2).isOK()); + + ASSERT_EQUALS(1U, q.getPositivePhr().size()); + ASSERT_EQUALS(0U, q.getNegatedPhr().size()); + ASSERT_EQUALS("Phrase Test", q.getPositivePhr()[0]); +} + +TEST(FTSQuery, CaseSensitiveNegativePhrases) { + FTSQuery q; + ASSERT( + q.parse("doing a -\"Phrase Test\" for fun", "english", true, TEXT_INDEX_VERSION_2).isOK()); + + ASSERT_EQUALS(0U, q.getPositivePhr().size()); + ASSERT_EQUALS(1U, q.getNegatedPhr().size()); + ASSERT_EQUALS("Phrase Test", q.getNegatedPhr()[0]); +} + +TEST(FTSQuery, Mix1) { + FTSQuery q; + ASSERT( + q.parse("\"industry\" -Melbourne -Physics", "english", false, TEXT_INDEX_VERSION_2).isOK()); + ASSERT_EQUALS("industri||melbourn|physic||industry||", q.debugString()); +} + +TEST(FTSQuery, NegPhrase2) { + FTSQuery q1, q2, q3; + ASSERT(q1.parse("foo \"bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q2.parse("foo \"-bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q3.parse("foo \" -bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK()); + + ASSERT_EQUALS(2U, q1.getPositiveTerms().size()); + ASSERT_EQUALS(2U, q2.getPositiveTerms().size()); + ASSERT_EQUALS(2U, q3.getPositiveTerms().size()); + + ASSERT_EQUALS(0U, q1.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q2.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q3.getNegatedTerms().size()); + + ASSERT_EQUALS(1U, q1.getPositivePhr().size()); + ASSERT_EQUALS(1U, q2.getPositivePhr().size()); + ASSERT_EQUALS(1U, q3.getPositivePhr().size()); + + ASSERT_EQUALS(0U, q1.getNegatedPhr().size()); + ASSERT_EQUALS(0U, q2.getNegatedPhr().size()); + ASSERT_EQUALS(0U, q3.getNegatedPhr().size()); +} + +TEST(FTSQuery, NegPhrase3) { + FTSQuery q1, q2, q3; + ASSERT(q1.parse("foo -\"bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q2.parse("foo -\"-bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q3.parse("foo -\" -bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK()); + + ASSERT_EQUALS(1U, q1.getPositiveTerms().size()); + ASSERT_EQUALS(1U, q2.getPositiveTerms().size()); + ASSERT_EQUALS(1U, q3.getPositiveTerms().size()); + + ASSERT_EQUALS(0U, q1.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q2.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q3.getNegatedTerms().size()); + + ASSERT_EQUALS(0U, q1.getPositivePhr().size()); + ASSERT_EQUALS(0U, q2.getPositivePhr().size()); + ASSERT_EQUALS(0U, q3.getPositivePhr().size()); + + ASSERT_EQUALS(1U, q1.getNegatedPhr().size()); + ASSERT_EQUALS(1U, q2.getNegatedPhr().size()); + ASSERT_EQUALS(1U, q3.getNegatedPhr().size()); +} + +// Test textIndexVersion:1 query with language "english". This invokes the standard English +// stemmer and stopword list. +TEST(FTSQuery, TextIndexVersion1LanguageEnglish) { + FTSQuery q; + ASSERT(q.parse("the running", "english", false, TEXT_INDEX_VERSION_1).isOK()); + ASSERT_EQUALS(1U, q.getPositiveTerms().size()); + ASSERT_EQUALS("run", *q.getPositiveTerms().begin()); + ASSERT_EQUALS(0U, q.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q.getPositivePhr().size()); + ASSERT_EQUALS(0U, q.getNegatedPhr().size()); +} + +// Test textIndexVersion:1 query with language "eng". "eng" uses the English stemmer, and +// no stopword list. +TEST(FTSQuery, TextIndexVersion1LanguageEng) { + FTSQuery q; + ASSERT(q.parse("the running", "eng", false, TEXT_INDEX_VERSION_1).isOK()); + ASSERT_EQUALS(2U, q.getPositiveTerms().size()); + ASSERT_EQUALS(1, std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "the")); + ASSERT_EQUALS(1, std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "run")); + ASSERT_EQUALS(0U, q.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q.getPositivePhr().size()); + ASSERT_EQUALS(0U, q.getNegatedPhr().size()); +} + +// Test textIndexVersion:1 query with language "invalid". No stemming will be performed, +// and no stopword list will be used. +TEST(FTSQuery, TextIndexVersion1LanguageInvalid) { + FTSQuery q; + ASSERT(q.parse("the running", "invalid", false, TEXT_INDEX_VERSION_1).isOK()); + ASSERT_EQUALS(2U, q.getPositiveTerms().size()); + ASSERT_EQUALS(1, std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "the")); + ASSERT_EQUALS(1, + std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "running")); + ASSERT_EQUALS(0U, q.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q.getPositivePhr().size()); + ASSERT_EQUALS(0U, q.getNegatedPhr().size()); +} +} } diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp index 274d9a6d6ba..eb7e018b522 100644 --- a/src/mongo/db/fts/fts_spec.cpp +++ b/src/mongo/db/fts/fts_spec.cpp @@ -40,457 +40,408 @@ namespace mongo { - namespace fts { - - using std::map; - using std::string; - using namespace mongoutils; - - const double DEFAULT_WEIGHT = 1; - const double MAX_WEIGHT = 1000000000; - const double MAX_WORD_WEIGHT = MAX_WEIGHT / 10000; - - namespace { - // Default language. Used for new indexes. - const std::string moduleDefaultLanguage( "english" ); - - /** Validate the given language override string. */ - bool validateOverride( const string& override ) { - // The override field can't be empty, can't be prefixed with a dollar sign, and - // can't contain a dot. - return !override.empty() && - override[0] != '$' && - override.find('.') == std::string::npos; - } - } - - FTSSpec::FTSSpec( const BSONObj& indexInfo ) { - // indexInfo is a text index spec. Text index specs pass through fixSpec() before - // being saved to the system.indexes collection. fixSpec() enforces a schema, such that - // required fields must exist and be of the correct type (e.g. weights, - // textIndexVersion). - massert( 16739, "found invalid spec for text index", - indexInfo["weights"].isABSONObj() ); - BSONElement textIndexVersionElt = indexInfo["textIndexVersion"]; - massert( 17367, - "found invalid spec for text index, expected number for textIndexVersion", - textIndexVersionElt.isNumber() ); - - // We currently support TEXT_INDEX_VERSION_1 (deprecated) and TEXT_INDEX_VERSION_2. - // Reject all other values. - massert( 17364, - str::stream() << "attempt to use unsupported textIndexVersion " << - textIndexVersionElt.numberInt() << "; versions supported: " << - TEXT_INDEX_VERSION_2 << ", " << TEXT_INDEX_VERSION_1, - textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 || - textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_1 ); - - _textIndexVersion = ( textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 ) ? - TEXT_INDEX_VERSION_2 : TEXT_INDEX_VERSION_1; - - // Initialize _defaultLanguage. Note that the FTSLanguage constructor requires - // textIndexVersion, since language parsing is version-specific. - auto indexLanguage = indexInfo["default_language"].String(); - auto swl = FTSLanguage::make(indexLanguage , _textIndexVersion ); - - // This can fail if the user originally created the text index under an instance of - // MongoDB that supports different languages then the current instance - // TODO: consder propagating the index ns to here to improve the error message - uassert(28682, - str::stream() << "Unrecognized language " << indexLanguage << - " found for text index. Verify mongod was started with the" - " correct options.", - swl.getStatus().isOK()); - _defaultLanguage = swl.getValue(); - - _languageOverrideField = indexInfo["language_override"].valuestrsafe(); - - _wildcard = false; - - // in this block we fill in the _weights map - { - BSONObjIterator i( indexInfo["weights"].Obj() ); - while ( i.more() ) { - BSONElement e = i.next(); - verify( e.isNumber() ); - - if ( WILDCARD == e.fieldName() ) { - _wildcard = true; - } - else { - double num = e.number(); - _weights[ e.fieldName() ] = num; - verify( num > 0 && num < MAX_WORD_WEIGHT ); - } - } - verify( _wildcard || _weights.size() ); - } - - // extra information - { - BSONObj keyPattern = indexInfo["key"].Obj(); - verify( keyPattern.nFields() >= 2 ); - BSONObjIterator i( keyPattern ); +namespace fts { - bool passedFTS = false; +using std::map; +using std::string; +using namespace mongoutils; - while ( i.more() ) { - BSONElement e = i.next(); - if ( str::equals( e.fieldName(), "_fts" ) || - str::equals( e.fieldName(), "_ftsx" ) ) { - passedFTS = true; - continue; - } +const double DEFAULT_WEIGHT = 1; +const double MAX_WEIGHT = 1000000000; +const double MAX_WORD_WEIGHT = MAX_WEIGHT / 10000; - if ( passedFTS ) - _extraAfter.push_back( e.fieldName() ); - else - _extraBefore.push_back( e.fieldName() ); - } +namespace { +// Default language. Used for new indexes. +const std::string moduleDefaultLanguage("english"); - } - } +/** Validate the given language override string. */ +bool validateOverride(const string& override) { + // The override field can't be empty, can't be prefixed with a dollar sign, and + // can't contain a dot. + return !override.empty() && override[0] != '$' && override.find('.') == std::string::npos; +} +} - const FTSLanguage* FTSSpec::_getLanguageToUseV2( const BSONObj& userDoc, - const FTSLanguage* currentLanguage ) const { - BSONElement e = userDoc[_languageOverrideField]; - if ( e.eoo() ) { - return currentLanguage; +FTSSpec::FTSSpec(const BSONObj& indexInfo) { + // indexInfo is a text index spec. Text index specs pass through fixSpec() before + // being saved to the system.indexes collection. fixSpec() enforces a schema, such that + // required fields must exist and be of the correct type (e.g. weights, + // textIndexVersion). + massert(16739, "found invalid spec for text index", indexInfo["weights"].isABSONObj()); + BSONElement textIndexVersionElt = indexInfo["textIndexVersion"]; + massert(17367, + "found invalid spec for text index, expected number for textIndexVersion", + textIndexVersionElt.isNumber()); + + // We currently support TEXT_INDEX_VERSION_1 (deprecated) and TEXT_INDEX_VERSION_2. + // Reject all other values. + massert(17364, + str::stream() << "attempt to use unsupported textIndexVersion " + << textIndexVersionElt.numberInt() << "; versions supported: " + << TEXT_INDEX_VERSION_2 << ", " << TEXT_INDEX_VERSION_1, + textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 || + textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_1); + + _textIndexVersion = (textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2) + ? TEXT_INDEX_VERSION_2 + : TEXT_INDEX_VERSION_1; + + // Initialize _defaultLanguage. Note that the FTSLanguage constructor requires + // textIndexVersion, since language parsing is version-specific. + auto indexLanguage = indexInfo["default_language"].String(); + auto swl = FTSLanguage::make(indexLanguage, _textIndexVersion); + + // This can fail if the user originally created the text index under an instance of + // MongoDB that supports different languages then the current instance + // TODO: consder propagating the index ns to here to improve the error message + uassert(28682, + str::stream() << "Unrecognized language " << indexLanguage + << " found for text index. Verify mongod was started with the" + " correct options.", + swl.getStatus().isOK()); + _defaultLanguage = swl.getValue(); + + _languageOverrideField = indexInfo["language_override"].valuestrsafe(); + + _wildcard = false; + + // in this block we fill in the _weights map + { + BSONObjIterator i(indexInfo["weights"].Obj()); + while (i.more()) { + BSONElement e = i.next(); + verify(e.isNumber()); + + if (WILDCARD == e.fieldName()) { + _wildcard = true; + } else { + double num = e.number(); + _weights[e.fieldName()] = num; + verify(num > 0 && num < MAX_WORD_WEIGHT); } - uassert( 17261, - "found language override field in document with non-string type", - e.type() == mongo::String ); - StatusWithFTSLanguage swl = FTSLanguage::make( e.String(), TEXT_INDEX_VERSION_2 ); - uassert( 17262, - "language override unsupported: " + e.String(), - swl.getStatus().isOK() ); - return swl.getValue(); } + verify(_wildcard || _weights.size()); + } - void FTSSpec::scoreDocument( const BSONObj& obj, TermFrequencyMap* term_freqs ) const { - if ( _textIndexVersion == TEXT_INDEX_VERSION_1 ) { - return _scoreDocumentV1( obj, term_freqs ); - } + // extra information + { + BSONObj keyPattern = indexInfo["key"].Obj(); + verify(keyPattern.nFields() >= 2); + BSONObjIterator i(keyPattern); - FTSElementIterator it( *this, obj ); + bool passedFTS = false; - while ( it.more() ) { - FTSIteratorValue val = it.next(); - std::unique_ptr<FTSTokenizer> tokenizer(val._language->createTokenizer()); - _scoreStringV2( tokenizer.get(), val._text, term_freqs, val._weight ); + while (i.more()) { + BSONElement e = i.next(); + if (str::equals(e.fieldName(), "_fts") || str::equals(e.fieldName(), "_ftsx")) { + passedFTS = true; + continue; } + + if (passedFTS) + _extraAfter.push_back(e.fieldName()); + else + _extraBefore.push_back(e.fieldName()); } + } +} - void FTSSpec::_scoreStringV2( FTSTokenizer* tokenizer, - StringData raw, - TermFrequencyMap* docScores, - double weight ) const { +const FTSLanguage* FTSSpec::_getLanguageToUseV2(const BSONObj& userDoc, + const FTSLanguage* currentLanguage) const { + BSONElement e = userDoc[_languageOverrideField]; + if (e.eoo()) { + return currentLanguage; + } + uassert(17261, + "found language override field in document with non-string type", + e.type() == mongo::String); + StatusWithFTSLanguage swl = FTSLanguage::make(e.String(), TEXT_INDEX_VERSION_2); + uassert(17262, "language override unsupported: " + e.String(), swl.getStatus().isOK()); + return swl.getValue(); +} - ScoreHelperMap terms; +void FTSSpec::scoreDocument(const BSONObj& obj, TermFrequencyMap* term_freqs) const { + if (_textIndexVersion == TEXT_INDEX_VERSION_1) { + return _scoreDocumentV1(obj, term_freqs); + } - unsigned numTokens = 0; + FTSElementIterator it(*this, obj); - tokenizer->reset(raw.rawData(), FTSTokenizer::FilterStopWords ); + while (it.more()) { + FTSIteratorValue val = it.next(); + std::unique_ptr<FTSTokenizer> tokenizer(val._language->createTokenizer()); + _scoreStringV2(tokenizer.get(), val._text, term_freqs, val._weight); + } +} - while (tokenizer->moveNext()) { - string term = tokenizer->get().toString(); +void FTSSpec::_scoreStringV2(FTSTokenizer* tokenizer, + StringData raw, + TermFrequencyMap* docScores, + double weight) const { + ScoreHelperMap terms; - ScoreHelperStruct& data = terms[term]; + unsigned numTokens = 0; - if ( data.exp ) { - data.exp *= 2; - } - else { - data.exp = 1; - } - data.count += 1; - data.freq += ( 1 / data.exp ); - numTokens++; - } + tokenizer->reset(raw.rawData(), FTSTokenizer::FilterStopWords); - for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) { + while (tokenizer->moveNext()) { + string term = tokenizer->get().toString(); - const string& term = i->first; - const ScoreHelperStruct& data = i->second; + ScoreHelperStruct& data = terms[term]; - // in order to adjust weights as a function of term count as it - // relates to total field length. ie. is this the only word or - // a frequently occuring term? or does it only show up once in - // a long block of text? + if (data.exp) { + data.exp *= 2; + } else { + data.exp = 1; + } + data.count += 1; + data.freq += (1 / data.exp); + numTokens++; + } - double coeff = ( 0.5 * data.count / numTokens ) + 0.5; + for (ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i) { + const string& term = i->first; + const ScoreHelperStruct& data = i->second; - // if term is identical to the raw form of the - // field (untokenized) give it a small boost. - double adjustment = 1; - if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) ) - adjustment += 0.1; + // in order to adjust weights as a function of term count as it + // relates to total field length. ie. is this the only word or + // a frequently occuring term? or does it only show up once in + // a long block of text? - double& score = (*docScores)[term]; - score += ( weight * data.freq * coeff * adjustment ); - verify( score <= MAX_WEIGHT ); - } - } + double coeff = (0.5 * data.count / numTokens) + 0.5; - Status FTSSpec::getIndexPrefix( const BSONObj& query, BSONObj* out ) const { - if ( numExtraBefore() == 0 ) { - *out = BSONObj(); - return Status::OK(); - } + // if term is identical to the raw form of the + // field (untokenized) give it a small boost. + double adjustment = 1; + if (raw.size() == term.length() && raw.equalCaseInsensitive(term)) + adjustment += 0.1; - BSONObjBuilder b; - for ( unsigned i = 0; i < numExtraBefore(); i++ ) { - BSONElement e = query.getFieldDotted(extraBefore(i)); - if ( e.eoo() ) - return Status( ErrorCodes::BadValue, - str::stream() - << "need have an equality filter on: " - << extraBefore(i) ); - - if ( e.isABSONObj() && e.Obj().firstElement().getGtLtOp( -1 ) != -1 ) - return Status( ErrorCodes::BadValue, - str::stream() - << "need have an equality filter on: " - << extraBefore(i) ); - - b.append( e ); - } - *out = b.obj(); - return Status::OK(); - } + double& score = (*docScores)[term]; + score += (weight * data.freq * coeff * adjustment); + verify(score <= MAX_WEIGHT); + } +} - namespace { - void _addFTSStuff( BSONObjBuilder* b ) { - b->append( "_fts", INDEX_NAME ); - b->append( "_ftsx", 1 ); - } +Status FTSSpec::getIndexPrefix(const BSONObj& query, BSONObj* out) const { + if (numExtraBefore() == 0) { + *out = BSONObj(); + return Status::OK(); + } - void verifyFieldNameNotReserved( StringData s ) { - uassert( 17289, - "text index with reserved fields _fts/_ftsx not allowed", - s != "_fts" && s != "_ftsx" ); - } - } + BSONObjBuilder b; + for (unsigned i = 0; i < numExtraBefore(); i++) { + BSONElement e = query.getFieldDotted(extraBefore(i)); + if (e.eoo()) + return Status(ErrorCodes::BadValue, + str::stream() << "need have an equality filter on: " << extraBefore(i)); - BSONObj FTSSpec::fixSpec( const BSONObj& spec ) { - if ( spec["textIndexVersion"].numberInt() == TEXT_INDEX_VERSION_1 ) { - return _fixSpecV1( spec ); - } + if (e.isABSONObj() && e.Obj().firstElement().getGtLtOp(-1) != -1) + return Status(ErrorCodes::BadValue, + str::stream() << "need have an equality filter on: " << extraBefore(i)); - map<string,int> m; - - BSONObj keyPattern; - { - BSONObjBuilder b; - - // Populate m and keyPattern. - { - bool addedFtsStuff = false; - BSONObjIterator i( spec["key"].Obj() ); - while ( i.more() ) { - BSONElement e = i.next(); - if ( str::equals( e.fieldName(), "_fts" ) ) { - uassert( 17271, - "expecting _fts:\"text\"", - INDEX_NAME == e.valuestrsafe() ); - addedFtsStuff = true; - b.append( e ); - } - else if ( str::equals( e.fieldName(), "_ftsx" ) ) { - uassert( 17272, "expecting _ftsx:1", e.numberInt() == 1 ); - b.append( e ); - } - else if ( e.type() == String && INDEX_NAME == e.valuestr() ) { - - if ( !addedFtsStuff ) { - _addFTSStuff( &b ); - addedFtsStuff = true; - } - - m[e.fieldName()] = 1; - } - else { - uassert( 17273, - "expected value 1 or -1 for non-text key in compound index", - e.numberInt() == 1 || e.numberInt() == -1 ); - b.append( e ); - } - } - verify( addedFtsStuff ); - } - keyPattern = b.obj(); - - // Verify that index key is in the correct format: extraBefore fields, then text - // fields, then extraAfter fields. - { - BSONObjIterator i( spec["key"].Obj() ); - verify( i.more() ); - BSONElement e = i.next(); - - // extraBefore fields - while ( String != e.type() ) { - verifyFieldNameNotReserved( e.fieldNameStringData() ); - verify( i.more() ); - e = i.next(); - } + b.append(e); + } + *out = b.obj(); + return Status::OK(); +} - // text fields - bool alreadyFixed = str::equals( e.fieldName(), "_fts" ); - if ( alreadyFixed ) { - uassert( 17288, "expected _ftsx after _fts", i.more() ); - e = i.next(); - uassert( 17274, - "expected _ftsx after _fts", - str::equals( e.fieldName(), "_ftsx" ) ); - e = i.next(); - } - else { - do { - verifyFieldNameNotReserved( e.fieldNameStringData() ); - e = i.next(); - } while ( !e.eoo() && e.type() == String ); - } +namespace { +void _addFTSStuff(BSONObjBuilder* b) { + b->append("_fts", INDEX_NAME); + b->append("_ftsx", 1); +} - // extraAfterFields - while ( !e.eoo() ) { - uassert( 17389, - "'text' fields in index must all be adjacent", - e.type() != String ); - verifyFieldNameNotReserved( e.fieldNameStringData() ); - e = i.next(); - } - } +void verifyFieldNameNotReserved(StringData s) { + uassert(17289, + "text index with reserved fields _fts/_ftsx not allowed", + s != "_fts" && s != "_ftsx"); +} +} - } +BSONObj FTSSpec::fixSpec(const BSONObj& spec) { + if (spec["textIndexVersion"].numberInt() == TEXT_INDEX_VERSION_1) { + return _fixSpecV1(spec); + } - if ( spec["weights"].type() == Object ) { - BSONObjIterator i( spec["weights"].Obj() ); - while ( i.more() ) { - BSONElement e = i.next(); - uassert( 17283, - "weight for text index needs numeric type", - e.isNumber() ); - m[e.fieldName()] = e.numberInt(); - } - } - else if ( spec["weights"].str() == WILDCARD ) { - m[WILDCARD] = 1; - } - else if ( !spec["weights"].eoo() ) { - uasserted( 17284, "text index option 'weights' must be an object" ); - } + map<string, int> m; - BSONObj weights; - { - BSONObjBuilder b; - for ( map<string,int>::iterator i = m.begin(); i != m.end(); ++i ) { - uassert( 16674, "score for word too high", - i->second > 0 && i->second < MAX_WORD_WEIGHT ); - - // Verify weight refers to a valid field. - if ( i->first != "$**" ) { - FieldRef keyField( i->first ); - uassert( 17294, - "weight cannot be on an empty field", - keyField.numParts() != 0 ); - for ( size_t partNum = 0; partNum < keyField.numParts(); partNum++ ) { - StringData part = keyField.getPart(partNum); - uassert( 17291, - "weight cannot have empty path component", - !part.empty() ); - uassert( 17292, - "weight cannot have path component with $ prefix", - !part.startsWith( "$" ) ); - } + BSONObj keyPattern; + { + BSONObjBuilder b; + + // Populate m and keyPattern. + { + bool addedFtsStuff = false; + BSONObjIterator i(spec["key"].Obj()); + while (i.more()) { + BSONElement e = i.next(); + if (str::equals(e.fieldName(), "_fts")) { + uassert(17271, "expecting _fts:\"text\"", INDEX_NAME == e.valuestrsafe()); + addedFtsStuff = true; + b.append(e); + } else if (str::equals(e.fieldName(), "_ftsx")) { + uassert(17272, "expecting _ftsx:1", e.numberInt() == 1); + b.append(e); + } else if (e.type() == String && INDEX_NAME == e.valuestr()) { + if (!addedFtsStuff) { + _addFTSStuff(&b); + addedFtsStuff = true; } - b.append( i->first, i->second ); + m[e.fieldName()] = 1; + } else { + uassert(17273, + "expected value 1 or -1 for non-text key in compound index", + e.numberInt() == 1 || e.numberInt() == -1); + b.append(e); } - weights = b.obj(); - } - - BSONElement default_language_elt = spec["default_language"]; - string default_language( default_language_elt.str() ); - if ( default_language_elt.eoo() ) { - default_language = moduleDefaultLanguage; } - else { - uassert( 17263, - "default_language needs a string type", - default_language_elt.type() == String ); + verify(addedFtsStuff); + } + keyPattern = b.obj(); + + // Verify that index key is in the correct format: extraBefore fields, then text + // fields, then extraAfter fields. + { + BSONObjIterator i(spec["key"].Obj()); + verify(i.more()); + BSONElement e = i.next(); + + // extraBefore fields + while (String != e.type()) { + verifyFieldNameNotReserved(e.fieldNameStringData()); + verify(i.more()); + e = i.next(); } - uassert( 17264, - "default_language is not valid", - FTSLanguage::make( default_language, - TEXT_INDEX_VERSION_2 ).getStatus().isOK() ); - - BSONElement language_override_elt = spec["language_override"]; - string language_override( language_override_elt.str() ); - if ( language_override_elt.eoo() ) { - language_override = "language"; + + // text fields + bool alreadyFixed = str::equals(e.fieldName(), "_fts"); + if (alreadyFixed) { + uassert(17288, "expected _ftsx after _fts", i.more()); + e = i.next(); + uassert(17274, "expected _ftsx after _fts", str::equals(e.fieldName(), "_ftsx")); + e = i.next(); + } else { + do { + verifyFieldNameNotReserved(e.fieldNameStringData()); + e = i.next(); + } while (!e.eoo() && e.type() == String); } - else { - uassert( 17136, - "language_override is not valid", - language_override_elt.type() == String - && validateOverride( language_override ) ); + + // extraAfterFields + while (!e.eoo()) { + uassert(17389, "'text' fields in index must all be adjacent", e.type() != String); + verifyFieldNameNotReserved(e.fieldNameStringData()); + e = i.next(); } + } + } - int version = -1; - int textIndexVersion = TEXT_INDEX_VERSION_2; + if (spec["weights"].type() == Object) { + BSONObjIterator i(spec["weights"].Obj()); + while (i.more()) { + BSONElement e = i.next(); + uassert(17283, "weight for text index needs numeric type", e.isNumber()); + m[e.fieldName()] = e.numberInt(); + } + } else if (spec["weights"].str() == WILDCARD) { + m[WILDCARD] = 1; + } else if (!spec["weights"].eoo()) { + uasserted(17284, "text index option 'weights' must be an object"); + } - BSONObjBuilder b; - BSONObjIterator i( spec ); - while ( i.more() ) { - BSONElement e = i.next(); - if ( str::equals( e.fieldName(), "key" ) ) { - b.append( "key", keyPattern ); - } - else if ( str::equals( e.fieldName(), "weights" ) ) { - b.append( "weights", weights ); - weights = BSONObj(); - } - else if ( str::equals( e.fieldName(), "default_language" ) ) { - b.append( "default_language", default_language); - default_language = ""; - } - else if ( str::equals( e.fieldName(), "language_override" ) ) { - b.append( "language_override", language_override); - language_override = ""; - } - else if ( str::equals( e.fieldName(), "v" ) ) { - version = e.numberInt(); - } - else if ( str::equals( e.fieldName(), "textIndexVersion" ) ) { - uassert( 17293, - "text index option 'textIndexVersion' must be a number", - e.isNumber() ); - textIndexVersion = e.numberInt(); - uassert( 16730, - str::stream() << "bad textIndexVersion: " << textIndexVersion, - textIndexVersion == TEXT_INDEX_VERSION_2 ); - } - else { - b.append( e ); + BSONObj weights; + { + BSONObjBuilder b; + for (map<string, int>::iterator i = m.begin(); i != m.end(); ++i) { + uassert(16674, "score for word too high", i->second > 0 && i->second < MAX_WORD_WEIGHT); + + // Verify weight refers to a valid field. + if (i->first != "$**") { + FieldRef keyField(i->first); + uassert(17294, "weight cannot be on an empty field", keyField.numParts() != 0); + for (size_t partNum = 0; partNum < keyField.numParts(); partNum++) { + StringData part = keyField.getPart(partNum); + uassert(17291, "weight cannot have empty path component", !part.empty()); + uassert(17292, + "weight cannot have path component with $ prefix", + !part.startsWith("$")); } } - if ( !weights.isEmpty() ) { - b.append( "weights", weights ); - } - if ( !default_language.empty() ) { - b.append( "default_language", default_language); - } - if ( !language_override.empty() ) { - b.append( "language_override", language_override); - } - if ( version >= 0 ) { - b.append( "v", version ); - } - b.append( "textIndexVersion", textIndexVersion ); + b.append(i->first, i->second); + } + weights = b.obj(); + } + + BSONElement default_language_elt = spec["default_language"]; + string default_language(default_language_elt.str()); + if (default_language_elt.eoo()) { + default_language = moduleDefaultLanguage; + } else { + uassert( + 17263, "default_language needs a string type", default_language_elt.type() == String); + } + uassert(17264, + "default_language is not valid", + FTSLanguage::make(default_language, TEXT_INDEX_VERSION_2).getStatus().isOK()); + + BSONElement language_override_elt = spec["language_override"]; + string language_override(language_override_elt.str()); + if (language_override_elt.eoo()) { + language_override = "language"; + } else { + uassert(17136, + "language_override is not valid", + language_override_elt.type() == String && validateOverride(language_override)); + } - return b.obj(); + int version = -1; + int textIndexVersion = TEXT_INDEX_VERSION_2; + + BSONObjBuilder b; + BSONObjIterator i(spec); + while (i.more()) { + BSONElement e = i.next(); + if (str::equals(e.fieldName(), "key")) { + b.append("key", keyPattern); + } else if (str::equals(e.fieldName(), "weights")) { + b.append("weights", weights); + weights = BSONObj(); + } else if (str::equals(e.fieldName(), "default_language")) { + b.append("default_language", default_language); + default_language = ""; + } else if (str::equals(e.fieldName(), "language_override")) { + b.append("language_override", language_override); + language_override = ""; + } else if (str::equals(e.fieldName(), "v")) { + version = e.numberInt(); + } else if (str::equals(e.fieldName(), "textIndexVersion")) { + uassert(17293, "text index option 'textIndexVersion' must be a number", e.isNumber()); + textIndexVersion = e.numberInt(); + uassert(16730, + str::stream() << "bad textIndexVersion: " << textIndexVersion, + textIndexVersion == TEXT_INDEX_VERSION_2); + } else { + b.append(e); } + } + if (!weights.isEmpty()) { + b.append("weights", weights); + } + if (!default_language.empty()) { + b.append("default_language", default_language); } + if (!language_override.empty()) { + b.append("language_override", language_override); + } + if (version >= 0) { + b.append("v", version); + } + b.append("textIndexVersion", textIndexVersion); + + return b.obj(); +} +} } diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h index 0f17d825dcc..d5cc0b46472 100644 --- a/src/mongo/db/fts/fts_spec.h +++ b/src/mongo/db/fts/fts_spec.h @@ -43,136 +43,146 @@ namespace mongo { - namespace fts { - - extern const double MAX_WEIGHT; - extern const double MAX_WORD_WEIGHT; - extern const double DEFAULT_WEIGHT; - - typedef std::map<std::string,double> Weights; // TODO cool map - typedef unordered_map<std::string,double> TermFrequencyMap; - - struct ScoreHelperStruct { - ScoreHelperStruct() - : freq(0), count(0), exp(0){ - } - double freq; - double count; - double exp; - }; - typedef unordered_map<std::string,ScoreHelperStruct> ScoreHelperMap; - - class FTSSpec { - - struct Tools { - Tools( const FTSLanguage& _language, - const Stemmer* _stemmer, - const StopWords* _stopwords ) - : language( _language ) - , stemmer( _stemmer ) - , stopwords( _stopwords ) {} - - const FTSLanguage& language; - const Stemmer* stemmer; - const StopWords* stopwords; - }; - - public: - FTSSpec( const BSONObj& indexInfo ); - - bool wildcard() const { return _wildcard; } - const FTSLanguage& defaultLanguage() const { return *_defaultLanguage; } - const std::string& languageOverrideField() const { return _languageOverrideField; } - - size_t numExtraBefore() const { return _extraBefore.size(); } - const std::string& extraBefore( unsigned i ) const { return _extraBefore[i]; } - - size_t numExtraAfter() const { return _extraAfter.size(); } - const std::string& extraAfter( unsigned i ) const { return _extraAfter[i]; } - - /** - * Calculates term/score pairs for a BSONObj as applied to this spec. - * @arg obj document to traverse; can be a subdocument or array - * @arg term_freqs output parameter to store (term,score) results - */ - void scoreDocument( const BSONObj& obj, TermFrequencyMap* term_freqs ) const; - - /** - * given a query, pulls out the pieces (in order) that go in the index first - */ - Status getIndexPrefix( const BSONObj& filter, BSONObj* out ) const; - - const Weights& weights() const { return _weights; } - static BSONObj fixSpec( const BSONObj& spec ); - - /** - * Returns text index version. - */ - TextIndexVersion getTextIndexVersion() const { return _textIndexVersion; } - - private: - // - // Helper methods. Invoked for TEXT_INDEX_VERSION_2 spec objects only. - // - - /** - * Calculate the term scores for 'raw' and update 'term_freqs' with the result. Parses - * 'raw' using 'tools', and weights term scores based on 'weight'. - */ - void _scoreStringV2( FTSTokenizer* tokenizer, - StringData raw, - TermFrequencyMap* term_freqs, - double weight ) const; - - public: - /** - * Get the language override for the given BSON doc. If no language override is - * specified, returns currentLanguage. - */ - const FTSLanguage* _getLanguageToUseV2( const BSONObj& userDoc, - const FTSLanguage* currentLanguage ) const; - - private: - // - // Deprecated helper methods. Invoked for TEXT_INDEX_VERSION_1 spec objects only. - // - - void _scoreStringV1( const Tools& tools, - StringData raw, - TermFrequencyMap* docScores, - double weight ) const; - - bool _weightV1( StringData field, double* out ) const; - - void _scoreRecurseV1( const Tools& tools, - const BSONObj& obj, - TermFrequencyMap* term_freqs ) const; - - void _scoreDocumentV1( const BSONObj& obj, TermFrequencyMap* term_freqs ) const; - - const FTSLanguage& _getLanguageToUseV1( const BSONObj& userDoc ) const; - - static BSONObj _fixSpecV1( const BSONObj& spec ); - - // - // Instance variables. - // - - TextIndexVersion _textIndexVersion; +namespace fts { + +extern const double MAX_WEIGHT; +extern const double MAX_WORD_WEIGHT; +extern const double DEFAULT_WEIGHT; + +typedef std::map<std::string, double> Weights; // TODO cool map +typedef unordered_map<std::string, double> TermFrequencyMap; + +struct ScoreHelperStruct { + ScoreHelperStruct() : freq(0), count(0), exp(0) {} + double freq; + double count; + double exp; +}; +typedef unordered_map<std::string, ScoreHelperStruct> ScoreHelperMap; + +class FTSSpec { + struct Tools { + Tools(const FTSLanguage& _language, const Stemmer* _stemmer, const StopWords* _stopwords) + : language(_language), stemmer(_stemmer), stopwords(_stopwords) {} + + const FTSLanguage& language; + const Stemmer* stemmer; + const StopWords* stopwords; + }; + +public: + FTSSpec(const BSONObj& indexInfo); + + bool wildcard() const { + return _wildcard; + } + const FTSLanguage& defaultLanguage() const { + return *_defaultLanguage; + } + const std::string& languageOverrideField() const { + return _languageOverrideField; + } + + size_t numExtraBefore() const { + return _extraBefore.size(); + } + const std::string& extraBefore(unsigned i) const { + return _extraBefore[i]; + } + + size_t numExtraAfter() const { + return _extraAfter.size(); + } + const std::string& extraAfter(unsigned i) const { + return _extraAfter[i]; + } - const FTSLanguage* _defaultLanguage; - std::string _languageOverrideField; - bool _wildcard; + /** + * Calculates term/score pairs for a BSONObj as applied to this spec. + * @arg obj document to traverse; can be a subdocument or array + * @arg term_freqs output parameter to store (term,score) results + */ + void scoreDocument(const BSONObj& obj, TermFrequencyMap* term_freqs) const; - // mapping : fieldname -> weight - Weights _weights; - - // Prefix compound key - used to partition search index - std::vector<std::string> _extraBefore; + /** + * given a query, pulls out the pieces (in order) that go in the index first + */ + Status getIndexPrefix(const BSONObj& filter, BSONObj* out) const; - // Suffix compound key - used for covering index behavior - std::vector<std::string> _extraAfter; - }; + const Weights& weights() const { + return _weights; + } + static BSONObj fixSpec(const BSONObj& spec); + /** + * Returns text index version. + */ + TextIndexVersion getTextIndexVersion() const { + return _textIndexVersion; } + +private: + // + // Helper methods. Invoked for TEXT_INDEX_VERSION_2 spec objects only. + // + + /** + * Calculate the term scores for 'raw' and update 'term_freqs' with the result. Parses + * 'raw' using 'tools', and weights term scores based on 'weight'. + */ + void _scoreStringV2(FTSTokenizer* tokenizer, + StringData raw, + TermFrequencyMap* term_freqs, + double weight) const; + +public: + /** + * Get the language override for the given BSON doc. If no language override is + * specified, returns currentLanguage. + */ + const FTSLanguage* _getLanguageToUseV2(const BSONObj& userDoc, + const FTSLanguage* currentLanguage) const; + +private: + // + // Deprecated helper methods. Invoked for TEXT_INDEX_VERSION_1 spec objects only. + // + + void _scoreStringV1(const Tools& tools, + StringData raw, + TermFrequencyMap* docScores, + double weight) const; + + bool _weightV1(StringData field, double* out) const; + + void _scoreRecurseV1(const Tools& tools, + const BSONObj& obj, + TermFrequencyMap* term_freqs) const; + + void _scoreDocumentV1(const BSONObj& obj, TermFrequencyMap* term_freqs) const; + + const FTSLanguage& _getLanguageToUseV1(const BSONObj& userDoc) const; + + static BSONObj _fixSpecV1(const BSONObj& spec); + + // + // Instance variables. + // + + TextIndexVersion _textIndexVersion; + + const FTSLanguage* _defaultLanguage; + std::string _languageOverrideField; + bool _wildcard; + + // mapping : fieldname -> weight + Weights _weights; + + // Prefix compound key - used to partition search index + std::vector<std::string> _extraBefore; + + // Suffix compound key - used for covering index behavior + std::vector<std::string> _extraAfter; +}; +} } diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp index a2dc1dc2489..4a161c8614a 100644 --- a/src/mongo/db/fts/fts_spec_legacy.cpp +++ b/src/mongo/db/fts/fts_spec_legacy.cpp @@ -33,290 +33,268 @@ namespace mongo { - namespace fts { +namespace fts { - // - // This file contains functionality specific to indexing documents from TEXT_INDEX_VERSION_1 - // text indexes. - // +// +// This file contains functionality specific to indexing documents from TEXT_INDEX_VERSION_1 +// text indexes. +// - using std::map; - using std::string; - using namespace mongoutils; +using std::map; +using std::string; +using namespace mongoutils; - namespace { - void _addFTSStuff( BSONObjBuilder* b ) { - b->append( "_fts", INDEX_NAME ); - b->append( "_ftsx", 1 ); - } - } +namespace { +void _addFTSStuff(BSONObjBuilder* b) { + b->append("_fts", INDEX_NAME); + b->append("_ftsx", 1); +} +} - const FTSLanguage& FTSSpec::_getLanguageToUseV1( const BSONObj& userDoc ) const { - BSONElement e = userDoc[_languageOverrideField]; - if ( e.type() == String ) { - const char * x = e.valuestrsafe(); - if ( strlen( x ) > 0 ) { - StatusWithFTSLanguage swl = FTSLanguage::make( x, TEXT_INDEX_VERSION_1 ); - dassert( swl.isOK() ); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail. - return *swl.getValue(); - } - } - return *_defaultLanguage; +const FTSLanguage& FTSSpec::_getLanguageToUseV1(const BSONObj& userDoc) const { + BSONElement e = userDoc[_languageOverrideField]; + if (e.type() == String) { + const char* x = e.valuestrsafe(); + if (strlen(x) > 0) { + StatusWithFTSLanguage swl = FTSLanguage::make(x, TEXT_INDEX_VERSION_1); + dassert(swl.isOK()); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail. + return *swl.getValue(); } + } + return *_defaultLanguage; +} - void FTSSpec::_scoreStringV1( const Tools& tools, - StringData raw, - TermFrequencyMap* docScores, - double weight ) const { - - ScoreHelperMap terms; +void FTSSpec::_scoreStringV1(const Tools& tools, + StringData raw, + TermFrequencyMap* docScores, + double weight) const { + ScoreHelperMap terms; - unsigned numTokens = 0; + unsigned numTokens = 0; - Tokenizer i( &tools.language, raw ); - while ( i.more() ) { - Token t = i.next(); - if ( t.type != Token::TEXT ) - continue; + Tokenizer i(&tools.language, raw); + while (i.more()) { + Token t = i.next(); + if (t.type != Token::TEXT) + continue; - string term = tolowerString( t.data ); - if ( tools.stopwords->isStopWord( term ) ) - continue; - term = tools.stemmer->stem( term ); + string term = tolowerString(t.data); + if (tools.stopwords->isStopWord(term)) + continue; + term = tools.stemmer->stem(term); - ScoreHelperStruct& data = terms[term]; + ScoreHelperStruct& data = terms[term]; - if ( data.exp ) - data.exp *= 2; - else - data.exp = 1; - data.count += 1; - data.freq += ( 1 / data.exp ); + if (data.exp) + data.exp *= 2; + else + data.exp = 1; + data.count += 1; + data.freq += (1 / data.exp); - numTokens++; - } + numTokens++; + } - for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) { + for (ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i) { + const string& term = i->first; + const ScoreHelperStruct& data = i->second; - const string& term = i->first; - const ScoreHelperStruct& data = i->second; + // in order to adjust weights as a function of term count as it + // relates to total field length. ie. is this the only word or + // a frequently occuring term? or does it only show up once in + // a long block of text? - // in order to adjust weights as a function of term count as it - // relates to total field length. ie. is this the only word or - // a frequently occuring term? or does it only show up once in - // a long block of text? + double coeff = (0.5 * data.count / numTokens) + 0.5; - double coeff = ( 0.5 * data.count / numTokens ) + 0.5; + // if term is identical to the raw form of the + // field (untokenized) give it a small boost. + double adjustment = 1; + if (raw.size() == term.length() && raw.equalCaseInsensitive(term)) + adjustment += 0.1; - // if term is identical to the raw form of the - // field (untokenized) give it a small boost. - double adjustment = 1; - if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) ) - adjustment += 0.1; + double& score = (*docScores)[term]; + score += (weight * data.freq * coeff * adjustment); + verify(score <= MAX_WEIGHT); + } +} - double& score = (*docScores)[term]; - score += ( weight * data.freq * coeff * adjustment ); - verify( score <= MAX_WEIGHT ); - } - } +bool FTSSpec::_weightV1(StringData field, double* out) const { + Weights::const_iterator i = _weights.find(field.toString()); + if (i == _weights.end()) + return false; + *out = i->second; + return true; +} - bool FTSSpec::_weightV1( StringData field, double* out ) const { - Weights::const_iterator i = _weights.find( field.toString() ); - if ( i == _weights.end() ) - return false; - *out = i->second; - return true; +/* + * Recurses over all fields of an obj (document in collection) + * and fills term,score map term_freqs + * @param tokenizer, tokenizer to tokenize a string into terms + * @param obj, object being parsed + * term_freqs, map <term,score> to be filled up + */ +void FTSSpec::_scoreRecurseV1(const Tools& tools, + const BSONObj& obj, + TermFrequencyMap* term_freqs) const { + BSONObjIterator j(obj); + while (j.more()) { + BSONElement x = j.next(); + + if (languageOverrideField() == x.fieldName()) + continue; + + if (x.type() == String) { + double w = 1; + _weightV1(x.fieldName(), &w); + _scoreStringV1(tools, x.valuestr(), term_freqs, w); + } else if (x.isABSONObj()) { + _scoreRecurseV1(tools, x.Obj(), term_freqs); } + } +} - /* - * Recurses over all fields of an obj (document in collection) - * and fills term,score map term_freqs - * @param tokenizer, tokenizer to tokenize a string into terms - * @param obj, object being parsed - * term_freqs, map <term,score> to be filled up - */ - void FTSSpec::_scoreRecurseV1( const Tools& tools, - const BSONObj& obj, - TermFrequencyMap* term_freqs ) const { - BSONObjIterator j( obj ); - while ( j.more() ) { - BSONElement x = j.next(); +void FTSSpec::_scoreDocumentV1(const BSONObj& obj, TermFrequencyMap* term_freqs) const { + const FTSLanguage& language = _getLanguageToUseV1(obj); - if ( languageOverrideField() == x.fieldName() ) - continue; + Stemmer stemmer(&language); + Tools tools(language, &stemmer, StopWords::getStopWords(&language)); - if (x.type() == String) { - double w = 1; - _weightV1( x.fieldName(), &w ); - _scoreStringV1(tools, x.valuestr(), term_freqs, w); - } - else if ( x.isABSONObj() ) { - _scoreRecurseV1( tools, x.Obj(), term_freqs); - } + if (wildcard()) { + // if * is specified for weight, we can recurse over all fields. + _scoreRecurseV1(tools, obj, term_freqs); + return; + } + // otherwise, we need to remember the different weights for each field + // and act accordingly (in other words, call _score) + for (Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++) { + const char* leftOverName = i->first.c_str(); + // name of field + BSONElement e = obj.getFieldDottedOrArray(leftOverName); + // weight associated to name of field + double weight = i->second; + + if (e.eoo()) { + // do nothing + } else if (e.type() == Array) { + BSONObjIterator j(e.Obj()); + while (j.more()) { + BSONElement x = j.next(); + if (leftOverName[0] && x.isABSONObj()) + x = x.Obj().getFieldDotted(leftOverName); + if (x.type() == String) + _scoreStringV1(tools, x.valuestr(), term_freqs, weight); } + } else if (e.type() == String) { + _scoreStringV1(tools, e.valuestr(), term_freqs, weight); } + } +} - void FTSSpec::_scoreDocumentV1( const BSONObj& obj, - TermFrequencyMap* term_freqs ) const { - - const FTSLanguage& language = _getLanguageToUseV1( obj ); - - Stemmer stemmer(&language); - Tools tools(language, &stemmer, StopWords::getStopWords( &language )); - - if ( wildcard() ) { - // if * is specified for weight, we can recurse over all fields. - _scoreRecurseV1(tools, obj, term_freqs); - return; - } - - // otherwise, we need to remember the different weights for each field - // and act accordingly (in other words, call _score) - for ( Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++ ) { - const char * leftOverName = i->first.c_str(); - // name of field - BSONElement e = obj.getFieldDottedOrArray(leftOverName); - // weight associated to name of field - double weight = i->second; - - if ( e.eoo() ) { - // do nothing - } - else if ( e.type() == Array ) { - BSONObjIterator j( e.Obj() ); - while ( j.more() ) { - BSONElement x = j.next(); - if ( leftOverName[0] && x.isABSONObj() ) - x = x.Obj().getFieldDotted( leftOverName ); - if ( x.type() == String ) - _scoreStringV1( tools, x.valuestr(), term_freqs, weight ); - } - } - else if ( e.type() == String ) { - _scoreStringV1( tools, e.valuestr(), term_freqs, weight ); +BSONObj FTSSpec::_fixSpecV1(const BSONObj& spec) { + map<string, int> m; + + BSONObj keyPattern; + { + BSONObjBuilder b; + bool addedFtsStuff = false; + + BSONObjIterator i(spec["key"].Obj()); + while (i.more()) { + BSONElement e = i.next(); + if (str::equals(e.fieldName(), "_fts") || str::equals(e.fieldName(), "_ftsx")) { + addedFtsStuff = true; + b.append(e); + } else if (e.type() == String && + (str::equals("fts", e.valuestr()) || str::equals("text", e.valuestr()))) { + if (!addedFtsStuff) { + _addFTSStuff(&b); + addedFtsStuff = true; } + m[e.fieldName()] = 1; + } else { + b.append(e); } } - BSONObj FTSSpec::_fixSpecV1( const BSONObj& spec ) { - map<string,int> m; - - BSONObj keyPattern; - { - BSONObjBuilder b; - bool addedFtsStuff = false; - - BSONObjIterator i( spec["key"].Obj() ); - while ( i.more() ) { - BSONElement e = i.next(); - if ( str::equals( e.fieldName(), "_fts" ) || - str::equals( e.fieldName(), "_ftsx" ) ) { - addedFtsStuff = true; - b.append( e ); - } - else if ( e.type() == String && - ( str::equals( "fts", e.valuestr() ) || - str::equals( "text", e.valuestr() ) ) ) { - - if ( !addedFtsStuff ) { - _addFTSStuff( &b ); - addedFtsStuff = true; - } - - m[e.fieldName()] = 1; - } - else { - b.append( e ); - } - } - - if ( !addedFtsStuff ) - _addFTSStuff( &b ); - - keyPattern = b.obj(); - } - - if ( spec["weights"].isABSONObj() ) { - BSONObjIterator i( spec["weights"].Obj() ); - while ( i.more() ) { - BSONElement e = i.next(); - m[e.fieldName()] = e.numberInt(); - } - } - else if ( spec["weights"].str() == WILDCARD ) { - m[WILDCARD] = 1; - } - - BSONObj weights; - { - BSONObjBuilder b; - for ( map<string,int>::iterator i = m.begin(); i != m.end(); ++i ) { - uassert( 17365, "score for word too high", - i->second > 0 && i->second < MAX_WORD_WEIGHT ); - b.append( i->first, i->second ); - } - weights = b.obj(); - } + if (!addedFtsStuff) + _addFTSStuff(&b); - string default_language(spec.getStringField("default_language")); - if ( default_language.empty() ) - default_language = "english"; + keyPattern = b.obj(); + } - string language_override(spec.getStringField("language_override")); - if ( language_override.empty() ) - language_override = "language"; + if (spec["weights"].isABSONObj()) { + BSONObjIterator i(spec["weights"].Obj()); + while (i.more()) { + BSONElement e = i.next(); + m[e.fieldName()] = e.numberInt(); + } + } else if (spec["weights"].str() == WILDCARD) { + m[WILDCARD] = 1; + } - int version = -1; - int textIndexVersion = 1; + BSONObj weights; + { + BSONObjBuilder b; + for (map<string, int>::iterator i = m.begin(); i != m.end(); ++i) { + uassert(17365, "score for word too high", i->second > 0 && i->second < MAX_WORD_WEIGHT); + b.append(i->first, i->second); + } + weights = b.obj(); + } - BSONObjBuilder b; - BSONObjIterator i( spec ); - while ( i.more() ) { - BSONElement e = i.next(); - if ( str::equals( e.fieldName(), "key" ) ) { - b.append( "key", keyPattern ); - } - else if ( str::equals( e.fieldName(), "weights" ) ) { - b.append( "weights", weights ); - weights = BSONObj(); - } - else if ( str::equals( e.fieldName(), "default_language" ) ) { - b.append( "default_language", default_language); - default_language = ""; - } - else if ( str::equals( e.fieldName(), "language_override" ) ) { - b.append( "language_override", language_override); - language_override = ""; - } - else if ( str::equals( e.fieldName(), "v" ) ) { - version = e.numberInt(); - } - else if ( str::equals( e.fieldName(), "textIndexVersion" ) ) { - textIndexVersion = e.numberInt(); - uassert( 17366, - str::stream() << "bad textIndexVersion: " << textIndexVersion, - textIndexVersion == 1 ); - } - else { - b.append( e ); - } - } + string default_language(spec.getStringField("default_language")); + if (default_language.empty()) + default_language = "english"; + + string language_override(spec.getStringField("language_override")); + if (language_override.empty()) + language_override = "language"; + + int version = -1; + int textIndexVersion = 1; + + BSONObjBuilder b; + BSONObjIterator i(spec); + while (i.more()) { + BSONElement e = i.next(); + if (str::equals(e.fieldName(), "key")) { + b.append("key", keyPattern); + } else if (str::equals(e.fieldName(), "weights")) { + b.append("weights", weights); + weights = BSONObj(); + } else if (str::equals(e.fieldName(), "default_language")) { + b.append("default_language", default_language); + default_language = ""; + } else if (str::equals(e.fieldName(), "language_override")) { + b.append("language_override", language_override); + language_override = ""; + } else if (str::equals(e.fieldName(), "v")) { + version = e.numberInt(); + } else if (str::equals(e.fieldName(), "textIndexVersion")) { + textIndexVersion = e.numberInt(); + uassert(17366, + str::stream() << "bad textIndexVersion: " << textIndexVersion, + textIndexVersion == 1); + } else { + b.append(e); + } + } - if ( !weights.isEmpty() ) - b.append( "weights", weights ); - if ( !default_language.empty() ) - b.append( "default_language", default_language); - if ( !language_override.empty() ) - b.append( "language_override", language_override); + if (!weights.isEmpty()) + b.append("weights", weights); + if (!default_language.empty()) + b.append("default_language", default_language); + if (!language_override.empty()) + b.append("language_override", language_override); - if ( version >= 0 ) - b.append( "v", version ); + if (version >= 0) + b.append("v", version); - b.append( "textIndexVersion", textIndexVersion ); + b.append("textIndexVersion", textIndexVersion); - return b.obj(); - } - } + return b.obj(); +} +} } diff --git a/src/mongo/db/fts/fts_spec_test.cpp b/src/mongo/db/fts/fts_spec_test.cpp index 832279eb18d..c9f628a2b28 100644 --- a/src/mongo/db/fts/fts_spec_test.cpp +++ b/src/mongo/db/fts/fts_spec_test.cpp @@ -36,541 +36,558 @@ namespace mongo { - using std::set; - using std::string; - - namespace fts { - - /** - * Assert that fixSpec() accepts the provided text index spec. - */ - void assertFixSuccess( const std::string& s ) { - BSONObj user = fromjson( s ); - - try { - // fixSpec() should not throw on a valid spec. - BSONObj fixed = FTSSpec::fixSpec( user ); - - // fixSpec() on an already-fixed spec shouldn't change it. - BSONObj fixed2 = FTSSpec::fixSpec( fixed ); - ASSERT_EQUALS( fixed, fixed2 ); - } - catch ( UserException& ) { - ASSERT( false ); - } - } - - /** - * Assert that fixSpec() rejects the provided text index spec. - */ - void assertFixFailure( const std::string& s ) { - BSONObj user = fromjson( s ); - - try { - // fixSpec() on an invalid spec should uassert. - BSONObj fixed = FTSSpec::fixSpec( user ); - } - catch ( UserException& ) { - return; - } - ASSERT( false ); - } - - TEST( FTSSpec, FixNormalKey1 ) { - assertFixSuccess("{key: {a: 'text'}}"); - assertFixSuccess("{key: {a: 'text', b: 'text'}}"); - assertFixSuccess("{key: {a: 'text', b: 'text', c: 'text'}}"); - - assertFixFailure("{key: {_fts: 'text'}}"); // not allowed to index reserved field - assertFixFailure("{key: {_ftsx: 'text'}}"); - } - - TEST( FTSSpec, FixCompoundKey1 ) { - assertFixSuccess("{key: {a: 'text', b: 1.0}}"); - assertFixSuccess("{key: {a: 'text', b: NumberInt(1)}}"); - assertFixSuccess("{key: {a: 'text', b: NumberLong(1)}}"); - assertFixSuccess("{key: {a: 'text', b: -1.0}}"); - assertFixSuccess("{key: {a: 'text', b: NumberInt(-1)}}"); - assertFixSuccess("{key: {a: 'text', b: NumberLong(-1)}}"); - assertFixSuccess("{key: {a: 1.0, b: 'text'}}"); - assertFixSuccess("{key: {a: NumberInt(1), b: 'text'}}"); - assertFixSuccess("{key: {a: NumberLong(1), b: 'text'}}"); - assertFixSuccess("{key: {a: -1, b: 'text'}}"); - assertFixSuccess("{key: {a: 1, b: 1, c: 'text'}}"); - assertFixSuccess("{key: {a: 1, b: -1, c: 'text'}}"); - assertFixSuccess("{key: {a: -1, b: 1, c: 'text'}}"); - assertFixSuccess("{key: {a: 1, b: 'text', c: 1}}"); - assertFixSuccess("{key: {a: 'text', b: 1, c: 1}}"); - assertFixSuccess("{key: {a: 'text', b: 1, c: -1}}"); - assertFixSuccess("{key: {a: 'text', b: 'text', c: 1}}"); - assertFixSuccess("{key: {a: 1, b: 'text', c: 'text'}}"); - - assertFixFailure("{key: {a: 'text', b: 0}}"); - assertFixFailure("{key: {a: 'text', b: '2d'}}"); // not allowed to mix special indexes - assertFixFailure("{key: {a: 'text', b: '1'}}"); - assertFixFailure("{key: {a: 'text', _fts: 1}}"); - assertFixFailure("{key: {a: 'text', _fts: 'text'}}"); - assertFixFailure("{key: {a: 'text', _ftsx: 1}}"); - assertFixFailure("{key: {a: 'text', _ftsx: 'text'}}"); - assertFixFailure("{key: {_fts: 1, a: 'text'}}"); - assertFixFailure("{key: {_fts: 'text', a: 'text'}}"); - assertFixFailure("{key: {_ftsx: 1, a: 'text'}}"); - assertFixFailure("{key: {_ftsx: 'text', a: 'text'}}"); - assertFixFailure("{key: {a: 'text', b: 1, c: 'text'}}"); // 'text' must all be adjacent - assertFixFailure("{key: {a: 'text', b: 1, c: 'text', d: 1}}"); - assertFixFailure("{key: {a: 1, b: 'text', c: 1, d: 'text', e: 1}}"); - } - - TEST( FTSSpec, FixDefaultLanguage1 ) { - assertFixSuccess("{key: {a: 'text'}, default_language: 'english'}"); - assertFixSuccess("{key: {a: 'text'}, default_language: 'engLISH'}"); - assertFixSuccess("{key: {a: 'text'}, default_language: 'en'}"); - assertFixSuccess("{key: {a: 'text'}, default_language: 'eN'}"); - assertFixSuccess("{key: {a: 'text'}, default_language: 'spanish'}"); - assertFixSuccess("{key: {a: 'text'}, default_language: 'none'}"); - - assertFixFailure("{key: {a: 'text'}, default_language: 'engrish'}"); - assertFixFailure("{key: {a: 'text'}, default_language: ' english'}"); - assertFixFailure("{key: {a: 'text'}, default_language: ''}"); - } - - TEST( FTSSpec, FixWeights1 ) { - assertFixSuccess("{key: {a: 'text'}, weights: {}}"); - assertFixSuccess("{key: {a: 'text'}, weights: {a: 1.0}}"); - assertFixSuccess("{key: {a: 'text'}, weights: {a: NumberInt(1)}}"); - assertFixSuccess("{key: {a: 'text'}, weights: {a: NumberLong(1)}}"); - assertFixSuccess("{key: {a: 'text'}, weights: {a: 99999}}"); - assertFixSuccess("{key: {'$**': 'text'}, weights: {'a.b': 2}}"); - assertFixSuccess("{key: {'$**': 'text'}, weights: {a: 2, b: 2}}"); - assertFixSuccess("{key: {'$**': 'text'}, weights: {'$**': 2}}"); - - assertFixFailure("{key: {a: 'text'}, weights: 0}"); - assertFixFailure("{key: {a: 'text'}, weights: []}"); - assertFixFailure("{key: {a: 'text'}, weights: 'x'}"); - assertFixFailure("{key: {a: 'text'}, weights: {a: 0}}"); - assertFixFailure("{key: {a: 'text'}, weights: {a: -1}}"); - assertFixFailure("{key: {a: 'text'}, weights: {a: 100000}}"); // above max weight - assertFixFailure("{key: {a: 'text'}, weights: {a: '1'}}"); - assertFixFailure("{key: {a: 'text'}, weights: {'': 1}}"); // "invalid" path - assertFixFailure("{key: {a: 'text'}, weights: {'a.': 1}}"); - assertFixFailure("{key: {a: 'text'}, weights: {'.a': 1}}"); - assertFixFailure("{key: {a: 'text'}, weights: {'a..a': 1}}"); - assertFixFailure("{key: {a: 'text'}, weights: {$a: 1}}"); - assertFixFailure("{key: {a: 'text'}, weights: {'a.$a': 1}}"); - assertFixFailure("{key: {a: 'text'}, weights: {'a.$**': 1}}"); - } - - TEST( FTSSpec, FixLanguageOverride1 ) { - assertFixSuccess("{key: {a: 'text'}, language_override: 'foo'}"); - assertFixSuccess("{key: {a: 'text'}, language_override: 'foo$bar'}"); - - assertFixFailure("{key: {a: 'text'}, language_override: 'foo.bar'}"); // can't have '.' - assertFixFailure("{key: {a: 'text'}, language_override: ''}"); - assertFixFailure("{key: {a: 'text'}, language_override: '$foo'}"); - } - - TEST( FTSSpec, FixTextIndexVersion1 ) { - assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 1.0}}"); - assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(1)}}"); - assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(1)}}"); - assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 2.0}}"); - assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(2)}}"); - assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(2)}}"); - - assertFixFailure("{key: {a: 'text'}, textIndexVersion: 3}"); - assertFixFailure("{key: {a: 'text'}, textIndexVersion: '2'}"); - assertFixFailure("{key: {a: 'text'}, textIndexVersion: {}}"); - } - - TEST( FTSSpec, ScoreSingleField1 ) { - BSONObj user = BSON( "key" << BSON( "title" << "text" << - "text" << "text" ) << - "weights" << BSON( "title" << 10 ) ); - - FTSSpec spec( FTSSpec::fixSpec( user ) ); - - TermFrequencyMap m; - spec.scoreDocument( BSON( "title" << "cat sat run" ), &m ); - ASSERT_EQUALS( 3U, m.size() ); - ASSERT_EQUALS( m["cat"], m["sat"] ); - ASSERT_EQUALS( m["cat"], m["run"] ); - ASSERT( m["cat"] > 0 ); - } - - TEST( FTSSpec, ScoreMultipleField1 ) { - BSONObj user = BSON( "key" << BSON( "title" << "text" << - "text" << "text" ) << - "weights" << BSON( "title" << 10 ) ); - - FTSSpec spec( FTSSpec::fixSpec( user ) ); - - TermFrequencyMap m; - spec.scoreDocument( BSON( "title" << "cat sat run" << "text" << "cat book" ), &m ); - - ASSERT_EQUALS( 4U, m.size() ); - ASSERT_EQUALS( m["sat"], m["run"] ); - ASSERT( m["sat"] > 0 ); - - ASSERT( m["cat"] > m["sat"] ); - ASSERT( m["cat"] > m["book"] ); - ASSERT( m["book"] > 0 ); - ASSERT( m["book"] < m["sat"] ); - } - - TEST( FTSSpec, ScoreMultipleField2 ) { - // Test where one indexed field is a parent component of another indexed field. - BSONObj user = BSON( "key" << BSON( "a" << "text" << "a.b" << "text" ) ); - - FTSSpec spec( FTSSpec::fixSpec( user ) ); - - TermFrequencyMap m; - spec.scoreDocument( BSON( "a" << BSON( "b" << "term" ) ), &m ); - ASSERT_EQUALS( 1U, m.size() ); - } - - TEST( FTSSpec, ScoreRepeatWord ) { - BSONObj user = BSON( "key" << BSON( "title" << "text" << - "text" << "text" ) << - "weights" << BSON( "title" << 10 ) ); - - FTSSpec spec( FTSSpec::fixSpec( user ) ); - - TermFrequencyMap m; - spec.scoreDocument( BSON( "title" << "cat sat sat run run run" ), &m ); - ASSERT_EQUALS( 3U, m.size() ); - ASSERT( m["cat"] > 0 ); - ASSERT( m["sat"] > m["cat"] ); - ASSERT( m["run"] > m["sat"] ); - - } - - TEST( FTSSpec, Extra1 ) { - BSONObj user = BSON( "key" << BSON( "data" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( user ) ); - ASSERT_EQUALS( 0U, spec.numExtraBefore() ); - ASSERT_EQUALS( 0U, spec.numExtraAfter() ); - } - - TEST( FTSSpec, Extra2 ) { - BSONObj user = BSON( "key" << BSON( "data" << "text" << "x" << 1 ) ); - BSONObj fixed = FTSSpec::fixSpec( user ); - FTSSpec spec( fixed ); - ASSERT_EQUALS( 0U, spec.numExtraBefore() ); - ASSERT_EQUALS( 1U, spec.numExtraAfter() ); - ASSERT_EQUALS( StringData("x"), spec.extraAfter(0) ); - - BSONObj fixed2 = FTSSpec::fixSpec( fixed ); - ASSERT_EQUALS( fixed, fixed2 ); - } - - TEST( FTSSpec, Extra3 ) { - BSONObj user = BSON( "key" << BSON( "x" << 1 << "data" << "text" ) ); - BSONObj fixed = FTSSpec::fixSpec( user ); - - ASSERT_EQUALS( BSON( "x" << 1 << - "_fts" << "text" << - "_ftsx" << 1 ), - fixed["key"].Obj() ); - ASSERT_EQUALS( BSON( "data" << 1 ), - fixed["weights"].Obj() ); - - BSONObj fixed2 = FTSSpec::fixSpec( fixed ); - ASSERT_EQUALS( fixed, fixed2 ); - - FTSSpec spec( fixed ); - ASSERT_EQUALS( 1U, spec.numExtraBefore() ); - ASSERT_EQUALS( StringData("x"), spec.extraBefore(0) ); - ASSERT_EQUALS( 0U, spec.numExtraAfter() ); - - BSONObj prefix; - - ASSERT( spec.getIndexPrefix( BSON( "x" << 2 ), &prefix ).isOK() ); - ASSERT_EQUALS( BSON( "x" << 2 ), prefix ); - - ASSERT( spec.getIndexPrefix( BSON( "x" << 3 << "y" << 4 ), &prefix ).isOK() ); - ASSERT_EQUALS( BSON( "x" << 3 ), prefix ); - - ASSERT( !spec.getIndexPrefix( BSON( "x" << BSON( "$gt" << 5 ) ), &prefix ).isOK() ); - ASSERT( !spec.getIndexPrefix( BSON( "y" << 4 ), &prefix ).isOK() ); - ASSERT( !spec.getIndexPrefix( BSONObj(), &prefix ).isOK() ); - } - - // Test for correct behavior when encountering nested arrays (both directly nested and - // indirectly nested). - - TEST( FTSSpec, NestedArraysPos1 ) { - BSONObj user = BSON( "key" << BSON( "a.b" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( user ) ); - - // The following document matches {"a.b": {$type: 2}}, so "term" should be indexed. - BSONObj obj = fromjson("{a: [{b: ['term']}]}"); // indirectly nested arrays - TermFrequencyMap m; - spec.scoreDocument( obj, &m ); - ASSERT_EQUALS( 1U, m.size() ); - } - - TEST( FTSSpec, NestedArraysPos2 ) { - BSONObj user = BSON( "key" << BSON( "$**" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( user ) ); - - // The wildcard spec implies a full recursive traversal, so "term" should be indexed. - BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays - TermFrequencyMap m; - spec.scoreDocument( obj, &m ); - ASSERT_EQUALS( 1U, m.size() ); - } - - TEST( FTSSpec, NestedArraysNeg1 ) { - BSONObj user = BSON( "key" << BSON( "a.b" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( user ) ); - - // The following document does not match {"a.b": {$type: 2}}, so "term" should not be - // indexed. - BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays - TermFrequencyMap m; - spec.scoreDocument( obj, &m ); - ASSERT_EQUALS( 0U, m.size() ); - } - - // Multi-language test_1: test independent stemming per sub-document - TEST( FTSSpec, NestedLanguages_PerArrayItemStemming ) { - BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - - BSONObj obj = fromjson( - "{ a :" - " { b :" - " [ { c : \"walked\", language : \"english\" }," - " { c : \"camminato\", language : \"italian\" }," - " { c : \"ging\", language : \"german\" } ]" - " }" - " }" ); - - spec.scoreDocument( obj, &tfm ); - - set<string> hits; - hits.insert("walk"); - hits.insert("cammin"); - hits.insert("ging"); - - for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { - string term = i->first; - ASSERT_EQUALS( 1U, hits.count( term ) ); - } - - } - - // Multi-language test_2: test nested stemming per sub-document - TEST( FTSSpec, NestedLanguages_PerSubdocStemming ) { - BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - - BSONObj obj = fromjson( - "{ language : \"english\"," - " a :" - " { language : \"danish\"," - " b :" - " [ { c : \"foredrag\" }," - " { c : \"foredragsholder\" }," - " { c : \"lector\" } ]" - " }" - "}" ); - - spec.scoreDocument( obj, &tfm ); - - set<string> hits; - hits.insert("foredrag"); - hits.insert("foredragshold"); - hits.insert("lector"); - - for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { - string term = i->first; - ASSERT_EQUALS( 1U, hits.count( term ) ); - } - - } - - // Multi-language test_3: test nested arrays - TEST( FTSSpec, NestedLanguages_NestedArrays ) { - BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - - BSONObj obj = fromjson( - "{ language : \"english\"," - " a : [" - " { language : \"danish\"," - " b :" - " [ { c : [\"foredrag\"] }," - " { c : [\"foredragsholder\"] }," - " { c : [\"lector\"] } ]" - " } ]" - "}" ); - - spec.scoreDocument( obj, &tfm ); - - set<string> hits; - hits.insert("foredrag"); - hits.insert("foredragshold"); - hits.insert("lector"); - - for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { - string term = i->first; - ASSERT_EQUALS( 1U, hits.count( term ) ); - } - - } - - // Multi-language test_4: test pruning - TEST( FTSSpec, NestedLanguages_PathPruning ) { - BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - - BSONObj obj = fromjson( - "{ language : \"english\"," - " a : " - " { language : \"danish\"," - " bc : \"foo\"," - " b : { d: \"bar\" }," - " b :" - " [ { c : \"foredrag\" }," - " { c : \"foredragsholder\" }," - " { c : \"lector\" } ]" - " }" - "}" ); - - spec.scoreDocument( obj, &tfm ); - - set<string> hits; - hits.insert("foredrag"); - hits.insert("foredragshold"); - hits.insert("lector"); - - for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { - string term = i->first; - ASSERT_EQUALS( 1U, hits.count( term ) ); - } - - } - - // Multi-language test_5: test wildcard spec - TEST( FTSSpec, NestedLanguages_Wildcard ) { - BSONObj indexSpec = BSON( "key" << BSON( "$**" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - - BSONObj obj = fromjson( - "{ language : \"english\"," - " b : \"walking\"," - " c : { e: \"walked\" }," - " d : " - " { language : \"danish\"," - " e :" - " [ { f : \"foredrag\" }," - " { f : \"foredragsholder\" }," - " { f : \"lector\" } ]" - " }" - "}" ); - - spec.scoreDocument( obj, &tfm ); - - set<string> hits; - hits.insert("foredrag"); - hits.insert("foredragshold"); - hits.insert("lector"); - hits.insert("walk"); - - for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { - string term = i->first; - ASSERT_EQUALS( 1U, hits.count( term ) ); - } - - } - - // Multi-language test_6: test wildcard spec with override - TEST( FTSSpec, NestedLanguages_WildcardOverride ) { - BSONObj indexSpec = BSON( "key" << BSON( "$**" << "text" ) << - "weights" << BSON( "d.e.f" << 20 ) ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - - BSONObj obj = fromjson( - "{ language : \"english\"," - " b : \"walking\"," - " c : { e: \"walked\" }," - " d : " - " { language : \"danish\"," - " e :" - " [ { f : \"foredrag\" }," - " { f : \"foredragsholder\" }," - " { f : \"lector\" } ]" - " }" - "}" ); - - spec.scoreDocument( obj, &tfm ); - - set<string> hits; - hits.insert("foredrag"); - hits.insert("foredragshold"); - hits.insert("lector"); - hits.insert("walk"); - - for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { - string term = i->first; - ASSERT_EQUALS( 1U, hits.count( term ) ); - } - - } - - /** Test differences across textIndexVersion values in handling of nested arrays. */ - TEST( FTSSpec, TextIndexLegacyNestedArrays ) { - BSONObj obj = fromjson( "{a: [{b: ['hello']}]}" ); - - // textIndexVersion=1 FTSSpec objects do not index nested arrays. - { - BSONObj indexSpec = fromjson( "{key: {'a.b': 'text'}, textIndexVersion: 1}" ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - spec.scoreDocument( obj, &tfm ); - ASSERT_EQUALS( tfm.size(), 0U ); - } - - // textIndexVersion=2 FTSSpec objects do index nested arrays. - { - BSONObj indexSpec = fromjson( "{key: {'a.b': 'text'}, textIndexVersion: 2}" ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - spec.scoreDocument( obj, &tfm ); - ASSERT_EQUALS( tfm.size(), 1U ); - } - } - - /** Test differences across textIndexVersion values in handling of language annotations. */ - TEST( FTSSpec, TextIndexLegacyLanguageRecognition) { - BSONObj obj = fromjson( "{a: 'the', language: 'EN'}" ); - - // textIndexVersion=1 FTSSpec objects treat two-letter language annotations as "none" - // for purposes of stopword processing. - { - BSONObj indexSpec = fromjson( "{key: {'a': 'text'}, textIndexVersion: 1}" ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - spec.scoreDocument( obj, &tfm ); - ASSERT_EQUALS( tfm.size(), 1U ); // "the" not recognized as stopword - } - - // textIndexVersion=2 FTSSpec objects recognize two-letter codes. - { - BSONObj indexSpec = fromjson( "{key: {'a': 'text'}, textIndexVersion: 2}" ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - spec.scoreDocument( obj, &tfm ); - ASSERT_EQUALS( tfm.size(), 0U ); // "the" recognized as stopword - } - } +using std::set; +using std::string; +namespace fts { + +/** + * Assert that fixSpec() accepts the provided text index spec. + */ +void assertFixSuccess(const std::string& s) { + BSONObj user = fromjson(s); + + try { + // fixSpec() should not throw on a valid spec. + BSONObj fixed = FTSSpec::fixSpec(user); + + // fixSpec() on an already-fixed spec shouldn't change it. + BSONObj fixed2 = FTSSpec::fixSpec(fixed); + ASSERT_EQUALS(fixed, fixed2); + } catch (UserException&) { + ASSERT(false); + } +} + +/** + * Assert that fixSpec() rejects the provided text index spec. + */ +void assertFixFailure(const std::string& s) { + BSONObj user = fromjson(s); + + try { + // fixSpec() on an invalid spec should uassert. + BSONObj fixed = FTSSpec::fixSpec(user); + } catch (UserException&) { + return; + } + ASSERT(false); +} + +TEST(FTSSpec, FixNormalKey1) { + assertFixSuccess("{key: {a: 'text'}}"); + assertFixSuccess("{key: {a: 'text', b: 'text'}}"); + assertFixSuccess("{key: {a: 'text', b: 'text', c: 'text'}}"); + + assertFixFailure("{key: {_fts: 'text'}}"); // not allowed to index reserved field + assertFixFailure("{key: {_ftsx: 'text'}}"); +} + +TEST(FTSSpec, FixCompoundKey1) { + assertFixSuccess("{key: {a: 'text', b: 1.0}}"); + assertFixSuccess("{key: {a: 'text', b: NumberInt(1)}}"); + assertFixSuccess("{key: {a: 'text', b: NumberLong(1)}}"); + assertFixSuccess("{key: {a: 'text', b: -1.0}}"); + assertFixSuccess("{key: {a: 'text', b: NumberInt(-1)}}"); + assertFixSuccess("{key: {a: 'text', b: NumberLong(-1)}}"); + assertFixSuccess("{key: {a: 1.0, b: 'text'}}"); + assertFixSuccess("{key: {a: NumberInt(1), b: 'text'}}"); + assertFixSuccess("{key: {a: NumberLong(1), b: 'text'}}"); + assertFixSuccess("{key: {a: -1, b: 'text'}}"); + assertFixSuccess("{key: {a: 1, b: 1, c: 'text'}}"); + assertFixSuccess("{key: {a: 1, b: -1, c: 'text'}}"); + assertFixSuccess("{key: {a: -1, b: 1, c: 'text'}}"); + assertFixSuccess("{key: {a: 1, b: 'text', c: 1}}"); + assertFixSuccess("{key: {a: 'text', b: 1, c: 1}}"); + assertFixSuccess("{key: {a: 'text', b: 1, c: -1}}"); + assertFixSuccess("{key: {a: 'text', b: 'text', c: 1}}"); + assertFixSuccess("{key: {a: 1, b: 'text', c: 'text'}}"); + + assertFixFailure("{key: {a: 'text', b: 0}}"); + assertFixFailure("{key: {a: 'text', b: '2d'}}"); // not allowed to mix special indexes + assertFixFailure("{key: {a: 'text', b: '1'}}"); + assertFixFailure("{key: {a: 'text', _fts: 1}}"); + assertFixFailure("{key: {a: 'text', _fts: 'text'}}"); + assertFixFailure("{key: {a: 'text', _ftsx: 1}}"); + assertFixFailure("{key: {a: 'text', _ftsx: 'text'}}"); + assertFixFailure("{key: {_fts: 1, a: 'text'}}"); + assertFixFailure("{key: {_fts: 'text', a: 'text'}}"); + assertFixFailure("{key: {_ftsx: 1, a: 'text'}}"); + assertFixFailure("{key: {_ftsx: 'text', a: 'text'}}"); + assertFixFailure("{key: {a: 'text', b: 1, c: 'text'}}"); // 'text' must all be adjacent + assertFixFailure("{key: {a: 'text', b: 1, c: 'text', d: 1}}"); + assertFixFailure("{key: {a: 1, b: 'text', c: 1, d: 'text', e: 1}}"); +} + +TEST(FTSSpec, FixDefaultLanguage1) { + assertFixSuccess("{key: {a: 'text'}, default_language: 'english'}"); + assertFixSuccess("{key: {a: 'text'}, default_language: 'engLISH'}"); + assertFixSuccess("{key: {a: 'text'}, default_language: 'en'}"); + assertFixSuccess("{key: {a: 'text'}, default_language: 'eN'}"); + assertFixSuccess("{key: {a: 'text'}, default_language: 'spanish'}"); + assertFixSuccess("{key: {a: 'text'}, default_language: 'none'}"); + + assertFixFailure("{key: {a: 'text'}, default_language: 'engrish'}"); + assertFixFailure("{key: {a: 'text'}, default_language: ' english'}"); + assertFixFailure("{key: {a: 'text'}, default_language: ''}"); +} + +TEST(FTSSpec, FixWeights1) { + assertFixSuccess("{key: {a: 'text'}, weights: {}}"); + assertFixSuccess("{key: {a: 'text'}, weights: {a: 1.0}}"); + assertFixSuccess("{key: {a: 'text'}, weights: {a: NumberInt(1)}}"); + assertFixSuccess("{key: {a: 'text'}, weights: {a: NumberLong(1)}}"); + assertFixSuccess("{key: {a: 'text'}, weights: {a: 99999}}"); + assertFixSuccess("{key: {'$**': 'text'}, weights: {'a.b': 2}}"); + assertFixSuccess("{key: {'$**': 'text'}, weights: {a: 2, b: 2}}"); + assertFixSuccess("{key: {'$**': 'text'}, weights: {'$**': 2}}"); + + assertFixFailure("{key: {a: 'text'}, weights: 0}"); + assertFixFailure("{key: {a: 'text'}, weights: []}"); + assertFixFailure("{key: {a: 'text'}, weights: 'x'}"); + assertFixFailure("{key: {a: 'text'}, weights: {a: 0}}"); + assertFixFailure("{key: {a: 'text'}, weights: {a: -1}}"); + assertFixFailure("{key: {a: 'text'}, weights: {a: 100000}}"); // above max weight + assertFixFailure("{key: {a: 'text'}, weights: {a: '1'}}"); + assertFixFailure("{key: {a: 'text'}, weights: {'': 1}}"); // "invalid" path + assertFixFailure("{key: {a: 'text'}, weights: {'a.': 1}}"); + assertFixFailure("{key: {a: 'text'}, weights: {'.a': 1}}"); + assertFixFailure("{key: {a: 'text'}, weights: {'a..a': 1}}"); + assertFixFailure("{key: {a: 'text'}, weights: {$a: 1}}"); + assertFixFailure("{key: {a: 'text'}, weights: {'a.$a': 1}}"); + assertFixFailure("{key: {a: 'text'}, weights: {'a.$**': 1}}"); +} + +TEST(FTSSpec, FixLanguageOverride1) { + assertFixSuccess("{key: {a: 'text'}, language_override: 'foo'}"); + assertFixSuccess("{key: {a: 'text'}, language_override: 'foo$bar'}"); + + assertFixFailure("{key: {a: 'text'}, language_override: 'foo.bar'}"); // can't have '.' + assertFixFailure("{key: {a: 'text'}, language_override: ''}"); + assertFixFailure("{key: {a: 'text'}, language_override: '$foo'}"); +} + +TEST(FTSSpec, FixTextIndexVersion1) { + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 1.0}}"); + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(1)}}"); + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(1)}}"); + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 2.0}}"); + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(2)}}"); + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(2)}}"); + + assertFixFailure("{key: {a: 'text'}, textIndexVersion: 3}"); + assertFixFailure("{key: {a: 'text'}, textIndexVersion: '2'}"); + assertFixFailure("{key: {a: 'text'}, textIndexVersion: {}}"); +} + +TEST(FTSSpec, ScoreSingleField1) { + BSONObj user = BSON("key" << BSON("title" + << "text" + << "text" + << "text") << "weights" << BSON("title" << 10)); + + FTSSpec spec(FTSSpec::fixSpec(user)); + + TermFrequencyMap m; + spec.scoreDocument(BSON("title" + << "cat sat run"), + &m); + ASSERT_EQUALS(3U, m.size()); + ASSERT_EQUALS(m["cat"], m["sat"]); + ASSERT_EQUALS(m["cat"], m["run"]); + ASSERT(m["cat"] > 0); +} + +TEST(FTSSpec, ScoreMultipleField1) { + BSONObj user = BSON("key" << BSON("title" + << "text" + << "text" + << "text") << "weights" << BSON("title" << 10)); + + FTSSpec spec(FTSSpec::fixSpec(user)); + + TermFrequencyMap m; + spec.scoreDocument(BSON("title" + << "cat sat run" + << "text" + << "cat book"), + &m); + + ASSERT_EQUALS(4U, m.size()); + ASSERT_EQUALS(m["sat"], m["run"]); + ASSERT(m["sat"] > 0); + + ASSERT(m["cat"] > m["sat"]); + ASSERT(m["cat"] > m["book"]); + ASSERT(m["book"] > 0); + ASSERT(m["book"] < m["sat"]); +} + +TEST(FTSSpec, ScoreMultipleField2) { + // Test where one indexed field is a parent component of another indexed field. + BSONObj user = BSON("key" << BSON("a" + << "text" + << "a.b" + << "text")); + + FTSSpec spec(FTSSpec::fixSpec(user)); + + TermFrequencyMap m; + spec.scoreDocument(BSON("a" << BSON("b" + << "term")), + &m); + ASSERT_EQUALS(1U, m.size()); +} + +TEST(FTSSpec, ScoreRepeatWord) { + BSONObj user = BSON("key" << BSON("title" + << "text" + << "text" + << "text") << "weights" << BSON("title" << 10)); + + FTSSpec spec(FTSSpec::fixSpec(user)); + + TermFrequencyMap m; + spec.scoreDocument(BSON("title" + << "cat sat sat run run run"), + &m); + ASSERT_EQUALS(3U, m.size()); + ASSERT(m["cat"] > 0); + ASSERT(m["sat"] > m["cat"]); + ASSERT(m["run"] > m["sat"]); +} + +TEST(FTSSpec, Extra1) { + BSONObj user = BSON("key" << BSON("data" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(user)); + ASSERT_EQUALS(0U, spec.numExtraBefore()); + ASSERT_EQUALS(0U, spec.numExtraAfter()); +} + +TEST(FTSSpec, Extra2) { + BSONObj user = BSON("key" << BSON("data" + << "text" + << "x" << 1)); + BSONObj fixed = FTSSpec::fixSpec(user); + FTSSpec spec(fixed); + ASSERT_EQUALS(0U, spec.numExtraBefore()); + ASSERT_EQUALS(1U, spec.numExtraAfter()); + ASSERT_EQUALS(StringData("x"), spec.extraAfter(0)); + + BSONObj fixed2 = FTSSpec::fixSpec(fixed); + ASSERT_EQUALS(fixed, fixed2); +} + +TEST(FTSSpec, Extra3) { + BSONObj user = BSON("key" << BSON("x" << 1 << "data" + << "text")); + BSONObj fixed = FTSSpec::fixSpec(user); + + ASSERT_EQUALS(BSON("x" << 1 << "_fts" + << "text" + << "_ftsx" << 1), + fixed["key"].Obj()); + ASSERT_EQUALS(BSON("data" << 1), fixed["weights"].Obj()); + + BSONObj fixed2 = FTSSpec::fixSpec(fixed); + ASSERT_EQUALS(fixed, fixed2); + + FTSSpec spec(fixed); + ASSERT_EQUALS(1U, spec.numExtraBefore()); + ASSERT_EQUALS(StringData("x"), spec.extraBefore(0)); + ASSERT_EQUALS(0U, spec.numExtraAfter()); + + BSONObj prefix; + + ASSERT(spec.getIndexPrefix(BSON("x" << 2), &prefix).isOK()); + ASSERT_EQUALS(BSON("x" << 2), prefix); + + ASSERT(spec.getIndexPrefix(BSON("x" << 3 << "y" << 4), &prefix).isOK()); + ASSERT_EQUALS(BSON("x" << 3), prefix); + + ASSERT(!spec.getIndexPrefix(BSON("x" << BSON("$gt" << 5)), &prefix).isOK()); + ASSERT(!spec.getIndexPrefix(BSON("y" << 4), &prefix).isOK()); + ASSERT(!spec.getIndexPrefix(BSONObj(), &prefix).isOK()); +} + +// Test for correct behavior when encountering nested arrays (both directly nested and +// indirectly nested). + +TEST(FTSSpec, NestedArraysPos1) { + BSONObj user = BSON("key" << BSON("a.b" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(user)); + + // The following document matches {"a.b": {$type: 2}}, so "term" should be indexed. + BSONObj obj = fromjson("{a: [{b: ['term']}]}"); // indirectly nested arrays + TermFrequencyMap m; + spec.scoreDocument(obj, &m); + ASSERT_EQUALS(1U, m.size()); +} + +TEST(FTSSpec, NestedArraysPos2) { + BSONObj user = BSON("key" << BSON("$**" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(user)); + + // The wildcard spec implies a full recursive traversal, so "term" should be indexed. + BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays + TermFrequencyMap m; + spec.scoreDocument(obj, &m); + ASSERT_EQUALS(1U, m.size()); +} + +TEST(FTSSpec, NestedArraysNeg1) { + BSONObj user = BSON("key" << BSON("a.b" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(user)); + + // The following document does not match {"a.b": {$type: 2}}, so "term" should not be + // indexed. + BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays + TermFrequencyMap m; + spec.scoreDocument(obj, &m); + ASSERT_EQUALS(0U, m.size()); +} + +// Multi-language test_1: test independent stemming per sub-document +TEST(FTSSpec, NestedLanguages_PerArrayItemStemming) { + BSONObj indexSpec = BSON("key" << BSON("a.b.c" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ a :" + " { b :" + " [ { c : \"walked\", language : \"english\" }," + " { c : \"camminato\", language : \"italian\" }," + " { c : \"ging\", language : \"german\" } ]" + " }" + " }"); + + spec.scoreDocument(obj, &tfm); + + set<string> hits; + hits.insert("walk"); + hits.insert("cammin"); + hits.insert("ging"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS(1U, hits.count(term)); + } +} + +// Multi-language test_2: test nested stemming per sub-document +TEST(FTSSpec, NestedLanguages_PerSubdocStemming) { + BSONObj indexSpec = BSON("key" << BSON("a.b.c" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ language : \"english\"," + " a :" + " { language : \"danish\"," + " b :" + " [ { c : \"foredrag\" }," + " { c : \"foredragsholder\" }," + " { c : \"lector\" } ]" + " }" + "}"); + + spec.scoreDocument(obj, &tfm); + + set<string> hits; + hits.insert("foredrag"); + hits.insert("foredragshold"); + hits.insert("lector"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS(1U, hits.count(term)); } } + +// Multi-language test_3: test nested arrays +TEST(FTSSpec, NestedLanguages_NestedArrays) { + BSONObj indexSpec = BSON("key" << BSON("a.b.c" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ language : \"english\"," + " a : [" + " { language : \"danish\"," + " b :" + " [ { c : [\"foredrag\"] }," + " { c : [\"foredragsholder\"] }," + " { c : [\"lector\"] } ]" + " } ]" + "}"); + + spec.scoreDocument(obj, &tfm); + + set<string> hits; + hits.insert("foredrag"); + hits.insert("foredragshold"); + hits.insert("lector"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS(1U, hits.count(term)); + } +} + +// Multi-language test_4: test pruning +TEST(FTSSpec, NestedLanguages_PathPruning) { + BSONObj indexSpec = BSON("key" << BSON("a.b.c" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ language : \"english\"," + " a : " + " { language : \"danish\"," + " bc : \"foo\"," + " b : { d: \"bar\" }," + " b :" + " [ { c : \"foredrag\" }," + " { c : \"foredragsholder\" }," + " { c : \"lector\" } ]" + " }" + "}"); + + spec.scoreDocument(obj, &tfm); + + set<string> hits; + hits.insert("foredrag"); + hits.insert("foredragshold"); + hits.insert("lector"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS(1U, hits.count(term)); + } +} + +// Multi-language test_5: test wildcard spec +TEST(FTSSpec, NestedLanguages_Wildcard) { + BSONObj indexSpec = BSON("key" << BSON("$**" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ language : \"english\"," + " b : \"walking\"," + " c : { e: \"walked\" }," + " d : " + " { language : \"danish\"," + " e :" + " [ { f : \"foredrag\" }," + " { f : \"foredragsholder\" }," + " { f : \"lector\" } ]" + " }" + "}"); + + spec.scoreDocument(obj, &tfm); + + set<string> hits; + hits.insert("foredrag"); + hits.insert("foredragshold"); + hits.insert("lector"); + hits.insert("walk"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS(1U, hits.count(term)); + } +} + +// Multi-language test_6: test wildcard spec with override +TEST(FTSSpec, NestedLanguages_WildcardOverride) { + BSONObj indexSpec = BSON("key" << BSON("$**" + << "text") << "weights" << BSON("d.e.f" << 20)); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ language : \"english\"," + " b : \"walking\"," + " c : { e: \"walked\" }," + " d : " + " { language : \"danish\"," + " e :" + " [ { f : \"foredrag\" }," + " { f : \"foredragsholder\" }," + " { f : \"lector\" } ]" + " }" + "}"); + + spec.scoreDocument(obj, &tfm); + + set<string> hits; + hits.insert("foredrag"); + hits.insert("foredragshold"); + hits.insert("lector"); + hits.insert("walk"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS(1U, hits.count(term)); + } +} + +/** Test differences across textIndexVersion values in handling of nested arrays. */ +TEST(FTSSpec, TextIndexLegacyNestedArrays) { + BSONObj obj = fromjson("{a: [{b: ['hello']}]}"); + + // textIndexVersion=1 FTSSpec objects do not index nested arrays. + { + BSONObj indexSpec = fromjson("{key: {'a.b': 'text'}, textIndexVersion: 1}"); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + spec.scoreDocument(obj, &tfm); + ASSERT_EQUALS(tfm.size(), 0U); + } + + // textIndexVersion=2 FTSSpec objects do index nested arrays. + { + BSONObj indexSpec = fromjson("{key: {'a.b': 'text'}, textIndexVersion: 2}"); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + spec.scoreDocument(obj, &tfm); + ASSERT_EQUALS(tfm.size(), 1U); + } +} + +/** Test differences across textIndexVersion values in handling of language annotations. */ +TEST(FTSSpec, TextIndexLegacyLanguageRecognition) { + BSONObj obj = fromjson("{a: 'the', language: 'EN'}"); + + // textIndexVersion=1 FTSSpec objects treat two-letter language annotations as "none" + // for purposes of stopword processing. + { + BSONObj indexSpec = fromjson("{key: {'a': 'text'}, textIndexVersion: 1}"); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + spec.scoreDocument(obj, &tfm); + ASSERT_EQUALS(tfm.size(), 1U); // "the" not recognized as stopword + } + + // textIndexVersion=2 FTSSpec objects recognize two-letter codes. + { + BSONObj indexSpec = fromjson("{key: {'a': 'text'}, textIndexVersion: 2}"); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + spec.scoreDocument(obj, &tfm); + ASSERT_EQUALS(tfm.size(), 0U); // "the" recognized as stopword + } +} +} +} diff --git a/src/mongo/db/fts/fts_tokenizer.h b/src/mongo/db/fts/fts_tokenizer.h index 2b345d89266..40cdbde2cb8 100644 --- a/src/mongo/db/fts/fts_tokenizer.h +++ b/src/mongo/db/fts/fts_tokenizer.h @@ -35,58 +35,58 @@ namespace mongo { namespace fts { - class FTSLanguage; - class StopWords; +class FTSLanguage; +class StopWords; + +/** + * FTSTokenizer + * A iterator of "documents" where a document contains space delimited words. + * For each word returns a stem or lemma version of a word optimized for full text indexing. + * Supports various options to control how tokens are generated. + */ +class FTSTokenizer { +public: + virtual ~FTSTokenizer() = default; /** - * FTSTokenizer - * A iterator of "documents" where a document contains space delimited words. - * For each word returns a stem or lemma version of a word optimized for full text indexing. - * Supports various options to control how tokens are generated. + * Options for generating tokens */ - class FTSTokenizer { - public: - virtual ~FTSTokenizer() = default; - - /** - * Options for generating tokens - */ - enum Options { - /** - * Default means lower cased, and stop words are not filtered. - */ - None = 0, - - /** - * Do not lower case terms. - */ - GenerateCaseSensitiveTokens = 1 << 0, - - /** - * Filter out stop words from return tokens. - */ - FilterStopWords = 1 << 1, - }; - + enum Options { /** - * Process a new document, and discards any previous results. - * May be called multiple times on an instance of an iterator. + * Default means lower cased, and stop words are not filtered. */ - virtual void reset(StringData document, Options options) = 0; + None = 0, /** - * Moves to the next token in the iterator. - * Returns false when the iterator reaches end of the document. + * Do not lower case terms. */ - virtual bool moveNext() = 0; + GenerateCaseSensitiveTokens = 1 << 0, /** - * Returns stemmed form, normalized, and lowercased depending on the parameter - * to the reset method. - * Returned StringData is valid until next call to moveNext(). + * Filter out stop words from return tokens. */ - virtual StringData get() const = 0; + FilterStopWords = 1 << 1, }; -} // namespace fts -} // namespace mongo + /** + * Process a new document, and discards any previous results. + * May be called multiple times on an instance of an iterator. + */ + virtual void reset(StringData document, Options options) = 0; + + /** + * Moves to the next token in the iterator. + * Returns false when the iterator reaches end of the document. + */ + virtual bool moveNext() = 0; + + /** + * Returns stemmed form, normalized, and lowercased depending on the parameter + * to the reset method. + * Returned StringData is valid until next call to moveNext(). + */ + virtual StringData get() const = 0; +}; + +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_util.cpp b/src/mongo/db/fts/fts_util.cpp index f2bd4e50905..85420fc66ad 100644 --- a/src/mongo/db/fts/fts_util.cpp +++ b/src/mongo/db/fts/fts_util.cpp @@ -32,11 +32,9 @@ namespace mongo { - namespace fts { +namespace fts { - const std::string INDEX_NAME = "text"; - const std::string WILDCARD = "$**"; - - } +const std::string INDEX_NAME = "text"; +const std::string WILDCARD = "$**"; +} } - diff --git a/src/mongo/db/fts/fts_util.h b/src/mongo/db/fts/fts_util.h index 7cde2bbe985..a1377162443 100644 --- a/src/mongo/db/fts/fts_util.h +++ b/src/mongo/db/fts/fts_util.h @@ -36,16 +36,14 @@ namespace mongo { - namespace fts { +namespace fts { - extern const std::string WILDCARD; - extern const std::string INDEX_NAME; +extern const std::string WILDCARD; +extern const std::string INDEX_NAME; - enum TextIndexVersion { - TEXT_INDEX_VERSION_1 = 1, // Legacy index format. Deprecated. - TEXT_INDEX_VERSION_2 = 2 // Current index format. - }; - - } +enum TextIndexVersion { + TEXT_INDEX_VERSION_1 = 1, // Legacy index format. Deprecated. + TEXT_INDEX_VERSION_2 = 2 // Current index format. +}; +} } - diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp index 9353fccf297..07d17c050eb 100644 --- a/src/mongo/db/fts/stemmer.cpp +++ b/src/mongo/db/fts/stemmer.cpp @@ -36,39 +36,36 @@ namespace mongo { - namespace fts { +namespace fts { - using std::string; +using std::string; - Stemmer::Stemmer( const FTSLanguage* language ) { - _stemmer = NULL; - if ( language->str() != "none" ) - _stemmer = sb_stemmer_new(language->str().c_str(), "UTF_8"); - } - - Stemmer::~Stemmer() { - if ( _stemmer ) { - sb_stemmer_delete(_stemmer); - _stemmer = NULL; - } - } - - string Stemmer::stem( StringData word ) const { - if ( !_stemmer ) - return word.toString(); +Stemmer::Stemmer(const FTSLanguage* language) { + _stemmer = NULL; + if (language->str() != "none") + _stemmer = sb_stemmer_new(language->str().c_str(), "UTF_8"); +} - const sb_symbol* sb_sym = sb_stemmer_stem( _stemmer, - (const sb_symbol*)word.rawData(), - word.size() ); +Stemmer::~Stemmer() { + if (_stemmer) { + sb_stemmer_delete(_stemmer); + _stemmer = NULL; + } +} - if ( sb_sym == NULL ) { - // out of memory - invariant( false ); - } +string Stemmer::stem(StringData word) const { + if (!_stemmer) + return word.toString(); - return string( (const char*)(sb_sym), sb_stemmer_length( _stemmer ) ); - } + const sb_symbol* sb_sym = + sb_stemmer_stem(_stemmer, (const sb_symbol*)word.rawData(), word.size()); + if (sb_sym == NULL) { + // out of memory + invariant(false); } + return string((const char*)(sb_sym), sb_stemmer_length(_stemmer)); +} +} } diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h index d6d76e64218..59261bfb6a0 100644 --- a/src/mongo/db/fts/stemmer.h +++ b/src/mongo/db/fts/stemmer.h @@ -39,23 +39,24 @@ namespace mongo { - namespace fts { - - /** - * maintains case - * but works - * running/Running -> run/Run - */ - class Stemmer { - MONGO_DISALLOW_COPYING( Stemmer ); - public: - Stemmer( const FTSLanguage* language ); - ~Stemmer(); - - std::string stem( StringData word ) const; - private: - struct sb_stemmer* _stemmer; - }; - } -} +namespace fts { +/** + * maintains case + * but works + * running/Running -> run/Run + */ +class Stemmer { + MONGO_DISALLOW_COPYING(Stemmer); + +public: + Stemmer(const FTSLanguage* language); + ~Stemmer(); + + std::string stem(StringData word) const; + +private: + struct sb_stemmer* _stemmer; +}; +} +} diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp index bef556bf2ad..d40d25e8348 100644 --- a/src/mongo/db/fts/stemmer_test.cpp +++ b/src/mongo/db/fts/stemmer_test.cpp @@ -35,19 +35,18 @@ #include "mongo/db/fts/stemmer.h" namespace mongo { - namespace fts { +namespace fts { - TEST( English, Stemmer1 ) { - Stemmer s( &languageEnglishV2 ); - ASSERT_EQUALS( "run", s.stem( "running" ) ); - ASSERT_EQUALS( "Run", s.stem( "Running" ) ); - } - - TEST( English, Caps ) { - Stemmer s( &languagePorterV1 ); - ASSERT_EQUALS( "unit", s.stem( "united" ) ); - ASSERT_EQUALS( "Unite", s.stem( "United" ) ); - } +TEST(English, Stemmer1) { + Stemmer s(&languageEnglishV2); + ASSERT_EQUALS("run", s.stem("running")); + ASSERT_EQUALS("Run", s.stem("Running")); +} - } +TEST(English, Caps) { + Stemmer s(&languagePorterV1); + ASSERT_EQUALS("unit", s.stem("united")); + ASSERT_EQUALS("Unite", s.stem("United")); +} +} } diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp index 421bfae63db..0a44eaf25ff 100644 --- a/src/mongo/db/fts/stop_words.cpp +++ b/src/mongo/db/fts/stop_words.cpp @@ -38,43 +38,38 @@ namespace mongo { - namespace fts { +namespace fts { - void loadStopWordMap( StringMap< std::set< std::string > >* m ); - - namespace { - StringMap< std::shared_ptr<StopWords> > StopWordsMap; - StopWords empty; - } +void loadStopWordMap(StringMap<std::set<std::string>>* m); +namespace { +StringMap<std::shared_ptr<StopWords>> StopWordsMap; +StopWords empty; +} - StopWords::StopWords(){ - } - StopWords::StopWords( const std::set<std::string>& words ) { - for ( std::set<std::string>::const_iterator i = words.begin(); i != words.end(); ++i ) - _words.insert( *i ); - } +StopWords::StopWords() {} - const StopWords* StopWords::getStopWords( const FTSLanguage* language ) { - auto i = StopWordsMap.find( language->str() ); - if ( i == StopWordsMap.end() ) - return ∅ - return i->second.get(); - } +StopWords::StopWords(const std::set<std::string>& words) { + for (std::set<std::string>::const_iterator i = words.begin(); i != words.end(); ++i) + _words.insert(*i); +} +const StopWords* StopWords::getStopWords(const FTSLanguage* language) { + auto i = StopWordsMap.find(language->str()); + if (i == StopWordsMap.end()) + return ∅ + return i->second.get(); +} - MONGO_INITIALIZER(StopWords)(InitializerContext* context) { - StringMap< std::set< std::string > > raw; - loadStopWordMap( &raw ); - for ( StringMap< std::set< std::string > >::const_iterator i = raw.begin(); - i != raw.end(); - ++i ) { - StopWordsMap[i->first].reset(new StopWords( i->second )); - } - return Status::OK(); - } +MONGO_INITIALIZER(StopWords)(InitializerContext* context) { + StringMap<std::set<std::string>> raw; + loadStopWordMap(&raw); + for (StringMap<std::set<std::string>>::const_iterator i = raw.begin(); i != raw.end(); ++i) { + StopWordsMap[i->first].reset(new StopWords(i->second)); } - + return Status::OK(); +} +} } diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h index d989b4dcd32..eebc11c012a 100644 --- a/src/mongo/db/fts/stop_words.h +++ b/src/mongo/db/fts/stop_words.h @@ -39,25 +39,27 @@ namespace mongo { - namespace fts { +namespace fts { - class StopWords { - MONGO_DISALLOW_COPYING( StopWords ); - public: - StopWords(); - StopWords( const std::set<std::string>& words ); +class StopWords { + MONGO_DISALLOW_COPYING(StopWords); - bool isStopWord( const std::string& word ) const { - return _words.count( word ) > 0; - } +public: + StopWords(); + StopWords(const std::set<std::string>& words); - size_t numStopWords() const { return _words.size(); } - - static const StopWords* getStopWords( const FTSLanguage* language ); - private: - unordered_set<std::string> _words; - }; + bool isStopWord(const std::string& word) const { + return _words.count(word) > 0; + } + size_t numStopWords() const { + return _words.size(); } -} + static const StopWords* getStopWords(const FTSLanguage* language); + +private: + unordered_set<std::string> _words; +}; +} +} diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp index 248c4d93407..5834503dd4a 100644 --- a/src/mongo/db/fts/stop_words_test.cpp +++ b/src/mongo/db/fts/stop_words_test.cpp @@ -33,13 +33,12 @@ #include "mongo/unittest/unittest.h" namespace mongo { - namespace fts { +namespace fts { - TEST( English, Basic1 ) { - const StopWords* englishStopWords = StopWords::getStopWords( &languageEnglishV2 ); - ASSERT( englishStopWords->isStopWord( "the" ) ); - ASSERT( !englishStopWords->isStopWord( "computer" ) ); - } - - } +TEST(English, Basic1) { + const StopWords* englishStopWords = StopWords::getStopWords(&languageEnglishV2); + ASSERT(englishStopWords->isStopWord("the")); + ASSERT(!englishStopWords->isStopWord("computer")); +} +} } diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp index 01037a85c8d..e1f595b9a4a 100644 --- a/src/mongo/db/fts/tokenizer.cpp +++ b/src/mongo/db/fts/tokenizer.cpp @@ -36,105 +36,103 @@ namespace mongo { - namespace fts { - - Tokenizer::Tokenizer(const FTSLanguage* language, StringData str) - : _pos(0), _raw( str ) { - _english = ( language->str() == "english" ); - _skipWhitespace(); - } - - bool Tokenizer::more() const { - return _pos < _raw.size(); - } - - Token Tokenizer::next() { - if ( _pos >= _raw.size() ) - return Token(Token::INVALID, "", 0); - - unsigned start = _pos++; - Token::Type type = _type( _raw[start] ); - if ( type == Token::WHITESPACE ) invariant( false ); - - if ( type == Token::TEXT ) - while ( _pos < _raw.size() && _type( _raw[_pos] ) == type ) - _pos++; - - StringData ret = _raw.substr( start, _pos - start ); - _skipWhitespace(); - return Token( type, ret, start ); - } - - - bool Tokenizer::_skipWhitespace() { - unsigned start = _pos; - while ( _pos < _raw.size() && _type( _raw[_pos] ) == Token::WHITESPACE ) - _pos++; - return _pos > start; - } - - - Token::Type Tokenizer::_type( char c ) const { - switch ( c ) { - case ' ': - case '\f': - case '\v': - case '\t': - case '\r': - case '\n': - return Token::WHITESPACE; - case '\'': - if ( _english ) - return Token::TEXT; - else - return Token::WHITESPACE; - - case '~': - case '`': - - case '!': - case '@': - case '#': - case '$': - case '%': - case '^': - case '&': - case '*': - case '(': - case ')': - - case '-': - - case '=': - case '+': - - case '[': - case ']': - case '{': - case '}': - case '|': - case '\\': - - case ';': - case ':': - - case '"': - - case '<': - case '>': - - case ',': - case '.': - - case '/': - case '?': - - return Token::DELIMITER; - default: +namespace fts { + +Tokenizer::Tokenizer(const FTSLanguage* language, StringData str) : _pos(0), _raw(str) { + _english = (language->str() == "english"); + _skipWhitespace(); +} + +bool Tokenizer::more() const { + return _pos < _raw.size(); +} + +Token Tokenizer::next() { + if (_pos >= _raw.size()) + return Token(Token::INVALID, "", 0); + + unsigned start = _pos++; + Token::Type type = _type(_raw[start]); + if (type == Token::WHITESPACE) + invariant(false); + + if (type == Token::TEXT) + while (_pos < _raw.size() && _type(_raw[_pos]) == type) + _pos++; + + StringData ret = _raw.substr(start, _pos - start); + _skipWhitespace(); + return Token(type, ret, start); +} + + +bool Tokenizer::_skipWhitespace() { + unsigned start = _pos; + while (_pos < _raw.size() && _type(_raw[_pos]) == Token::WHITESPACE) + _pos++; + return _pos > start; +} + + +Token::Type Tokenizer::_type(char c) const { + switch (c) { + case ' ': + case '\f': + case '\v': + case '\t': + case '\r': + case '\n': + return Token::WHITESPACE; + case '\'': + if (_english) return Token::TEXT; - } - } + else + return Token::WHITESPACE; - } + case '~': + case '`': + + case '!': + case '@': + case '#': + case '$': + case '%': + case '^': + case '&': + case '*': + case '(': + case ')': + + case '-': + + case '=': + case '+': + + case '[': + case ']': + case '{': + case '}': + case '|': + case '\\': + + case ';': + case ':': + case '"': + + case '<': + case '>': + + case ',': + case '.': + + case '/': + case '?': + + return Token::DELIMITER; + default: + return Token::TEXT; + } +} +} } diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h index 503816cc434..f1184a455f2 100644 --- a/src/mongo/db/fts/tokenizer.h +++ b/src/mongo/db/fts/tokenizer.h @@ -38,41 +38,37 @@ namespace mongo { - namespace fts { +namespace fts { - struct Token { - enum Type { WHITESPACE, DELIMITER, TEXT, INVALID }; - Token( Type type, StringData data, unsigned offset) - : type( type ), - data( data ), - offset( offset ) - {} +struct Token { + enum Type { WHITESPACE, DELIMITER, TEXT, INVALID }; + Token(Type type, StringData data, unsigned offset) : type(type), data(data), offset(offset) {} - bool ok() const { return type != INVALID; } - - Type type; - StringData data; - unsigned offset; - }; + bool ok() const { + return type != INVALID; + } - class Tokenizer { - MONGO_DISALLOW_COPYING( Tokenizer ); - public: + Type type; + StringData data; + unsigned offset; +}; - Tokenizer( const FTSLanguage* language, StringData str); +class Tokenizer { + MONGO_DISALLOW_COPYING(Tokenizer); - bool more() const; - Token next(); +public: + Tokenizer(const FTSLanguage* language, StringData str); - private: - Token::Type _type( char c ) const; - bool _skipWhitespace(); + bool more() const; + Token next(); - unsigned _pos; - const StringData _raw; - bool _english; - }; +private: + Token::Type _type(char c) const; + bool _skipWhitespace(); - } + unsigned _pos; + const StringData _raw; + bool _english; +}; +} } - diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp index d370c9f6c0b..143e3b372ce 100644 --- a/src/mongo/db/fts/tokenizer_test.cpp +++ b/src/mongo/db/fts/tokenizer_test.cpp @@ -33,91 +33,88 @@ #include "mongo/unittest/unittest.h" namespace mongo { - namespace fts { +namespace fts { - TEST( Tokenizer, Empty1 ) { - Tokenizer i( &languageEnglishV2, "" ); - ASSERT( !i.more() ); - } - - TEST( Tokenizer, Basic1 ) { - Tokenizer i( &languageEnglishV2, "blue red green" ); +TEST(Tokenizer, Empty1) { + Tokenizer i(&languageEnglishV2, ""); + ASSERT(!i.more()); +} - ASSERT( i.more() ); - ASSERT_EQUALS( i.next().data.toString(), "blue" ); +TEST(Tokenizer, Basic1) { + Tokenizer i(&languageEnglishV2, "blue red green"); - ASSERT( i.more() ); - ASSERT_EQUALS( i.next().data.toString(), "red" ); + ASSERT(i.more()); + ASSERT_EQUALS(i.next().data.toString(), "blue"); - ASSERT( i.more() ); - ASSERT_EQUALS( i.next().data.toString(), "green" ); + ASSERT(i.more()); + ASSERT_EQUALS(i.next().data.toString(), "red"); - ASSERT( !i.more() ); - } + ASSERT(i.more()); + ASSERT_EQUALS(i.next().data.toString(), "green"); - TEST( Tokenizer, Basic2 ) { - Tokenizer i( &languageEnglishV2, "blue-red" ); + ASSERT(!i.more()); +} - Token a = i.next(); - Token b = i.next(); - Token c = i.next(); - Token d = i.next(); +TEST(Tokenizer, Basic2) { + Tokenizer i(&languageEnglishV2, "blue-red"); - ASSERT_EQUALS( Token::TEXT, a.type ); - ASSERT_EQUALS( Token::DELIMITER, b.type ); - ASSERT_EQUALS( Token::TEXT, c.type ); - ASSERT_EQUALS( Token::INVALID, d.type ); + Token a = i.next(); + Token b = i.next(); + Token c = i.next(); + Token d = i.next(); - ASSERT_EQUALS( "blue", a.data.toString() ); - ASSERT_EQUALS( "-", b.data.toString() ); - ASSERT_EQUALS( "red", c.data.toString() ); - } + ASSERT_EQUALS(Token::TEXT, a.type); + ASSERT_EQUALS(Token::DELIMITER, b.type); + ASSERT_EQUALS(Token::TEXT, c.type); + ASSERT_EQUALS(Token::INVALID, d.type); - TEST( Tokenizer, Basic3 ) { - Tokenizer i( &languageEnglishV2, "blue -red" ); + ASSERT_EQUALS("blue", a.data.toString()); + ASSERT_EQUALS("-", b.data.toString()); + ASSERT_EQUALS("red", c.data.toString()); +} - Token a = i.next(); - Token b = i.next(); - Token c = i.next(); - Token d = i.next(); +TEST(Tokenizer, Basic3) { + Tokenizer i(&languageEnglishV2, "blue -red"); - ASSERT_EQUALS( Token::TEXT, a.type ); - ASSERT_EQUALS( Token::DELIMITER, b.type ); - ASSERT_EQUALS( Token::TEXT, c.type ); - ASSERT_EQUALS( Token::INVALID, d.type ); + Token a = i.next(); + Token b = i.next(); + Token c = i.next(); + Token d = i.next(); - ASSERT_EQUALS( "blue", a.data.toString() ); - ASSERT_EQUALS( "-", b.data.toString() ); - ASSERT_EQUALS( "red", c.data.toString() ); + ASSERT_EQUALS(Token::TEXT, a.type); + ASSERT_EQUALS(Token::DELIMITER, b.type); + ASSERT_EQUALS(Token::TEXT, c.type); + ASSERT_EQUALS(Token::INVALID, d.type); - ASSERT_EQUALS( 0U, a.offset ); - ASSERT_EQUALS( 5U, b.offset ); - ASSERT_EQUALS( 6U, c.offset ); - } + ASSERT_EQUALS("blue", a.data.toString()); + ASSERT_EQUALS("-", b.data.toString()); + ASSERT_EQUALS("red", c.data.toString()); - TEST( Tokenizer, Quote1English ) { - Tokenizer i( &languageEnglishV2, "eliot's car" ); + ASSERT_EQUALS(0U, a.offset); + ASSERT_EQUALS(5U, b.offset); + ASSERT_EQUALS(6U, c.offset); +} - Token a = i.next(); - Token b = i.next(); +TEST(Tokenizer, Quote1English) { + Tokenizer i(&languageEnglishV2, "eliot's car"); - ASSERT_EQUALS( "eliot's", a.data.toString() ); - ASSERT_EQUALS( "car", b.data.toString() ); - } + Token a = i.next(); + Token b = i.next(); - TEST( Tokenizer, Quote1French ) { - Tokenizer i( &languageFrenchV2, "eliot's car" ); + ASSERT_EQUALS("eliot's", a.data.toString()); + ASSERT_EQUALS("car", b.data.toString()); +} - Token a = i.next(); - Token b = i.next(); - Token c = i.next(); +TEST(Tokenizer, Quote1French) { + Tokenizer i(&languageFrenchV2, "eliot's car"); - ASSERT_EQUALS( "eliot", a.data.toString() ); - ASSERT_EQUALS( "s", b.data.toString() ); - ASSERT_EQUALS( "car", c.data.toString() ); - } + Token a = i.next(); + Token b = i.next(); + Token c = i.next(); - } + ASSERT_EQUALS("eliot", a.data.toString()); + ASSERT_EQUALS("s", b.data.toString()); + ASSERT_EQUALS("car", c.data.toString()); +} +} } - - |