diff options
Diffstat (limited to 'src/mongo/db/fts')
32 files changed, 3388 insertions, 3481 deletions
diff --git a/src/mongo/db/fts/fts_element_iterator.cpp b/src/mongo/db/fts/fts_element_iterator.cpp index f57e1097c14..4df642dc66a 100644 --- a/src/mongo/db/fts/fts_element_iterator.cpp +++ b/src/mongo/db/fts/fts_element_iterator.cpp @@ -37,152 +37,149 @@ namespace mongo { - namespace fts { +namespace fts { + +using std::string; + +extern const double DEFAULT_WEIGHT; +extern const double MAX_WEIGHT; + +std::ostream& operator<<(std::ostream& os, FTSElementIterator::FTSIteratorFrame& frame) { + BSONObjIterator it = frame._it; + return os << "FTSIteratorFrame[" + " element=" << (*it).toString() << ", _language=" << frame._language->str() + << ", _parentPath=" << frame._parentPath << ", _isArray=" << frame._isArray << "]"; +} + +FTSElementIterator::FTSElementIterator(const FTSSpec& spec, const BSONObj& obj) + : _frame(obj, spec, &spec.defaultLanguage(), "", false), + _spec(spec), + _currentValue(advance()) {} + +namespace { +/** Check for exact match or path prefix match. */ +inline bool _matchPrefix(const string& dottedName, const string& weight) { + if (weight == dottedName) { + return true; + } + return mongoutils::str::startsWith(weight, dottedName + '.'); +} +} + +bool FTSElementIterator::more() { + //_currentValue = advance(); + return _currentValue.valid(); +} + +FTSIteratorValue FTSElementIterator::next() { + FTSIteratorValue result = _currentValue; + _currentValue = advance(); + return result; +} - using std::string; - - extern const double DEFAULT_WEIGHT; - extern const double MAX_WEIGHT; - - std::ostream& operator<<( std::ostream& os, FTSElementIterator::FTSIteratorFrame& frame ) { - BSONObjIterator it = frame._it; - return os << "FTSIteratorFrame[" - " element=" << (*it).toString() << - ", _language=" << frame._language->str() << - ", _parentPath=" << frame._parentPath << - ", _isArray=" << frame._isArray << "]"; - } - - FTSElementIterator::FTSElementIterator( const FTSSpec& spec, const BSONObj& obj ) - : _frame( obj, spec, &spec.defaultLanguage(), "", false ), - _spec( spec ), - _currentValue( advance() ) - { } - - namespace { - /** Check for exact match or path prefix match. */ - inline bool _matchPrefix( const string& dottedName, const string& weight ) { - if ( weight == dottedName ) { - return true; - } - return mongoutils::str::startsWith( weight, dottedName + '.' ); - } - } - - bool FTSElementIterator::more() { - //_currentValue = advance(); - return _currentValue.valid(); +/** + * Helper method: + * if (current object iterator not exhausted) return true; + * while (frame stack not empty) { + * resume object iterator popped from stack; + * if (resumed iterator not exhausted) return true; + * } + * return false; + */ +bool FTSElementIterator::moreFrames() { + if (_frame._it.more()) + return true; + while (!_frameStack.empty()) { + _frame = _frameStack.top(); + _frameStack.pop(); + if (_frame._it.more()) { + return true; } - - FTSIteratorValue FTSElementIterator::next() { - FTSIteratorValue result = _currentValue; - _currentValue = advance(); - return result; + } + return false; +} + +FTSIteratorValue FTSElementIterator::advance() { + while (moreFrames()) { + BSONElement elem = _frame._it.next(); + string fieldName = elem.fieldName(); + + // Skip "language" specifier fields if wildcard. + if (_spec.wildcard() && _spec.languageOverrideField() == fieldName) { + continue; } - /** - * Helper method: - * if (current object iterator not exhausted) return true; - * while (frame stack not empty) { - * resume object iterator popped from stack; - * if (resumed iterator not exhausted) return true; - * } - * return false; - */ - bool FTSElementIterator::moreFrames() { - if (_frame._it.more()) return true; - while (!_frameStack.empty()) { - _frame = _frameStack.top(); - _frameStack.pop(); - if (_frame._it.more()) { - return true; - } + // Compose the dotted name of the current field: + // 1. parent path empty (top level): use the current field name + // 2. parent path non-empty and obj is an array: use the parent path + // 3. parent path non-empty and obj is a sub-doc: append field name to parent path + string dottedName = (_frame._parentPath.empty() ? fieldName : _frame._isArray + ? _frame._parentPath + : _frame._parentPath + '.' + fieldName); + + // Find lower bound of dottedName in _weights. lower_bound leaves us at the first + // weight that could possibly match or be a prefix of dottedName. And if this + // element fails to match, then no subsequent weight can match, since the weights + // are lexicographically ordered. + Weights::const_iterator i = + _spec.weights().lower_bound(elem.type() == Object ? dottedName + '.' : dottedName); + + // possibleWeightMatch is set if the weight map contains either a match or some item + // lexicographically larger than fieldName. This boolean acts as a guard on + // dereferences of iterator 'i'. + bool possibleWeightMatch = (i != _spec.weights().end()); + + // Optimize away two cases, when not wildcard: + // 1. lower_bound seeks to end(): no prefix match possible + // 2. lower_bound seeks to a name which is not a prefix + if (!_spec.wildcard()) { + if (!possibleWeightMatch) { + continue; + } else if (!_matchPrefix(dottedName, i->first)) { + continue; } - return false; } - FTSIteratorValue FTSElementIterator::advance() { - while ( moreFrames() ) { - - BSONElement elem = _frame._it.next(); - string fieldName = elem.fieldName(); + // Is the current field an exact match on a weight? + bool exactMatch = (possibleWeightMatch && i->first == dottedName); + double weight = (possibleWeightMatch ? i->second : DEFAULT_WEIGHT); - // Skip "language" specifier fields if wildcard. - if ( _spec.wildcard() && _spec.languageOverrideField() == fieldName ) { - continue; + switch (elem.type()) { + case String: + // Only index strings on exact match or wildcard. + if (exactMatch || _spec.wildcard()) { + return FTSIteratorValue(elem.valuestr(), _frame._language, weight); } - - // Compose the dotted name of the current field: - // 1. parent path empty (top level): use the current field name - // 2. parent path non-empty and obj is an array: use the parent path - // 3. parent path non-empty and obj is a sub-doc: append field name to parent path - string dottedName = ( _frame._parentPath.empty() ? fieldName - : _frame._isArray ? _frame._parentPath - : _frame._parentPath + '.' + fieldName ); - - // Find lower bound of dottedName in _weights. lower_bound leaves us at the first - // weight that could possibly match or be a prefix of dottedName. And if this - // element fails to match, then no subsequent weight can match, since the weights - // are lexicographically ordered. - Weights::const_iterator i = _spec.weights().lower_bound( elem.type() == Object - ? dottedName + '.' - : dottedName ); - - // possibleWeightMatch is set if the weight map contains either a match or some item - // lexicographically larger than fieldName. This boolean acts as a guard on - // dereferences of iterator 'i'. - bool possibleWeightMatch = ( i != _spec.weights().end() ); - - // Optimize away two cases, when not wildcard: - // 1. lower_bound seeks to end(): no prefix match possible - // 2. lower_bound seeks to a name which is not a prefix - if ( !_spec.wildcard() ) { - if ( !possibleWeightMatch ) { - continue; - } - else if ( !_matchPrefix( dottedName, i->first ) ) { - continue; - } + break; + + case Object: + // Only descend into a sub-document on proper prefix or wildcard. Note that + // !exactMatch is a sufficient test for proper prefix match, because of + // if ( !matchPrefix( dottedName, i->first ) ) continue; + // block above. + if (!exactMatch || _spec.wildcard()) { + _frameStack.push(_frame); + _frame = + FTSIteratorFrame(elem.Obj(), _spec, _frame._language, dottedName, false); } - - // Is the current field an exact match on a weight? - bool exactMatch = ( possibleWeightMatch && i->first == dottedName ); - double weight = ( possibleWeightMatch ? i->second : DEFAULT_WEIGHT ); - - switch ( elem.type() ) { - case String: - // Only index strings on exact match or wildcard. - if ( exactMatch || _spec.wildcard() ) { - return FTSIteratorValue( elem.valuestr(), _frame._language, weight ); - } - break; - - case Object: - // Only descend into a sub-document on proper prefix or wildcard. Note that - // !exactMatch is a sufficient test for proper prefix match, because of - // if ( !matchPrefix( dottedName, i->first ) ) continue; - // block above. - if ( !exactMatch || _spec.wildcard() ) { - _frameStack.push( _frame ); - _frame = FTSIteratorFrame( elem.Obj(), _spec, _frame._language, dottedName, false ); - } - break; - - case Array: - // Only descend into arrays from non-array parents or on wildcard. - if ( !_frame._isArray || _spec.wildcard() ) { - _frameStack.push( _frame ); - _frame = FTSIteratorFrame( elem.Obj(), _spec, _frame._language, dottedName, true ); - } - break; - - default: - // Skip over all other BSON types. - break; + break; + + case Array: + // Only descend into arrays from non-array parents or on wildcard. + if (!_frame._isArray || _spec.wildcard()) { + _frameStack.push(_frame); + _frame = + FTSIteratorFrame(elem.Obj(), _spec, _frame._language, dottedName, true); } - } - return FTSIteratorValue(); // valid()==false + break; + + default: + // Skip over all other BSON types. + break; } + } + return FTSIteratorValue(); // valid()==false +} - } // namespace fts -} // namespace mongo +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_element_iterator.h b/src/mongo/db/fts/fts_element_iterator.h index 3ca2372d1be..5c00f39c8bf 100644 --- a/src/mongo/db/fts/fts_element_iterator.h +++ b/src/mongo/db/fts/fts_element_iterator.h @@ -41,133 +41,121 @@ namespace mongo { - namespace fts { - - /** - * Encapsulates data fields returned by FTSElementIterator - */ - struct FTSIteratorValue { - - FTSIteratorValue( const char* text, - const FTSLanguage* language, - double weight ) - : _text(text), - _language(language), - _weight(weight), - _valid(true) - {} - - FTSIteratorValue() - : _text(NULL), - _language(), - _weight(0.0), - _valid(false) - {} - - bool valid() const { return _valid; } - - const char* _text; - const FTSLanguage* _language; - double _weight; - bool _valid; - }; - - /** - * Iterator pattern for walking through text-indexed fields of a - * BSON document. - * - * Example usage: - * FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - * FTSElementIterator it( spec, obj ); - * while ( it.more() ) { - * FTSIteratorValue val = it.next(); - * std::cout << val._text << '[' << val._language.str() - * << ',' << val._weight << ']' << std::endl; - * } - * - */ - class FTSElementIterator { - public: - /** - * Iterator constructor - * - * Note: Caller must ensure that the constructed FTSElementIterator - * does >not< outlive either spec or obj. - * - * @arg spec text index specifier - * @arg obj document that the iterator will traverse - */ - FTSElementIterator( const FTSSpec& spec, const BSONObj& obj); - - /** - * Iterator interface: returns false iff there are no further text-indexable fields. - */ - bool more(); - - /** - * Iterator interface: advances to the next text-indexable field. - */ - FTSIteratorValue next(); - - /** - * Iterator frame needed for iterative implementation of - * recursive sub-documents. - */ - struct FTSIteratorFrame { - FTSIteratorFrame( const BSONObj& obj, - const FTSSpec& spec, - const FTSLanguage* parentLanguage, - const std::string& parentPath, - bool isArray ) - : _it( obj ), - _language( spec._getLanguageToUseV2( obj, parentLanguage ) ), - _parentPath( parentPath ), - _isArray( isArray ) - {} - - friend std::ostream& operator<<(std::ostream&, FTSIteratorFrame&); - - BSONObjIterator _it; - const FTSLanguage* _language; - std::string _parentPath; - bool _isArray; - }; - - private: - /** - * Helper method: - * returns false iff all FTSIteratorFrames on _frameStack are exhausted. - */ - bool moreFrames(); - - /** - * Helper method: - * advances to the next text-indexable field, possibly pushing frames as - * needed for recursive sub-documents. - */ - FTSIteratorValue advance(); - - /** - * Stack used by iterative implementation of recursive sub-document traversal. - */ - std::stack<FTSIteratorFrame> _frameStack; - - /** - * Current frame, not yet pushed to stack. - */ - FTSIteratorFrame _frame; - - /** - * Constructor input parameter: text index specification. - */ - const FTSSpec& _spec; - - /** - * Current iterator return value, computed by 'more()', returned by 'next()'. - */ - FTSIteratorValue _currentValue; - }; - - } // namespace fts -} // namespace mongo +namespace fts { +/** + * Encapsulates data fields returned by FTSElementIterator + */ +struct FTSIteratorValue { + FTSIteratorValue(const char* text, const FTSLanguage* language, double weight) + : _text(text), _language(language), _weight(weight), _valid(true) {} + + FTSIteratorValue() : _text(NULL), _language(), _weight(0.0), _valid(false) {} + + bool valid() const { + return _valid; + } + + const char* _text; + const FTSLanguage* _language; + double _weight; + bool _valid; +}; + +/** + * Iterator pattern for walking through text-indexed fields of a + * BSON document. + * + * Example usage: + * FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); + * FTSElementIterator it( spec, obj ); + * while ( it.more() ) { + * FTSIteratorValue val = it.next(); + * std::cout << val._text << '[' << val._language.str() + * << ',' << val._weight << ']' << std::endl; + * } + * + */ +class FTSElementIterator { +public: + /** + * Iterator constructor + * + * Note: Caller must ensure that the constructed FTSElementIterator + * does >not< outlive either spec or obj. + * + * @arg spec text index specifier + * @arg obj document that the iterator will traverse + */ + FTSElementIterator(const FTSSpec& spec, const BSONObj& obj); + + /** + * Iterator interface: returns false iff there are no further text-indexable fields. + */ + bool more(); + + /** + * Iterator interface: advances to the next text-indexable field. + */ + FTSIteratorValue next(); + + /** + * Iterator frame needed for iterative implementation of + * recursive sub-documents. + */ + struct FTSIteratorFrame { + FTSIteratorFrame(const BSONObj& obj, + const FTSSpec& spec, + const FTSLanguage* parentLanguage, + const std::string& parentPath, + bool isArray) + : _it(obj), + _language(spec._getLanguageToUseV2(obj, parentLanguage)), + _parentPath(parentPath), + _isArray(isArray) {} + + friend std::ostream& operator<<(std::ostream&, FTSIteratorFrame&); + + BSONObjIterator _it; + const FTSLanguage* _language; + std::string _parentPath; + bool _isArray; + }; + +private: + /** + * Helper method: + * returns false iff all FTSIteratorFrames on _frameStack are exhausted. + */ + bool moreFrames(); + + /** + * Helper method: + * advances to the next text-indexable field, possibly pushing frames as + * needed for recursive sub-documents. + */ + FTSIteratorValue advance(); + + /** + * Stack used by iterative implementation of recursive sub-document traversal. + */ + std::stack<FTSIteratorFrame> _frameStack; + + /** + * Current frame, not yet pushed to stack. + */ + FTSIteratorFrame _frame; + + /** + * Constructor input parameter: text index specification. + */ + const FTSSpec& _spec; + + /** + * Current iterator return value, computed by 'more()', returned by 'next()'. + */ + FTSIteratorValue _currentValue; +}; + +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_element_iterator_test.cpp b/src/mongo/db/fts/fts_element_iterator_test.cpp index 6d5694c5990..2a16c14b5a5 100644 --- a/src/mongo/db/fts/fts_element_iterator_test.cpp +++ b/src/mongo/db/fts/fts_element_iterator_test.cpp @@ -34,279 +34,267 @@ #include "mongo/unittest/unittest.h" namespace mongo { - namespace fts { - - using std::string; - - TEST( FTSElementIterator, Test1 ) { - - BSONObj obj = fromjson( - "{ b : \"walking\"," - " c : { e: \"walked\" }," - " d : \"walker\"" - " }" ); - - BSONObj indexSpec = fromjson( - "{ key : { a : \"text\" }, weights : { b : 10, d : 5 } }" ); - - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - Weights::const_iterator itt = spec.weights().begin(); - ASSERT( itt != spec.weights().end() ); - ASSERT_EQUALS( "a", itt->first ); - ASSERT_EQUALS( 1, itt->second ); - ++itt; - ASSERT( itt != spec.weights().end() ); - ASSERT_EQUALS( "b", itt->first ); - ASSERT_EQUALS( 10, itt->second ); - ++itt; - ASSERT( itt != spec.weights().end() ); - ASSERT_EQUALS( "d", itt->first ); - ASSERT_EQUALS( 5, itt->second ); - ++itt; - - FTSElementIterator it( spec, obj ); - - ASSERT( it.more() ); - FTSIteratorValue val = it.next(); - ASSERT_EQUALS( "walking", string(val._text) ); - ASSERT_EQUALS( "english", val._language->str() ); - ASSERT_EQUALS( 10, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "walker", string(val._text) ); - ASSERT_EQUALS( "english", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - } - - // Multi-language : test - TEST( FTSElementIterator, Test2 ) { - - BSONObj obj = fromjson( - "{ a :" - " { b :" - " [ { c : \"walked\", language : \"english\" }," - " { c : \"camminato\", language : \"italian\" }," - " { c : \"ging\", language : \"german\" } ]" - " }," - " d : \"Feliz Año Nuevo!\"," - " language : \"spanish\"" - " }" ); - - BSONObj indexSpec = fromjson( - "{ key : { \"a.b.c\" : \"text\", d : \"text\" } }" ); - - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - - FTSElementIterator it( spec, obj ); - - ASSERT( it.more() ); - FTSIteratorValue val = it.next(); - ASSERT_EQUALS( "walked", string(val._text) ); - ASSERT_EQUALS( "english", val._language->str() ); - ASSERT_EQUALS( 1, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "camminato", string(val._text) ); - ASSERT_EQUALS( "italian", val._language->str() ); - ASSERT_EQUALS( 1, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "ging", string(val._text) ); - ASSERT_EQUALS( "german", val._language->str() ); - ASSERT_EQUALS( 1, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "Feliz Año Nuevo!", string(val._text) ); - ASSERT_EQUALS( "spanish", val._language->str() ); - ASSERT_EQUALS( 1, val._weight ); - } - - // Multi-language : test nested stemming per sub-document - TEST( FTSElementIterator, Test3 ) { - - BSONObj obj = fromjson( - "{ language : \"english\"," - " a :" - " { language : \"danish\"," - " b :" - " [ { c : \"foredrag\" }," - " { c : \"foredragsholder\" }," - " { c : \"lector\" } ]" - " }" - "}" ); - - BSONObj indexSpec = fromjson( - "{ key : { a : \"text\", \"a.b.c\" : \"text\" }, weights : { \"a.b.c\" : 5 } }" ); - - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - Weights::const_iterator itt = spec.weights().begin(); - ASSERT( itt != spec.weights().end() ); - ASSERT_EQUALS( "a", itt->first ); - ASSERT_EQUALS( 1, itt->second ); - ++itt; - ASSERT( itt != spec.weights().end() ); - ASSERT_EQUALS( "a.b.c", itt->first ); - ASSERT_EQUALS( 5, itt->second ); - - FTSElementIterator it( spec, obj ); - - ASSERT( it.more() ); - FTSIteratorValue val = it.next(); - ASSERT_EQUALS( "foredrag", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "foredragsholder", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "lector", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - } - - // Multi-language : test nested arrays - TEST( FTSElementIterator, Test4 ) { - - BSONObj obj = fromjson( - "{ language : \"english\"," - " a : [" - " { language : \"danish\"," - " b :" - " [ { c : [\"foredrag\"] }," - " { c : [\"foredragsholder\"] }," - " { c : [\"lector\"] } ]" - " } ]" - "}" ); - - BSONObj indexSpec = fromjson( - "{ key : { \"a.b.c\" : \"text\" }, weights : { \"a.b.c\" : 5 } }" ); - - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - FTSElementIterator it( spec, obj ); - - ASSERT( it.more() ); - FTSIteratorValue val = it.next(); - ASSERT_EQUALS( "foredrag", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "foredragsholder", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "lector", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - } - - // Multi-language : test wildcard spec - TEST( FTSElementIterator, Test5 ) { - - BSONObj obj = fromjson( - "{ language : \"english\"," - " b : \"these boots were made for walking\"," - " c : { e: \"I walked half way to the market before seeing the sunrise\" }," - " d : " - " { language : \"danish\"," - " e :" - " [ { f : \"foredrag\", g : 12 }," - " { f : \"foredragsholder\", g : 13 }," - " { f : \"lector\", g : 14 } ]" - " }" - "}" ); - - BSONObj indexSpec = fromjson( - "{ key : { a : \"text\" }, weights : { b : 20, c : 10, \"d.e.f\" : 5 } }" ); - - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - FTSElementIterator it( spec, obj ); - - ASSERT( it.more() ); - FTSIteratorValue val = it.next(); - ASSERT_EQUALS( "these boots were made for walking", string(val._text) ); - ASSERT_EQUALS( "english", val._language->str() ); - ASSERT_EQUALS( 20, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "foredrag", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "foredragsholder", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "lector", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - } - - // Multi-language : test wildcard spec - TEST( FTSElementIterator, Test6 ) { - - BSONObj obj = fromjson( - "{ language : \"english\"," - " b : \"these boots were made for walking\"," - " c : { e: \"I walked half way to the market before seeing the sunrise\" }," - " d : " - " { language : \"danish\"," - " e :" - " [ { f : \"foredrag\", g : 12 }," - " { f : \"foredragsholder\", g : 13 }," - " { f : \"lector\", g : 14 } ]" - " }" - "}" ); - - BSONObj indexSpec = fromjson( - "{ key : { a : \"text\" }, weights : { b : 20, c : 10, \"d.e.f\" : 5 } }" ); - - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - FTSElementIterator it( spec, obj ); - - ASSERT( it.more() ); - FTSIteratorValue val = it.next(); - ASSERT_EQUALS( "these boots were made for walking", string(val._text) ); - ASSERT_EQUALS( "english", val._language->str() ); - ASSERT_EQUALS( 20, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "foredrag", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "foredragsholder", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - - ASSERT( it.more() ); - val = it.next(); - ASSERT_EQUALS( "lector", string(val._text) ); - ASSERT_EQUALS( "danish", val._language->str() ); - ASSERT_EQUALS( 5, val._weight ); - } - } +namespace fts { + +using std::string; + +TEST(FTSElementIterator, Test1) { + BSONObj obj = fromjson( + "{ b : \"walking\"," + " c : { e: \"walked\" }," + " d : \"walker\"" + " }"); + + BSONObj indexSpec = fromjson("{ key : { a : \"text\" }, weights : { b : 10, d : 5 } }"); + + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + Weights::const_iterator itt = spec.weights().begin(); + ASSERT(itt != spec.weights().end()); + ASSERT_EQUALS("a", itt->first); + ASSERT_EQUALS(1, itt->second); + ++itt; + ASSERT(itt != spec.weights().end()); + ASSERT_EQUALS("b", itt->first); + ASSERT_EQUALS(10, itt->second); + ++itt; + ASSERT(itt != spec.weights().end()); + ASSERT_EQUALS("d", itt->first); + ASSERT_EQUALS(5, itt->second); + ++itt; + + FTSElementIterator it(spec, obj); + + ASSERT(it.more()); + FTSIteratorValue val = it.next(); + ASSERT_EQUALS("walking", string(val._text)); + ASSERT_EQUALS("english", val._language->str()); + ASSERT_EQUALS(10, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("walker", string(val._text)); + ASSERT_EQUALS("english", val._language->str()); + ASSERT_EQUALS(5, val._weight); } +// Multi-language : test +TEST(FTSElementIterator, Test2) { + BSONObj obj = fromjson( + "{ a :" + " { b :" + " [ { c : \"walked\", language : \"english\" }," + " { c : \"camminato\", language : \"italian\" }," + " { c : \"ging\", language : \"german\" } ]" + " }," + " d : \"Feliz Año Nuevo!\"," + " language : \"spanish\"" + " }"); + + BSONObj indexSpec = fromjson("{ key : { \"a.b.c\" : \"text\", d : \"text\" } }"); + + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + + FTSElementIterator it(spec, obj); + + ASSERT(it.more()); + FTSIteratorValue val = it.next(); + ASSERT_EQUALS("walked", string(val._text)); + ASSERT_EQUALS("english", val._language->str()); + ASSERT_EQUALS(1, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("camminato", string(val._text)); + ASSERT_EQUALS("italian", val._language->str()); + ASSERT_EQUALS(1, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("ging", string(val._text)); + ASSERT_EQUALS("german", val._language->str()); + ASSERT_EQUALS(1, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("Feliz Año Nuevo!", string(val._text)); + ASSERT_EQUALS("spanish", val._language->str()); + ASSERT_EQUALS(1, val._weight); +} + +// Multi-language : test nested stemming per sub-document +TEST(FTSElementIterator, Test3) { + BSONObj obj = fromjson( + "{ language : \"english\"," + " a :" + " { language : \"danish\"," + " b :" + " [ { c : \"foredrag\" }," + " { c : \"foredragsholder\" }," + " { c : \"lector\" } ]" + " }" + "}"); + + BSONObj indexSpec = + fromjson("{ key : { a : \"text\", \"a.b.c\" : \"text\" }, weights : { \"a.b.c\" : 5 } }"); + + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + Weights::const_iterator itt = spec.weights().begin(); + ASSERT(itt != spec.weights().end()); + ASSERT_EQUALS("a", itt->first); + ASSERT_EQUALS(1, itt->second); + ++itt; + ASSERT(itt != spec.weights().end()); + ASSERT_EQUALS("a.b.c", itt->first); + ASSERT_EQUALS(5, itt->second); + + FTSElementIterator it(spec, obj); + + ASSERT(it.more()); + FTSIteratorValue val = it.next(); + ASSERT_EQUALS("foredrag", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("foredragsholder", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("lector", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); +} + +// Multi-language : test nested arrays +TEST(FTSElementIterator, Test4) { + BSONObj obj = fromjson( + "{ language : \"english\"," + " a : [" + " { language : \"danish\"," + " b :" + " [ { c : [\"foredrag\"] }," + " { c : [\"foredragsholder\"] }," + " { c : [\"lector\"] } ]" + " } ]" + "}"); + + BSONObj indexSpec = fromjson("{ key : { \"a.b.c\" : \"text\" }, weights : { \"a.b.c\" : 5 } }"); + + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + FTSElementIterator it(spec, obj); + + ASSERT(it.more()); + FTSIteratorValue val = it.next(); + ASSERT_EQUALS("foredrag", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("foredragsholder", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("lector", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); +} + +// Multi-language : test wildcard spec +TEST(FTSElementIterator, Test5) { + BSONObj obj = fromjson( + "{ language : \"english\"," + " b : \"these boots were made for walking\"," + " c : { e: \"I walked half way to the market before seeing the sunrise\" }," + " d : " + " { language : \"danish\"," + " e :" + " [ { f : \"foredrag\", g : 12 }," + " { f : \"foredragsholder\", g : 13 }," + " { f : \"lector\", g : 14 } ]" + " }" + "}"); + + BSONObj indexSpec = + fromjson("{ key : { a : \"text\" }, weights : { b : 20, c : 10, \"d.e.f\" : 5 } }"); + + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + FTSElementIterator it(spec, obj); + + ASSERT(it.more()); + FTSIteratorValue val = it.next(); + ASSERT_EQUALS("these boots were made for walking", string(val._text)); + ASSERT_EQUALS("english", val._language->str()); + ASSERT_EQUALS(20, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("foredrag", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("foredragsholder", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("lector", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); +} + +// Multi-language : test wildcard spec +TEST(FTSElementIterator, Test6) { + BSONObj obj = fromjson( + "{ language : \"english\"," + " b : \"these boots were made for walking\"," + " c : { e: \"I walked half way to the market before seeing the sunrise\" }," + " d : " + " { language : \"danish\"," + " e :" + " [ { f : \"foredrag\", g : 12 }," + " { f : \"foredragsholder\", g : 13 }," + " { f : \"lector\", g : 14 } ]" + " }" + "}"); + + BSONObj indexSpec = + fromjson("{ key : { a : \"text\" }, weights : { b : 20, c : 10, \"d.e.f\" : 5 } }"); + + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + FTSElementIterator it(spec, obj); + + ASSERT(it.more()); + FTSIteratorValue val = it.next(); + ASSERT_EQUALS("these boots were made for walking", string(val._text)); + ASSERT_EQUALS("english", val._language->str()); + ASSERT_EQUALS(20, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("foredrag", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("foredragsholder", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); + + ASSERT(it.more()); + val = it.next(); + ASSERT_EQUALS("lector", string(val._text)); + ASSERT_EQUALS("danish", val._language->str()); + ASSERT_EQUALS(5, val._weight); +} +} +} diff --git a/src/mongo/db/fts/fts_enabled.cpp b/src/mongo/db/fts/fts_enabled.cpp index b8e071bd62a..fb261194db1 100644 --- a/src/mongo/db/fts/fts_enabled.cpp +++ b/src/mongo/db/fts/fts_enabled.cpp @@ -35,46 +35,42 @@ #include "mongo/util/log.h" namespace mongo { - namespace fts { - namespace { +namespace fts { +namespace { - bool dummyEnabledFlag = true; // Unused, needed for server parameter. +bool dummyEnabledFlag = true; // Unused, needed for server parameter. - /** - * Declaration for the "textSearchEnabled" server parameter, which is now deprecated. - * Note that: - * - setting to true performs a no-op and logs a deprecation message. - * - setting to false will fail. - */ - class ExportedTextSearchEnabledParameter : public ExportedServerParameter<bool> { - public: - ExportedTextSearchEnabledParameter() : - ExportedServerParameter<bool>( ServerParameterSet::getGlobal(), - "textSearchEnabled", - &dummyEnabledFlag, - true, - true ) {} - - virtual Status validate( const bool& potentialNewValue ) { - if ( !potentialNewValue ) { - return Status( ErrorCodes::BadValue, - "textSearchEnabled cannot be set to false"); - } - - log() << "Attempted to set textSearchEnabled server parameter."; - log() << "Text search is enabled by default and cannot be disabled."; - log() << "The following are now deprecated and will be removed in a future " - << "release:"; - log() << "- the \"textSearchEnabled\" server parameter (setting it has no " - << "effect)"; - log() << "- the \"text\" command (has been replaced by the $text query " - "operator)"; +/** + * Declaration for the "textSearchEnabled" server parameter, which is now deprecated. + * Note that: + * - setting to true performs a no-op and logs a deprecation message. + * - setting to false will fail. + */ +class ExportedTextSearchEnabledParameter : public ExportedServerParameter<bool> { +public: + ExportedTextSearchEnabledParameter() + : ExportedServerParameter<bool>( + ServerParameterSet::getGlobal(), "textSearchEnabled", &dummyEnabledFlag, true, true) { + } - return Status::OK(); - } + virtual Status validate(const bool& potentialNewValue) { + if (!potentialNewValue) { + return Status(ErrorCodes::BadValue, "textSearchEnabled cannot be set to false"); + } - } exportedTextSearchEnabledParam; + log() << "Attempted to set textSearchEnabled server parameter."; + log() << "Text search is enabled by default and cannot be disabled."; + log() << "The following are now deprecated and will be removed in a future " + << "release:"; + log() << "- the \"textSearchEnabled\" server parameter (setting it has no " + << "effect)"; + log() << "- the \"text\" command (has been replaced by the $text query " + "operator)"; - } + return Status::OK(); } + +} exportedTextSearchEnabledParam; +} +} } diff --git a/src/mongo/db/fts/fts_index_format.cpp b/src/mongo/db/fts/fts_index_format.cpp index b2311602248..d15cddfcb76 100644 --- a/src/mongo/db/fts/fts_index_format.cpp +++ b/src/mongo/db/fts/fts_index_format.cpp @@ -39,178 +39,168 @@ namespace mongo { - namespace fts { - - using std::string; - using std::vector; - - namespace { - BSONObj nullObj; - BSONElement nullElt; - - // New in textIndexVersion 2. - // If the term is longer than 32 characters, it may - // result in the generated key being too large - // for the index. In that case, we generate a 64-character key - // from the concatenation of the first 32 characters - // and the hex string of the murmur3 hash value of the entire - // term value. - const size_t termKeyPrefixLength = 32U; - // 128-bit hash value expressed in hex = 32 characters - const size_t termKeySuffixLength = 32U; - const size_t termKeyLength = termKeyPrefixLength + termKeySuffixLength; - - /** - * Returns size of buffer required to store term in index key. - * In version 1, terms are stored verbatim in key. - * In version 2, terms longer than 32 characters are hashed and combined - * with a prefix. - */ - int guessTermSize( const std::string& term, TextIndexVersion textIndexVersion ) { - if ( TEXT_INDEX_VERSION_1 == textIndexVersion ) { - return term.size(); - } - else { - invariant( TEXT_INDEX_VERSION_2 == textIndexVersion ); - if ( term.size() <= termKeyPrefixLength ) { - return term.size(); - } - return termKeyLength; - } - } - } +namespace fts { + +using std::string; +using std::vector; + +namespace { +BSONObj nullObj; +BSONElement nullElt; + +// New in textIndexVersion 2. +// If the term is longer than 32 characters, it may +// result in the generated key being too large +// for the index. In that case, we generate a 64-character key +// from the concatenation of the first 32 characters +// and the hex string of the murmur3 hash value of the entire +// term value. +const size_t termKeyPrefixLength = 32U; +// 128-bit hash value expressed in hex = 32 characters +const size_t termKeySuffixLength = 32U; +const size_t termKeyLength = termKeyPrefixLength + termKeySuffixLength; - MONGO_INITIALIZER( FTSIndexFormat )( InitializerContext* context ) { - BSONObjBuilder b; - b.appendNull( "" ); - nullObj = b.obj(); - nullElt = nullObj.firstElement(); - return Status::OK(); +/** + * Returns size of buffer required to store term in index key. + * In version 1, terms are stored verbatim in key. + * In version 2, terms longer than 32 characters are hashed and combined + * with a prefix. + */ +int guessTermSize(const std::string& term, TextIndexVersion textIndexVersion) { + if (TEXT_INDEX_VERSION_1 == textIndexVersion) { + return term.size(); + } else { + invariant(TEXT_INDEX_VERSION_2 == textIndexVersion); + if (term.size() <= termKeyPrefixLength) { + return term.size(); } + return termKeyLength; + } +} +} - void FTSIndexFormat::getKeys( const FTSSpec& spec, - const BSONObj& obj, - BSONObjSet* keys ) { - - int extraSize = 0; - vector<BSONElement> extrasBefore; - vector<BSONElement> extrasAfter; - - // compute the non FTS key elements - for ( unsigned i = 0; i < spec.numExtraBefore(); i++ ) { - BSONElement e = obj.getFieldDotted(spec.extraBefore(i)); - if ( e.eoo() ) - e = nullElt; - uassert( 16675, "cannot have a multi-key as a prefix to a text index", - e.type() != Array ); - extrasBefore.push_back(e); - extraSize += e.size(); - } - for ( unsigned i = 0; i < spec.numExtraAfter(); i++ ) { - BSONElement e = obj.getFieldDotted(spec.extraAfter(i)); - if ( e.eoo() ) - e = nullElt; - extrasAfter.push_back(e); - extraSize += e.size(); - } - - - TermFrequencyMap term_freqs; - spec.scoreDocument( obj, &term_freqs ); - - // create index keys from raw scores - // only 1 per string - - uassert( 16732, - mongoutils::str::stream() << "too many unique keys for a single document to" - << " have a text index, max is " << term_freqs.size() << obj["_id"], - term_freqs.size() <= 400000 ); - - long long keyBSONSize = 0; - const int MaxKeyBSONSizeMB = 4; - - for ( TermFrequencyMap::const_iterator i = term_freqs.begin(); i != term_freqs.end(); ++i ) { - - const string& term = i->first; - double weight = i->second; - - // guess the total size of the btree entry based on the size of the weight, term tuple - int guess = - 5 /* bson overhead */ + - 10 /* weight */ + - 8 /* term overhead */ + - /* term size (could be truncated/hashed) */ - guessTermSize( term, spec.getTextIndexVersion() ) + - extraSize; - - BSONObjBuilder b(guess); // builds a BSON object with guess length. - for ( unsigned k = 0; k < extrasBefore.size(); k++ ) { - b.appendAs( extrasBefore[k], "" ); - } - _appendIndexKey( b, weight, term, spec.getTextIndexVersion() ); - for ( unsigned k = 0; k < extrasAfter.size(); k++ ) { - b.appendAs( extrasAfter[k], "" ); - } - BSONObj res = b.obj(); - - verify( guess >= res.objsize() ); - - keys->insert( res ); - keyBSONSize += res.objsize(); - - uassert( 16733, - mongoutils::str::stream() - << "trying to index text where term list is too big, max is " - << MaxKeyBSONSizeMB << "mb " << obj["_id"], - keyBSONSize <= ( MaxKeyBSONSizeMB * 1024 * 1024 ) ); - - } - } +MONGO_INITIALIZER(FTSIndexFormat)(InitializerContext* context) { + BSONObjBuilder b; + b.appendNull(""); + nullObj = b.obj(); + nullElt = nullObj.firstElement(); + return Status::OK(); +} + +void FTSIndexFormat::getKeys(const FTSSpec& spec, const BSONObj& obj, BSONObjSet* keys) { + int extraSize = 0; + vector<BSONElement> extrasBefore; + vector<BSONElement> extrasAfter; + + // compute the non FTS key elements + for (unsigned i = 0; i < spec.numExtraBefore(); i++) { + BSONElement e = obj.getFieldDotted(spec.extraBefore(i)); + if (e.eoo()) + e = nullElt; + uassert(16675, "cannot have a multi-key as a prefix to a text index", e.type() != Array); + extrasBefore.push_back(e); + extraSize += e.size(); + } + for (unsigned i = 0; i < spec.numExtraAfter(); i++) { + BSONElement e = obj.getFieldDotted(spec.extraAfter(i)); + if (e.eoo()) + e = nullElt; + extrasAfter.push_back(e); + extraSize += e.size(); + } + + + TermFrequencyMap term_freqs; + spec.scoreDocument(obj, &term_freqs); + + // create index keys from raw scores + // only 1 per string + + uassert(16732, + mongoutils::str::stream() << "too many unique keys for a single document to" + << " have a text index, max is " << term_freqs.size() + << obj["_id"], + term_freqs.size() <= 400000); + + long long keyBSONSize = 0; + const int MaxKeyBSONSizeMB = 4; - BSONObj FTSIndexFormat::getIndexKey( double weight, - const string& term, - const BSONObj& indexPrefix, - TextIndexVersion textIndexVersion ) { - BSONObjBuilder b; + for (TermFrequencyMap::const_iterator i = term_freqs.begin(); i != term_freqs.end(); ++i) { + const string& term = i->first; + double weight = i->second; - BSONObjIterator i( indexPrefix ); - while ( i.more() ) { - b.appendAs( i.next(), "" ); - } + // guess the total size of the btree entry based on the size of the weight, term tuple + int guess = 5 /* bson overhead */ + 10 /* weight */ + 8 /* term overhead */ + + /* term size (could be truncated/hashed) */ + guessTermSize(term, spec.getTextIndexVersion()) + extraSize; - _appendIndexKey( b, weight, term, textIndexVersion ); - return b.obj(); + BSONObjBuilder b(guess); // builds a BSON object with guess length. + for (unsigned k = 0; k < extrasBefore.size(); k++) { + b.appendAs(extrasBefore[k], ""); } + _appendIndexKey(b, weight, term, spec.getTextIndexVersion()); + for (unsigned k = 0; k < extrasAfter.size(); k++) { + b.appendAs(extrasAfter[k], ""); + } + BSONObj res = b.obj(); + + verify(guess >= res.objsize()); + + keys->insert(res); + keyBSONSize += res.objsize(); + + uassert(16733, + mongoutils::str::stream() + << "trying to index text where term list is too big, max is " + << MaxKeyBSONSizeMB << "mb " << obj["_id"], + keyBSONSize <= (MaxKeyBSONSizeMB * 1024 * 1024)); + } +} + +BSONObj FTSIndexFormat::getIndexKey(double weight, + const string& term, + const BSONObj& indexPrefix, + TextIndexVersion textIndexVersion) { + BSONObjBuilder b; - void FTSIndexFormat::_appendIndexKey( BSONObjBuilder& b, double weight, const string& term, - TextIndexVersion textIndexVersion ) { - verify( weight >= 0 && weight <= MAX_WEIGHT ); // FTSmaxweight = defined in fts_header - // Terms are added to index key verbatim. - if ( TEXT_INDEX_VERSION_1 == textIndexVersion ) { - b.append( "", term ); - b.append( "", weight ); - } - // See comments at the top of file for termKeyPrefixLength. - // Apply hash for text index version 2 to long terms (longer than 32 characters). - else { - invariant( TEXT_INDEX_VERSION_2 == textIndexVersion ); - if ( term.size() <= termKeyPrefixLength ) { - b.append( "", term ); - } - else { - union { - uint64_t hash[2]; - char data[16]; - } t; - uint32_t seed = 0; - MurmurHash3_x64_128( term.data(), term.size(), seed, t.hash ); - string keySuffix = mongo::toHexLower( t.data, sizeof( t.data ) ); - invariant( termKeySuffixLength == keySuffix.size() ); - b.append( "", term.substr( 0, termKeyPrefixLength ) + - keySuffix ); - } - b.append( "", weight ); - } + BSONObjIterator i(indexPrefix); + while (i.more()) { + b.appendAs(i.next(), ""); + } + + _appendIndexKey(b, weight, term, textIndexVersion); + return b.obj(); +} + +void FTSIndexFormat::_appendIndexKey(BSONObjBuilder& b, + double weight, + const string& term, + TextIndexVersion textIndexVersion) { + verify(weight >= 0 && weight <= MAX_WEIGHT); // FTSmaxweight = defined in fts_header + // Terms are added to index key verbatim. + if (TEXT_INDEX_VERSION_1 == textIndexVersion) { + b.append("", term); + b.append("", weight); + } + // See comments at the top of file for termKeyPrefixLength. + // Apply hash for text index version 2 to long terms (longer than 32 characters). + else { + invariant(TEXT_INDEX_VERSION_2 == textIndexVersion); + if (term.size() <= termKeyPrefixLength) { + b.append("", term); + } else { + union { + uint64_t hash[2]; + char data[16]; + } t; + uint32_t seed = 0; + MurmurHash3_x64_128(term.data(), term.size(), seed, t.hash); + string keySuffix = mongo::toHexLower(t.data, sizeof(t.data)); + invariant(termKeySuffixLength == keySuffix.size()); + b.append("", term.substr(0, termKeyPrefixLength) + keySuffix); } + b.append("", weight); } } +} +} diff --git a/src/mongo/db/fts/fts_index_format.h b/src/mongo/db/fts/fts_index_format.h index 75084e822ad..0c8c7787bd1 100644 --- a/src/mongo/db/fts/fts_index_format.h +++ b/src/mongo/db/fts/fts_index_format.h @@ -34,38 +34,36 @@ namespace mongo { - namespace fts { +namespace fts { - class FTSIndexFormat { - public: +class FTSIndexFormat { +public: + static void getKeys(const FTSSpec& spec, const BSONObj& document, BSONObjSet* keys); - static void getKeys( const FTSSpec& spec, - const BSONObj& document, - BSONObjSet* keys ); + /* + * Helper method to get return entry from the FTSIndex as a BSONObj + * @param weight, the weight of the term in the entry + * @param term, the std::string term in the entry + * @param indexPrefix, the fields that go in the index first + * @param textIndexVersion, index version. affects key format. + */ + static BSONObj getIndexKey(double weight, + const std::string& term, + const BSONObj& indexPrefix, + TextIndexVersion textIndexVersion); - /* - * Helper method to get return entry from the FTSIndex as a BSONObj - * @param weight, the weight of the term in the entry - * @param term, the std::string term in the entry - * @param indexPrefix, the fields that go in the index first - * @param textIndexVersion, index version. affects key format. - */ - static BSONObj getIndexKey( double weight, - const std::string& term, - const BSONObj& indexPrefix, - TextIndexVersion textIndexVersion ); - - private: - /* - * Helper method to get return entry from the FTSIndex as a BSONObj - * @param b, reference to the BSONOBjBuilder - * @param weight, the weight of the term in the entry - * @param term, the std::string term in the entry - * @param textIndexVersion, index version. affects key format. - */ - static void _appendIndexKey( BSONObjBuilder& b, double weight, const std::string& term, - TextIndexVersion textIndexVersion ); - }; - - } +private: + /* + * Helper method to get return entry from the FTSIndex as a BSONObj + * @param b, reference to the BSONOBjBuilder + * @param weight, the weight of the term in the entry + * @param term, the std::string term in the entry + * @param textIndexVersion, index version. affects key format. + */ + static void _appendIndexKey(BSONObjBuilder& b, + double weight, + const std::string& term, + TextIndexVersion textIndexVersion); +}; +} } diff --git a/src/mongo/db/fts/fts_index_format_test.cpp b/src/mongo/db/fts/fts_index_format_test.cpp index 58d2e620b8d..0752dd58f74 100644 --- a/src/mongo/db/fts/fts_index_format_test.cpp +++ b/src/mongo/db/fts/fts_index_format_test.cpp @@ -41,165 +41,184 @@ namespace mongo { - namespace fts { - - using std::string; - - TEST( FTSIndexFormat, Simple1 ) { - FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" ) ) ) ); - BSONObjSet keys; - FTSIndexFormat::getKeys( spec, BSON( "data" << "cat sat" ), &keys ); - - ASSERT_EQUALS( 2U, keys.size() ); - for ( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) { - BSONObj key = *i; - ASSERT_EQUALS( 2, key.nFields() ); - ASSERT_EQUALS( String, key.firstElement().type() ); - } - } - - TEST( FTSIndexFormat, ExtraBack1 ) { - FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" << - "x" << 1 ) ) ) ); - BSONObjSet keys; - FTSIndexFormat::getKeys( spec, BSON( "data" << "cat" << "x" << 5 ), &keys ); - - ASSERT_EQUALS( 1U, keys.size() ); - BSONObj key = *(keys.begin()); - ASSERT_EQUALS( 3, key.nFields() ); - BSONObjIterator i( key ); - ASSERT_EQUALS( StringData("cat"), i.next().valuestr() ); - ASSERT( i.next().numberDouble() > 0 ); - ASSERT_EQUALS( 5, i.next().numberInt() ); - } +namespace fts { + +using std::string; + +TEST(FTSIndexFormat, Simple1) { + FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("data" + << "text")))); + BSONObjSet keys; + FTSIndexFormat::getKeys(spec, + BSON("data" + << "cat sat"), + &keys); + + ASSERT_EQUALS(2U, keys.size()); + for (BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i) { + BSONObj key = *i; + ASSERT_EQUALS(2, key.nFields()); + ASSERT_EQUALS(String, key.firstElement().type()); + } +} - /* - TEST( FTSIndexFormat, ExtraBackArray1 ) { - FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" << - "x.y" << 1 ) ) ) ); - BSONObjSet keys; - FTSIndexFormat::getKeys( spec, - BSON( "data" << "cat" << - "x" << BSON_ARRAY( BSON( "y" << 1 ) << - BSON( "y" << 2 ) ) ), - &keys ); - - ASSERT_EQUALS( 1U, keys.size() ); - BSONObj key = *(keys.begin()); - log() << "e: " << key << endl; - ASSERT_EQUALS( 3, key.nFields() ); - BSONObjIterator i( key ); - ASSERT_EQUALS( StringData("cat"), i.next().valuestr() ); - ASSERT( i.next().numberDouble() > 0 ); - ASSERT_EQUALS( 5, i.next().numberInt() ); - } - */ - - TEST( FTSIndexFormat, ExtraFront1 ) { - FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << 1 << - "data" << "text" ) ) ) ); - BSONObjSet keys; - FTSIndexFormat::getKeys( spec, BSON( "data" << "cat" << "x" << 5 ), &keys ); - - ASSERT_EQUALS( 1U, keys.size() ); - BSONObj key = *(keys.begin()); - ASSERT_EQUALS( 3, key.nFields() ); - BSONObjIterator i( key ); - ASSERT_EQUALS( 5, i.next().numberInt() ); - ASSERT_EQUALS( StringData("cat"), i.next().valuestr() ); - ASSERT( i.next().numberDouble() > 0 ); - } +TEST(FTSIndexFormat, ExtraBack1) { + FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("data" + << "text" + << "x" << 1)))); + BSONObjSet keys; + FTSIndexFormat::getKeys(spec, + BSON("data" + << "cat" + << "x" << 5), + &keys); + + ASSERT_EQUALS(1U, keys.size()); + BSONObj key = *(keys.begin()); + ASSERT_EQUALS(3, key.nFields()); + BSONObjIterator i(key); + ASSERT_EQUALS(StringData("cat"), i.next().valuestr()); + ASSERT(i.next().numberDouble() > 0); + ASSERT_EQUALS(5, i.next().numberInt()); +} - TEST( FTSIndexFormat, StopWords1 ) { - FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" ) ) ) ); +/* +TEST( FTSIndexFormat, ExtraBackArray1 ) { + FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" << + "x.y" << 1 ) ) ) ); + BSONObjSet keys; + FTSIndexFormat::getKeys( spec, + BSON( "data" << "cat" << + "x" << BSON_ARRAY( BSON( "y" << 1 ) << + BSON( "y" << 2 ) ) ), + &keys ); + + ASSERT_EQUALS( 1U, keys.size() ); + BSONObj key = *(keys.begin()); + log() << "e: " << key << endl; + ASSERT_EQUALS( 3, key.nFields() ); + BSONObjIterator i( key ); + ASSERT_EQUALS( StringData("cat"), i.next().valuestr() ); + ASSERT( i.next().numberDouble() > 0 ); + ASSERT_EQUALS( 5, i.next().numberInt() ); +} +*/ - BSONObjSet keys1; - FTSIndexFormat::getKeys( spec, BSON( "data" << "computer" ), &keys1 ); - ASSERT_EQUALS( 1U, keys1.size() ); +TEST(FTSIndexFormat, ExtraFront1) { + FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("x" << 1 << "data" + << "text")))); + BSONObjSet keys; + FTSIndexFormat::getKeys(spec, + BSON("data" + << "cat" + << "x" << 5), + &keys); + + ASSERT_EQUALS(1U, keys.size()); + BSONObj key = *(keys.begin()); + ASSERT_EQUALS(3, key.nFields()); + BSONObjIterator i(key); + ASSERT_EQUALS(5, i.next().numberInt()); + ASSERT_EQUALS(StringData("cat"), i.next().valuestr()); + ASSERT(i.next().numberDouble() > 0); +} - BSONObjSet keys2; - FTSIndexFormat::getKeys( spec, BSON( "data" << "any computer" ), &keys2 ); - ASSERT_EQUALS( 1U, keys2.size() ); - } +TEST(FTSIndexFormat, StopWords1) { + FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("data" + << "text")))); + + BSONObjSet keys1; + FTSIndexFormat::getKeys(spec, + BSON("data" + << "computer"), + &keys1); + ASSERT_EQUALS(1U, keys1.size()); + + BSONObjSet keys2; + FTSIndexFormat::getKeys(spec, + BSON("data" + << "any computer"), + &keys2); + ASSERT_EQUALS(1U, keys2.size()); +} - /** - * Helper function to compare keys returned in getKeys() result - * with expected values. - */ - void assertEqualsIndexKeys( std::set<std::string>& expectedKeys, const BSONObjSet& keys ) { - ASSERT_EQUALS( expectedKeys.size(), keys.size() ); - for ( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) { - BSONObj key = *i; - ASSERT_EQUALS( 2, key.nFields() ); - ASSERT_EQUALS( String, key.firstElement().type() ); - string s = key.firstElement().String(); - std::set<string>::const_iterator j = expectedKeys.find(s); - if (j == expectedKeys.end()) { - mongoutils::str::stream ss; - ss << "unexpected key " << s << " in FTSIndexFormat::getKeys result. " - << "expected keys:"; - for (std::set<string>::const_iterator k = expectedKeys.begin(); - k != expectedKeys.end(); ++k) { - ss << "\n " << *k; - } - FAIL(ss); - } +/** + * Helper function to compare keys returned in getKeys() result + * with expected values. + */ +void assertEqualsIndexKeys(std::set<std::string>& expectedKeys, const BSONObjSet& keys) { + ASSERT_EQUALS(expectedKeys.size(), keys.size()); + for (BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i) { + BSONObj key = *i; + ASSERT_EQUALS(2, key.nFields()); + ASSERT_EQUALS(String, key.firstElement().type()); + string s = key.firstElement().String(); + std::set<string>::const_iterator j = expectedKeys.find(s); + if (j == expectedKeys.end()) { + mongoutils::str::stream ss; + ss << "unexpected key " << s << " in FTSIndexFormat::getKeys result. " + << "expected keys:"; + for (std::set<string>::const_iterator k = expectedKeys.begin(); k != expectedKeys.end(); + ++k) { + ss << "\n " << *k; } + FAIL(ss); } + } +} - /** - * Tests keys for long terms using text index version 1. - * Terms that are too long are not truncated in version 1. - */ - TEST( FTSIndexFormat, LongWordsTextIndexVersion1 ) { - FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" ) << - "textIndexVersion" << 1 ) ) ); - BSONObjSet keys; - string longPrefix( 1024U, 'a' ); - // "aaa...aaacat" - string longWordCat = longPrefix + "cat"; - // "aaa...aaasat" - string longWordSat = longPrefix + "sat"; - string text = mongoutils::str::stream() << longWordCat << " " << longWordSat; - FTSIndexFormat::getKeys( spec, BSON( "data" << text ), &keys ); - - // Hard-coded expected computed keys for future-proofing. - std::set<string> expectedKeys; - // cat - expectedKeys.insert( longWordCat ); - // sat - expectedKeys.insert( longWordSat ); - - assertEqualsIndexKeys( expectedKeys, keys); - } - - /** - * Tests keys for long terms using text index version 2. - * In version 2, long terms (longer than 32 characters) - * are hashed with murmur3 and appended to the first 32 - * characters of the term to form the index key. - */ - TEST( FTSIndexFormat, LongWordTextIndexVersion2 ) { - FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" ) << - "textIndexVersion" << 2 ) ) ); - BSONObjSet keys; - string longPrefix( 1024U, 'a' ); - // "aaa...aaacat" - string longWordCat = longPrefix + "cat"; - // "aaa...aaasat" - string longWordSat = longPrefix + "sat"; - string text = mongoutils::str::stream() << longWordCat << " " << longWordSat; - FTSIndexFormat::getKeys( spec, BSON( "data" << text ), &keys ); - - // Hard-coded expected computed keys for future-proofing. - std::set<string> expectedKeys; - // cat - expectedKeys.insert( "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab8e78455d827ebb87cbe87f392bf45f6" ); - // sat - expectedKeys.insert( "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaf2d6f58bb3b81b97e611ae7ccac6dea7" ); - - assertEqualsIndexKeys( expectedKeys, keys); - } +/** + * Tests keys for long terms using text index version 1. + * Terms that are too long are not truncated in version 1. + */ +TEST(FTSIndexFormat, LongWordsTextIndexVersion1) { + FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("data" + << "text") << "textIndexVersion" << 1))); + BSONObjSet keys; + string longPrefix(1024U, 'a'); + // "aaa...aaacat" + string longWordCat = longPrefix + "cat"; + // "aaa...aaasat" + string longWordSat = longPrefix + "sat"; + string text = mongoutils::str::stream() << longWordCat << " " << longWordSat; + FTSIndexFormat::getKeys(spec, BSON("data" << text), &keys); + + // Hard-coded expected computed keys for future-proofing. + std::set<string> expectedKeys; + // cat + expectedKeys.insert(longWordCat); + // sat + expectedKeys.insert(longWordSat); + + assertEqualsIndexKeys(expectedKeys, keys); +} - } +/** + * Tests keys for long terms using text index version 2. + * In version 2, long terms (longer than 32 characters) + * are hashed with murmur3 and appended to the first 32 + * characters of the term to form the index key. + */ +TEST(FTSIndexFormat, LongWordTextIndexVersion2) { + FTSSpec spec(FTSSpec::fixSpec(BSON("key" << BSON("data" + << "text") << "textIndexVersion" << 2))); + BSONObjSet keys; + string longPrefix(1024U, 'a'); + // "aaa...aaacat" + string longWordCat = longPrefix + "cat"; + // "aaa...aaasat" + string longWordSat = longPrefix + "sat"; + string text = mongoutils::str::stream() << longWordCat << " " << longWordSat; + FTSIndexFormat::getKeys(spec, BSON("data" << text), &keys); + + // Hard-coded expected computed keys for future-proofing. + std::set<string> expectedKeys; + // cat + expectedKeys.insert("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab8e78455d827ebb87cbe87f392bf45f6"); + // sat + expectedKeys.insert("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaf2d6f58bb3b81b97e611ae7ccac6dea7"); + + assertEqualsIndexKeys(expectedKeys, keys); +} +} } diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp index 21474038f06..f778a9ec821 100644 --- a/src/mongo/db/fts/fts_language.cpp +++ b/src/mongo/db/fts/fts_language.cpp @@ -31,7 +31,7 @@ #include "mongo/db/fts/fts_language.h" #include <string> - + #include "mongo/base/init.h" #include "mongo/util/assert_util.h" #include "mongo/util/mongoutils/str.h" @@ -40,223 +40,219 @@ namespace mongo { - namespace fts { - - namespace { +namespace fts { - /** - * Case-insensitive StringData comparator. - */ - struct LanguageStringCompare { - /** Returns true if lhs < rhs. */ - bool operator()( const StringData& lhs, const StringData& rhs ) const { - size_t minSize = std::min( lhs.size(), rhs.size() ); +namespace { - for ( size_t x = 0; x < minSize; x++ ) { - char a = tolower( lhs[x] ); - char b = tolower( rhs[x] ); - if ( a < b ) { - return true; - } - if ( a > b ) { - return false; - } - } +/** + * Case-insensitive StringData comparator. + */ +struct LanguageStringCompare { + /** Returns true if lhs < rhs. */ + bool operator()(const StringData& lhs, const StringData& rhs) const { + size_t minSize = std::min(lhs.size(), rhs.size()); - return lhs.size() < rhs.size(); - } - }; + for (size_t x = 0; x < minSize; x++) { + char a = tolower(lhs[x]); + char b = tolower(rhs[x]); + if (a < b) { + return true; + } + if (a > b) { + return false; + } + } - // Lookup table from user language string (case-insensitive) to FTSLanguage. Populated - // by initializers in group FTSAllLanguagesRegistered and initializer - // FTSRegisterLanguageAliases. For use with TEXT_INDEX_VERSION_2 text indexes only. - typedef std::map<StringData, const FTSLanguage*, LanguageStringCompare> LanguageMapV2; - LanguageMapV2 languageMapV2; + return lhs.size() < rhs.size(); + } +}; - // Like languageMapV2, but for use with TEXT_INDEX_VERSION_1 text indexes. - // Case-sensitive by lookup key. - typedef std::map<StringData, const FTSLanguage*> LanguageMapV1; - LanguageMapV1 languageMapV1; - } +// Lookup table from user language string (case-insensitive) to FTSLanguage. Populated +// by initializers in group FTSAllLanguagesRegistered and initializer +// FTSRegisterLanguageAliases. For use with TEXT_INDEX_VERSION_2 text indexes only. +typedef std::map<StringData, const FTSLanguage*, LanguageStringCompare> LanguageMapV2; +LanguageMapV2 languageMapV2; - MONGO_INITIALIZER_GROUP( FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, - MONGO_NO_DEPENDENTS ); +// Like languageMapV2, but for use with TEXT_INDEX_VERSION_1 text indexes. +// Case-sensitive by lookup key. +typedef std::map<StringData, const FTSLanguage*> LanguageMapV1; +LanguageMapV1 languageMapV1; +} - // - // Register supported languages' canonical names for TEXT_INDEX_VERSION_2. - // +MONGO_INITIALIZER_GROUP(FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, MONGO_NO_DEPENDENTS); - MONGO_FTS_LANGUAGE_DECLARE( languageNoneV2, "none", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDanishV2, "danish", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDutchV2, "dutch", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageEnglishV2, "english", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFinnishV2, "finnish", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFrenchV2, "french", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageGermanV2, "german", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageHungarianV2, "hungarian", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageItalianV2, "italian", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageNorwegianV2, "norwegian", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languagePortugueseV2, "portuguese", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRomanianV2, "romanian", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRussianV2, "russian", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageSpanishV2, "spanish", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageSwedishV2, "swedish", TEXT_INDEX_VERSION_2 ); - MONGO_FTS_LANGUAGE_DECLARE( languageTurkishV2, "turkish", TEXT_INDEX_VERSION_2 ); +// +// Register supported languages' canonical names for TEXT_INDEX_VERSION_2. +// - // - // Register all Snowball language modules for TEXT_INDEX_VERSION_1. Note that only the full - // names are recognized by the StopWords class (as such, the language string "dan" in - // TEXT_INDEX_VERSION_1 will generate the Danish stemmer and the empty stopword list). - // +MONGO_FTS_LANGUAGE_DECLARE(languageNoneV2, "none", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageDanishV2, "danish", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageDutchV2, "dutch", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageEnglishV2, "english", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageFinnishV2, "finnish", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageFrenchV2, "french", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageGermanV2, "german", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageHungarianV2, "hungarian", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageItalianV2, "italian", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageNorwegianV2, "norwegian", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languagePortugueseV2, "portuguese", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageRomanianV2, "romanian", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageRussianV2, "russian", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageSpanishV2, "spanish", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageSwedishV2, "swedish", TEXT_INDEX_VERSION_2); +MONGO_FTS_LANGUAGE_DECLARE(languageTurkishV2, "turkish", TEXT_INDEX_VERSION_2); - MONGO_FTS_LANGUAGE_DECLARE( languageNoneV1, "none", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDaV1, "da", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDanV1, "dan", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDanishV1, "danish", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDeV1, "de", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDeuV1, "deu", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDutV1, "dut", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageDutchV1, "dutch", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageEnV1, "en", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageEngV1, "eng", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageEnglishV1, "english", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageEsV1, "es", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageEslV1, "esl", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFiV1, "fi", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFinV1, "fin", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFinnishV1, "finnish", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFrV1, "fr", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFraV1, "fra", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFreV1, "fre", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageFrenchV1, "french", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageGerV1, "ger", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageGermanV1, "german", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageHuV1, "hu", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageHunV1, "hun", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageHungarianV1, "hungarian", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageItV1, "it", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageItaV1, "ita", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageItalianV1, "italian", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageNlV1, "nl", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageNldV1, "nld", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageNoV1, "no", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageNorV1, "nor", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageNorwegianV1, "norwegian", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languagePorV1, "por", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languagePorterV1, "porter", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languagePortugueseV1, "portuguese", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languagePtV1, "pt", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRoV1, "ro", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRomanianV1, "romanian", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRonV1, "ron", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRuV1, "ru", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRumV1, "rum", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRusV1, "rus", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageRussianV1, "russian", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageSpaV1, "spa", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageSpanishV1, "spanish", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageSvV1, "sv", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageSweV1, "swe", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageSwedishV1, "swedish", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageTrV1, "tr", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageTurV1, "tur", TEXT_INDEX_VERSION_1 ); - MONGO_FTS_LANGUAGE_DECLARE( languageTurkishV1, "turkish", TEXT_INDEX_VERSION_1 ); +// +// Register all Snowball language modules for TEXT_INDEX_VERSION_1. Note that only the full +// names are recognized by the StopWords class (as such, the language string "dan" in +// TEXT_INDEX_VERSION_1 will generate the Danish stemmer and the empty stopword list). +// - MONGO_INITIALIZER_WITH_PREREQUISITES( FTSRegisterLanguageAliases, - ( "FTSAllLanguagesRegistered" ) ) - ( InitializerContext* context ) { - // Register language aliases for TEXT_INDEX_VERSION_2. - FTSLanguage::registerLanguageAlias( &languageDanishV2, "da", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageDutchV2, "nl", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageEnglishV2, "en", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageFinnishV2, "fi", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageFrenchV2, "fr", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageGermanV2, "de", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageHungarianV2, "hu", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageItalianV2, "it", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageNorwegianV2, "nb", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languagePortugueseV2, "pt", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageRomanianV2, "ro", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageRussianV2, "ru", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageSpanishV2, "es", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageSwedishV2, "sv", TEXT_INDEX_VERSION_2 ); - FTSLanguage::registerLanguageAlias( &languageTurkishV2, "tr", TEXT_INDEX_VERSION_2 ); - return Status::OK(); - } +MONGO_FTS_LANGUAGE_DECLARE(languageNoneV1, "none", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageDaV1, "da", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageDanV1, "dan", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageDanishV1, "danish", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageDeV1, "de", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageDeuV1, "deu", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageDutV1, "dut", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageDutchV1, "dutch", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageEnV1, "en", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageEngV1, "eng", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageEnglishV1, "english", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageEsV1, "es", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageEslV1, "esl", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageFiV1, "fi", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageFinV1, "fin", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageFinnishV1, "finnish", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageFrV1, "fr", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageFraV1, "fra", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageFreV1, "fre", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageFrenchV1, "french", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageGerV1, "ger", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageGermanV1, "german", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageHuV1, "hu", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageHunV1, "hun", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageHungarianV1, "hungarian", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageItV1, "it", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageItaV1, "ita", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageItalianV1, "italian", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageNlV1, "nl", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageNldV1, "nld", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageNoV1, "no", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageNorV1, "nor", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageNorwegianV1, "norwegian", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languagePorV1, "por", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languagePorterV1, "porter", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languagePortugueseV1, "portuguese", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languagePtV1, "pt", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageRoV1, "ro", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageRomanianV1, "romanian", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageRonV1, "ron", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageRuV1, "ru", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageRumV1, "rum", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageRusV1, "rus", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageRussianV1, "russian", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageSpaV1, "spa", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageSpanishV1, "spanish", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageSvV1, "sv", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageSweV1, "swe", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageSwedishV1, "swedish", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageTrV1, "tr", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageTurV1, "tur", TEXT_INDEX_VERSION_1); +MONGO_FTS_LANGUAGE_DECLARE(languageTurkishV1, "turkish", TEXT_INDEX_VERSION_1); - // static - void FTSLanguage::registerLanguage( const StringData& languageName, - TextIndexVersion textIndexVersion, - FTSLanguage* language ) { - verify( !languageName.empty() ); - language->_canonicalName = languageName.toString(); - switch ( textIndexVersion ) { - case TEXT_INDEX_VERSION_2: - verify( languageMapV2.find( languageName ) == languageMapV2.end() ); - languageMapV2[ languageName ] = language; - return; - case TEXT_INDEX_VERSION_1: - verify( languageMapV1.find( languageName ) == languageMapV1.end() ); - languageMapV1[ languageName ] = language; - return; - } - verify( false ); - } +MONGO_INITIALIZER_WITH_PREREQUISITES(FTSRegisterLanguageAliases, ("FTSAllLanguagesRegistered")) +(InitializerContext* context) { + // Register language aliases for TEXT_INDEX_VERSION_2. + FTSLanguage::registerLanguageAlias(&languageDanishV2, "da", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageDutchV2, "nl", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageEnglishV2, "en", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageFinnishV2, "fi", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageFrenchV2, "fr", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageGermanV2, "de", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageHungarianV2, "hu", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageItalianV2, "it", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageNorwegianV2, "nb", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languagePortugueseV2, "pt", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageRomanianV2, "ro", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageRussianV2, "ru", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageSpanishV2, "es", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageSwedishV2, "sv", TEXT_INDEX_VERSION_2); + FTSLanguage::registerLanguageAlias(&languageTurkishV2, "tr", TEXT_INDEX_VERSION_2); + return Status::OK(); +} - // static - void FTSLanguage::registerLanguageAlias( const FTSLanguage* language, - const StringData& alias, - TextIndexVersion textIndexVersion ) { - switch ( textIndexVersion ) { - case TEXT_INDEX_VERSION_2: - verify( languageMapV2.find( alias ) == languageMapV2.end() ); - languageMapV2[ alias ] = language; - return; - case TEXT_INDEX_VERSION_1: - verify( languageMapV1.find( alias ) == languageMapV1.end() ); - languageMapV1[ alias ] = language; - return; - } - verify( false ); - } +// static +void FTSLanguage::registerLanguage(const StringData& languageName, + TextIndexVersion textIndexVersion, + FTSLanguage* language) { + verify(!languageName.empty()); + language->_canonicalName = languageName.toString(); + switch (textIndexVersion) { + case TEXT_INDEX_VERSION_2: + verify(languageMapV2.find(languageName) == languageMapV2.end()); + languageMapV2[languageName] = language; + return; + case TEXT_INDEX_VERSION_1: + verify(languageMapV1.find(languageName) == languageMapV1.end()); + languageMapV1[languageName] = language; + return; + } + verify(false); +} - FTSLanguage::FTSLanguage() : _canonicalName() { - } +// static +void FTSLanguage::registerLanguageAlias(const FTSLanguage* language, + const StringData& alias, + TextIndexVersion textIndexVersion) { + switch (textIndexVersion) { + case TEXT_INDEX_VERSION_2: + verify(languageMapV2.find(alias) == languageMapV2.end()); + languageMapV2[alias] = language; + return; + case TEXT_INDEX_VERSION_1: + verify(languageMapV1.find(alias) == languageMapV1.end()); + languageMapV1[alias] = language; + return; + } + verify(false); +} - const std::string& FTSLanguage::str() const { - verify( !_canonicalName.empty() ); - return _canonicalName; - } +FTSLanguage::FTSLanguage() : _canonicalName() {} - // static - StatusWithFTSLanguage FTSLanguage::make( const StringData& langName, - TextIndexVersion textIndexVersion ) { - switch ( textIndexVersion ) { - case TEXT_INDEX_VERSION_2: { - LanguageMapV2::const_iterator it = languageMapV2.find( langName ); - if ( it == languageMapV2.end() ) { - // TEXT_INDEX_VERSION_2 rejects unrecognized language strings. - Status status = Status( ErrorCodes::BadValue, - mongoutils::str::stream() << - "unsupported language: \"" << langName << - "\"" ); - return StatusWithFTSLanguage( status ); - } +const std::string& FTSLanguage::str() const { + verify(!_canonicalName.empty()); + return _canonicalName; +} - return StatusWithFTSLanguage( it->second ); - } - case TEXT_INDEX_VERSION_1: { - LanguageMapV1::const_iterator it = languageMapV1.find( langName ); - if ( it == languageMapV1.end() ) { - // TEXT_INDEX_VERSION_1 treats unrecognized language strings as "none". - return StatusWithFTSLanguage( &languageNoneV1 ); - } - return StatusWithFTSLanguage( it->second ); - } +// static +StatusWithFTSLanguage FTSLanguage::make(const StringData& langName, + TextIndexVersion textIndexVersion) { + switch (textIndexVersion) { + case TEXT_INDEX_VERSION_2: { + LanguageMapV2::const_iterator it = languageMapV2.find(langName); + if (it == languageMapV2.end()) { + // TEXT_INDEX_VERSION_2 rejects unrecognized language strings. + Status status = Status(ErrorCodes::BadValue, + mongoutils::str::stream() << "unsupported language: \"" + << langName << "\""); + return StatusWithFTSLanguage(status); } - verify( false ); - return StatusWithFTSLanguage( Status::OK() ); + return StatusWithFTSLanguage(it->second); + } + case TEXT_INDEX_VERSION_1: { + LanguageMapV1::const_iterator it = languageMapV1.find(langName); + if (it == languageMapV1.end()) { + // TEXT_INDEX_VERSION_1 treats unrecognized language strings as "none". + return StatusWithFTSLanguage(&languageNoneV1); + } + return StatusWithFTSLanguage(it->second); } } + + verify(false); + return StatusWithFTSLanguage(Status::OK()); +} +} } diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h index 5877c8a2756..87f2a7c5529 100644 --- a/src/mongo/db/fts/fts_language.h +++ b/src/mongo/db/fts/fts_language.h @@ -37,92 +37,91 @@ namespace mongo { - namespace fts { - - #define MONGO_FTS_LANGUAGE_DECLARE( language, name, version ) \ - FTSLanguage language; \ - MONGO_INITIALIZER_GENERAL( language, MONGO_NO_PREREQUISITES, \ - ( "FTSAllLanguagesRegistered" ) ) \ - ( ::mongo::InitializerContext* context ) { \ - FTSLanguage::registerLanguage( name, version, &language ); \ - return Status::OK(); \ - } - - /** - * A FTSLanguage represents a language for a text-indexed document or a text search. - * FTSLanguage objects are not copyable. - * - * Recommended usage: - * - * StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_2 ); - * if ( !swl.getStatus().isOK() ) { - * // Error. - * } - * else { - * const FTSLanguage* language = swl.getValue(); - * // Use language. - * } - */ - class FTSLanguage { - // Use make() instead of copying. - MONGO_DISALLOW_COPYING( FTSLanguage ); - public: - /** Create an uninitialized language. */ - FTSLanguage(); - - /** - * Returns the language as a std::string in canonical form (lowercased English name). It is - * an error to call str() on an uninitialized language. - */ - const std::string& str() const; - - /** - * Register std::string 'languageName' as a new language with text index version - * 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'. - * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language - * string. - */ - static void registerLanguage( const StringData& languageName, - TextIndexVersion textIndexVersion, - FTSLanguage *languageOut ); - - /** - * Register 'alias' as an alias for 'language' with text index version - * 'textIndexVersion'. Subsequent calls to FTSLanguage::make() will recognize the - * newly-registered alias. - */ - static void registerLanguageAlias( const FTSLanguage* language, - const StringData& alias, - TextIndexVersion textIndexVersion ); - - /** - * Return the FTSLanguage associated with the given language string. Returns an error - * Status if an invalid language std::string is passed. - * - * For textIndexVersion=TEXT_INDEX_VERSION_2, language strings are - * case-insensitive, and need to be in one of the two following forms: - * - English name, like "spanish". - * - Two-letter code, like "es". - * - * For textIndexVersion=TEXT_INDEX_VERSION_1, no validation or normalization of - * language strings is performed. This is necessary to preserve indexing behavior for - * documents with language strings like "en": for compatibility, text data in these - * documents needs to be processed with the English stemmer and the empty stopword list - * (since "en" is recognized by Snowball but not the stopword processing logic). - */ - static StatusWith<const FTSLanguage*> make( const StringData& langName, - TextIndexVersion textIndexVersion ); - - private: - // std::string representation of language in canonical form. - std::string _canonicalName; - }; - - typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage; - - extern FTSLanguage languagePorterV1; - extern FTSLanguage languageEnglishV2; - extern FTSLanguage languageFrenchV2; - +namespace fts { + +#define MONGO_FTS_LANGUAGE_DECLARE(language, name, version) \ + FTSLanguage language; \ + MONGO_INITIALIZER_GENERAL(language, MONGO_NO_PREREQUISITES, ("FTSAllLanguagesRegistered")) \ + (::mongo::InitializerContext * context) { \ + FTSLanguage::registerLanguage(name, version, &language); \ + return Status::OK(); \ } + +/** + * A FTSLanguage represents a language for a text-indexed document or a text search. + * FTSLanguage objects are not copyable. + * + * Recommended usage: + * + * StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_2 ); + * if ( !swl.getStatus().isOK() ) { + * // Error. + * } + * else { + * const FTSLanguage* language = swl.getValue(); + * // Use language. + * } + */ +class FTSLanguage { + // Use make() instead of copying. + MONGO_DISALLOW_COPYING(FTSLanguage); + +public: + /** Create an uninitialized language. */ + FTSLanguage(); + + /** + * Returns the language as a std::string in canonical form (lowercased English name). It is + * an error to call str() on an uninitialized language. + */ + const std::string& str() const; + + /** + * Register std::string 'languageName' as a new language with text index version + * 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'. + * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language + * string. + */ + static void registerLanguage(const StringData& languageName, + TextIndexVersion textIndexVersion, + FTSLanguage* languageOut); + + /** + * Register 'alias' as an alias for 'language' with text index version + * 'textIndexVersion'. Subsequent calls to FTSLanguage::make() will recognize the + * newly-registered alias. + */ + static void registerLanguageAlias(const FTSLanguage* language, + const StringData& alias, + TextIndexVersion textIndexVersion); + + /** + * Return the FTSLanguage associated with the given language string. Returns an error + * Status if an invalid language std::string is passed. + * + * For textIndexVersion=TEXT_INDEX_VERSION_2, language strings are + * case-insensitive, and need to be in one of the two following forms: + * - English name, like "spanish". + * - Two-letter code, like "es". + * + * For textIndexVersion=TEXT_INDEX_VERSION_1, no validation or normalization of + * language strings is performed. This is necessary to preserve indexing behavior for + * documents with language strings like "en": for compatibility, text data in these + * documents needs to be processed with the English stemmer and the empty stopword list + * (since "en" is recognized by Snowball but not the stopword processing logic). + */ + static StatusWith<const FTSLanguage*> make(const StringData& langName, + TextIndexVersion textIndexVersion); + +private: + // std::string representation of language in canonical form. + std::string _canonicalName; +}; + +typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage; + +extern FTSLanguage languagePorterV1; +extern FTSLanguage languageEnglishV2; +extern FTSLanguage languageFrenchV2; +} } diff --git a/src/mongo/db/fts/fts_language_test.cpp b/src/mongo/db/fts/fts_language_test.cpp index 0fb46ef2df7..c24f02ff7fd 100644 --- a/src/mongo/db/fts/fts_language_test.cpp +++ b/src/mongo/db/fts/fts_language_test.cpp @@ -35,103 +35,102 @@ namespace mongo { - namespace fts { - - // Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2. - - TEST( FTSLanguageV2, ExactLanguage ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "spanish", TEXT_INDEX_VERSION_2 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "spanish" ); - } - - TEST( FTSLanguageV2, ExactCode ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "es", TEXT_INDEX_VERSION_2 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "spanish" ); - } - - TEST( FTSLanguageV2, UpperCaseLanguage ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "SPANISH", TEXT_INDEX_VERSION_2 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "spanish" ); - } - - TEST( FTSLanguageV2, UpperCaseCode ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "ES", TEXT_INDEX_VERSION_2 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "spanish" ); - } - - TEST( FTSLanguageV2, NoneLanguage ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "none", TEXT_INDEX_VERSION_2 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "none" ); - } - - // Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2. - - TEST( FTSLanguageV2, Unknown ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "spanglish", TEXT_INDEX_VERSION_2 ); - ASSERT( !swl.getStatus().isOK() ); - } - - TEST( FTSLanguageV2, Empty ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "", TEXT_INDEX_VERSION_2 ); - ASSERT( !swl.getStatus().isOK() ); - } - - // Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1. - - TEST( FTSLanguageV1, ExactLanguage ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "spanish", TEXT_INDEX_VERSION_1 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "spanish" ); - } - - TEST( FTSLanguageV1, DeprecatedLanguage ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "porter", TEXT_INDEX_VERSION_1 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "porter" ); - } - - TEST( FTSLanguageV1, StemmerOnlyLanguage1 ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_1 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "en" ); - } - - TEST( FTSLanguageV1, StemmerOnlyLanguage2 ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "eng", TEXT_INDEX_VERSION_1 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "eng" ); - } - - TEST( FTSLanguageV1, NoneLanguage ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "none", TEXT_INDEX_VERSION_1 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "none" ); - } - - // Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1. - - TEST( FTSLanguageV1, CaseSensitive ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "SPANISH", TEXT_INDEX_VERSION_1 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "none" ); - } - - TEST( FTSLanguageV1, Unknown ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "asdf", TEXT_INDEX_VERSION_1 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "none" ); - } - - TEST( FTSLanguageV1, Empty ) { - StatusWithFTSLanguage swl = FTSLanguage::make( "", TEXT_INDEX_VERSION_1 ); - ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue()->str(), "none" ); - } - - } +namespace fts { + +// Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2. + +TEST(FTSLanguageV2, ExactLanguage) { + StatusWithFTSLanguage swl = FTSLanguage::make("spanish", TEXT_INDEX_VERSION_2); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "spanish"); +} + +TEST(FTSLanguageV2, ExactCode) { + StatusWithFTSLanguage swl = FTSLanguage::make("es", TEXT_INDEX_VERSION_2); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "spanish"); +} + +TEST(FTSLanguageV2, UpperCaseLanguage) { + StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_2); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "spanish"); +} + +TEST(FTSLanguageV2, UpperCaseCode) { + StatusWithFTSLanguage swl = FTSLanguage::make("ES", TEXT_INDEX_VERSION_2); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "spanish"); +} + +TEST(FTSLanguageV2, NoneLanguage) { + StatusWithFTSLanguage swl = FTSLanguage::make("none", TEXT_INDEX_VERSION_2); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "none"); +} + +// Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2. + +TEST(FTSLanguageV2, Unknown) { + StatusWithFTSLanguage swl = FTSLanguage::make("spanglish", TEXT_INDEX_VERSION_2); + ASSERT(!swl.getStatus().isOK()); +} + +TEST(FTSLanguageV2, Empty) { + StatusWithFTSLanguage swl = FTSLanguage::make("", TEXT_INDEX_VERSION_2); + ASSERT(!swl.getStatus().isOK()); +} + +// Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1. + +TEST(FTSLanguageV1, ExactLanguage) { + StatusWithFTSLanguage swl = FTSLanguage::make("spanish", TEXT_INDEX_VERSION_1); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "spanish"); +} + +TEST(FTSLanguageV1, DeprecatedLanguage) { + StatusWithFTSLanguage swl = FTSLanguage::make("porter", TEXT_INDEX_VERSION_1); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "porter"); +} + +TEST(FTSLanguageV1, StemmerOnlyLanguage1) { + StatusWithFTSLanguage swl = FTSLanguage::make("en", TEXT_INDEX_VERSION_1); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "en"); +} + +TEST(FTSLanguageV1, StemmerOnlyLanguage2) { + StatusWithFTSLanguage swl = FTSLanguage::make("eng", TEXT_INDEX_VERSION_1); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "eng"); +} + +TEST(FTSLanguageV1, NoneLanguage) { + StatusWithFTSLanguage swl = FTSLanguage::make("none", TEXT_INDEX_VERSION_1); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "none"); +} + +// Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1. + +TEST(FTSLanguageV1, CaseSensitive) { + StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_1); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "none"); +} + +TEST(FTSLanguageV1, Unknown) { + StatusWithFTSLanguage swl = FTSLanguage::make("asdf", TEXT_INDEX_VERSION_1); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "none"); +} + +TEST(FTSLanguageV1, Empty) { + StatusWithFTSLanguage swl = FTSLanguage::make("", TEXT_INDEX_VERSION_1); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "none"); +} +} } diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp index e4a38726011..daa258be1c8 100644 --- a/src/mongo/db/fts/fts_matcher.cpp +++ b/src/mongo/db/fts/fts_matcher.cpp @@ -36,101 +36,96 @@ namespace mongo { - namespace fts { +namespace fts { - using std::string; +using std::string; - FTSMatcher::FTSMatcher( const FTSQuery& query, const FTSSpec& spec ) - : _query( query ), - _spec( spec ) { - } +FTSMatcher::FTSMatcher(const FTSQuery& query, const FTSSpec& spec) : _query(query), _spec(spec) {} - /* - * Checks if the obj contains any of the negTerms, if so returns true, otherwise false - * @param obj, object to be checked - */ - bool FTSMatcher::hasNegativeTerm(const BSONObj& obj ) const { - // called during search. deals with the case in which we have a term - // flagged for exclusion, i.e. "hello -world" we want to remove all - // results that include "world" +/* + * Checks if the obj contains any of the negTerms, if so returns true, otherwise false + * @param obj, object to be checked + */ +bool FTSMatcher::hasNegativeTerm(const BSONObj& obj) const { + // called during search. deals with the case in which we have a term + // flagged for exclusion, i.e. "hello -world" we want to remove all + // results that include "world" - if ( _query.getNegatedTerms().size() == 0 ) { - return false; - } + if (_query.getNegatedTerms().size() == 0) { + return false; + } - FTSElementIterator it( _spec, obj); + FTSElementIterator it(_spec, obj); - while ( it.more() ) { - FTSIteratorValue val = it.next(); - if (_hasNegativeTerm_string( val._language, val._text )) { - return true; - } - } + while (it.more()) { + FTSIteratorValue val = it.next(); + if (_hasNegativeTerm_string(val._language, val._text)) { + return true; + } + } + + return false; +} +/* + * Checks if any of the negTerms is in the tokenized string + * @param raw, the raw string to be tokenized + */ +bool FTSMatcher::_hasNegativeTerm_string(const FTSLanguage* language, const string& raw) const { + Tokenizer i(*language, raw); + Stemmer stemmer(*language); + while (i.more()) { + Token t = i.next(); + if (t.type != Token::TEXT) + continue; + string word = stemmer.stem(tolowerString(t.data)); + if (_query.getNegatedTerms().count(word) > 0) + return true; + } + return false; +} + +bool FTSMatcher::phrasesMatch(const BSONObj& obj) const { + for (unsigned i = 0; i < _query.getPhr().size(); i++) { + if (!phraseMatch(_query.getPhr()[i], obj)) { return false; } + } - /* - * Checks if any of the negTerms is in the tokenized string - * @param raw, the raw string to be tokenized - */ - bool FTSMatcher::_hasNegativeTerm_string( const FTSLanguage* language, - const string& raw ) const { - - Tokenizer i( *language, raw ); - Stemmer stemmer( *language ); - while ( i.more() ) { - Token t = i.next(); - if ( t.type != Token::TEXT ) - continue; - string word = stemmer.stem( tolowerString( t.data ) ); - if ( _query.getNegatedTerms().count( word ) > 0 ) - return true; - } + for (unsigned i = 0; i < _query.getNegatedPhr().size(); i++) { + if (phraseMatch(_query.getNegatedPhr()[i], obj)) { return false; } + } - bool FTSMatcher::phrasesMatch( const BSONObj& obj ) const { - for (unsigned i = 0; i < _query.getPhr().size(); i++ ) { - if ( !phraseMatch( _query.getPhr()[i], obj ) ) { - return false; - } - } - - for (unsigned i = 0; i < _query.getNegatedPhr().size(); i++ ) { - if ( phraseMatch( _query.getNegatedPhr()[i], obj ) ) { - return false; - } - } + return true; +} +/** + * Checks if phrase is exactly matched in obj, returns true if so, false otherwise + * @param phrase, the string to be matched + * @param obj, document in the collection to match against + */ +bool FTSMatcher::phraseMatch(const string& phrase, const BSONObj& obj) const { + FTSElementIterator it(_spec, obj); + + while (it.more()) { + FTSIteratorValue val = it.next(); + if (_phraseMatches(phrase, val._text)) { return true; } + } - /** - * Checks if phrase is exactly matched in obj, returns true if so, false otherwise - * @param phrase, the string to be matched - * @param obj, document in the collection to match against - */ - bool FTSMatcher::phraseMatch( const string& phrase, const BSONObj& obj ) const { - FTSElementIterator it( _spec, obj); - - while ( it.more() ) { - FTSIteratorValue val = it.next(); - if (_phraseMatches( phrase, val._text )) { - return true; - } - } - - return false; - } + return false; +} - /* - * Looks for phrase in a raw string - * @param phrase, phrase to match - * @param haystack, raw string to be parsed - */ - bool FTSMatcher::_phraseMatches( const string& phrase, const string& haystack ) const { - return strcasestr( haystack.c_str(), phrase.c_str() ) != NULL; - } - } +/* + * Looks for phrase in a raw string + * @param phrase, phrase to match + * @param haystack, raw string to be parsed + */ +bool FTSMatcher::_phraseMatches(const string& phrase, const string& haystack) const { + return strcasestr(haystack.c_str(), phrase.c_str()) != NULL; +} +} } diff --git a/src/mongo/db/fts/fts_matcher.h b/src/mongo/db/fts/fts_matcher.h index 3e3b971ddc3..ad5c26d8969 100644 --- a/src/mongo/db/fts/fts_matcher.h +++ b/src/mongo/db/fts/fts_matcher.h @@ -36,42 +36,41 @@ namespace mongo { - namespace fts { +namespace fts { - class FTSMatcher { - public: - FTSMatcher( const FTSQuery& query, const FTSSpec& spec ); +class FTSMatcher { +public: + FTSMatcher(const FTSQuery& query, const FTSSpec& spec); - /** - * @return true if obj has a negated term - */ - bool hasNegativeTerm(const BSONObj& obj ) const; + /** + * @return true if obj has a negated term + */ + bool hasNegativeTerm(const BSONObj& obj) const; - /** - * @return true if obj is ok by all phrases - * so all full phrases and no negated - */ - bool phrasesMatch( const BSONObj& obj ) const; - bool phraseMatch( const std::string& phrase, const BSONObj& obj ) const; + /** + * @return true if obj is ok by all phrases + * so all full phrases and no negated + */ + bool phrasesMatch(const BSONObj& obj) const; + bool phraseMatch(const std::string& phrase, const BSONObj& obj) const; - bool matchesNonTerm( const BSONObj& obj ) const { - return !hasNegativeTerm( obj ) && phrasesMatch( obj ); - } - - private: - /** - * @return true if raw has a negated term - */ - bool _hasNegativeTerm_string( const FTSLanguage* language, const std::string& raw ) const; + bool matchesNonTerm(const BSONObj& obj) const { + return !hasNegativeTerm(obj) && phrasesMatch(obj); + } - /** - * @return true if raw has a phrase - */ - bool _phraseMatches( const std::string& phrase, const std::string& raw ) const; +private: + /** + * @return true if raw has a negated term + */ + bool _hasNegativeTerm_string(const FTSLanguage* language, const std::string& raw) const; - FTSQuery _query; - FTSSpec _spec; - }; + /** + * @return true if raw has a phrase + */ + bool _phraseMatches(const std::string& phrase, const std::string& raw) const; - } + FTSQuery _query; + FTSSpec _spec; +}; +} } diff --git a/src/mongo/db/fts/fts_matcher_test.cpp b/src/mongo/db/fts/fts_matcher_test.cpp index 0201ed4ba09..80518d0bc68 100644 --- a/src/mongo/db/fts/fts_matcher_test.cpp +++ b/src/mongo/db/fts/fts_matcher_test.cpp @@ -34,66 +34,83 @@ #include "mongo/unittest/unittest.h" namespace mongo { - namespace fts { +namespace fts { - TEST( FTSMatcher, NegWild1 ) { - FTSQuery q; - ASSERT_OK( q.parse( "foo -bar", "english", TEXT_INDEX_VERSION_2 ) ); - FTSMatcher m( q, - FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "text" ) ) ) ) ); +TEST(FTSMatcher, NegWild1) { + FTSQuery q; + ASSERT_OK(q.parse("foo -bar", "english", TEXT_INDEX_VERSION_2)); + FTSMatcher m(q, + FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("$**" + << "text"))))); - ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "bar" ) ) ) ); - ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "bar" ) ) ) ); - } - - // Regression test for SERVER-11994. - TEST( FTSMatcher, NegWild2 ) { - FTSQuery q; - ASSERT_OK( q.parse( "pizza -restaurant", "english", TEXT_INDEX_VERSION_2 ) ); - FTSMatcher m( q, - FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "text" ) ) ) ) ); + ASSERT(m.hasNegativeTerm(BSON("x" << BSON("y" + << "bar")))); + ASSERT(m.hasNegativeTerm(BSON("x" << BSON("y" + << "bar")))); +} - ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "pizza restaurant" ) ) ) ); - ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "PIZZA RESTAURANT" ) ) ) ); - } +// Regression test for SERVER-11994. +TEST(FTSMatcher, NegWild2) { + FTSQuery q; + ASSERT_OK(q.parse("pizza -restaurant", "english", TEXT_INDEX_VERSION_2)); + FTSMatcher m(q, + FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("$**" + << "text"))))); - TEST( FTSMatcher, Phrase1 ) { - FTSQuery q; - ASSERT_OK( q.parse( "foo \"table top\"", "english", TEXT_INDEX_VERSION_2 ) ); - FTSMatcher m( q, - FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "text" ) ) ) ) ); - - ASSERT( m.phraseMatch( "table top", BSON( "x" << "table top" ) ) ); - ASSERT( m.phraseMatch( "table top", BSON( "x" << " asd table top asd" ) ) ); - ASSERT( !m.phraseMatch( "table top", BSON( "x" << "tablz top" ) ) ); - ASSERT( !m.phraseMatch( "table top", BSON( "x" << " asd tablz top asd" ) ) ); + ASSERT(m.hasNegativeTerm(BSON("x" << BSON("y" + << "pizza restaurant")))); + ASSERT(m.hasNegativeTerm(BSON("x" << BSON("y" + << "PIZZA RESTAURANT")))); +} - ASSERT( m.phrasesMatch( BSON( "x" << "table top" ) ) ); - ASSERT( !m.phrasesMatch( BSON( "x" << "table a top" ) ) ); +TEST(FTSMatcher, Phrase1) { + FTSQuery q; + ASSERT_OK(q.parse("foo \"table top\"", "english", TEXT_INDEX_VERSION_2)); + FTSMatcher m(q, + FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("$**" + << "text"))))); - } + ASSERT(m.phraseMatch("table top", + BSON("x" + << "table top"))); + ASSERT(m.phraseMatch("table top", + BSON("x" + << " asd table top asd"))); + ASSERT(!m.phraseMatch("table top", + BSON("x" + << "tablz top"))); + ASSERT(!m.phraseMatch("table top", + BSON("x" + << " asd tablz top asd"))); - TEST( FTSMatcher, Phrase2 ) { - FTSQuery q; - ASSERT_OK( q.parse( "foo \"table top\"", "english", TEXT_INDEX_VERSION_2 ) ); - FTSMatcher m( q, - FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) ); - ASSERT( m.phraseMatch( "table top", - BSON( "x" << BSON_ARRAY( "table top" ) ) ) ); - } + ASSERT(m.phrasesMatch(BSON("x" + << "table top"))); + ASSERT(!m.phrasesMatch(BSON("x" + << "table a top"))); +} - // Test that the matcher parses the document with the document language, not the search - // language. - TEST( FTSMatcher, ParsesUsingDocLanguage ) { - FTSQuery q; - ASSERT_OK( q.parse( "-glad", "none", TEXT_INDEX_VERSION_2 ) ); - FTSMatcher m( q, - FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "text" ) ) ) ) ); +TEST(FTSMatcher, Phrase2) { + FTSQuery q; + ASSERT_OK(q.parse("foo \"table top\"", "english", TEXT_INDEX_VERSION_2)); + FTSMatcher m(q, + FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" + << "text"))))); + ASSERT(m.phraseMatch("table top", BSON("x" << BSON_ARRAY("table top")))); +} - // Even though the search language is "none", the document {x: "gladly"} should be - // parsed using the English stemmer, and as such should match the negated term "glad". - ASSERT( m.hasNegativeTerm( BSON( "x" << "gladly" ) ) ); - } +// Test that the matcher parses the document with the document language, not the search +// language. +TEST(FTSMatcher, ParsesUsingDocLanguage) { + FTSQuery q; + ASSERT_OK(q.parse("-glad", "none", TEXT_INDEX_VERSION_2)); + FTSMatcher m(q, + FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" + << "text"))))); - } + // Even though the search language is "none", the document {x: "gladly"} should be + // parsed using the English stemmer, and as such should match the negated term "glad". + ASSERT(m.hasNegativeTerm(BSON("x" + << "gladly"))); +} +} } diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp index 82300215b45..67a6d6cc8cb 100644 --- a/src/mongo/db/fts/fts_query.cpp +++ b/src/mongo/db/fts/fts_query.cpp @@ -38,164 +38,158 @@ namespace mongo { - namespace fts { - - using namespace mongoutils; - - using std::set; - using std::string; - using std::stringstream; - using std::vector; - - Status FTSQuery::parse(const string& query, const StringData& language, - TextIndexVersion textIndexVersion) { - _search = query; - StatusWithFTSLanguage swl = FTSLanguage::make( language, textIndexVersion ); - if ( !swl.getStatus().isOK() ) { - return swl.getStatus(); - } - _language = swl.getValue(); +namespace fts { + +using namespace mongoutils; + +using std::set; +using std::string; +using std::stringstream; +using std::vector; + +Status FTSQuery::parse(const string& query, + const StringData& language, + TextIndexVersion textIndexVersion) { + _search = query; + StatusWithFTSLanguage swl = FTSLanguage::make(language, textIndexVersion); + if (!swl.getStatus().isOK()) { + return swl.getStatus(); + } + _language = swl.getValue(); - const StopWords* stopWords = StopWords::getStopWords( *_language ); - Stemmer stemmer( *_language ); + const StopWords* stopWords = StopWords::getStopWords(*_language); + Stemmer stemmer(*_language); - bool inNegation = false; - bool inPhrase = false; + bool inNegation = false; + bool inPhrase = false; - unsigned quoteOffset = 0; + unsigned quoteOffset = 0; - Tokenizer i( *_language, query ); - while ( i.more() ) { - Token t = i.next(); + Tokenizer i(*_language, query); + while (i.more()) { + Token t = i.next(); - if ( t.type == Token::TEXT ) { - string s = t.data.toString(); + if (t.type == Token::TEXT) { + string s = t.data.toString(); - if ( inPhrase && inNegation ) { - // don't add term - } - else { - _addTerm( stopWords, stemmer, s, inNegation ); - } + if (inPhrase && inNegation) { + // don't add term + } else { + _addTerm(stopWords, stemmer, s, inNegation); + } - if ( inNegation && !inPhrase ) - inNegation = false; + if (inNegation && !inPhrase) + inNegation = false; + } else if (t.type == Token::DELIMITER) { + char c = t.data[0]; + if (c == '-') { + if (!inPhrase && t.previousWhiteSpace) { + // phrases can be negated, and terms not in phrases can be negated. + // terms in phrases can not be negated. + inNegation = true; } - else if ( t.type == Token::DELIMITER ) { - char c = t.data[0]; - if ( c == '-' ) { - if ( !inPhrase && t.previousWhiteSpace ) { - // phrases can be negated, and terms not in phrases can be negated. - // terms in phrases can not be negated. - inNegation = true; - } - } - else if ( c == '"' ) { - if ( inPhrase ) { - // end of a phrase - unsigned phraseStart = quoteOffset + 1; - unsigned phraseLength = t.offset - phraseStart; - StringData phrase = StringData( query ).substr( phraseStart, - phraseLength ); - if ( inNegation ) - _negatedPhrases.push_back( tolowerString( phrase ) ); - else - _phrases.push_back( tolowerString( phrase ) ); - inNegation = false; - inPhrase = false; - } - else { - // start of a phrase - inPhrase = true; - quoteOffset = t.offset; - } - } - } - else { - abort(); + } else if (c == '"') { + if (inPhrase) { + // end of a phrase + unsigned phraseStart = quoteOffset + 1; + unsigned phraseLength = t.offset - phraseStart; + StringData phrase = StringData(query).substr(phraseStart, phraseLength); + if (inNegation) + _negatedPhrases.push_back(tolowerString(phrase)); + else + _phrases.push_back(tolowerString(phrase)); + inNegation = false; + inPhrase = false; + } else { + // start of a phrase + inPhrase = true; + quoteOffset = t.offset; } } - - return Status::OK(); + } else { + abort(); } + } - void FTSQuery::_addTerm( const StopWords* sw, Stemmer& stemmer, const string& term, bool negated ) { - string word = tolowerString( term ); - if ( sw->isStopWord( word ) ) - return; - word = stemmer.stem( word ); - if ( negated ) - _negatedTerms.insert( word ); - else - _terms.push_back( word ); - } + return Status::OK(); +} - namespace { - void _debugHelp( stringstream& ss, const set<string>& s, const string& sep ) { - bool first = true; - for ( set<string>::const_iterator i = s.begin(); i != s.end(); ++i ) { - if ( first ) - first = false; - else - ss << sep; - ss << *i; - } - } +void FTSQuery::_addTerm(const StopWords* sw, Stemmer& stemmer, const string& term, bool negated) { + string word = tolowerString(term); + if (sw->isStopWord(word)) + return; + word = stemmer.stem(word); + if (negated) + _negatedTerms.insert(word); + else + _terms.push_back(word); +} - void _debugHelp( stringstream& ss, const vector<string>& v, const string& sep ) { - set<string> s( v.begin(), v.end() ); - _debugHelp( ss, s, sep ); - } +namespace { +void _debugHelp(stringstream& ss, const set<string>& s, const string& sep) { + bool first = true; + for (set<string>::const_iterator i = s.begin(); i != s.end(); ++i) { + if (first) + first = false; + else + ss << sep; + ss << *i; + } +} - } +void _debugHelp(stringstream& ss, const vector<string>& v, const string& sep) { + set<string> s(v.begin(), v.end()); + _debugHelp(ss, s, sep); +} +} - string FTSQuery::toString() const { - stringstream ss; - ss << "FTSQuery\n"; +string FTSQuery::toString() const { + stringstream ss; + ss << "FTSQuery\n"; - ss << " terms: "; - _debugHelp( ss, getTerms(), ", " ); - ss << "\n"; + ss << " terms: "; + _debugHelp(ss, getTerms(), ", "); + ss << "\n"; - ss << " negated terms: "; - _debugHelp( ss, getNegatedTerms(), ", " ); - ss << "\n"; + ss << " negated terms: "; + _debugHelp(ss, getNegatedTerms(), ", "); + ss << "\n"; - ss << " phrases: "; - _debugHelp( ss, getPhr(), ", " ); - ss << "\n"; + ss << " phrases: "; + _debugHelp(ss, getPhr(), ", "); + ss << "\n"; - ss << " negated phrases: "; - _debugHelp( ss, getNegatedPhr(), ", " ); - ss << "\n"; + ss << " negated phrases: "; + _debugHelp(ss, getNegatedPhr(), ", "); + ss << "\n"; - return ss.str(); - } + return ss.str(); +} - string FTSQuery::debugString() const { - stringstream ss; +string FTSQuery::debugString() const { + stringstream ss; - _debugHelp( ss, getTerms(), "|" ); - ss << "||"; + _debugHelp(ss, getTerms(), "|"); + ss << "||"; - _debugHelp( ss, getNegatedTerms(), "|" ); - ss << "||"; + _debugHelp(ss, getNegatedTerms(), "|"); + ss << "||"; - _debugHelp( ss, getPhr(), "|" ); - ss << "||"; + _debugHelp(ss, getPhr(), "|"); + ss << "||"; - _debugHelp( ss, getNegatedPhr(), "|" ); + _debugHelp(ss, getNegatedPhr(), "|"); - return ss.str(); - } + return ss.str(); +} - BSONObj FTSQuery::toBSON() const { - BSONObjBuilder bob; - bob.append( "terms", getTerms() ); - bob.append( "negatedTerms", getNegatedTerms() ); - bob.append( "phrases", getPhr() ); - bob.append( "negatedPhrases", getNegatedPhr() ); - return bob.obj(); - } - } +BSONObj FTSQuery::toBSON() const { + BSONObjBuilder bob; + bob.append("terms", getTerms()); + bob.append("negatedTerms", getNegatedTerms()); + bob.append("phrases", getPhr()); + bob.append("negatedPhrases", getNegatedPhr()); + return bob.obj(); +} +} } diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h index 0bee769172c..d8cf024e975 100644 --- a/src/mongo/db/fts/fts_query.h +++ b/src/mongo/db/fts/fts_query.h @@ -40,56 +40,63 @@ namespace mongo { - namespace fts { - - class FTSQuery { - - public: - // Initializes an FTSQuery. Note that the parsing of "language" depends on the text - // index version, since a query which doesn't specify a language and is against a - // version 1 text index with a version 1 default language string needs to be parsed as - // version 1 (see fts_language.cpp for a list of language strings specific to version - // 1). - Status parse(const std::string& query, const StringData& language, - TextIndexVersion textIndexVersion); - - const std::vector<std::string>& getTerms() const { return _terms; } - const std::set<std::string>& getNegatedTerms() const { return _negatedTerms; } - - const std::vector<std::string>& getPhr() const { return _phrases; } - const std::vector<std::string>& getNegatedPhr() const { return _negatedPhrases; } +namespace fts { + +class FTSQuery { +public: + // Initializes an FTSQuery. Note that the parsing of "language" depends on the text + // index version, since a query which doesn't specify a language and is against a + // version 1 text index with a version 1 default language string needs to be parsed as + // version 1 (see fts_language.cpp for a list of language strings specific to version + // 1). + Status parse(const std::string& query, + const StringData& language, + TextIndexVersion textIndexVersion); + + const std::vector<std::string>& getTerms() const { + return _terms; + } + const std::set<std::string>& getNegatedTerms() const { + return _negatedTerms; + } - /** - * @return true if any negations or phrase + or - - */ - bool hasNonTermPieces() const { - return - _negatedTerms.size() > 0 || - _phrases.size() > 0 || - _negatedPhrases.size() > 0; - } + const std::vector<std::string>& getPhr() const { + return _phrases; + } + const std::vector<std::string>& getNegatedPhr() const { + return _negatedPhrases; + } - std::string getSearch() const { return _search; } - const FTSLanguage& getLanguage() const { return *_language; } + /** + * @return true if any negations or phrase + or - + */ + bool hasNonTermPieces() const { + return _negatedTerms.size() > 0 || _phrases.size() > 0 || _negatedPhrases.size() > 0; + } - std::string toString() const; + std::string getSearch() const { + return _search; + } + const FTSLanguage& getLanguage() const { + return *_language; + } - std::string debugString() const; + std::string toString() const; - BSONObj toBSON() const; + std::string debugString() const; - protected: - std::string _search; - const FTSLanguage* _language; - std::vector<std::string> _terms; - std::set<std::string> _negatedTerms; - std::vector<std::string> _phrases; - std::vector<std::string> _negatedPhrases; + BSONObj toBSON() const; - private: - void _addTerm( const StopWords* sw, Stemmer& stemmer, const std::string& term, bool negated ); - }; +protected: + std::string _search; + const FTSLanguage* _language; + std::vector<std::string> _terms; + std::set<std::string> _negatedTerms; + std::vector<std::string> _phrases; + std::vector<std::string> _negatedPhrases; - } +private: + void _addTerm(const StopWords* sw, Stemmer& stemmer, const std::string& term, bool negated); +}; +} } - diff --git a/src/mongo/db/fts/fts_query_test.cpp b/src/mongo/db/fts/fts_query_test.cpp index 1e5318a1592..36a27a26dd6 100644 --- a/src/mongo/db/fts/fts_query_test.cpp +++ b/src/mongo/db/fts/fts_query_test.cpp @@ -33,148 +33,143 @@ #include "mongo/unittest/unittest.h" namespace mongo { - namespace fts { - - TEST( FTSQuery, Basic1 ) { - FTSQuery q; - ASSERT( q.parse( "this is fun", "english", TEXT_INDEX_VERSION_2 ).isOK() ); - - ASSERT_EQUALS( 1U, q.getTerms().size() ); - ASSERT_EQUALS( "fun", q.getTerms()[0] ); - ASSERT_EQUALS( 0U, q.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q.getPhr().size() ); - ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); - } - - TEST( FTSQuery, Neg1 ) { - FTSQuery q; - ASSERT( q.parse( "this is -really fun", "english", TEXT_INDEX_VERSION_2 ).isOK() ); - - ASSERT_EQUALS( 1U, q.getTerms().size() ); - ASSERT_EQUALS( "fun", q.getTerms()[0] ); - ASSERT_EQUALS( 1U, q.getNegatedTerms().size() ); - ASSERT_EQUALS( "realli", *q.getNegatedTerms().begin() ); - } - - TEST( FTSQuery, Phrase1 ) { - FTSQuery q; - ASSERT( q.parse( "doing a \"phrase test\" for fun", "english", - TEXT_INDEX_VERSION_2 ).isOK() ); - - ASSERT_EQUALS( 3U, q.getTerms().size() ); - ASSERT_EQUALS( 0U, q.getNegatedTerms().size() ); - ASSERT_EQUALS( 1U, q.getPhr().size() ); - ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); - - ASSERT_EQUALS( "phrase test", q.getPhr()[0] ); - ASSERT_EQUALS( "fun|phrase|test||||phrase test||", q.debugString() ); - } - - TEST( FTSQuery, Phrase2 ) { - FTSQuery q; - ASSERT( q.parse( "doing a \"phrase-test\" for fun", "english", - TEXT_INDEX_VERSION_2 ).isOK() ); - ASSERT_EQUALS( 1U, q.getPhr().size() ); - ASSERT_EQUALS( "phrase-test", q.getPhr()[0] ); - } - - TEST( FTSQuery, NegPhrase1 ) { - FTSQuery q; - ASSERT( q.parse( "doing a -\"phrase test\" for fun", "english", - TEXT_INDEX_VERSION_2 ).isOK() ); - ASSERT_EQUALS( "fun||||||phrase test", q.debugString() ); - } - - TEST( FTSQuery, Mix1 ) { - FTSQuery q; - ASSERT( q.parse( "\"industry\" -Melbourne -Physics", "english", - TEXT_INDEX_VERSION_2 ).isOK() ); - ASSERT_EQUALS( "industri||melbourn|physic||industry||", q.debugString() ); - } - - TEST( FTSQuery, NegPhrase2) { - FTSQuery q1, q2, q3; - ASSERT( q1.parse( "foo \"bar\"", "english", TEXT_INDEX_VERSION_2 ).isOK() ); - ASSERT( q2.parse( "foo \"-bar\"", "english", TEXT_INDEX_VERSION_2 ).isOK() ); - ASSERT( q3.parse( "foo \" -bar\"", "english", TEXT_INDEX_VERSION_2 ).isOK() ); - - ASSERT_EQUALS( 2U, q1.getTerms().size() ); - ASSERT_EQUALS( 2U, q2.getTerms().size() ); - ASSERT_EQUALS( 2U, q3.getTerms().size() ); - - ASSERT_EQUALS( 0U, q1.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q2.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q3.getNegatedTerms().size() ); - - ASSERT_EQUALS( 1U, q1.getPhr().size() ); - ASSERT_EQUALS( 1U, q2.getPhr().size() ); - ASSERT_EQUALS( 1U, q3.getPhr().size() ); - - ASSERT_EQUALS( 0U, q1.getNegatedPhr().size() ); - ASSERT_EQUALS( 0U, q2.getNegatedPhr().size() ); - ASSERT_EQUALS( 0U, q3.getNegatedPhr().size() ); - } - - TEST( FTSQuery, NegPhrase3) { - FTSQuery q1, q2, q3; - ASSERT( q1.parse( "foo -\"bar\"", "english", TEXT_INDEX_VERSION_2 ).isOK() ); - ASSERT( q2.parse( "foo -\"-bar\"", "english", TEXT_INDEX_VERSION_2 ).isOK() ); - ASSERT( q3.parse( "foo -\" -bar\"", "english", TEXT_INDEX_VERSION_2 ).isOK() ); - - ASSERT_EQUALS( 1U, q1.getTerms().size() ); - ASSERT_EQUALS( 1U, q2.getTerms().size() ); - ASSERT_EQUALS( 1U, q3.getTerms().size() ); - - ASSERT_EQUALS( 0U, q1.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q2.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q3.getNegatedTerms().size() ); - - ASSERT_EQUALS( 0U, q1.getPhr().size() ); - ASSERT_EQUALS( 0U, q2.getPhr().size() ); - ASSERT_EQUALS( 0U, q3.getPhr().size() ); - - ASSERT_EQUALS( 1U, q1.getNegatedPhr().size() ); - ASSERT_EQUALS( 1U, q2.getNegatedPhr().size() ); - ASSERT_EQUALS( 1U, q3.getNegatedPhr().size() ); - } - - // Test textIndexVersion:1 query with language "english". This invokes the standard English - // stemmer and stopword list. - TEST( FTSQuery, TextIndexVersion1LanguageEnglish ) { - FTSQuery q; - ASSERT( q.parse( "the running", "english", TEXT_INDEX_VERSION_1 ).isOK() ); - ASSERT_EQUALS( 1U, q.getTerms().size() ); - ASSERT_EQUALS( "run", q.getTerms()[0] ); - ASSERT_EQUALS( 0U, q.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q.getPhr().size() ); - ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); - } - - // Test textIndexVersion:1 query with language "eng". "eng" uses the English stemmer, and - // no stopword list. - TEST( FTSQuery, TextIndexVersion1LanguageEng ) { - FTSQuery q; - ASSERT( q.parse( "the running", "eng", TEXT_INDEX_VERSION_1 ).isOK() ); - ASSERT_EQUALS( 2U, q.getTerms().size() ); - ASSERT_EQUALS( 1, std::count( q.getTerms().begin(), q.getTerms().end(), "the" ) ); - ASSERT_EQUALS( 1, std::count( q.getTerms().begin(), q.getTerms().end(), "run" ) ); - ASSERT_EQUALS( 0U, q.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q.getPhr().size() ); - ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); - } - - // Test textIndexVersion:1 query with language "invalid". No stemming will be performed, - // and no stopword list will be used. - TEST( FTSQuery, TextIndexVersion1LanguageInvalid ) { - FTSQuery q; - ASSERT( q.parse( "the running", "invalid", TEXT_INDEX_VERSION_1 ).isOK() ); - ASSERT_EQUALS( 2U, q.getTerms().size() ); - ASSERT_EQUALS( 1, std::count( q.getTerms().begin(), q.getTerms().end(), "the" ) ); - ASSERT_EQUALS( 1, std::count( q.getTerms().begin(), q.getTerms().end(), "running" ) ); - ASSERT_EQUALS( 0U, q.getNegatedTerms().size() ); - ASSERT_EQUALS( 0U, q.getPhr().size() ); - ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); - } - - } +namespace fts { + +TEST(FTSQuery, Basic1) { + FTSQuery q; + ASSERT(q.parse("this is fun", "english", TEXT_INDEX_VERSION_2).isOK()); + + ASSERT_EQUALS(1U, q.getTerms().size()); + ASSERT_EQUALS("fun", q.getTerms()[0]); + ASSERT_EQUALS(0U, q.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q.getPhr().size()); + ASSERT_EQUALS(0U, q.getNegatedPhr().size()); +} + +TEST(FTSQuery, Neg1) { + FTSQuery q; + ASSERT(q.parse("this is -really fun", "english", TEXT_INDEX_VERSION_2).isOK()); + + ASSERT_EQUALS(1U, q.getTerms().size()); + ASSERT_EQUALS("fun", q.getTerms()[0]); + ASSERT_EQUALS(1U, q.getNegatedTerms().size()); + ASSERT_EQUALS("realli", *q.getNegatedTerms().begin()); +} + +TEST(FTSQuery, Phrase1) { + FTSQuery q; + ASSERT(q.parse("doing a \"phrase test\" for fun", "english", TEXT_INDEX_VERSION_2).isOK()); + + ASSERT_EQUALS(3U, q.getTerms().size()); + ASSERT_EQUALS(0U, q.getNegatedTerms().size()); + ASSERT_EQUALS(1U, q.getPhr().size()); + ASSERT_EQUALS(0U, q.getNegatedPhr().size()); + + ASSERT_EQUALS("phrase test", q.getPhr()[0]); + ASSERT_EQUALS("fun|phrase|test||||phrase test||", q.debugString()); +} + +TEST(FTSQuery, Phrase2) { + FTSQuery q; + ASSERT(q.parse("doing a \"phrase-test\" for fun", "english", TEXT_INDEX_VERSION_2).isOK()); + ASSERT_EQUALS(1U, q.getPhr().size()); + ASSERT_EQUALS("phrase-test", q.getPhr()[0]); +} + +TEST(FTSQuery, NegPhrase1) { + FTSQuery q; + ASSERT(q.parse("doing a -\"phrase test\" for fun", "english", TEXT_INDEX_VERSION_2).isOK()); + ASSERT_EQUALS("fun||||||phrase test", q.debugString()); +} + +TEST(FTSQuery, Mix1) { + FTSQuery q; + ASSERT(q.parse("\"industry\" -Melbourne -Physics", "english", TEXT_INDEX_VERSION_2).isOK()); + ASSERT_EQUALS("industri||melbourn|physic||industry||", q.debugString()); +} + +TEST(FTSQuery, NegPhrase2) { + FTSQuery q1, q2, q3; + ASSERT(q1.parse("foo \"bar\"", "english", TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q2.parse("foo \"-bar\"", "english", TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q3.parse("foo \" -bar\"", "english", TEXT_INDEX_VERSION_2).isOK()); + + ASSERT_EQUALS(2U, q1.getTerms().size()); + ASSERT_EQUALS(2U, q2.getTerms().size()); + ASSERT_EQUALS(2U, q3.getTerms().size()); + + ASSERT_EQUALS(0U, q1.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q2.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q3.getNegatedTerms().size()); + + ASSERT_EQUALS(1U, q1.getPhr().size()); + ASSERT_EQUALS(1U, q2.getPhr().size()); + ASSERT_EQUALS(1U, q3.getPhr().size()); + + ASSERT_EQUALS(0U, q1.getNegatedPhr().size()); + ASSERT_EQUALS(0U, q2.getNegatedPhr().size()); + ASSERT_EQUALS(0U, q3.getNegatedPhr().size()); +} + +TEST(FTSQuery, NegPhrase3) { + FTSQuery q1, q2, q3; + ASSERT(q1.parse("foo -\"bar\"", "english", TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q2.parse("foo -\"-bar\"", "english", TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q3.parse("foo -\" -bar\"", "english", TEXT_INDEX_VERSION_2).isOK()); + + ASSERT_EQUALS(1U, q1.getTerms().size()); + ASSERT_EQUALS(1U, q2.getTerms().size()); + ASSERT_EQUALS(1U, q3.getTerms().size()); + + ASSERT_EQUALS(0U, q1.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q2.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q3.getNegatedTerms().size()); + + ASSERT_EQUALS(0U, q1.getPhr().size()); + ASSERT_EQUALS(0U, q2.getPhr().size()); + ASSERT_EQUALS(0U, q3.getPhr().size()); + + ASSERT_EQUALS(1U, q1.getNegatedPhr().size()); + ASSERT_EQUALS(1U, q2.getNegatedPhr().size()); + ASSERT_EQUALS(1U, q3.getNegatedPhr().size()); +} + +// Test textIndexVersion:1 query with language "english". This invokes the standard English +// stemmer and stopword list. +TEST(FTSQuery, TextIndexVersion1LanguageEnglish) { + FTSQuery q; + ASSERT(q.parse("the running", "english", TEXT_INDEX_VERSION_1).isOK()); + ASSERT_EQUALS(1U, q.getTerms().size()); + ASSERT_EQUALS("run", q.getTerms()[0]); + ASSERT_EQUALS(0U, q.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q.getPhr().size()); + ASSERT_EQUALS(0U, q.getNegatedPhr().size()); +} + +// Test textIndexVersion:1 query with language "eng". "eng" uses the English stemmer, and +// no stopword list. +TEST(FTSQuery, TextIndexVersion1LanguageEng) { + FTSQuery q; + ASSERT(q.parse("the running", "eng", TEXT_INDEX_VERSION_1).isOK()); + ASSERT_EQUALS(2U, q.getTerms().size()); + ASSERT_EQUALS(1, std::count(q.getTerms().begin(), q.getTerms().end(), "the")); + ASSERT_EQUALS(1, std::count(q.getTerms().begin(), q.getTerms().end(), "run")); + ASSERT_EQUALS(0U, q.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q.getPhr().size()); + ASSERT_EQUALS(0U, q.getNegatedPhr().size()); +} + +// Test textIndexVersion:1 query with language "invalid". No stemming will be performed, +// and no stopword list will be used. +TEST(FTSQuery, TextIndexVersion1LanguageInvalid) { + FTSQuery q; + ASSERT(q.parse("the running", "invalid", TEXT_INDEX_VERSION_1).isOK()); + ASSERT_EQUALS(2U, q.getTerms().size()); + ASSERT_EQUALS(1, std::count(q.getTerms().begin(), q.getTerms().end(), "the")); + ASSERT_EQUALS(1, std::count(q.getTerms().begin(), q.getTerms().end(), "running")); + ASSERT_EQUALS(0U, q.getNegatedTerms().size()); + ASSERT_EQUALS(0U, q.getPhr().size()); + ASSERT_EQUALS(0U, q.getNegatedPhr().size()); +} +} } diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp index 20142cbf2f5..55e8637969e 100644 --- a/src/mongo/db/fts/fts_spec.cpp +++ b/src/mongo/db/fts/fts_spec.cpp @@ -39,458 +39,409 @@ namespace mongo { - namespace fts { - - using std::map; - using std::string; - using namespace mongoutils; - - const double DEFAULT_WEIGHT = 1; - const double MAX_WEIGHT = 1000000000; - const double MAX_WORD_WEIGHT = MAX_WEIGHT / 10000; - - namespace { - // Default language. Used for new indexes. - const std::string moduleDefaultLanguage( "english" ); - - /** Validate the given language override string. */ - bool validateOverride( const string& override ) { - // The override field can't be empty, can't be prefixed with a dollar sign, and - // can't contain a dot. - return !override.empty() && - override[0] != '$' && - override.find('.') == std::string::npos; - } - } - - FTSSpec::FTSSpec( const BSONObj& indexInfo ) { - // indexInfo is a text index spec. Text index specs pass through fixSpec() before - // being saved to the system.indexes collection. fixSpec() enforces a schema, such that - // required fields must exist and be of the correct type (e.g. weights, - // textIndexVersion). - massert( 16739, "found invalid spec for text index", - indexInfo["weights"].isABSONObj() ); - BSONElement textIndexVersionElt = indexInfo["textIndexVersion"]; - massert( 17367, - "found invalid spec for text index, expected number for textIndexVersion", - textIndexVersionElt.isNumber() ); - - // We currently support TEXT_INDEX_VERSION_1 (deprecated) and TEXT_INDEX_VERSION_2. - // Reject all other values. - massert( 17364, - str::stream() << "attempt to use unsupported textIndexVersion " << - textIndexVersionElt.numberInt() << "; versions supported: " << - TEXT_INDEX_VERSION_2 << ", " << TEXT_INDEX_VERSION_1, - textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 || - textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_1 ); - - _textIndexVersion = ( textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 ) ? - TEXT_INDEX_VERSION_2 : TEXT_INDEX_VERSION_1; - - // Initialize _defaultLanguage. Note that the FTSLanguage constructor requires - // textIndexVersion, since language parsing is version-specific. - StatusWithFTSLanguage swl = - FTSLanguage::make( indexInfo["default_language"].String(), _textIndexVersion ); - verify( swl.getStatus().isOK() ); // should not fail, since validated by fixSpec(). - _defaultLanguage = swl.getValue(); - - _languageOverrideField = indexInfo["language_override"].valuestrsafe(); - - _wildcard = false; - - // in this block we fill in the _weights map - { - BSONObjIterator i( indexInfo["weights"].Obj() ); - while ( i.more() ) { - BSONElement e = i.next(); - verify( e.isNumber() ); - - if ( WILDCARD == e.fieldName() ) { - _wildcard = true; - } - else { - double num = e.number(); - _weights[ e.fieldName() ] = num; - verify( num > 0 && num < MAX_WORD_WEIGHT ); - } - } - verify( _wildcard || _weights.size() ); - } - - // extra information - { - BSONObj keyPattern = indexInfo["key"].Obj(); - verify( keyPattern.nFields() >= 2 ); - BSONObjIterator i( keyPattern ); +namespace fts { - bool passedFTS = false; +using std::map; +using std::string; +using namespace mongoutils; - while ( i.more() ) { - BSONElement e = i.next(); - if ( str::equals( e.fieldName(), "_fts" ) || - str::equals( e.fieldName(), "_ftsx" ) ) { - passedFTS = true; - continue; - } +const double DEFAULT_WEIGHT = 1; +const double MAX_WEIGHT = 1000000000; +const double MAX_WORD_WEIGHT = MAX_WEIGHT / 10000; - if ( passedFTS ) - _extraAfter.push_back( e.fieldName() ); - else - _extraBefore.push_back( e.fieldName() ); - } +namespace { +// Default language. Used for new indexes. +const std::string moduleDefaultLanguage("english"); - } - } +/** Validate the given language override string. */ +bool validateOverride(const string& override) { + // The override field can't be empty, can't be prefixed with a dollar sign, and + // can't contain a dot. + return !override.empty() && override[0] != '$' && override.find('.') == std::string::npos; +} +} - const FTSLanguage* FTSSpec::_getLanguageToUseV2( const BSONObj& userDoc, - const FTSLanguage* currentLanguage ) const { - BSONElement e = userDoc[_languageOverrideField]; - if ( e.eoo() ) { - return currentLanguage; +FTSSpec::FTSSpec(const BSONObj& indexInfo) { + // indexInfo is a text index spec. Text index specs pass through fixSpec() before + // being saved to the system.indexes collection. fixSpec() enforces a schema, such that + // required fields must exist and be of the correct type (e.g. weights, + // textIndexVersion). + massert(16739, "found invalid spec for text index", indexInfo["weights"].isABSONObj()); + BSONElement textIndexVersionElt = indexInfo["textIndexVersion"]; + massert(17367, + "found invalid spec for text index, expected number for textIndexVersion", + textIndexVersionElt.isNumber()); + + // We currently support TEXT_INDEX_VERSION_1 (deprecated) and TEXT_INDEX_VERSION_2. + // Reject all other values. + massert(17364, + str::stream() << "attempt to use unsupported textIndexVersion " + << textIndexVersionElt.numberInt() << "; versions supported: " + << TEXT_INDEX_VERSION_2 << ", " << TEXT_INDEX_VERSION_1, + textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 || + textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_1); + + _textIndexVersion = (textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2) + ? TEXT_INDEX_VERSION_2 + : TEXT_INDEX_VERSION_1; + + // Initialize _defaultLanguage. Note that the FTSLanguage constructor requires + // textIndexVersion, since language parsing is version-specific. + StatusWithFTSLanguage swl = + FTSLanguage::make(indexInfo["default_language"].String(), _textIndexVersion); + verify(swl.getStatus().isOK()); // should not fail, since validated by fixSpec(). + _defaultLanguage = swl.getValue(); + + _languageOverrideField = indexInfo["language_override"].valuestrsafe(); + + _wildcard = false; + + // in this block we fill in the _weights map + { + BSONObjIterator i(indexInfo["weights"].Obj()); + while (i.more()) { + BSONElement e = i.next(); + verify(e.isNumber()); + + if (WILDCARD == e.fieldName()) { + _wildcard = true; + } else { + double num = e.number(); + _weights[e.fieldName()] = num; + verify(num > 0 && num < MAX_WORD_WEIGHT); } - uassert( 17261, - "found language override field in document with non-string type", - e.type() == mongo::String ); - StatusWithFTSLanguage swl = FTSLanguage::make( e.String(), TEXT_INDEX_VERSION_2 ); - uassert( 17262, - "language override unsupported: " + e.String(), - swl.getStatus().isOK() ); - return swl.getValue(); } + verify(_wildcard || _weights.size()); + } - void FTSSpec::scoreDocument( const BSONObj& obj, TermFrequencyMap* term_freqs ) const { - if ( _textIndexVersion == TEXT_INDEX_VERSION_1 ) { - return _scoreDocumentV1( obj, term_freqs ); - } + // extra information + { + BSONObj keyPattern = indexInfo["key"].Obj(); + verify(keyPattern.nFields() >= 2); + BSONObjIterator i(keyPattern); - FTSElementIterator it( *this, obj ); + bool passedFTS = false; - while ( it.more() ) { - FTSIteratorValue val = it.next(); - Stemmer stemmer( *val._language ); - Tools tools( *val._language, &stemmer, StopWords::getStopWords( *val._language ) ); - _scoreStringV2( tools, val._text, term_freqs, val._weight ); + while (i.more()) { + BSONElement e = i.next(); + if (str::equals(e.fieldName(), "_fts") || str::equals(e.fieldName(), "_ftsx")) { + passedFTS = true; + continue; } + + if (passedFTS) + _extraAfter.push_back(e.fieldName()); + else + _extraBefore.push_back(e.fieldName()); } + } +} - void FTSSpec::_scoreStringV2( const Tools& tools, - const StringData& raw, - TermFrequencyMap* docScores, - double weight ) const { +const FTSLanguage* FTSSpec::_getLanguageToUseV2(const BSONObj& userDoc, + const FTSLanguage* currentLanguage) const { + BSONElement e = userDoc[_languageOverrideField]; + if (e.eoo()) { + return currentLanguage; + } + uassert(17261, + "found language override field in document with non-string type", + e.type() == mongo::String); + StatusWithFTSLanguage swl = FTSLanguage::make(e.String(), TEXT_INDEX_VERSION_2); + uassert(17262, "language override unsupported: " + e.String(), swl.getStatus().isOK()); + return swl.getValue(); +} - ScoreHelperMap terms; +void FTSSpec::scoreDocument(const BSONObj& obj, TermFrequencyMap* term_freqs) const { + if (_textIndexVersion == TEXT_INDEX_VERSION_1) { + return _scoreDocumentV1(obj, term_freqs); + } - unsigned numTokens = 0; + FTSElementIterator it(*this, obj); - Tokenizer i( tools.language, raw ); - while ( i.more() ) { - Token t = i.next(); - if ( t.type != Token::TEXT ) - continue; + while (it.more()) { + FTSIteratorValue val = it.next(); + Stemmer stemmer(*val._language); + Tools tools(*val._language, &stemmer, StopWords::getStopWords(*val._language)); + _scoreStringV2(tools, val._text, term_freqs, val._weight); + } +} - string term = t.data.toString(); - makeLower( &term ); - if ( tools.stopwords->isStopWord( term ) ) { - continue; - } - term = tools.stemmer->stem( term ); +void FTSSpec::_scoreStringV2(const Tools& tools, + const StringData& raw, + TermFrequencyMap* docScores, + double weight) const { + ScoreHelperMap terms; - ScoreHelperStruct& data = terms[term]; + unsigned numTokens = 0; - if ( data.exp ) { - data.exp *= 2; - } - else { - data.exp = 1; - } - data.count += 1; - data.freq += ( 1 / data.exp ); - numTokens++; - } + Tokenizer i(tools.language, raw); + while (i.more()) { + Token t = i.next(); + if (t.type != Token::TEXT) + continue; - for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) { + string term = t.data.toString(); + makeLower(&term); + if (tools.stopwords->isStopWord(term)) { + continue; + } + term = tools.stemmer->stem(term); - const string& term = i->first; - const ScoreHelperStruct& data = i->second; + ScoreHelperStruct& data = terms[term]; - // in order to adjust weights as a function of term count as it - // relates to total field length. ie. is this the only word or - // a frequently occuring term? or does it only show up once in - // a long block of text? + if (data.exp) { + data.exp *= 2; + } else { + data.exp = 1; + } + data.count += 1; + data.freq += (1 / data.exp); + numTokens++; + } - double coeff = ( 0.5 * data.count / numTokens ) + 0.5; + for (ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i) { + const string& term = i->first; + const ScoreHelperStruct& data = i->second; - // if term is identical to the raw form of the - // field (untokenized) give it a small boost. - double adjustment = 1; - if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) ) - adjustment += 0.1; + // in order to adjust weights as a function of term count as it + // relates to total field length. ie. is this the only word or + // a frequently occuring term? or does it only show up once in + // a long block of text? - double& score = (*docScores)[term]; - score += ( weight * data.freq * coeff * adjustment ); - verify( score <= MAX_WEIGHT ); - } - } + double coeff = (0.5 * data.count / numTokens) + 0.5; - Status FTSSpec::getIndexPrefix( const BSONObj& query, BSONObj* out ) const { - if ( numExtraBefore() == 0 ) { - *out = BSONObj(); - return Status::OK(); - } + // if term is identical to the raw form of the + // field (untokenized) give it a small boost. + double adjustment = 1; + if (raw.size() == term.length() && raw.equalCaseInsensitive(term)) + adjustment += 0.1; - BSONObjBuilder b; - for ( unsigned i = 0; i < numExtraBefore(); i++ ) { - BSONElement e = query.getFieldDotted(extraBefore(i)); - if ( e.eoo() ) - return Status( ErrorCodes::BadValue, - str::stream() - << "need have an equality filter on: " - << extraBefore(i) ); - - if ( e.isABSONObj() && e.Obj().firstElement().getGtLtOp( -1 ) != -1 ) - return Status( ErrorCodes::BadValue, - str::stream() - << "need have an equality filter on: " - << extraBefore(i) ); - - b.append( e ); - } - *out = b.obj(); - return Status::OK(); - } + double& score = (*docScores)[term]; + score += (weight * data.freq * coeff * adjustment); + verify(score <= MAX_WEIGHT); + } +} - namespace { - void _addFTSStuff( BSONObjBuilder* b ) { - b->append( "_fts", INDEX_NAME ); - b->append( "_ftsx", 1 ); - } +Status FTSSpec::getIndexPrefix(const BSONObj& query, BSONObj* out) const { + if (numExtraBefore() == 0) { + *out = BSONObj(); + return Status::OK(); + } - void verifyFieldNameNotReserved( StringData s ) { - uassert( 17289, - "text index with reserved fields _fts/_ftsx not allowed", - s != "_fts" && s != "_ftsx" ); - } - } + BSONObjBuilder b; + for (unsigned i = 0; i < numExtraBefore(); i++) { + BSONElement e = query.getFieldDotted(extraBefore(i)); + if (e.eoo()) + return Status(ErrorCodes::BadValue, + str::stream() << "need have an equality filter on: " << extraBefore(i)); - BSONObj FTSSpec::fixSpec( const BSONObj& spec ) { - if ( spec["textIndexVersion"].numberInt() == TEXT_INDEX_VERSION_1 ) { - return _fixSpecV1( spec ); - } + if (e.isABSONObj() && e.Obj().firstElement().getGtLtOp(-1) != -1) + return Status(ErrorCodes::BadValue, + str::stream() << "need have an equality filter on: " << extraBefore(i)); - map<string,int> m; - - BSONObj keyPattern; - { - BSONObjBuilder b; - - // Populate m and keyPattern. - { - bool addedFtsStuff = false; - BSONObjIterator i( spec["key"].Obj() ); - while ( i.more() ) { - BSONElement e = i.next(); - if ( str::equals( e.fieldName(), "_fts" ) ) { - uassert( 17271, - "expecting _fts:\"text\"", - INDEX_NAME == e.valuestrsafe() ); - addedFtsStuff = true; - b.append( e ); - } - else if ( str::equals( e.fieldName(), "_ftsx" ) ) { - uassert( 17272, "expecting _ftsx:1", e.numberInt() == 1 ); - b.append( e ); - } - else if ( e.type() == String && INDEX_NAME == e.valuestr() ) { - - if ( !addedFtsStuff ) { - _addFTSStuff( &b ); - addedFtsStuff = true; - } - - m[e.fieldName()] = 1; - } - else { - uassert( 17273, - "expected value 1 or -1 for non-text key in compound index", - e.numberInt() == 1 || e.numberInt() == -1 ); - b.append( e ); - } - } - verify( addedFtsStuff ); - } - keyPattern = b.obj(); - - // Verify that index key is in the correct format: extraBefore fields, then text - // fields, then extraAfter fields. - { - BSONObjIterator i( spec["key"].Obj() ); - verify( i.more() ); - BSONElement e = i.next(); - - // extraBefore fields - while ( String != e.type() ) { - verifyFieldNameNotReserved( e.fieldNameStringData() ); - verify( i.more() ); - e = i.next(); - } + b.append(e); + } + *out = b.obj(); + return Status::OK(); +} - // text fields - bool alreadyFixed = str::equals( e.fieldName(), "_fts" ); - if ( alreadyFixed ) { - uassert( 17288, "expected _ftsx after _fts", i.more() ); - e = i.next(); - uassert( 17274, - "expected _ftsx after _fts", - str::equals( e.fieldName(), "_ftsx" ) ); - e = i.next(); - } - else { - do { - verifyFieldNameNotReserved( e.fieldNameStringData() ); - e = i.next(); - } while ( !e.eoo() && e.type() == String ); - } +namespace { +void _addFTSStuff(BSONObjBuilder* b) { + b->append("_fts", INDEX_NAME); + b->append("_ftsx", 1); +} - // extraAfterFields - while ( !e.eoo() ) { - uassert( 17389, - "'text' fields in index must all be adjacent", - e.type() != String ); - verifyFieldNameNotReserved( e.fieldNameStringData() ); - e = i.next(); - } - } +void verifyFieldNameNotReserved(StringData s) { + uassert(17289, + "text index with reserved fields _fts/_ftsx not allowed", + s != "_fts" && s != "_ftsx"); +} +} - } +BSONObj FTSSpec::fixSpec(const BSONObj& spec) { + if (spec["textIndexVersion"].numberInt() == TEXT_INDEX_VERSION_1) { + return _fixSpecV1(spec); + } - if ( spec["weights"].type() == Object ) { - BSONObjIterator i( spec["weights"].Obj() ); - while ( i.more() ) { - BSONElement e = i.next(); - uassert( 17283, - "weight for text index needs numeric type", - e.isNumber() ); - m[e.fieldName()] = e.numberInt(); - } - } - else if ( spec["weights"].str() == WILDCARD ) { - m[WILDCARD] = 1; - } - else if ( !spec["weights"].eoo() ) { - uasserted( 17284, "text index option 'weights' must be an object" ); - } + map<string, int> m; + + BSONObj keyPattern; + { + BSONObjBuilder b; - BSONObj weights; - { - BSONObjBuilder b; - for ( map<string,int>::iterator i = m.begin(); i != m.end(); ++i ) { - uassert( 16674, "score for word too high", - i->second > 0 && i->second < MAX_WORD_WEIGHT ); - - // Verify weight refers to a valid field. - if ( i->first != "$**" ) { - FieldRef keyField( i->first ); - uassert( 17294, - "weight cannot be on an empty field", - keyField.numParts() != 0 ); - for ( size_t partNum = 0; partNum < keyField.numParts(); partNum++ ) { - StringData part = keyField.getPart(partNum); - uassert( 17291, - "weight cannot have empty path component", - !part.empty() ); - uassert( 17292, - "weight cannot have path component with $ prefix", - !part.startsWith( "$" ) ); - } + // Populate m and keyPattern. + { + bool addedFtsStuff = false; + BSONObjIterator i(spec["key"].Obj()); + while (i.more()) { + BSONElement e = i.next(); + if (str::equals(e.fieldName(), "_fts")) { + uassert(17271, "expecting _fts:\"text\"", INDEX_NAME == e.valuestrsafe()); + addedFtsStuff = true; + b.append(e); + } else if (str::equals(e.fieldName(), "_ftsx")) { + uassert(17272, "expecting _ftsx:1", e.numberInt() == 1); + b.append(e); + } else if (e.type() == String && INDEX_NAME == e.valuestr()) { + if (!addedFtsStuff) { + _addFTSStuff(&b); + addedFtsStuff = true; } - b.append( i->first, i->second ); + m[e.fieldName()] = 1; + } else { + uassert(17273, + "expected value 1 or -1 for non-text key in compound index", + e.numberInt() == 1 || e.numberInt() == -1); + b.append(e); } - weights = b.obj(); } - - BSONElement default_language_elt = spec["default_language"]; - string default_language( default_language_elt.str() ); - if ( default_language_elt.eoo() ) { - default_language = moduleDefaultLanguage; - } - else { - uassert( 17263, - "default_language needs a string type", - default_language_elt.type() == String ); + verify(addedFtsStuff); + } + keyPattern = b.obj(); + + // Verify that index key is in the correct format: extraBefore fields, then text + // fields, then extraAfter fields. + { + BSONObjIterator i(spec["key"].Obj()); + verify(i.more()); + BSONElement e = i.next(); + + // extraBefore fields + while (String != e.type()) { + verifyFieldNameNotReserved(e.fieldNameStringData()); + verify(i.more()); + e = i.next(); } - uassert( 17264, - "default_language is not valid", - FTSLanguage::make( default_language, - TEXT_INDEX_VERSION_2 ).getStatus().isOK() ); - - BSONElement language_override_elt = spec["language_override"]; - string language_override( language_override_elt.str() ); - if ( language_override_elt.eoo() ) { - language_override = "language"; + + // text fields + bool alreadyFixed = str::equals(e.fieldName(), "_fts"); + if (alreadyFixed) { + uassert(17288, "expected _ftsx after _fts", i.more()); + e = i.next(); + uassert(17274, "expected _ftsx after _fts", str::equals(e.fieldName(), "_ftsx")); + e = i.next(); + } else { + do { + verifyFieldNameNotReserved(e.fieldNameStringData()); + e = i.next(); + } while (!e.eoo() && e.type() == String); } - else { - uassert( 17136, - "language_override is not valid", - language_override_elt.type() == String - && validateOverride( language_override ) ); + + // extraAfterFields + while (!e.eoo()) { + uassert(17389, "'text' fields in index must all be adjacent", e.type() != String); + verifyFieldNameNotReserved(e.fieldNameStringData()); + e = i.next(); } + } + } - int version = -1; - int textIndexVersion = TEXT_INDEX_VERSION_2; + if (spec["weights"].type() == Object) { + BSONObjIterator i(spec["weights"].Obj()); + while (i.more()) { + BSONElement e = i.next(); + uassert(17283, "weight for text index needs numeric type", e.isNumber()); + m[e.fieldName()] = e.numberInt(); + } + } else if (spec["weights"].str() == WILDCARD) { + m[WILDCARD] = 1; + } else if (!spec["weights"].eoo()) { + uasserted(17284, "text index option 'weights' must be an object"); + } - BSONObjBuilder b; - BSONObjIterator i( spec ); - while ( i.more() ) { - BSONElement e = i.next(); - if ( str::equals( e.fieldName(), "key" ) ) { - b.append( "key", keyPattern ); - } - else if ( str::equals( e.fieldName(), "weights" ) ) { - b.append( "weights", weights ); - weights = BSONObj(); - } - else if ( str::equals( e.fieldName(), "default_language" ) ) { - b.append( "default_language", default_language); - default_language = ""; - } - else if ( str::equals( e.fieldName(), "language_override" ) ) { - b.append( "language_override", language_override); - language_override = ""; - } - else if ( str::equals( e.fieldName(), "v" ) ) { - version = e.numberInt(); - } - else if ( str::equals( e.fieldName(), "textIndexVersion" ) ) { - uassert( 17293, - "text index option 'textIndexVersion' must be a number", - e.isNumber() ); - textIndexVersion = e.numberInt(); - uassert( 16730, - str::stream() << "bad textIndexVersion: " << textIndexVersion, - textIndexVersion == TEXT_INDEX_VERSION_2 ); - } - else { - b.append( e ); + BSONObj weights; + { + BSONObjBuilder b; + for (map<string, int>::iterator i = m.begin(); i != m.end(); ++i) { + uassert(16674, "score for word too high", i->second > 0 && i->second < MAX_WORD_WEIGHT); + + // Verify weight refers to a valid field. + if (i->first != "$**") { + FieldRef keyField(i->first); + uassert(17294, "weight cannot be on an empty field", keyField.numParts() != 0); + for (size_t partNum = 0; partNum < keyField.numParts(); partNum++) { + StringData part = keyField.getPart(partNum); + uassert(17291, "weight cannot have empty path component", !part.empty()); + uassert(17292, + "weight cannot have path component with $ prefix", + !part.startsWith("$")); } } - if ( !weights.isEmpty() ) { - b.append( "weights", weights ); - } - if ( !default_language.empty() ) { - b.append( "default_language", default_language); - } - if ( !language_override.empty() ) { - b.append( "language_override", language_override); - } - if ( version >= 0 ) { - b.append( "v", version ); - } - b.append( "textIndexVersion", textIndexVersion ); + b.append(i->first, i->second); + } + weights = b.obj(); + } + + BSONElement default_language_elt = spec["default_language"]; + string default_language(default_language_elt.str()); + if (default_language_elt.eoo()) { + default_language = moduleDefaultLanguage; + } else { + uassert( + 17263, "default_language needs a string type", default_language_elt.type() == String); + } + uassert(17264, + "default_language is not valid", + FTSLanguage::make(default_language, TEXT_INDEX_VERSION_2).getStatus().isOK()); + + BSONElement language_override_elt = spec["language_override"]; + string language_override(language_override_elt.str()); + if (language_override_elt.eoo()) { + language_override = "language"; + } else { + uassert(17136, + "language_override is not valid", + language_override_elt.type() == String && validateOverride(language_override)); + } - return b.obj(); + int version = -1; + int textIndexVersion = TEXT_INDEX_VERSION_2; + + BSONObjBuilder b; + BSONObjIterator i(spec); + while (i.more()) { + BSONElement e = i.next(); + if (str::equals(e.fieldName(), "key")) { + b.append("key", keyPattern); + } else if (str::equals(e.fieldName(), "weights")) { + b.append("weights", weights); + weights = BSONObj(); + } else if (str::equals(e.fieldName(), "default_language")) { + b.append("default_language", default_language); + default_language = ""; + } else if (str::equals(e.fieldName(), "language_override")) { + b.append("language_override", language_override); + language_override = ""; + } else if (str::equals(e.fieldName(), "v")) { + version = e.numberInt(); + } else if (str::equals(e.fieldName(), "textIndexVersion")) { + uassert(17293, "text index option 'textIndexVersion' must be a number", e.isNumber()); + textIndexVersion = e.numberInt(); + uassert(16730, + str::stream() << "bad textIndexVersion: " << textIndexVersion, + textIndexVersion == TEXT_INDEX_VERSION_2); + } else { + b.append(e); } + } + if (!weights.isEmpty()) { + b.append("weights", weights); + } + if (!default_language.empty()) { + b.append("default_language", default_language); } + if (!language_override.empty()) { + b.append("language_override", language_override); + } + if (version >= 0) { + b.append("v", version); + } + b.append("textIndexVersion", textIndexVersion); + + return b.obj(); +} +} } diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h index 7f6365002fb..23a54a08c87 100644 --- a/src/mongo/db/fts/fts_spec.h +++ b/src/mongo/db/fts/fts_spec.h @@ -43,136 +43,146 @@ namespace mongo { - namespace fts { - - extern const double MAX_WEIGHT; - extern const double MAX_WORD_WEIGHT; - extern const double DEFAULT_WEIGHT; - - typedef std::map<std::string,double> Weights; // TODO cool map - typedef unordered_map<std::string,double> TermFrequencyMap; - - struct ScoreHelperStruct { - ScoreHelperStruct() - : freq(0), count(0), exp(0){ - } - double freq; - double count; - double exp; - }; - typedef unordered_map<std::string,ScoreHelperStruct> ScoreHelperMap; - - class FTSSpec { - - struct Tools { - Tools( const FTSLanguage& _language, - const Stemmer* _stemmer, - const StopWords* _stopwords ) - : language( _language ) - , stemmer( _stemmer ) - , stopwords( _stopwords ) {} - - const FTSLanguage& language; - const Stemmer* stemmer; - const StopWords* stopwords; - }; - - public: - FTSSpec( const BSONObj& indexInfo ); - - bool wildcard() const { return _wildcard; } - const FTSLanguage& defaultLanguage() const { return *_defaultLanguage; } - const std::string& languageOverrideField() const { return _languageOverrideField; } - - size_t numExtraBefore() const { return _extraBefore.size(); } - const std::string& extraBefore( unsigned i ) const { return _extraBefore[i]; } - - size_t numExtraAfter() const { return _extraAfter.size(); } - const std::string& extraAfter( unsigned i ) const { return _extraAfter[i]; } - - /** - * Calculates term/score pairs for a BSONObj as applied to this spec. - * @arg obj document to traverse; can be a subdocument or array - * @arg term_freqs output parameter to store (term,score) results - */ - void scoreDocument( const BSONObj& obj, TermFrequencyMap* term_freqs ) const; - - /** - * given a query, pulls out the pieces (in order) that go in the index first - */ - Status getIndexPrefix( const BSONObj& filter, BSONObj* out ) const; - - const Weights& weights() const { return _weights; } - static BSONObj fixSpec( const BSONObj& spec ); - - /** - * Returns text index version. - */ - TextIndexVersion getTextIndexVersion() const { return _textIndexVersion; } - - private: - // - // Helper methods. Invoked for TEXT_INDEX_VERSION_2 spec objects only. - // - - /** - * Calculate the term scores for 'raw' and update 'term_freqs' with the result. Parses - * 'raw' using 'tools', and weights term scores based on 'weight'. - */ - void _scoreStringV2( const Tools& tools, - const StringData& raw, - TermFrequencyMap* term_freqs, - double weight ) const; - - public: - /** - * Get the language override for the given BSON doc. If no language override is - * specified, returns currentLanguage. - */ - const FTSLanguage* _getLanguageToUseV2( const BSONObj& userDoc, - const FTSLanguage* currentLanguage ) const; - - private: - // - // Deprecated helper methods. Invoked for TEXT_INDEX_VERSION_1 spec objects only. - // - - void _scoreStringV1( const Tools& tools, - const StringData& raw, - TermFrequencyMap* docScores, - double weight ) const; - - bool _weightV1( const StringData& field, double* out ) const; - - void _scoreRecurseV1( const Tools& tools, - const BSONObj& obj, - TermFrequencyMap* term_freqs ) const; - - void _scoreDocumentV1( const BSONObj& obj, TermFrequencyMap* term_freqs ) const; - - const FTSLanguage& _getLanguageToUseV1( const BSONObj& userDoc ) const; - - static BSONObj _fixSpecV1( const BSONObj& spec ); - - // - // Instance variables. - // - - TextIndexVersion _textIndexVersion; +namespace fts { + +extern const double MAX_WEIGHT; +extern const double MAX_WORD_WEIGHT; +extern const double DEFAULT_WEIGHT; + +typedef std::map<std::string, double> Weights; // TODO cool map +typedef unordered_map<std::string, double> TermFrequencyMap; + +struct ScoreHelperStruct { + ScoreHelperStruct() : freq(0), count(0), exp(0) {} + double freq; + double count; + double exp; +}; +typedef unordered_map<std::string, ScoreHelperStruct> ScoreHelperMap; + +class FTSSpec { + struct Tools { + Tools(const FTSLanguage& _language, const Stemmer* _stemmer, const StopWords* _stopwords) + : language(_language), stemmer(_stemmer), stopwords(_stopwords) {} + + const FTSLanguage& language; + const Stemmer* stemmer; + const StopWords* stopwords; + }; + +public: + FTSSpec(const BSONObj& indexInfo); + + bool wildcard() const { + return _wildcard; + } + const FTSLanguage& defaultLanguage() const { + return *_defaultLanguage; + } + const std::string& languageOverrideField() const { + return _languageOverrideField; + } + + size_t numExtraBefore() const { + return _extraBefore.size(); + } + const std::string& extraBefore(unsigned i) const { + return _extraBefore[i]; + } + + size_t numExtraAfter() const { + return _extraAfter.size(); + } + const std::string& extraAfter(unsigned i) const { + return _extraAfter[i]; + } - const FTSLanguage* _defaultLanguage; - std::string _languageOverrideField; - bool _wildcard; + /** + * Calculates term/score pairs for a BSONObj as applied to this spec. + * @arg obj document to traverse; can be a subdocument or array + * @arg term_freqs output parameter to store (term,score) results + */ + void scoreDocument(const BSONObj& obj, TermFrequencyMap* term_freqs) const; - // mapping : fieldname -> weight - Weights _weights; - - // Prefix compound key - used to partition search index - std::vector<std::string> _extraBefore; + /** + * given a query, pulls out the pieces (in order) that go in the index first + */ + Status getIndexPrefix(const BSONObj& filter, BSONObj* out) const; - // Suffix compound key - used for covering index behavior - std::vector<std::string> _extraAfter; - }; + const Weights& weights() const { + return _weights; + } + static BSONObj fixSpec(const BSONObj& spec); + /** + * Returns text index version. + */ + TextIndexVersion getTextIndexVersion() const { + return _textIndexVersion; } + +private: + // + // Helper methods. Invoked for TEXT_INDEX_VERSION_2 spec objects only. + // + + /** + * Calculate the term scores for 'raw' and update 'term_freqs' with the result. Parses + * 'raw' using 'tools', and weights term scores based on 'weight'. + */ + void _scoreStringV2(const Tools& tools, + const StringData& raw, + TermFrequencyMap* term_freqs, + double weight) const; + +public: + /** + * Get the language override for the given BSON doc. If no language override is + * specified, returns currentLanguage. + */ + const FTSLanguage* _getLanguageToUseV2(const BSONObj& userDoc, + const FTSLanguage* currentLanguage) const; + +private: + // + // Deprecated helper methods. Invoked for TEXT_INDEX_VERSION_1 spec objects only. + // + + void _scoreStringV1(const Tools& tools, + const StringData& raw, + TermFrequencyMap* docScores, + double weight) const; + + bool _weightV1(const StringData& field, double* out) const; + + void _scoreRecurseV1(const Tools& tools, + const BSONObj& obj, + TermFrequencyMap* term_freqs) const; + + void _scoreDocumentV1(const BSONObj& obj, TermFrequencyMap* term_freqs) const; + + const FTSLanguage& _getLanguageToUseV1(const BSONObj& userDoc) const; + + static BSONObj _fixSpecV1(const BSONObj& spec); + + // + // Instance variables. + // + + TextIndexVersion _textIndexVersion; + + const FTSLanguage* _defaultLanguage; + std::string _languageOverrideField; + bool _wildcard; + + // mapping : fieldname -> weight + Weights _weights; + + // Prefix compound key - used to partition search index + std::vector<std::string> _extraBefore; + + // Suffix compound key - used for covering index behavior + std::vector<std::string> _extraAfter; +}; +} } diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp index 5f958f97b38..d8bf658097d 100644 --- a/src/mongo/db/fts/fts_spec_legacy.cpp +++ b/src/mongo/db/fts/fts_spec_legacy.cpp @@ -32,291 +32,269 @@ namespace mongo { - namespace fts { +namespace fts { - // - // This file contains functionality specific to indexing documents from TEXT_INDEX_VERSION_1 - // text indexes. - // +// +// This file contains functionality specific to indexing documents from TEXT_INDEX_VERSION_1 +// text indexes. +// - using std::map; - using std::string; - using namespace mongoutils; +using std::map; +using std::string; +using namespace mongoutils; - namespace { - void _addFTSStuff( BSONObjBuilder* b ) { - b->append( "_fts", INDEX_NAME ); - b->append( "_ftsx", 1 ); - } - } +namespace { +void _addFTSStuff(BSONObjBuilder* b) { + b->append("_fts", INDEX_NAME); + b->append("_ftsx", 1); +} +} - const FTSLanguage& FTSSpec::_getLanguageToUseV1( const BSONObj& userDoc ) const { - BSONElement e = userDoc[_languageOverrideField]; - if ( e.type() == String ) { - const char * x = e.valuestrsafe(); - if ( strlen( x ) > 0 ) { - StatusWithFTSLanguage swl = FTSLanguage::make( x, TEXT_INDEX_VERSION_1 ); - dassert( swl.isOK() ); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail. - return *swl.getValue(); - } - } - return *_defaultLanguage; +const FTSLanguage& FTSSpec::_getLanguageToUseV1(const BSONObj& userDoc) const { + BSONElement e = userDoc[_languageOverrideField]; + if (e.type() == String) { + const char* x = e.valuestrsafe(); + if (strlen(x) > 0) { + StatusWithFTSLanguage swl = FTSLanguage::make(x, TEXT_INDEX_VERSION_1); + dassert(swl.isOK()); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail. + return *swl.getValue(); } + } + return *_defaultLanguage; +} - void FTSSpec::_scoreStringV1( const Tools& tools, - const StringData& raw, - TermFrequencyMap* docScores, - double weight ) const { - - ScoreHelperMap terms; +void FTSSpec::_scoreStringV1(const Tools& tools, + const StringData& raw, + TermFrequencyMap* docScores, + double weight) const { + ScoreHelperMap terms; - unsigned numTokens = 0; + unsigned numTokens = 0; - Tokenizer i( tools.language, raw ); - while ( i.more() ) { - Token t = i.next(); - if ( t.type != Token::TEXT ) - continue; + Tokenizer i(tools.language, raw); + while (i.more()) { + Token t = i.next(); + if (t.type != Token::TEXT) + continue; - string term = t.data.toString(); - makeLower( &term ); - if ( tools.stopwords->isStopWord( term ) ) - continue; - term = tools.stemmer->stem( term ); + string term = t.data.toString(); + makeLower(&term); + if (tools.stopwords->isStopWord(term)) + continue; + term = tools.stemmer->stem(term); - ScoreHelperStruct& data = terms[term]; + ScoreHelperStruct& data = terms[term]; - if ( data.exp ) - data.exp *= 2; - else - data.exp = 1; - data.count += 1; - data.freq += ( 1 / data.exp ); + if (data.exp) + data.exp *= 2; + else + data.exp = 1; + data.count += 1; + data.freq += (1 / data.exp); - numTokens++; - } + numTokens++; + } - for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) { + for (ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i) { + const string& term = i->first; + const ScoreHelperStruct& data = i->second; - const string& term = i->first; - const ScoreHelperStruct& data = i->second; + // in order to adjust weights as a function of term count as it + // relates to total field length. ie. is this the only word or + // a frequently occuring term? or does it only show up once in + // a long block of text? - // in order to adjust weights as a function of term count as it - // relates to total field length. ie. is this the only word or - // a frequently occuring term? or does it only show up once in - // a long block of text? + double coeff = (0.5 * data.count / numTokens) + 0.5; - double coeff = ( 0.5 * data.count / numTokens ) + 0.5; + // if term is identical to the raw form of the + // field (untokenized) give it a small boost. + double adjustment = 1; + if (raw.size() == term.length() && raw.equalCaseInsensitive(term)) + adjustment += 0.1; - // if term is identical to the raw form of the - // field (untokenized) give it a small boost. - double adjustment = 1; - if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) ) - adjustment += 0.1; + double& score = (*docScores)[term]; + score += (weight * data.freq * coeff * adjustment); + verify(score <= MAX_WEIGHT); + } +} - double& score = (*docScores)[term]; - score += ( weight * data.freq * coeff * adjustment ); - verify( score <= MAX_WEIGHT ); - } - } +bool FTSSpec::_weightV1(const StringData& field, double* out) const { + Weights::const_iterator i = _weights.find(field.toString()); + if (i == _weights.end()) + return false; + *out = i->second; + return true; +} - bool FTSSpec::_weightV1( const StringData& field, double* out ) const { - Weights::const_iterator i = _weights.find( field.toString() ); - if ( i == _weights.end() ) - return false; - *out = i->second; - return true; +/* + * Recurses over all fields of an obj (document in collection) + * and fills term,score map term_freqs + * @param tokenizer, tokenizer to tokenize a string into terms + * @param obj, object being parsed + * term_freqs, map <term,score> to be filled up + */ +void FTSSpec::_scoreRecurseV1(const Tools& tools, + const BSONObj& obj, + TermFrequencyMap* term_freqs) const { + BSONObjIterator j(obj); + while (j.more()) { + BSONElement x = j.next(); + + if (languageOverrideField() == x.fieldName()) + continue; + + if (x.type() == String) { + double w = 1; + _weightV1(x.fieldName(), &w); + _scoreStringV1(tools, x.valuestr(), term_freqs, w); + } else if (x.isABSONObj()) { + _scoreRecurseV1(tools, x.Obj(), term_freqs); } + } +} - /* - * Recurses over all fields of an obj (document in collection) - * and fills term,score map term_freqs - * @param tokenizer, tokenizer to tokenize a string into terms - * @param obj, object being parsed - * term_freqs, map <term,score> to be filled up - */ - void FTSSpec::_scoreRecurseV1( const Tools& tools, - const BSONObj& obj, - TermFrequencyMap* term_freqs ) const { - BSONObjIterator j( obj ); - while ( j.more() ) { - BSONElement x = j.next(); +void FTSSpec::_scoreDocumentV1(const BSONObj& obj, TermFrequencyMap* term_freqs) const { + const FTSLanguage& language = _getLanguageToUseV1(obj); - if ( languageOverrideField() == x.fieldName() ) - continue; + Stemmer stemmer(language); + Tools tools(language, &stemmer, StopWords::getStopWords(language)); - if (x.type() == String) { - double w = 1; - _weightV1( x.fieldName(), &w ); - _scoreStringV1(tools, x.valuestr(), term_freqs, w); - } - else if ( x.isABSONObj() ) { - _scoreRecurseV1( tools, x.Obj(), term_freqs); - } + if (wildcard()) { + // if * is specified for weight, we can recurse over all fields. + _scoreRecurseV1(tools, obj, term_freqs); + return; + } + // otherwise, we need to remember the different weights for each field + // and act accordingly (in other words, call _score) + for (Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++) { + const char* leftOverName = i->first.c_str(); + // name of field + BSONElement e = obj.getFieldDottedOrArray(leftOverName); + // weight associated to name of field + double weight = i->second; + + if (e.eoo()) { + // do nothing + } else if (e.type() == Array) { + BSONObjIterator j(e.Obj()); + while (j.more()) { + BSONElement x = j.next(); + if (leftOverName[0] && x.isABSONObj()) + x = x.Obj().getFieldDotted(leftOverName); + if (x.type() == String) + _scoreStringV1(tools, x.valuestr(), term_freqs, weight); } + } else if (e.type() == String) { + _scoreStringV1(tools, e.valuestr(), term_freqs, weight); } + } +} - void FTSSpec::_scoreDocumentV1( const BSONObj& obj, - TermFrequencyMap* term_freqs ) const { - - const FTSLanguage& language = _getLanguageToUseV1( obj ); - - Stemmer stemmer(language); - Tools tools(language, &stemmer, StopWords::getStopWords( language )); - - if ( wildcard() ) { - // if * is specified for weight, we can recurse over all fields. - _scoreRecurseV1(tools, obj, term_freqs); - return; - } - - // otherwise, we need to remember the different weights for each field - // and act accordingly (in other words, call _score) - for ( Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++ ) { - const char * leftOverName = i->first.c_str(); - // name of field - BSONElement e = obj.getFieldDottedOrArray(leftOverName); - // weight associated to name of field - double weight = i->second; - - if ( e.eoo() ) { - // do nothing - } - else if ( e.type() == Array ) { - BSONObjIterator j( e.Obj() ); - while ( j.more() ) { - BSONElement x = j.next(); - if ( leftOverName[0] && x.isABSONObj() ) - x = x.Obj().getFieldDotted( leftOverName ); - if ( x.type() == String ) - _scoreStringV1( tools, x.valuestr(), term_freqs, weight ); - } - } - else if ( e.type() == String ) { - _scoreStringV1( tools, e.valuestr(), term_freqs, weight ); +BSONObj FTSSpec::_fixSpecV1(const BSONObj& spec) { + map<string, int> m; + + BSONObj keyPattern; + { + BSONObjBuilder b; + bool addedFtsStuff = false; + + BSONObjIterator i(spec["key"].Obj()); + while (i.more()) { + BSONElement e = i.next(); + if (str::equals(e.fieldName(), "_fts") || str::equals(e.fieldName(), "_ftsx")) { + addedFtsStuff = true; + b.append(e); + } else if (e.type() == String && + (str::equals("fts", e.valuestr()) || str::equals("text", e.valuestr()))) { + if (!addedFtsStuff) { + _addFTSStuff(&b); + addedFtsStuff = true; } + m[e.fieldName()] = 1; + } else { + b.append(e); } } - BSONObj FTSSpec::_fixSpecV1( const BSONObj& spec ) { - map<string,int> m; - - BSONObj keyPattern; - { - BSONObjBuilder b; - bool addedFtsStuff = false; - - BSONObjIterator i( spec["key"].Obj() ); - while ( i.more() ) { - BSONElement e = i.next(); - if ( str::equals( e.fieldName(), "_fts" ) || - str::equals( e.fieldName(), "_ftsx" ) ) { - addedFtsStuff = true; - b.append( e ); - } - else if ( e.type() == String && - ( str::equals( "fts", e.valuestr() ) || - str::equals( "text", e.valuestr() ) ) ) { - - if ( !addedFtsStuff ) { - _addFTSStuff( &b ); - addedFtsStuff = true; - } - - m[e.fieldName()] = 1; - } - else { - b.append( e ); - } - } - - if ( !addedFtsStuff ) - _addFTSStuff( &b ); - - keyPattern = b.obj(); - } - - if ( spec["weights"].isABSONObj() ) { - BSONObjIterator i( spec["weights"].Obj() ); - while ( i.more() ) { - BSONElement e = i.next(); - m[e.fieldName()] = e.numberInt(); - } - } - else if ( spec["weights"].str() == WILDCARD ) { - m[WILDCARD] = 1; - } - - BSONObj weights; - { - BSONObjBuilder b; - for ( map<string,int>::iterator i = m.begin(); i != m.end(); ++i ) { - uassert( 17365, "score for word too high", - i->second > 0 && i->second < MAX_WORD_WEIGHT ); - b.append( i->first, i->second ); - } - weights = b.obj(); - } + if (!addedFtsStuff) + _addFTSStuff(&b); - string default_language(spec.getStringField("default_language")); - if ( default_language.empty() ) - default_language = "english"; + keyPattern = b.obj(); + } - string language_override(spec.getStringField("language_override")); - if ( language_override.empty() ) - language_override = "language"; + if (spec["weights"].isABSONObj()) { + BSONObjIterator i(spec["weights"].Obj()); + while (i.more()) { + BSONElement e = i.next(); + m[e.fieldName()] = e.numberInt(); + } + } else if (spec["weights"].str() == WILDCARD) { + m[WILDCARD] = 1; + } - int version = -1; - int textIndexVersion = 1; + BSONObj weights; + { + BSONObjBuilder b; + for (map<string, int>::iterator i = m.begin(); i != m.end(); ++i) { + uassert(17365, "score for word too high", i->second > 0 && i->second < MAX_WORD_WEIGHT); + b.append(i->first, i->second); + } + weights = b.obj(); + } - BSONObjBuilder b; - BSONObjIterator i( spec ); - while ( i.more() ) { - BSONElement e = i.next(); - if ( str::equals( e.fieldName(), "key" ) ) { - b.append( "key", keyPattern ); - } - else if ( str::equals( e.fieldName(), "weights" ) ) { - b.append( "weights", weights ); - weights = BSONObj(); - } - else if ( str::equals( e.fieldName(), "default_language" ) ) { - b.append( "default_language", default_language); - default_language = ""; - } - else if ( str::equals( e.fieldName(), "language_override" ) ) { - b.append( "language_override", language_override); - language_override = ""; - } - else if ( str::equals( e.fieldName(), "v" ) ) { - version = e.numberInt(); - } - else if ( str::equals( e.fieldName(), "textIndexVersion" ) ) { - textIndexVersion = e.numberInt(); - uassert( 17366, - str::stream() << "bad textIndexVersion: " << textIndexVersion, - textIndexVersion == 1 ); - } - else { - b.append( e ); - } - } + string default_language(spec.getStringField("default_language")); + if (default_language.empty()) + default_language = "english"; + + string language_override(spec.getStringField("language_override")); + if (language_override.empty()) + language_override = "language"; + + int version = -1; + int textIndexVersion = 1; + + BSONObjBuilder b; + BSONObjIterator i(spec); + while (i.more()) { + BSONElement e = i.next(); + if (str::equals(e.fieldName(), "key")) { + b.append("key", keyPattern); + } else if (str::equals(e.fieldName(), "weights")) { + b.append("weights", weights); + weights = BSONObj(); + } else if (str::equals(e.fieldName(), "default_language")) { + b.append("default_language", default_language); + default_language = ""; + } else if (str::equals(e.fieldName(), "language_override")) { + b.append("language_override", language_override); + language_override = ""; + } else if (str::equals(e.fieldName(), "v")) { + version = e.numberInt(); + } else if (str::equals(e.fieldName(), "textIndexVersion")) { + textIndexVersion = e.numberInt(); + uassert(17366, + str::stream() << "bad textIndexVersion: " << textIndexVersion, + textIndexVersion == 1); + } else { + b.append(e); + } + } - if ( !weights.isEmpty() ) - b.append( "weights", weights ); - if ( !default_language.empty() ) - b.append( "default_language", default_language); - if ( !language_override.empty() ) - b.append( "language_override", language_override); + if (!weights.isEmpty()) + b.append("weights", weights); + if (!default_language.empty()) + b.append("default_language", default_language); + if (!language_override.empty()) + b.append("language_override", language_override); - if ( version >= 0 ) - b.append( "v", version ); + if (version >= 0) + b.append("v", version); - b.append( "textIndexVersion", textIndexVersion ); + b.append("textIndexVersion", textIndexVersion); - return b.obj(); - } - } + return b.obj(); +} +} } diff --git a/src/mongo/db/fts/fts_spec_test.cpp b/src/mongo/db/fts/fts_spec_test.cpp index 832279eb18d..c9f628a2b28 100644 --- a/src/mongo/db/fts/fts_spec_test.cpp +++ b/src/mongo/db/fts/fts_spec_test.cpp @@ -36,541 +36,558 @@ namespace mongo { - using std::set; - using std::string; - - namespace fts { - - /** - * Assert that fixSpec() accepts the provided text index spec. - */ - void assertFixSuccess( const std::string& s ) { - BSONObj user = fromjson( s ); - - try { - // fixSpec() should not throw on a valid spec. - BSONObj fixed = FTSSpec::fixSpec( user ); - - // fixSpec() on an already-fixed spec shouldn't change it. - BSONObj fixed2 = FTSSpec::fixSpec( fixed ); - ASSERT_EQUALS( fixed, fixed2 ); - } - catch ( UserException& ) { - ASSERT( false ); - } - } - - /** - * Assert that fixSpec() rejects the provided text index spec. - */ - void assertFixFailure( const std::string& s ) { - BSONObj user = fromjson( s ); - - try { - // fixSpec() on an invalid spec should uassert. - BSONObj fixed = FTSSpec::fixSpec( user ); - } - catch ( UserException& ) { - return; - } - ASSERT( false ); - } - - TEST( FTSSpec, FixNormalKey1 ) { - assertFixSuccess("{key: {a: 'text'}}"); - assertFixSuccess("{key: {a: 'text', b: 'text'}}"); - assertFixSuccess("{key: {a: 'text', b: 'text', c: 'text'}}"); - - assertFixFailure("{key: {_fts: 'text'}}"); // not allowed to index reserved field - assertFixFailure("{key: {_ftsx: 'text'}}"); - } - - TEST( FTSSpec, FixCompoundKey1 ) { - assertFixSuccess("{key: {a: 'text', b: 1.0}}"); - assertFixSuccess("{key: {a: 'text', b: NumberInt(1)}}"); - assertFixSuccess("{key: {a: 'text', b: NumberLong(1)}}"); - assertFixSuccess("{key: {a: 'text', b: -1.0}}"); - assertFixSuccess("{key: {a: 'text', b: NumberInt(-1)}}"); - assertFixSuccess("{key: {a: 'text', b: NumberLong(-1)}}"); - assertFixSuccess("{key: {a: 1.0, b: 'text'}}"); - assertFixSuccess("{key: {a: NumberInt(1), b: 'text'}}"); - assertFixSuccess("{key: {a: NumberLong(1), b: 'text'}}"); - assertFixSuccess("{key: {a: -1, b: 'text'}}"); - assertFixSuccess("{key: {a: 1, b: 1, c: 'text'}}"); - assertFixSuccess("{key: {a: 1, b: -1, c: 'text'}}"); - assertFixSuccess("{key: {a: -1, b: 1, c: 'text'}}"); - assertFixSuccess("{key: {a: 1, b: 'text', c: 1}}"); - assertFixSuccess("{key: {a: 'text', b: 1, c: 1}}"); - assertFixSuccess("{key: {a: 'text', b: 1, c: -1}}"); - assertFixSuccess("{key: {a: 'text', b: 'text', c: 1}}"); - assertFixSuccess("{key: {a: 1, b: 'text', c: 'text'}}"); - - assertFixFailure("{key: {a: 'text', b: 0}}"); - assertFixFailure("{key: {a: 'text', b: '2d'}}"); // not allowed to mix special indexes - assertFixFailure("{key: {a: 'text', b: '1'}}"); - assertFixFailure("{key: {a: 'text', _fts: 1}}"); - assertFixFailure("{key: {a: 'text', _fts: 'text'}}"); - assertFixFailure("{key: {a: 'text', _ftsx: 1}}"); - assertFixFailure("{key: {a: 'text', _ftsx: 'text'}}"); - assertFixFailure("{key: {_fts: 1, a: 'text'}}"); - assertFixFailure("{key: {_fts: 'text', a: 'text'}}"); - assertFixFailure("{key: {_ftsx: 1, a: 'text'}}"); - assertFixFailure("{key: {_ftsx: 'text', a: 'text'}}"); - assertFixFailure("{key: {a: 'text', b: 1, c: 'text'}}"); // 'text' must all be adjacent - assertFixFailure("{key: {a: 'text', b: 1, c: 'text', d: 1}}"); - assertFixFailure("{key: {a: 1, b: 'text', c: 1, d: 'text', e: 1}}"); - } - - TEST( FTSSpec, FixDefaultLanguage1 ) { - assertFixSuccess("{key: {a: 'text'}, default_language: 'english'}"); - assertFixSuccess("{key: {a: 'text'}, default_language: 'engLISH'}"); - assertFixSuccess("{key: {a: 'text'}, default_language: 'en'}"); - assertFixSuccess("{key: {a: 'text'}, default_language: 'eN'}"); - assertFixSuccess("{key: {a: 'text'}, default_language: 'spanish'}"); - assertFixSuccess("{key: {a: 'text'}, default_language: 'none'}"); - - assertFixFailure("{key: {a: 'text'}, default_language: 'engrish'}"); - assertFixFailure("{key: {a: 'text'}, default_language: ' english'}"); - assertFixFailure("{key: {a: 'text'}, default_language: ''}"); - } - - TEST( FTSSpec, FixWeights1 ) { - assertFixSuccess("{key: {a: 'text'}, weights: {}}"); - assertFixSuccess("{key: {a: 'text'}, weights: {a: 1.0}}"); - assertFixSuccess("{key: {a: 'text'}, weights: {a: NumberInt(1)}}"); - assertFixSuccess("{key: {a: 'text'}, weights: {a: NumberLong(1)}}"); - assertFixSuccess("{key: {a: 'text'}, weights: {a: 99999}}"); - assertFixSuccess("{key: {'$**': 'text'}, weights: {'a.b': 2}}"); - assertFixSuccess("{key: {'$**': 'text'}, weights: {a: 2, b: 2}}"); - assertFixSuccess("{key: {'$**': 'text'}, weights: {'$**': 2}}"); - - assertFixFailure("{key: {a: 'text'}, weights: 0}"); - assertFixFailure("{key: {a: 'text'}, weights: []}"); - assertFixFailure("{key: {a: 'text'}, weights: 'x'}"); - assertFixFailure("{key: {a: 'text'}, weights: {a: 0}}"); - assertFixFailure("{key: {a: 'text'}, weights: {a: -1}}"); - assertFixFailure("{key: {a: 'text'}, weights: {a: 100000}}"); // above max weight - assertFixFailure("{key: {a: 'text'}, weights: {a: '1'}}"); - assertFixFailure("{key: {a: 'text'}, weights: {'': 1}}"); // "invalid" path - assertFixFailure("{key: {a: 'text'}, weights: {'a.': 1}}"); - assertFixFailure("{key: {a: 'text'}, weights: {'.a': 1}}"); - assertFixFailure("{key: {a: 'text'}, weights: {'a..a': 1}}"); - assertFixFailure("{key: {a: 'text'}, weights: {$a: 1}}"); - assertFixFailure("{key: {a: 'text'}, weights: {'a.$a': 1}}"); - assertFixFailure("{key: {a: 'text'}, weights: {'a.$**': 1}}"); - } - - TEST( FTSSpec, FixLanguageOverride1 ) { - assertFixSuccess("{key: {a: 'text'}, language_override: 'foo'}"); - assertFixSuccess("{key: {a: 'text'}, language_override: 'foo$bar'}"); - - assertFixFailure("{key: {a: 'text'}, language_override: 'foo.bar'}"); // can't have '.' - assertFixFailure("{key: {a: 'text'}, language_override: ''}"); - assertFixFailure("{key: {a: 'text'}, language_override: '$foo'}"); - } - - TEST( FTSSpec, FixTextIndexVersion1 ) { - assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 1.0}}"); - assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(1)}}"); - assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(1)}}"); - assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 2.0}}"); - assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(2)}}"); - assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(2)}}"); - - assertFixFailure("{key: {a: 'text'}, textIndexVersion: 3}"); - assertFixFailure("{key: {a: 'text'}, textIndexVersion: '2'}"); - assertFixFailure("{key: {a: 'text'}, textIndexVersion: {}}"); - } - - TEST( FTSSpec, ScoreSingleField1 ) { - BSONObj user = BSON( "key" << BSON( "title" << "text" << - "text" << "text" ) << - "weights" << BSON( "title" << 10 ) ); - - FTSSpec spec( FTSSpec::fixSpec( user ) ); - - TermFrequencyMap m; - spec.scoreDocument( BSON( "title" << "cat sat run" ), &m ); - ASSERT_EQUALS( 3U, m.size() ); - ASSERT_EQUALS( m["cat"], m["sat"] ); - ASSERT_EQUALS( m["cat"], m["run"] ); - ASSERT( m["cat"] > 0 ); - } - - TEST( FTSSpec, ScoreMultipleField1 ) { - BSONObj user = BSON( "key" << BSON( "title" << "text" << - "text" << "text" ) << - "weights" << BSON( "title" << 10 ) ); - - FTSSpec spec( FTSSpec::fixSpec( user ) ); - - TermFrequencyMap m; - spec.scoreDocument( BSON( "title" << "cat sat run" << "text" << "cat book" ), &m ); - - ASSERT_EQUALS( 4U, m.size() ); - ASSERT_EQUALS( m["sat"], m["run"] ); - ASSERT( m["sat"] > 0 ); - - ASSERT( m["cat"] > m["sat"] ); - ASSERT( m["cat"] > m["book"] ); - ASSERT( m["book"] > 0 ); - ASSERT( m["book"] < m["sat"] ); - } - - TEST( FTSSpec, ScoreMultipleField2 ) { - // Test where one indexed field is a parent component of another indexed field. - BSONObj user = BSON( "key" << BSON( "a" << "text" << "a.b" << "text" ) ); - - FTSSpec spec( FTSSpec::fixSpec( user ) ); - - TermFrequencyMap m; - spec.scoreDocument( BSON( "a" << BSON( "b" << "term" ) ), &m ); - ASSERT_EQUALS( 1U, m.size() ); - } - - TEST( FTSSpec, ScoreRepeatWord ) { - BSONObj user = BSON( "key" << BSON( "title" << "text" << - "text" << "text" ) << - "weights" << BSON( "title" << 10 ) ); - - FTSSpec spec( FTSSpec::fixSpec( user ) ); - - TermFrequencyMap m; - spec.scoreDocument( BSON( "title" << "cat sat sat run run run" ), &m ); - ASSERT_EQUALS( 3U, m.size() ); - ASSERT( m["cat"] > 0 ); - ASSERT( m["sat"] > m["cat"] ); - ASSERT( m["run"] > m["sat"] ); - - } - - TEST( FTSSpec, Extra1 ) { - BSONObj user = BSON( "key" << BSON( "data" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( user ) ); - ASSERT_EQUALS( 0U, spec.numExtraBefore() ); - ASSERT_EQUALS( 0U, spec.numExtraAfter() ); - } - - TEST( FTSSpec, Extra2 ) { - BSONObj user = BSON( "key" << BSON( "data" << "text" << "x" << 1 ) ); - BSONObj fixed = FTSSpec::fixSpec( user ); - FTSSpec spec( fixed ); - ASSERT_EQUALS( 0U, spec.numExtraBefore() ); - ASSERT_EQUALS( 1U, spec.numExtraAfter() ); - ASSERT_EQUALS( StringData("x"), spec.extraAfter(0) ); - - BSONObj fixed2 = FTSSpec::fixSpec( fixed ); - ASSERT_EQUALS( fixed, fixed2 ); - } - - TEST( FTSSpec, Extra3 ) { - BSONObj user = BSON( "key" << BSON( "x" << 1 << "data" << "text" ) ); - BSONObj fixed = FTSSpec::fixSpec( user ); - - ASSERT_EQUALS( BSON( "x" << 1 << - "_fts" << "text" << - "_ftsx" << 1 ), - fixed["key"].Obj() ); - ASSERT_EQUALS( BSON( "data" << 1 ), - fixed["weights"].Obj() ); - - BSONObj fixed2 = FTSSpec::fixSpec( fixed ); - ASSERT_EQUALS( fixed, fixed2 ); - - FTSSpec spec( fixed ); - ASSERT_EQUALS( 1U, spec.numExtraBefore() ); - ASSERT_EQUALS( StringData("x"), spec.extraBefore(0) ); - ASSERT_EQUALS( 0U, spec.numExtraAfter() ); - - BSONObj prefix; - - ASSERT( spec.getIndexPrefix( BSON( "x" << 2 ), &prefix ).isOK() ); - ASSERT_EQUALS( BSON( "x" << 2 ), prefix ); - - ASSERT( spec.getIndexPrefix( BSON( "x" << 3 << "y" << 4 ), &prefix ).isOK() ); - ASSERT_EQUALS( BSON( "x" << 3 ), prefix ); - - ASSERT( !spec.getIndexPrefix( BSON( "x" << BSON( "$gt" << 5 ) ), &prefix ).isOK() ); - ASSERT( !spec.getIndexPrefix( BSON( "y" << 4 ), &prefix ).isOK() ); - ASSERT( !spec.getIndexPrefix( BSONObj(), &prefix ).isOK() ); - } - - // Test for correct behavior when encountering nested arrays (both directly nested and - // indirectly nested). - - TEST( FTSSpec, NestedArraysPos1 ) { - BSONObj user = BSON( "key" << BSON( "a.b" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( user ) ); - - // The following document matches {"a.b": {$type: 2}}, so "term" should be indexed. - BSONObj obj = fromjson("{a: [{b: ['term']}]}"); // indirectly nested arrays - TermFrequencyMap m; - spec.scoreDocument( obj, &m ); - ASSERT_EQUALS( 1U, m.size() ); - } - - TEST( FTSSpec, NestedArraysPos2 ) { - BSONObj user = BSON( "key" << BSON( "$**" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( user ) ); - - // The wildcard spec implies a full recursive traversal, so "term" should be indexed. - BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays - TermFrequencyMap m; - spec.scoreDocument( obj, &m ); - ASSERT_EQUALS( 1U, m.size() ); - } - - TEST( FTSSpec, NestedArraysNeg1 ) { - BSONObj user = BSON( "key" << BSON( "a.b" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( user ) ); - - // The following document does not match {"a.b": {$type: 2}}, so "term" should not be - // indexed. - BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays - TermFrequencyMap m; - spec.scoreDocument( obj, &m ); - ASSERT_EQUALS( 0U, m.size() ); - } - - // Multi-language test_1: test independent stemming per sub-document - TEST( FTSSpec, NestedLanguages_PerArrayItemStemming ) { - BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - - BSONObj obj = fromjson( - "{ a :" - " { b :" - " [ { c : \"walked\", language : \"english\" }," - " { c : \"camminato\", language : \"italian\" }," - " { c : \"ging\", language : \"german\" } ]" - " }" - " }" ); - - spec.scoreDocument( obj, &tfm ); - - set<string> hits; - hits.insert("walk"); - hits.insert("cammin"); - hits.insert("ging"); - - for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { - string term = i->first; - ASSERT_EQUALS( 1U, hits.count( term ) ); - } - - } - - // Multi-language test_2: test nested stemming per sub-document - TEST( FTSSpec, NestedLanguages_PerSubdocStemming ) { - BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - - BSONObj obj = fromjson( - "{ language : \"english\"," - " a :" - " { language : \"danish\"," - " b :" - " [ { c : \"foredrag\" }," - " { c : \"foredragsholder\" }," - " { c : \"lector\" } ]" - " }" - "}" ); - - spec.scoreDocument( obj, &tfm ); - - set<string> hits; - hits.insert("foredrag"); - hits.insert("foredragshold"); - hits.insert("lector"); - - for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { - string term = i->first; - ASSERT_EQUALS( 1U, hits.count( term ) ); - } - - } - - // Multi-language test_3: test nested arrays - TEST( FTSSpec, NestedLanguages_NestedArrays ) { - BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - - BSONObj obj = fromjson( - "{ language : \"english\"," - " a : [" - " { language : \"danish\"," - " b :" - " [ { c : [\"foredrag\"] }," - " { c : [\"foredragsholder\"] }," - " { c : [\"lector\"] } ]" - " } ]" - "}" ); - - spec.scoreDocument( obj, &tfm ); - - set<string> hits; - hits.insert("foredrag"); - hits.insert("foredragshold"); - hits.insert("lector"); - - for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { - string term = i->first; - ASSERT_EQUALS( 1U, hits.count( term ) ); - } - - } - - // Multi-language test_4: test pruning - TEST( FTSSpec, NestedLanguages_PathPruning ) { - BSONObj indexSpec = BSON( "key" << BSON( "a.b.c" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - - BSONObj obj = fromjson( - "{ language : \"english\"," - " a : " - " { language : \"danish\"," - " bc : \"foo\"," - " b : { d: \"bar\" }," - " b :" - " [ { c : \"foredrag\" }," - " { c : \"foredragsholder\" }," - " { c : \"lector\" } ]" - " }" - "}" ); - - spec.scoreDocument( obj, &tfm ); - - set<string> hits; - hits.insert("foredrag"); - hits.insert("foredragshold"); - hits.insert("lector"); - - for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { - string term = i->first; - ASSERT_EQUALS( 1U, hits.count( term ) ); - } - - } - - // Multi-language test_5: test wildcard spec - TEST( FTSSpec, NestedLanguages_Wildcard ) { - BSONObj indexSpec = BSON( "key" << BSON( "$**" << "text" ) ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - - BSONObj obj = fromjson( - "{ language : \"english\"," - " b : \"walking\"," - " c : { e: \"walked\" }," - " d : " - " { language : \"danish\"," - " e :" - " [ { f : \"foredrag\" }," - " { f : \"foredragsholder\" }," - " { f : \"lector\" } ]" - " }" - "}" ); - - spec.scoreDocument( obj, &tfm ); - - set<string> hits; - hits.insert("foredrag"); - hits.insert("foredragshold"); - hits.insert("lector"); - hits.insert("walk"); - - for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { - string term = i->first; - ASSERT_EQUALS( 1U, hits.count( term ) ); - } - - } - - // Multi-language test_6: test wildcard spec with override - TEST( FTSSpec, NestedLanguages_WildcardOverride ) { - BSONObj indexSpec = BSON( "key" << BSON( "$**" << "text" ) << - "weights" << BSON( "d.e.f" << 20 ) ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - - BSONObj obj = fromjson( - "{ language : \"english\"," - " b : \"walking\"," - " c : { e: \"walked\" }," - " d : " - " { language : \"danish\"," - " e :" - " [ { f : \"foredrag\" }," - " { f : \"foredragsholder\" }," - " { f : \"lector\" } ]" - " }" - "}" ); - - spec.scoreDocument( obj, &tfm ); - - set<string> hits; - hits.insert("foredrag"); - hits.insert("foredragshold"); - hits.insert("lector"); - hits.insert("walk"); - - for (TermFrequencyMap::const_iterator i = tfm.begin(); i!=tfm.end(); ++i) { - string term = i->first; - ASSERT_EQUALS( 1U, hits.count( term ) ); - } - - } - - /** Test differences across textIndexVersion values in handling of nested arrays. */ - TEST( FTSSpec, TextIndexLegacyNestedArrays ) { - BSONObj obj = fromjson( "{a: [{b: ['hello']}]}" ); - - // textIndexVersion=1 FTSSpec objects do not index nested arrays. - { - BSONObj indexSpec = fromjson( "{key: {'a.b': 'text'}, textIndexVersion: 1}" ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - spec.scoreDocument( obj, &tfm ); - ASSERT_EQUALS( tfm.size(), 0U ); - } - - // textIndexVersion=2 FTSSpec objects do index nested arrays. - { - BSONObj indexSpec = fromjson( "{key: {'a.b': 'text'}, textIndexVersion: 2}" ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - spec.scoreDocument( obj, &tfm ); - ASSERT_EQUALS( tfm.size(), 1U ); - } - } - - /** Test differences across textIndexVersion values in handling of language annotations. */ - TEST( FTSSpec, TextIndexLegacyLanguageRecognition) { - BSONObj obj = fromjson( "{a: 'the', language: 'EN'}" ); - - // textIndexVersion=1 FTSSpec objects treat two-letter language annotations as "none" - // for purposes of stopword processing. - { - BSONObj indexSpec = fromjson( "{key: {'a': 'text'}, textIndexVersion: 1}" ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - spec.scoreDocument( obj, &tfm ); - ASSERT_EQUALS( tfm.size(), 1U ); // "the" not recognized as stopword - } - - // textIndexVersion=2 FTSSpec objects recognize two-letter codes. - { - BSONObj indexSpec = fromjson( "{key: {'a': 'text'}, textIndexVersion: 2}" ); - FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); - TermFrequencyMap tfm; - spec.scoreDocument( obj, &tfm ); - ASSERT_EQUALS( tfm.size(), 0U ); // "the" recognized as stopword - } - } +using std::set; +using std::string; +namespace fts { + +/** + * Assert that fixSpec() accepts the provided text index spec. + */ +void assertFixSuccess(const std::string& s) { + BSONObj user = fromjson(s); + + try { + // fixSpec() should not throw on a valid spec. + BSONObj fixed = FTSSpec::fixSpec(user); + + // fixSpec() on an already-fixed spec shouldn't change it. + BSONObj fixed2 = FTSSpec::fixSpec(fixed); + ASSERT_EQUALS(fixed, fixed2); + } catch (UserException&) { + ASSERT(false); + } +} + +/** + * Assert that fixSpec() rejects the provided text index spec. + */ +void assertFixFailure(const std::string& s) { + BSONObj user = fromjson(s); + + try { + // fixSpec() on an invalid spec should uassert. + BSONObj fixed = FTSSpec::fixSpec(user); + } catch (UserException&) { + return; + } + ASSERT(false); +} + +TEST(FTSSpec, FixNormalKey1) { + assertFixSuccess("{key: {a: 'text'}}"); + assertFixSuccess("{key: {a: 'text', b: 'text'}}"); + assertFixSuccess("{key: {a: 'text', b: 'text', c: 'text'}}"); + + assertFixFailure("{key: {_fts: 'text'}}"); // not allowed to index reserved field + assertFixFailure("{key: {_ftsx: 'text'}}"); +} + +TEST(FTSSpec, FixCompoundKey1) { + assertFixSuccess("{key: {a: 'text', b: 1.0}}"); + assertFixSuccess("{key: {a: 'text', b: NumberInt(1)}}"); + assertFixSuccess("{key: {a: 'text', b: NumberLong(1)}}"); + assertFixSuccess("{key: {a: 'text', b: -1.0}}"); + assertFixSuccess("{key: {a: 'text', b: NumberInt(-1)}}"); + assertFixSuccess("{key: {a: 'text', b: NumberLong(-1)}}"); + assertFixSuccess("{key: {a: 1.0, b: 'text'}}"); + assertFixSuccess("{key: {a: NumberInt(1), b: 'text'}}"); + assertFixSuccess("{key: {a: NumberLong(1), b: 'text'}}"); + assertFixSuccess("{key: {a: -1, b: 'text'}}"); + assertFixSuccess("{key: {a: 1, b: 1, c: 'text'}}"); + assertFixSuccess("{key: {a: 1, b: -1, c: 'text'}}"); + assertFixSuccess("{key: {a: -1, b: 1, c: 'text'}}"); + assertFixSuccess("{key: {a: 1, b: 'text', c: 1}}"); + assertFixSuccess("{key: {a: 'text', b: 1, c: 1}}"); + assertFixSuccess("{key: {a: 'text', b: 1, c: -1}}"); + assertFixSuccess("{key: {a: 'text', b: 'text', c: 1}}"); + assertFixSuccess("{key: {a: 1, b: 'text', c: 'text'}}"); + + assertFixFailure("{key: {a: 'text', b: 0}}"); + assertFixFailure("{key: {a: 'text', b: '2d'}}"); // not allowed to mix special indexes + assertFixFailure("{key: {a: 'text', b: '1'}}"); + assertFixFailure("{key: {a: 'text', _fts: 1}}"); + assertFixFailure("{key: {a: 'text', _fts: 'text'}}"); + assertFixFailure("{key: {a: 'text', _ftsx: 1}}"); + assertFixFailure("{key: {a: 'text', _ftsx: 'text'}}"); + assertFixFailure("{key: {_fts: 1, a: 'text'}}"); + assertFixFailure("{key: {_fts: 'text', a: 'text'}}"); + assertFixFailure("{key: {_ftsx: 1, a: 'text'}}"); + assertFixFailure("{key: {_ftsx: 'text', a: 'text'}}"); + assertFixFailure("{key: {a: 'text', b: 1, c: 'text'}}"); // 'text' must all be adjacent + assertFixFailure("{key: {a: 'text', b: 1, c: 'text', d: 1}}"); + assertFixFailure("{key: {a: 1, b: 'text', c: 1, d: 'text', e: 1}}"); +} + +TEST(FTSSpec, FixDefaultLanguage1) { + assertFixSuccess("{key: {a: 'text'}, default_language: 'english'}"); + assertFixSuccess("{key: {a: 'text'}, default_language: 'engLISH'}"); + assertFixSuccess("{key: {a: 'text'}, default_language: 'en'}"); + assertFixSuccess("{key: {a: 'text'}, default_language: 'eN'}"); + assertFixSuccess("{key: {a: 'text'}, default_language: 'spanish'}"); + assertFixSuccess("{key: {a: 'text'}, default_language: 'none'}"); + + assertFixFailure("{key: {a: 'text'}, default_language: 'engrish'}"); + assertFixFailure("{key: {a: 'text'}, default_language: ' english'}"); + assertFixFailure("{key: {a: 'text'}, default_language: ''}"); +} + +TEST(FTSSpec, FixWeights1) { + assertFixSuccess("{key: {a: 'text'}, weights: {}}"); + assertFixSuccess("{key: {a: 'text'}, weights: {a: 1.0}}"); + assertFixSuccess("{key: {a: 'text'}, weights: {a: NumberInt(1)}}"); + assertFixSuccess("{key: {a: 'text'}, weights: {a: NumberLong(1)}}"); + assertFixSuccess("{key: {a: 'text'}, weights: {a: 99999}}"); + assertFixSuccess("{key: {'$**': 'text'}, weights: {'a.b': 2}}"); + assertFixSuccess("{key: {'$**': 'text'}, weights: {a: 2, b: 2}}"); + assertFixSuccess("{key: {'$**': 'text'}, weights: {'$**': 2}}"); + + assertFixFailure("{key: {a: 'text'}, weights: 0}"); + assertFixFailure("{key: {a: 'text'}, weights: []}"); + assertFixFailure("{key: {a: 'text'}, weights: 'x'}"); + assertFixFailure("{key: {a: 'text'}, weights: {a: 0}}"); + assertFixFailure("{key: {a: 'text'}, weights: {a: -1}}"); + assertFixFailure("{key: {a: 'text'}, weights: {a: 100000}}"); // above max weight + assertFixFailure("{key: {a: 'text'}, weights: {a: '1'}}"); + assertFixFailure("{key: {a: 'text'}, weights: {'': 1}}"); // "invalid" path + assertFixFailure("{key: {a: 'text'}, weights: {'a.': 1}}"); + assertFixFailure("{key: {a: 'text'}, weights: {'.a': 1}}"); + assertFixFailure("{key: {a: 'text'}, weights: {'a..a': 1}}"); + assertFixFailure("{key: {a: 'text'}, weights: {$a: 1}}"); + assertFixFailure("{key: {a: 'text'}, weights: {'a.$a': 1}}"); + assertFixFailure("{key: {a: 'text'}, weights: {'a.$**': 1}}"); +} + +TEST(FTSSpec, FixLanguageOverride1) { + assertFixSuccess("{key: {a: 'text'}, language_override: 'foo'}"); + assertFixSuccess("{key: {a: 'text'}, language_override: 'foo$bar'}"); + + assertFixFailure("{key: {a: 'text'}, language_override: 'foo.bar'}"); // can't have '.' + assertFixFailure("{key: {a: 'text'}, language_override: ''}"); + assertFixFailure("{key: {a: 'text'}, language_override: '$foo'}"); +} + +TEST(FTSSpec, FixTextIndexVersion1) { + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 1.0}}"); + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(1)}}"); + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(1)}}"); + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 2.0}}"); + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(2)}}"); + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(2)}}"); + + assertFixFailure("{key: {a: 'text'}, textIndexVersion: 3}"); + assertFixFailure("{key: {a: 'text'}, textIndexVersion: '2'}"); + assertFixFailure("{key: {a: 'text'}, textIndexVersion: {}}"); +} + +TEST(FTSSpec, ScoreSingleField1) { + BSONObj user = BSON("key" << BSON("title" + << "text" + << "text" + << "text") << "weights" << BSON("title" << 10)); + + FTSSpec spec(FTSSpec::fixSpec(user)); + + TermFrequencyMap m; + spec.scoreDocument(BSON("title" + << "cat sat run"), + &m); + ASSERT_EQUALS(3U, m.size()); + ASSERT_EQUALS(m["cat"], m["sat"]); + ASSERT_EQUALS(m["cat"], m["run"]); + ASSERT(m["cat"] > 0); +} + +TEST(FTSSpec, ScoreMultipleField1) { + BSONObj user = BSON("key" << BSON("title" + << "text" + << "text" + << "text") << "weights" << BSON("title" << 10)); + + FTSSpec spec(FTSSpec::fixSpec(user)); + + TermFrequencyMap m; + spec.scoreDocument(BSON("title" + << "cat sat run" + << "text" + << "cat book"), + &m); + + ASSERT_EQUALS(4U, m.size()); + ASSERT_EQUALS(m["sat"], m["run"]); + ASSERT(m["sat"] > 0); + + ASSERT(m["cat"] > m["sat"]); + ASSERT(m["cat"] > m["book"]); + ASSERT(m["book"] > 0); + ASSERT(m["book"] < m["sat"]); +} + +TEST(FTSSpec, ScoreMultipleField2) { + // Test where one indexed field is a parent component of another indexed field. + BSONObj user = BSON("key" << BSON("a" + << "text" + << "a.b" + << "text")); + + FTSSpec spec(FTSSpec::fixSpec(user)); + + TermFrequencyMap m; + spec.scoreDocument(BSON("a" << BSON("b" + << "term")), + &m); + ASSERT_EQUALS(1U, m.size()); +} + +TEST(FTSSpec, ScoreRepeatWord) { + BSONObj user = BSON("key" << BSON("title" + << "text" + << "text" + << "text") << "weights" << BSON("title" << 10)); + + FTSSpec spec(FTSSpec::fixSpec(user)); + + TermFrequencyMap m; + spec.scoreDocument(BSON("title" + << "cat sat sat run run run"), + &m); + ASSERT_EQUALS(3U, m.size()); + ASSERT(m["cat"] > 0); + ASSERT(m["sat"] > m["cat"]); + ASSERT(m["run"] > m["sat"]); +} + +TEST(FTSSpec, Extra1) { + BSONObj user = BSON("key" << BSON("data" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(user)); + ASSERT_EQUALS(0U, spec.numExtraBefore()); + ASSERT_EQUALS(0U, spec.numExtraAfter()); +} + +TEST(FTSSpec, Extra2) { + BSONObj user = BSON("key" << BSON("data" + << "text" + << "x" << 1)); + BSONObj fixed = FTSSpec::fixSpec(user); + FTSSpec spec(fixed); + ASSERT_EQUALS(0U, spec.numExtraBefore()); + ASSERT_EQUALS(1U, spec.numExtraAfter()); + ASSERT_EQUALS(StringData("x"), spec.extraAfter(0)); + + BSONObj fixed2 = FTSSpec::fixSpec(fixed); + ASSERT_EQUALS(fixed, fixed2); +} + +TEST(FTSSpec, Extra3) { + BSONObj user = BSON("key" << BSON("x" << 1 << "data" + << "text")); + BSONObj fixed = FTSSpec::fixSpec(user); + + ASSERT_EQUALS(BSON("x" << 1 << "_fts" + << "text" + << "_ftsx" << 1), + fixed["key"].Obj()); + ASSERT_EQUALS(BSON("data" << 1), fixed["weights"].Obj()); + + BSONObj fixed2 = FTSSpec::fixSpec(fixed); + ASSERT_EQUALS(fixed, fixed2); + + FTSSpec spec(fixed); + ASSERT_EQUALS(1U, spec.numExtraBefore()); + ASSERT_EQUALS(StringData("x"), spec.extraBefore(0)); + ASSERT_EQUALS(0U, spec.numExtraAfter()); + + BSONObj prefix; + + ASSERT(spec.getIndexPrefix(BSON("x" << 2), &prefix).isOK()); + ASSERT_EQUALS(BSON("x" << 2), prefix); + + ASSERT(spec.getIndexPrefix(BSON("x" << 3 << "y" << 4), &prefix).isOK()); + ASSERT_EQUALS(BSON("x" << 3), prefix); + + ASSERT(!spec.getIndexPrefix(BSON("x" << BSON("$gt" << 5)), &prefix).isOK()); + ASSERT(!spec.getIndexPrefix(BSON("y" << 4), &prefix).isOK()); + ASSERT(!spec.getIndexPrefix(BSONObj(), &prefix).isOK()); +} + +// Test for correct behavior when encountering nested arrays (both directly nested and +// indirectly nested). + +TEST(FTSSpec, NestedArraysPos1) { + BSONObj user = BSON("key" << BSON("a.b" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(user)); + + // The following document matches {"a.b": {$type: 2}}, so "term" should be indexed. + BSONObj obj = fromjson("{a: [{b: ['term']}]}"); // indirectly nested arrays + TermFrequencyMap m; + spec.scoreDocument(obj, &m); + ASSERT_EQUALS(1U, m.size()); +} + +TEST(FTSSpec, NestedArraysPos2) { + BSONObj user = BSON("key" << BSON("$**" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(user)); + + // The wildcard spec implies a full recursive traversal, so "term" should be indexed. + BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays + TermFrequencyMap m; + spec.scoreDocument(obj, &m); + ASSERT_EQUALS(1U, m.size()); +} + +TEST(FTSSpec, NestedArraysNeg1) { + BSONObj user = BSON("key" << BSON("a.b" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(user)); + + // The following document does not match {"a.b": {$type: 2}}, so "term" should not be + // indexed. + BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays + TermFrequencyMap m; + spec.scoreDocument(obj, &m); + ASSERT_EQUALS(0U, m.size()); +} + +// Multi-language test_1: test independent stemming per sub-document +TEST(FTSSpec, NestedLanguages_PerArrayItemStemming) { + BSONObj indexSpec = BSON("key" << BSON("a.b.c" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ a :" + " { b :" + " [ { c : \"walked\", language : \"english\" }," + " { c : \"camminato\", language : \"italian\" }," + " { c : \"ging\", language : \"german\" } ]" + " }" + " }"); + + spec.scoreDocument(obj, &tfm); + + set<string> hits; + hits.insert("walk"); + hits.insert("cammin"); + hits.insert("ging"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS(1U, hits.count(term)); + } +} + +// Multi-language test_2: test nested stemming per sub-document +TEST(FTSSpec, NestedLanguages_PerSubdocStemming) { + BSONObj indexSpec = BSON("key" << BSON("a.b.c" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ language : \"english\"," + " a :" + " { language : \"danish\"," + " b :" + " [ { c : \"foredrag\" }," + " { c : \"foredragsholder\" }," + " { c : \"lector\" } ]" + " }" + "}"); + + spec.scoreDocument(obj, &tfm); + + set<string> hits; + hits.insert("foredrag"); + hits.insert("foredragshold"); + hits.insert("lector"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS(1U, hits.count(term)); } } + +// Multi-language test_3: test nested arrays +TEST(FTSSpec, NestedLanguages_NestedArrays) { + BSONObj indexSpec = BSON("key" << BSON("a.b.c" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ language : \"english\"," + " a : [" + " { language : \"danish\"," + " b :" + " [ { c : [\"foredrag\"] }," + " { c : [\"foredragsholder\"] }," + " { c : [\"lector\"] } ]" + " } ]" + "}"); + + spec.scoreDocument(obj, &tfm); + + set<string> hits; + hits.insert("foredrag"); + hits.insert("foredragshold"); + hits.insert("lector"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS(1U, hits.count(term)); + } +} + +// Multi-language test_4: test pruning +TEST(FTSSpec, NestedLanguages_PathPruning) { + BSONObj indexSpec = BSON("key" << BSON("a.b.c" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ language : \"english\"," + " a : " + " { language : \"danish\"," + " bc : \"foo\"," + " b : { d: \"bar\" }," + " b :" + " [ { c : \"foredrag\" }," + " { c : \"foredragsholder\" }," + " { c : \"lector\" } ]" + " }" + "}"); + + spec.scoreDocument(obj, &tfm); + + set<string> hits; + hits.insert("foredrag"); + hits.insert("foredragshold"); + hits.insert("lector"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS(1U, hits.count(term)); + } +} + +// Multi-language test_5: test wildcard spec +TEST(FTSSpec, NestedLanguages_Wildcard) { + BSONObj indexSpec = BSON("key" << BSON("$**" + << "text")); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ language : \"english\"," + " b : \"walking\"," + " c : { e: \"walked\" }," + " d : " + " { language : \"danish\"," + " e :" + " [ { f : \"foredrag\" }," + " { f : \"foredragsholder\" }," + " { f : \"lector\" } ]" + " }" + "}"); + + spec.scoreDocument(obj, &tfm); + + set<string> hits; + hits.insert("foredrag"); + hits.insert("foredragshold"); + hits.insert("lector"); + hits.insert("walk"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS(1U, hits.count(term)); + } +} + +// Multi-language test_6: test wildcard spec with override +TEST(FTSSpec, NestedLanguages_WildcardOverride) { + BSONObj indexSpec = BSON("key" << BSON("$**" + << "text") << "weights" << BSON("d.e.f" << 20)); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + + BSONObj obj = fromjson( + "{ language : \"english\"," + " b : \"walking\"," + " c : { e: \"walked\" }," + " d : " + " { language : \"danish\"," + " e :" + " [ { f : \"foredrag\" }," + " { f : \"foredragsholder\" }," + " { f : \"lector\" } ]" + " }" + "}"); + + spec.scoreDocument(obj, &tfm); + + set<string> hits; + hits.insert("foredrag"); + hits.insert("foredragshold"); + hits.insert("lector"); + hits.insert("walk"); + + for (TermFrequencyMap::const_iterator i = tfm.begin(); i != tfm.end(); ++i) { + string term = i->first; + ASSERT_EQUALS(1U, hits.count(term)); + } +} + +/** Test differences across textIndexVersion values in handling of nested arrays. */ +TEST(FTSSpec, TextIndexLegacyNestedArrays) { + BSONObj obj = fromjson("{a: [{b: ['hello']}]}"); + + // textIndexVersion=1 FTSSpec objects do not index nested arrays. + { + BSONObj indexSpec = fromjson("{key: {'a.b': 'text'}, textIndexVersion: 1}"); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + spec.scoreDocument(obj, &tfm); + ASSERT_EQUALS(tfm.size(), 0U); + } + + // textIndexVersion=2 FTSSpec objects do index nested arrays. + { + BSONObj indexSpec = fromjson("{key: {'a.b': 'text'}, textIndexVersion: 2}"); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + spec.scoreDocument(obj, &tfm); + ASSERT_EQUALS(tfm.size(), 1U); + } +} + +/** Test differences across textIndexVersion values in handling of language annotations. */ +TEST(FTSSpec, TextIndexLegacyLanguageRecognition) { + BSONObj obj = fromjson("{a: 'the', language: 'EN'}"); + + // textIndexVersion=1 FTSSpec objects treat two-letter language annotations as "none" + // for purposes of stopword processing. + { + BSONObj indexSpec = fromjson("{key: {'a': 'text'}, textIndexVersion: 1}"); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + spec.scoreDocument(obj, &tfm); + ASSERT_EQUALS(tfm.size(), 1U); // "the" not recognized as stopword + } + + // textIndexVersion=2 FTSSpec objects recognize two-letter codes. + { + BSONObj indexSpec = fromjson("{key: {'a': 'text'}, textIndexVersion: 2}"); + FTSSpec spec(FTSSpec::fixSpec(indexSpec)); + TermFrequencyMap tfm; + spec.scoreDocument(obj, &tfm); + ASSERT_EQUALS(tfm.size(), 0U); // "the" recognized as stopword + } +} +} +} diff --git a/src/mongo/db/fts/fts_util.cpp b/src/mongo/db/fts/fts_util.cpp index f2bd4e50905..85420fc66ad 100644 --- a/src/mongo/db/fts/fts_util.cpp +++ b/src/mongo/db/fts/fts_util.cpp @@ -32,11 +32,9 @@ namespace mongo { - namespace fts { +namespace fts { - const std::string INDEX_NAME = "text"; - const std::string WILDCARD = "$**"; - - } +const std::string INDEX_NAME = "text"; +const std::string WILDCARD = "$**"; +} } - diff --git a/src/mongo/db/fts/fts_util.h b/src/mongo/db/fts/fts_util.h index 5a749b27f53..4e67a5cee2e 100644 --- a/src/mongo/db/fts/fts_util.h +++ b/src/mongo/db/fts/fts_util.h @@ -38,63 +38,59 @@ namespace mongo { - namespace fts { +namespace fts { - extern const std::string WILDCARD; - extern const std::string INDEX_NAME; +extern const std::string WILDCARD; +extern const std::string INDEX_NAME; - enum TextIndexVersion { - TEXT_INDEX_VERSION_1 = 1, // Legacy index format. Deprecated. - TEXT_INDEX_VERSION_2 = 2 // Current index format. - }; +enum TextIndexVersion { + TEXT_INDEX_VERSION_1 = 1, // Legacy index format. Deprecated. + TEXT_INDEX_VERSION_2 = 2 // Current index format. +}; - /** - * destructive! - */ - inline void makeLower( std::string* s ) { - std::string::size_type sz = s->size(); - for ( std::string::size_type i = 0; i < sz; i++ ) - (*s)[i] = (char)tolower( (int)(*s)[i] ); - } - - struct _be_hash { - size_t operator()( const BSONElement& e ) const { - return static_cast<size_t>( BSONElementHasher::hash64( e, 17 ) ); - } - }; - - struct _be_equals { - bool operator()( const BSONElement& a, const BSONElement& b ) const { - return a == b; - } - }; - - struct _be_convert { - BSONElement operator()( const BSONObj& o ) const { - const BSONElement& x = o.firstElement(); - BSONElement y( x.rawdata() ); - return y; - } - }; +/** + * destructive! + */ +inline void makeLower(std::string* s) { + std::string::size_type sz = s->size(); + for (std::string::size_type i = 0; i < sz; i++) + (*s)[i] = (char)tolower((int)(*s)[i]); +} - struct _be_convert_other { - BSONObj operator()( const BSONElement& e ) const { - return e.wrap(); - } - }; +struct _be_hash { + size_t operator()(const BSONElement& e) const { + return static_cast<size_t>(BSONElementHasher::hash64(e, 17)); + } +}; - template< typename V > - class BSONElementMap : public UnorderedFastKeyTable<BSONElement, - BSONObj, - V, - _be_hash, - _be_equals, - _be_convert, - _be_convert_other > { - }; +struct _be_equals { + bool operator()(const BSONElement& a, const BSONElement& b) const { + return a == b; + } +}; +struct _be_convert { + BSONElement operator()(const BSONObj& o) const { + const BSONElement& x = o.firstElement(); + BSONElement y(x.rawdata()); + return y; + } +}; +struct _be_convert_other { + BSONObj operator()(const BSONElement& e) const { + return e.wrap(); } +}; + +template <typename V> +class BSONElementMap : public UnorderedFastKeyTable<BSONElement, + BSONObj, + V, + _be_hash, + _be_equals, + _be_convert, + _be_convert_other> {}; +} } - diff --git a/src/mongo/db/fts/fts_util_test.cpp b/src/mongo/db/fts/fts_util_test.cpp index 381ed1bf86a..04871a86adf 100644 --- a/src/mongo/db/fts/fts_util_test.cpp +++ b/src/mongo/db/fts/fts_util_test.cpp @@ -34,15 +34,14 @@ #include "mongo/db/fts/fts_util.h" namespace mongo { - namespace fts { +namespace fts { - TEST( BSONElementMap, Simple1 ) { - BSONElementMap<double> m; +TEST(BSONElementMap, Simple1) { + BSONElementMap<double> m; - BSONObj x = BSON( "x" << 5 ); - m[x.firstElement()] = 5; - ASSERT_EQUALS( 5, m[x.firstElement()] ); - } - - } + BSONObj x = BSON("x" << 5); + m[x.firstElement()] = 5; + ASSERT_EQUALS(5, m[x.firstElement()]); +} +} } diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp index dd260f92305..6ce0fdfb538 100644 --- a/src/mongo/db/fts/stemmer.cpp +++ b/src/mongo/db/fts/stemmer.cpp @@ -36,39 +36,36 @@ namespace mongo { - namespace fts { +namespace fts { - using std::string; +using std::string; - Stemmer::Stemmer( const FTSLanguage& language ) { - _stemmer = NULL; - if ( language.str() != "none" ) - _stemmer = sb_stemmer_new(language.str().c_str(), "UTF_8"); - } - - Stemmer::~Stemmer() { - if ( _stemmer ) { - sb_stemmer_delete(_stemmer); - _stemmer = NULL; - } - } - - string Stemmer::stem( const StringData& word ) const { - if ( !_stemmer ) - return word.toString(); +Stemmer::Stemmer(const FTSLanguage& language) { + _stemmer = NULL; + if (language.str() != "none") + _stemmer = sb_stemmer_new(language.str().c_str(), "UTF_8"); +} - const sb_symbol* sb_sym = sb_stemmer_stem( _stemmer, - (const sb_symbol*)word.rawData(), - word.size() ); +Stemmer::~Stemmer() { + if (_stemmer) { + sb_stemmer_delete(_stemmer); + _stemmer = NULL; + } +} - if ( sb_sym == NULL ) { - // out of memory - abort(); - } +string Stemmer::stem(const StringData& word) const { + if (!_stemmer) + return word.toString(); - return string( (const char*)(sb_sym), sb_stemmer_length( _stemmer ) ); - } + const sb_symbol* sb_sym = + sb_stemmer_stem(_stemmer, (const sb_symbol*)word.rawData(), word.size()); + if (sb_sym == NULL) { + // out of memory + abort(); } + return string((const char*)(sb_sym), sb_stemmer_length(_stemmer)); +} +} } diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h index fe028e2aba7..8d38af2de97 100644 --- a/src/mongo/db/fts/stemmer.h +++ b/src/mongo/db/fts/stemmer.h @@ -39,22 +39,22 @@ namespace mongo { - namespace fts { - - /** - * maintains case - * but works - * running/Running -> run/Run - */ - class Stemmer { - public: - Stemmer( const FTSLanguage& language ); - ~Stemmer(); - - std::string stem( const StringData& word ) const; - private: - struct sb_stemmer* _stemmer; - }; - } -} +namespace fts { +/** + * maintains case + * but works + * running/Running -> run/Run + */ +class Stemmer { +public: + Stemmer(const FTSLanguage& language); + ~Stemmer(); + + std::string stem(const StringData& word) const; + +private: + struct sb_stemmer* _stemmer; +}; +} +} diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp index 9037715d4da..7ff32b0f7e6 100644 --- a/src/mongo/db/fts/stemmer_test.cpp +++ b/src/mongo/db/fts/stemmer_test.cpp @@ -35,19 +35,18 @@ #include "mongo/db/fts/stemmer.h" namespace mongo { - namespace fts { +namespace fts { - TEST( English, Stemmer1 ) { - Stemmer s( languageEnglishV2 ); - ASSERT_EQUALS( "run", s.stem( "running" ) ); - ASSERT_EQUALS( "Run", s.stem( "Running" ) ); - } - - TEST( English, Caps ) { - Stemmer s( languagePorterV1 ); - ASSERT_EQUALS( "unit", s.stem( "united" ) ); - ASSERT_EQUALS( "Unite", s.stem( "United" ) ); - } +TEST(English, Stemmer1) { + Stemmer s(languageEnglishV2); + ASSERT_EQUALS("run", s.stem("running")); + ASSERT_EQUALS("Run", s.stem("Running")); +} - } +TEST(English, Caps) { + Stemmer s(languagePorterV1); + ASSERT_EQUALS("unit", s.stem("united")); + ASSERT_EQUALS("Unite", s.stem("United")); +} +} } diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp index 66240a1ce2d..51f64a16764 100644 --- a/src/mongo/db/fts/stop_words.cpp +++ b/src/mongo/db/fts/stop_words.cpp @@ -38,48 +38,42 @@ #include "mongo/util/string_map.h" - namespace mongo { - using boost::shared_ptr; - - namespace fts { +using boost::shared_ptr; - void loadStopWordMap( StringMap< std::set< std::string > >* m ); +namespace fts { - namespace { - StringMap< boost::shared_ptr<StopWords> > STOP_WORDS; - StopWords empty; - } +void loadStopWordMap(StringMap<std::set<std::string>>* m); +namespace { +StringMap<boost::shared_ptr<StopWords>> STOP_WORDS; +StopWords empty; +} - StopWords::StopWords(){ - } - StopWords::StopWords( const std::set<std::string>& words ) { - for ( std::set<std::string>::const_iterator i = words.begin(); i != words.end(); ++i ) - _words.insert( *i ); - } +StopWords::StopWords() {} - const StopWords* StopWords::getStopWords( const FTSLanguage& language ) { - StringMap< boost::shared_ptr<StopWords> >::const_iterator i = STOP_WORDS.find( language.str() ); - if ( i == STOP_WORDS.end() ) - return ∅ - return i->second.get(); - } +StopWords::StopWords(const std::set<std::string>& words) { + for (std::set<std::string>::const_iterator i = words.begin(); i != words.end(); ++i) + _words.insert(*i); +} +const StopWords* StopWords::getStopWords(const FTSLanguage& language) { + StringMap<boost::shared_ptr<StopWords>>::const_iterator i = STOP_WORDS.find(language.str()); + if (i == STOP_WORDS.end()) + return ∅ + return i->second.get(); +} - MONGO_INITIALIZER(StopWords)(InitializerContext* context) { - StringMap< std::set< std::string > > raw; - loadStopWordMap( &raw ); - for ( StringMap< std::set< std::string > >::const_iterator i = raw.begin(); - i != raw.end(); - ++i ) { - STOP_WORDS[i->first].reset(new StopWords( i->second )); - } - return Status::OK(); - } +MONGO_INITIALIZER(StopWords)(InitializerContext* context) { + StringMap<std::set<std::string>> raw; + loadStopWordMap(&raw); + for (StringMap<std::set<std::string>>::const_iterator i = raw.begin(); i != raw.end(); ++i) { + STOP_WORDS[i->first].reset(new StopWords(i->second)); } - + return Status::OK(); +} +} } diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h index 04fb35c3d4d..7bc8de810d2 100644 --- a/src/mongo/db/fts/stop_words.h +++ b/src/mongo/db/fts/stop_words.h @@ -39,24 +39,25 @@ namespace mongo { - namespace fts { +namespace fts { - class StopWords { - public: - StopWords(); - StopWords( const std::set<std::string>& words ); +class StopWords { +public: + StopWords(); + StopWords(const std::set<std::string>& words); - bool isStopWord( const std::string& word ) const { - return _words.count( word ) > 0; - } + bool isStopWord(const std::string& word) const { + return _words.count(word) > 0; + } - size_t numStopWords() const { return _words.size(); } + size_t numStopWords() const { + return _words.size(); + } - static const StopWords* getStopWords( const FTSLanguage& language ); - private: - unordered_set<std::string> _words; - }; + static const StopWords* getStopWords(const FTSLanguage& language); - } +private: + unordered_set<std::string> _words; +}; +} } - diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp index 0edf4e2540c..5835d623497 100644 --- a/src/mongo/db/fts/stop_words_test.cpp +++ b/src/mongo/db/fts/stop_words_test.cpp @@ -33,13 +33,12 @@ #include "mongo/unittest/unittest.h" namespace mongo { - namespace fts { +namespace fts { - TEST( English, Basic1 ) { - const StopWords* englishStopWords = StopWords::getStopWords( languageEnglishV2 ); - ASSERT( englishStopWords->isStopWord( "the" ) ); - ASSERT( !englishStopWords->isStopWord( "computer" ) ); - } - - } +TEST(English, Basic1) { + const StopWords* englishStopWords = StopWords::getStopWords(languageEnglishV2); + ASSERT(englishStopWords->isStopWord("the")); + ASSERT(!englishStopWords->isStopWord("computer")); +} +} } diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp index 53580fca4be..22e67287736 100644 --- a/src/mongo/db/fts/tokenizer.cpp +++ b/src/mongo/db/fts/tokenizer.cpp @@ -36,107 +36,105 @@ namespace mongo { - namespace fts { - - Tokenizer::Tokenizer( const FTSLanguage& language, const StringData& str ) - : _pos(0), _raw( str ) { - _english = ( language.str() == "english" ); - _skipWhitespace(); - _previousWhiteSpace = true; - } - - bool Tokenizer::more() const { - return _pos < _raw.size(); - } - - Token Tokenizer::next() { - if ( _pos >= _raw.size() ) - return Token( Token::INVALID, "", 0, false ); - - unsigned start = _pos++; - Token::Type type = _type( _raw[start] ); - if ( type == Token::WHITESPACE ) abort(); - - if ( type == Token::TEXT ) - while ( _pos < _raw.size() && _type( _raw[_pos] ) == type ) - _pos++; - - StringData ret = _raw.substr( start, _pos - start ); - bool old = _previousWhiteSpace; - _previousWhiteSpace = _skipWhitespace(); - return Token( type, ret, start, old ); - } - - - bool Tokenizer::_skipWhitespace() { - unsigned start = _pos; - while ( _pos < _raw.size() && _type( _raw[_pos] ) == Token::WHITESPACE ) - _pos++; - return _pos > start; - } - - - Token::Type Tokenizer::_type( char c ) const { - switch ( c ) { - case ' ': - case '\f': - case '\v': - case '\t': - case '\r': - case '\n': - return Token::WHITESPACE; - case '\'': - if ( _english ) - return Token::TEXT; - else - return Token::WHITESPACE; - - case '~': - case '`': - - case '!': - case '@': - case '#': - case '$': - case '%': - case '^': - case '&': - case '*': - case '(': - case ')': - - case '-': - - case '=': - case '+': - - case '[': - case ']': - case '{': - case '}': - case '|': - case '\\': - - case ';': - case ':': - - case '"': - - case '<': - case '>': - - case ',': - case '.': - - case '/': - case '?': - - return Token::DELIMITER; - default: +namespace fts { + +Tokenizer::Tokenizer(const FTSLanguage& language, const StringData& str) : _pos(0), _raw(str) { + _english = (language.str() == "english"); + _skipWhitespace(); + _previousWhiteSpace = true; +} + +bool Tokenizer::more() const { + return _pos < _raw.size(); +} + +Token Tokenizer::next() { + if (_pos >= _raw.size()) + return Token(Token::INVALID, "", 0, false); + + unsigned start = _pos++; + Token::Type type = _type(_raw[start]); + if (type == Token::WHITESPACE) + abort(); + + if (type == Token::TEXT) + while (_pos < _raw.size() && _type(_raw[_pos]) == type) + _pos++; + + StringData ret = _raw.substr(start, _pos - start); + bool old = _previousWhiteSpace; + _previousWhiteSpace = _skipWhitespace(); + return Token(type, ret, start, old); +} + + +bool Tokenizer::_skipWhitespace() { + unsigned start = _pos; + while (_pos < _raw.size() && _type(_raw[_pos]) == Token::WHITESPACE) + _pos++; + return _pos > start; +} + + +Token::Type Tokenizer::_type(char c) const { + switch (c) { + case ' ': + case '\f': + case '\v': + case '\t': + case '\r': + case '\n': + return Token::WHITESPACE; + case '\'': + if (_english) return Token::TEXT; - } - } + else + return Token::WHITESPACE; - } + case '~': + case '`': + + case '!': + case '@': + case '#': + case '$': + case '%': + case '^': + case '&': + case '*': + case '(': + case ')': + + case '-': + + case '=': + case '+': + + case '[': + case ']': + case '{': + case '}': + case '|': + case '\\': + + case ';': + case ':': + case '"': + + case '<': + case '>': + + case ',': + case '.': + + case '/': + case '?': + + return Token::DELIMITER; + default: + return Token::TEXT; + } +} +} } diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h index 06e63f6aa2f..29f7a164ce3 100644 --- a/src/mongo/db/fts/tokenizer.h +++ b/src/mongo/db/fts/tokenizer.h @@ -40,42 +40,38 @@ namespace mongo { - namespace fts { +namespace fts { - struct Token { - enum Type { WHITESPACE, DELIMITER, TEXT, INVALID }; - Token( Type type, const StringData& data, unsigned offset, bool previousWhiteSpace ) - : type( type ), - data( data ), - offset( offset ), - previousWhiteSpace( previousWhiteSpace ) {} +struct Token { + enum Type { WHITESPACE, DELIMITER, TEXT, INVALID }; + Token(Type type, const StringData& data, unsigned offset, bool previousWhiteSpace) + : type(type), data(data), offset(offset), previousWhiteSpace(previousWhiteSpace) {} - bool ok() const { return type != INVALID; } - - Type type; - StringData data; - unsigned offset; - bool previousWhiteSpace; - }; - - class Tokenizer { - public: + bool ok() const { + return type != INVALID; + } - Tokenizer( const FTSLanguage& language, const StringData& str ); + Type type; + StringData data; + unsigned offset; + bool previousWhiteSpace; +}; - bool more() const; - Token next(); +class Tokenizer { +public: + Tokenizer(const FTSLanguage& language, const StringData& str); - private: - Token::Type _type( char c ) const; - bool _skipWhitespace(); + bool more() const; + Token next(); - unsigned _pos; - bool _previousWhiteSpace; - const StringData _raw; - bool _english; - }; +private: + Token::Type _type(char c) const; + bool _skipWhitespace(); - } + unsigned _pos; + bool _previousWhiteSpace; + const StringData _raw; + bool _english; +}; +} } - diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp index 29153a329a6..f75593becb0 100644 --- a/src/mongo/db/fts/tokenizer_test.cpp +++ b/src/mongo/db/fts/tokenizer_test.cpp @@ -33,100 +33,97 @@ #include "mongo/unittest/unittest.h" namespace mongo { - namespace fts { +namespace fts { - TEST( Tokenizer, Empty1 ) { - Tokenizer i( languageEnglishV2, "" ); - ASSERT( !i.more() ); - } - - TEST( Tokenizer, Basic1 ) { - Tokenizer i( languageEnglishV2, "blue red green" ); +TEST(Tokenizer, Empty1) { + Tokenizer i(languageEnglishV2, ""); + ASSERT(!i.more()); +} - ASSERT( i.more() ); - ASSERT_EQUALS( i.next().data.toString(), "blue" ); +TEST(Tokenizer, Basic1) { + Tokenizer i(languageEnglishV2, "blue red green"); - ASSERT( i.more() ); - ASSERT_EQUALS( i.next().data.toString(), "red" ); + ASSERT(i.more()); + ASSERT_EQUALS(i.next().data.toString(), "blue"); - ASSERT( i.more() ); - ASSERT_EQUALS( i.next().data.toString(), "green" ); + ASSERT(i.more()); + ASSERT_EQUALS(i.next().data.toString(), "red"); - ASSERT( !i.more() ); - } + ASSERT(i.more()); + ASSERT_EQUALS(i.next().data.toString(), "green"); - TEST( Tokenizer, Basic2 ) { - Tokenizer i( languageEnglishV2, "blue-red" ); + ASSERT(!i.more()); +} - Token a = i.next(); - Token b = i.next(); - Token c = i.next(); - Token d = i.next(); +TEST(Tokenizer, Basic2) { + Tokenizer i(languageEnglishV2, "blue-red"); - ASSERT_EQUALS( Token::TEXT, a.type ); - ASSERT_EQUALS( Token::DELIMITER, b.type ); - ASSERT_EQUALS( Token::TEXT, c.type ); - ASSERT_EQUALS( Token::INVALID, d.type ); + Token a = i.next(); + Token b = i.next(); + Token c = i.next(); + Token d = i.next(); - ASSERT_EQUALS( "blue", a.data.toString() ); - ASSERT_EQUALS( "-", b.data.toString() ); - ASSERT_EQUALS( "red", c.data.toString() ); + ASSERT_EQUALS(Token::TEXT, a.type); + ASSERT_EQUALS(Token::DELIMITER, b.type); + ASSERT_EQUALS(Token::TEXT, c.type); + ASSERT_EQUALS(Token::INVALID, d.type); - ASSERT( a.previousWhiteSpace ); - ASSERT( !b.previousWhiteSpace ); - ASSERT( !c.previousWhiteSpace ); - } + ASSERT_EQUALS("blue", a.data.toString()); + ASSERT_EQUALS("-", b.data.toString()); + ASSERT_EQUALS("red", c.data.toString()); - TEST( Tokenizer, Basic3 ) { - Tokenizer i( languageEnglishV2, "blue -red" ); + ASSERT(a.previousWhiteSpace); + ASSERT(!b.previousWhiteSpace); + ASSERT(!c.previousWhiteSpace); +} - Token a = i.next(); - Token b = i.next(); - Token c = i.next(); - Token d = i.next(); +TEST(Tokenizer, Basic3) { + Tokenizer i(languageEnglishV2, "blue -red"); - ASSERT_EQUALS( Token::TEXT, a.type ); - ASSERT_EQUALS( Token::DELIMITER, b.type ); - ASSERT_EQUALS( Token::TEXT, c.type ); - ASSERT_EQUALS( Token::INVALID, d.type ); + Token a = i.next(); + Token b = i.next(); + Token c = i.next(); + Token d = i.next(); - ASSERT_EQUALS( "blue", a.data.toString() ); - ASSERT_EQUALS( "-", b.data.toString() ); - ASSERT_EQUALS( "red", c.data.toString() ); + ASSERT_EQUALS(Token::TEXT, a.type); + ASSERT_EQUALS(Token::DELIMITER, b.type); + ASSERT_EQUALS(Token::TEXT, c.type); + ASSERT_EQUALS(Token::INVALID, d.type); - ASSERT( a.previousWhiteSpace ); - ASSERT( b.previousWhiteSpace ); - ASSERT( !c.previousWhiteSpace ); + ASSERT_EQUALS("blue", a.data.toString()); + ASSERT_EQUALS("-", b.data.toString()); + ASSERT_EQUALS("red", c.data.toString()); + ASSERT(a.previousWhiteSpace); + ASSERT(b.previousWhiteSpace); + ASSERT(!c.previousWhiteSpace); - ASSERT_EQUALS( 0U, a.offset ); - ASSERT_EQUALS( 5U, b.offset ); - ASSERT_EQUALS( 6U, c.offset ); - } - TEST( Tokenizer, Quote1English ) { - Tokenizer i( languageEnglishV2, "eliot's car" ); + ASSERT_EQUALS(0U, a.offset); + ASSERT_EQUALS(5U, b.offset); + ASSERT_EQUALS(6U, c.offset); +} - Token a = i.next(); - Token b = i.next(); +TEST(Tokenizer, Quote1English) { + Tokenizer i(languageEnglishV2, "eliot's car"); - ASSERT_EQUALS( "eliot's", a.data.toString() ); - ASSERT_EQUALS( "car", b.data.toString() ); - } + Token a = i.next(); + Token b = i.next(); - TEST( Tokenizer, Quote1French ) { - Tokenizer i( languageFrenchV2, "eliot's car" ); + ASSERT_EQUALS("eliot's", a.data.toString()); + ASSERT_EQUALS("car", b.data.toString()); +} - Token a = i.next(); - Token b = i.next(); - Token c = i.next(); +TEST(Tokenizer, Quote1French) { + Tokenizer i(languageFrenchV2, "eliot's car"); - ASSERT_EQUALS( "eliot", a.data.toString() ); - ASSERT_EQUALS( "s", b.data.toString() ); - ASSERT_EQUALS( "car", c.data.toString() ); - } + Token a = i.next(); + Token b = i.next(); + Token c = i.next(); - } + ASSERT_EQUALS("eliot", a.data.toString()); + ASSERT_EQUALS("s", b.data.toString()); + ASSERT_EQUALS("car", c.data.toString()); +} +} } - - |