diff options
Diffstat (limited to 'src/mongo/db/fts/tokenizer.cpp')
-rw-r--r-- | src/mongo/db/fts/tokenizer.cpp | 194 |
1 files changed, 96 insertions, 98 deletions
diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp index 01037a85c8d..e1f595b9a4a 100644 --- a/src/mongo/db/fts/tokenizer.cpp +++ b/src/mongo/db/fts/tokenizer.cpp @@ -36,105 +36,103 @@ namespace mongo { - namespace fts { - - Tokenizer::Tokenizer(const FTSLanguage* language, StringData str) - : _pos(0), _raw( str ) { - _english = ( language->str() == "english" ); - _skipWhitespace(); - } - - bool Tokenizer::more() const { - return _pos < _raw.size(); - } - - Token Tokenizer::next() { - if ( _pos >= _raw.size() ) - return Token(Token::INVALID, "", 0); - - unsigned start = _pos++; - Token::Type type = _type( _raw[start] ); - if ( type == Token::WHITESPACE ) invariant( false ); - - if ( type == Token::TEXT ) - while ( _pos < _raw.size() && _type( _raw[_pos] ) == type ) - _pos++; - - StringData ret = _raw.substr( start, _pos - start ); - _skipWhitespace(); - return Token( type, ret, start ); - } - - - bool Tokenizer::_skipWhitespace() { - unsigned start = _pos; - while ( _pos < _raw.size() && _type( _raw[_pos] ) == Token::WHITESPACE ) - _pos++; - return _pos > start; - } - - - Token::Type Tokenizer::_type( char c ) const { - switch ( c ) { - case ' ': - case '\f': - case '\v': - case '\t': - case '\r': - case '\n': - return Token::WHITESPACE; - case '\'': - if ( _english ) - return Token::TEXT; - else - return Token::WHITESPACE; - - case '~': - case '`': - - case '!': - case '@': - case '#': - case '$': - case '%': - case '^': - case '&': - case '*': - case '(': - case ')': - - case '-': - - case '=': - case '+': - - case '[': - case ']': - case '{': - case '}': - case '|': - case '\\': - - case ';': - case ':': - - case '"': - - case '<': - case '>': - - case ',': - case '.': - - case '/': - case '?': - - return Token::DELIMITER; - default: +namespace fts { + +Tokenizer::Tokenizer(const FTSLanguage* language, StringData str) : _pos(0), _raw(str) { + _english = (language->str() == "english"); + _skipWhitespace(); +} + +bool Tokenizer::more() const { + return _pos < _raw.size(); +} + +Token Tokenizer::next() { + if (_pos >= _raw.size()) + return Token(Token::INVALID, "", 0); + + unsigned start = _pos++; + Token::Type type = _type(_raw[start]); + if (type == Token::WHITESPACE) + invariant(false); + + if (type == Token::TEXT) + while (_pos < _raw.size() && _type(_raw[_pos]) == type) + _pos++; + + StringData ret = _raw.substr(start, _pos - start); + _skipWhitespace(); + return Token(type, ret, start); +} + + +bool Tokenizer::_skipWhitespace() { + unsigned start = _pos; + while (_pos < _raw.size() && _type(_raw[_pos]) == Token::WHITESPACE) + _pos++; + return _pos > start; +} + + +Token::Type Tokenizer::_type(char c) const { + switch (c) { + case ' ': + case '\f': + case '\v': + case '\t': + case '\r': + case '\n': + return Token::WHITESPACE; + case '\'': + if (_english) return Token::TEXT; - } - } + else + return Token::WHITESPACE; - } + case '~': + case '`': + + case '!': + case '@': + case '#': + case '$': + case '%': + case '^': + case '&': + case '*': + case '(': + case ')': + + case '-': + + case '=': + case '+': + + case '[': + case ']': + case '{': + case '}': + case '|': + case '\\': + + case ';': + case ':': + case '"': + + case '<': + case '>': + + case ',': + case '.': + + case '/': + case '?': + + return Token::DELIMITER; + default: + return Token::TEXT; + } +} +} } |