diff options
Diffstat (limited to 'qpid/cpp/src/qpid/broker/SelectorToken.cpp')
-rw-r--r-- | qpid/cpp/src/qpid/broker/SelectorToken.cpp | 298 |
1 files changed, 298 insertions, 0 deletions
diff --git a/qpid/cpp/src/qpid/broker/SelectorToken.cpp b/qpid/cpp/src/qpid/broker/SelectorToken.cpp new file mode 100644 index 0000000000..d69267b2e5 --- /dev/null +++ b/qpid/cpp/src/qpid/broker/SelectorToken.cpp @@ -0,0 +1,298 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +#include "qpid/broker/SelectorToken.h" + +#include <string> +#include <algorithm> +#include <iostream> +#include <cassert> +#include <cctype> + +namespace qpid { +namespace broker { + +// Tokeniserss always take string const_iterators to mark the beginning and end of the string being tokenised +// if the tokenise is successful then the start iterator is advanced, if the tokenise fails then the start +// iterator is unchanged. + +std::ostream& operator<<(std::ostream& os, const Token& t) +{ + os << "T<" << t.type << ", " << t.val << ">"; + return os; +} + +TokenException::TokenException(const std::string& msg) : + range_error(msg) +{} + +// Lexically, reserved words are a subset of identifiers +// so we parse an identifier first then check if it is a reserved word and +// convert it if it is a reserved word +namespace { + +struct RWEntry { + const char* word; + TokenType type; +}; + +inline bool caseless(const char* s1, const char* s2) +{ + do { + char ls1 = std::tolower(*s1); + char ls2 = std::tolower(*s2); + if (ls1<ls2) + return true; + else if (ls1>ls2) + return false; + } while ( *s1++ && *s2++ ); + // Equal + return false; +} + +inline bool operator<(const RWEntry& lhs, const RWEntry& rhs) { + return caseless(lhs.word, rhs.word); +} + +} + +bool tokeniseReservedWord(Token& tok) +{ + // This must be sorted!! + static const RWEntry reserved[] = { + {"and", T_AND}, + {"between", T_BETWEEN}, + {"escape", T_ESCAPE}, + {"false", T_FALSE}, + {"in", T_IN}, + {"is", T_IS}, + {"like", T_LIKE}, + {"not", T_NOT}, + {"null", T_NULL}, + {"or", T_OR}, + {"true", T_TRUE} + }; + + const int reserved_size = sizeof(reserved)/sizeof(RWEntry); + + if ( tok.type != T_IDENTIFIER ) return false; + + RWEntry rw; + rw.word = tok.val.c_str(); + std::pair<const RWEntry*, const RWEntry*> entry = std::equal_range(&reserved[0], &reserved[reserved_size], rw); + + if ( entry.first==entry.second ) return false; + + tok.type = entry.first->type; + return true; +} + +// parsing strings is complicated by the need to allow embedded quotes by doubling the quote character +bool processString(std::string::const_iterator& s, std::string::const_iterator& e, char quoteChar, TokenType type, Token& tok) +{ + // We only get here once the tokeniser recognises the initial quote for a string + // so we don't need to check for it again. + std::string::const_iterator q = std::find(s+1, e, quoteChar); + if ( q==e ) return false; + + std::string content(s+1, q); + ++q; + + while ( q!=e && *q==quoteChar ) { + std::string::const_iterator p = q; + q = std::find(p+1, e, quoteChar); + if ( q==e ) return false; + content += std::string(p, q); + ++q; + } + + tok = Token(type, s, content); + s = q; + return true; +} + +inline bool isIdentifierStart(char c) +{ + return std::isalpha(c) || c=='_' || c=='$'; +} + +inline bool isIdentifierPart(char c) +{ + return std::isalnum(c) || c=='_' || c=='$' || c=='.'; +} + +static const std::string END("<END>"); +bool tokenise(std::string::const_iterator& s, std::string::const_iterator& e, Token& tok) +{ + std::string::const_iterator t = s; + + // Hand constructed state machine recogniser + enum { + START, + REJECT, + IDENTIFIER, + DIGIT, + DECIMAL_START, + DECIMAL, + EXPONENT_SIGN, + EXPONENT_START, + EXPONENT, + ACCEPT_IDENTIFIER, + ACCEPT_INC, + ACCEPT_NOINC + } state = START; + + TokenType tokType = T_EOS; + while (true) + switch (state) { + case START: + if (t==e) {tok = Token(T_EOS, s, END); return true;} + else if (std::isspace(*t)) {++t; ++s; continue;} + else switch (*t) { + case '(': tokType = T_LPAREN; state = ACCEPT_INC; continue; + case ')': tokType = T_RPAREN; state = ACCEPT_INC; continue; + case ',': tokType = T_COMMA; state = ACCEPT_INC; continue; + case '+': tokType = T_PLUS; state = ACCEPT_INC; continue; + case '-': tokType = T_MINUS; state = ACCEPT_INC; continue; + case '*': tokType = T_MULT; state = ACCEPT_INC; continue; + case '/': tokType = T_DIV; state = ACCEPT_INC; continue; + case '=': tokType = T_EQUAL; state = ACCEPT_INC; continue; + case '<': + ++t; + if (t==e || (*t!='>' && *t!='=')) + {tokType = T_LESS; state = ACCEPT_NOINC; continue; } + else + {tokType = (*t=='>') ? T_NEQ : T_LSEQ; state = ACCEPT_INC; continue; } + case '>': + ++t; + if (t==e || *t!='=') + {tokType = T_GRT; state = ACCEPT_NOINC; continue;} + else + {tokType = T_GREQ; state = ACCEPT_INC; continue;} + default: + break; + } + if (isIdentifierStart(*t)) {++t; state = IDENTIFIER;} + else if (*t=='\'') {return processString(s, e, '\'', T_STRING, tok);} + else if (*t=='\"') {return processString(s, e, '\"', T_IDENTIFIER, tok);} + else if (std::isdigit(*t)) {++t; state = DIGIT;} + else if (*t=='.') {++t; state = DECIMAL_START;} + else state = REJECT; + continue; + case IDENTIFIER: + if (t==e) {state = ACCEPT_IDENTIFIER;} + else if (isIdentifierPart(*t)) {++t; state = IDENTIFIER;} + else state = ACCEPT_IDENTIFIER; + continue; + case DECIMAL_START: + if (t==e) {state = REJECT;} + else if (std::isdigit(*t)) {++t; state = DECIMAL;} + else state = REJECT; + continue; + case EXPONENT_SIGN: + if (t==e) {state = REJECT;} + else if (*t=='-' || *t=='+') {++t; state = EXPONENT_START;} + else if (std::isdigit(*t)) {++t; state = EXPONENT;} + else state = REJECT; + continue; + case EXPONENT_START: + if (t==e) {state = REJECT;} + else if (std::isdigit(*t)) {++t; state = EXPONENT;} + else state = REJECT; + continue; + case DIGIT: + if (t==e) {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} + else if (std::isdigit(*t)) {++t; state = DIGIT;} + else if (*t=='.') {++t; state = DECIMAL;} + else if (*t=='e' || *t=='E') {++t; state = EXPONENT_SIGN;} + else {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} + continue; + case DECIMAL: + if (t==e) {tokType = T_NUMERIC_APPROX; state = ACCEPT_NOINC;} + else if (std::isdigit(*t)) {++t; state = DECIMAL;} + else if (*t=='e' || *t=='E') {++t; state = EXPONENT_SIGN;} + else {tokType = T_NUMERIC_APPROX; state = ACCEPT_NOINC;} + continue; + case EXPONENT: + if (t==e) {tokType = T_NUMERIC_APPROX; state = ACCEPT_NOINC;} + else if (std::isdigit(*t)) {++t; state = EXPONENT;} + else {tokType = T_NUMERIC_APPROX; state = ACCEPT_NOINC;} + continue; + case ACCEPT_INC: + ++t; + case ACCEPT_NOINC: + tok = Token(tokType, s, t); + s = t; + return true; + case ACCEPT_IDENTIFIER: + tok = Token(T_IDENTIFIER, s, t); + s = t; + tokeniseReservedWord(tok); + return true; + case REJECT: + return false; + }; +} + +Tokeniser::Tokeniser(const std::string::const_iterator& s, const std::string::const_iterator& e) : + tokp(0), + inStart(s), + inp(s), + inEnd(e) +{ +} + +/** + * Skip any whitespace then look for a token, throwing an exception if no valid token + * is found. + * + * Advance the string iterator past the parsed token on success. On failure the string iterator is + * in an undefined location. + */ +const Token& Tokeniser::nextToken() +{ + if ( tokens.size()>tokp ) return tokens[tokp++]; + + // Don't extend stream of tokens further than the end of stream; + if ( tokp>0 && tokens[tokp-1].type==T_EOS ) return tokens[tokp-1]; + + tokens.push_back(Token()); + Token& tok = tokens[tokp++]; + + if (tokenise(inp, inEnd, tok)) return tok; + + throw TokenException("Found illegal character"); +} + +void Tokeniser::returnTokens(unsigned int n) +{ + assert( n<=tokp ); + tokp-=n; +} + +std::string Tokeniser::remaining() +{ + Token& currentTok = tokens[tokp]; + return std::string(currentTok.tokenStart, inEnd); +} + + +}} |