diff options
Diffstat (limited to 'CIAO/CCF/CCF/IDL2/LexicalAnalyzer.cpp')
-rw-r--r-- | CIAO/CCF/CCF/IDL2/LexicalAnalyzer.cpp | 964 |
1 files changed, 964 insertions, 0 deletions
diff --git a/CIAO/CCF/CCF/IDL2/LexicalAnalyzer.cpp b/CIAO/CCF/CCF/IDL2/LexicalAnalyzer.cpp new file mode 100644 index 00000000000..e357a1a0c28 --- /dev/null +++ b/CIAO/CCF/CCF/IDL2/LexicalAnalyzer.cpp @@ -0,0 +1,964 @@ +// file : CCF/IDL2/LexicalAnalyzer.cpp +// author : Boris Kolpackov <boris@dre.vanderbilt.edu> +// cvs-id : $Id$ + +#include "CCF/IDL2/LexicalAnalyzer.hpp" + +#include <stdlib.h> // strtol +#include <iostream> + +using std::cerr; +using std::endl; +using std::string; +using std::pair; +using std::size_t; + +namespace CCF +{ + namespace IDL2 + { + LexicalAnalyzer:: + LexicalAnalyzer (CompilerElements::TokenStream<Char>& is) + : loc_ ("C"), is_ (is) + { + // Keywords (alphabetic order). + // + + keyword_table_.insert ("abstract" ); + keyword_table_.insert ("attribute" ); + keyword_table_.insert ("__binclude" ); + keyword_table_.insert ("case" ); + keyword_table_.insert ("const" ); + keyword_table_.insert ("custom" ); + keyword_table_.insert ("default" ); + keyword_table_.insert ("exception" ); + keyword_table_.insert ("enum" ); + keyword_table_.insert ("factory" ); + keyword_table_.insert ("getraises" ); + keyword_table_.insert ("in" ); + keyword_table_.insert ("inout" ); + keyword_table_.insert ("interface" ); + keyword_table_.insert ("__qinclude" ); + keyword_table_.insert ("local" ); + keyword_table_.insert ("module" ); + keyword_table_.insert ("native" ); + keyword_table_.insert ("oneway" ); + keyword_table_.insert ("out" ); + keyword_table_.insert ("private" ); + keyword_table_.insert ("public" ); + keyword_table_.insert ("raises" ); + keyword_table_.insert ("readonly" ); + keyword_table_.insert ("sequence" ); + keyword_table_.insert ("setraises" ); + keyword_table_.insert ("struct" ); + keyword_table_.insert ("supports" ); + keyword_table_.insert ("switch" ); + keyword_table_.insert ("truncatable"); + keyword_table_.insert ("typedef" ); + keyword_table_.insert ("typeid" ); + keyword_table_.insert ("typeprefix" ); + keyword_table_.insert ("union" ); + keyword_table_.insert ("valuetype" ); + + // Identifiers (alphabetic order). + // + // Note: if you are planning to hack something up + // in the code below, first make sure you understand + // how everything works! + // + + IdentifierTreeNode end; + + identifier_tree_["Object" ] = end; + identifier_tree_["ValueBase"] = end; + identifier_tree_["any" ] = end; + identifier_tree_["boolean" ] = end; + identifier_tree_["char" ] = end; + identifier_tree_["double" ] = end; + identifier_tree_["float" ] = end; + + IdentifierTreeNode long_; + long_["" ] = end; + long_["double"] = end; + long_["long" ] = end; + + identifier_tree_["long" ] = long_; + identifier_tree_["octet" ] = end; + identifier_tree_["short" ] = end; + identifier_tree_["string" ] = end; + + IdentifierTreeNode unsigned_long_; + unsigned_long_["" ] = end; + unsigned_long_["long"] = end; + + IdentifierTreeNode unsigned_; + unsigned_["long" ] = unsigned_long_; + unsigned_["short" ] = end; + + identifier_tree_["unsigned" ] = unsigned_; + identifier_tree_["void" ] = end; + identifier_tree_["wchar" ] = end; + identifier_tree_["wstring" ] = end; + + // punctuation (pair-matched). + // + punctuation_table_.insert (":"); + punctuation_table_.insert (","); + punctuation_table_.insert ("{"); + punctuation_table_.insert ("}"); + punctuation_table_.insert ("["); + punctuation_table_.insert ("]"); + punctuation_table_.insert ("("); + punctuation_table_.insert (")"); + punctuation_table_.insert ("<"); + punctuation_table_.insert (">"); + punctuation_table_.insert (";"); + + // operators + // + operator_table_.insert ("+"); // add + operator_table_.insert ("&"); // and + operator_table_.insert ("~"); // com + operator_table_.insert ("/"); // div + operator_table_.insert ("="); // eq + operator_table_.insert ("<<"); // lsh + operator_table_.insert ("*"); // mul + operator_table_.insert ("|"); // or + operator_table_.insert ("%"); // rem + operator_table_.insert ("-"); // sub + operator_table_.insert (">>"); // rsh + operator_table_.insert ("^"); // xor + } + + + LexicalAnalyzer::Char LexicalAnalyzer:: + get () + { + if (!ibuffer_.empty ()) + { + Char c = ibuffer_.front (); + ibuffer_.pop_front (); + return c; + } + else + { + return is_.next (); + } + } + + LexicalAnalyzer::Char LexicalAnalyzer:: + peek () + { + if (ibuffer_.empty ()) + { + ibuffer_.push_back (is_.next ()); + } + + return ibuffer_.front (); + } + + LexicalAnalyzer::Char LexicalAnalyzer:: + peek_more () + { + while (ibuffer_.size () < 2) + { + ibuffer_.push_back (is_.next ()); + } + + return ibuffer_.at (1); + } + + void LexicalAnalyzer:: + ret (Char const& c) + { + ibuffer_.push_front (c); + } + + + TokenPtr LexicalAnalyzer:: + next () + { + while (true) // Recovery loop. + { + Char c = skip_space (get ()); + + if (is_eos (c)) return TokenPtr (new EndOfStream (0)); + + TokenPtr token; + + if (character_literal (c, token)) return token; + + if (string_literal (c, token)) return token; + + if (integer_literal (c, token)) return token; + + // Check for identifier after literals because it can be + // triggered by wide string prefix (L"..."). + // + if (is_alpha (c) || c == '_' || (c == ':' && peek () == ':')) + { + return identifier (c); + } + + // Check for punctuation after identifier because ':' in + // scoped name will trigger. + // + if (operator_ (c, token)) return token; + + if (punctuation (c, token)) return token; + + cerr << c.line () << ": error: unable to derive any token " + << "from \'" << c << "\'" << endl; + + + //Do some primitive error recovery. + // + while (c != ';') + { + c = skip_space (get ()); + if (is_eos (c)) return TokenPtr (new EndOfStream (0)); + } + } + } + + LexicalAnalyzer::Char LexicalAnalyzer:: + skip_space (Char c) + { + while (!is_eos (c) && is_space (c)) c = get (); + + return c; + } + + bool LexicalAnalyzer:: + read_simple_identifier (string& lexeme, CharBuffer& buf) + { + Char c = skip_space (get ()); + + buf.push_back (c); + + if (is_eos (c)) return false; + + if (is_alpha (c) || c == '_') + { + lexeme += c; + + while (true) + { + c = peek (); + + if (is_eos (c)) + { + cerr << "warning: no new line at the end of file" << endl; + break; + } + + if (is_alnum (c) || c == '_') + { + get (); + buf.push_back (c); + lexeme += c; + continue; + } + + break; + } + + return true; + } + + return false; + } + + + bool LexicalAnalyzer:: + traverse_identifier_tree (string& lexeme, IdentifierTreeNode const& node) + { + if (node.map_.empty ()) return true; + + CharBuffer buf; + string part; + + if (read_simple_identifier (part, buf)) + { + IdentifierTreeNode::PrefixMap::const_iterator i ( + node.map_.find (part)); + + if (i != node.map_.end ()) + { + if (traverse_identifier_tree (part, i->second)) + { + lexeme += " " + part; + return true; + } + } + } + + // Return characters to the buffer in case we couldn't + // match anything. + + for(;!buf.empty (); buf.pop_back ()) ret (buf.back ()); + + // Check is the node.map_ contains empty key which indicates + // that what we've got is good enough. + // + return node.map_.find ("") != node.map_.end (); + } + + + TokenPtr LexicalAnalyzer:: + identifier (Char c) + { + unsigned long line (c.line ()); + + string lexeme; + + enum + { + simple, + scoped, + other + } type = simple; + + if (c == ':') + { + //@@ not checking for eos here + if ((c = get ()) != ':') + { + cerr << "error: " << c.line () << ": \':\' expected." + << endl; + + return TokenPtr (new EndOfStream (0)); + //@@ error handling is lame for lexical analyzer. + } + + lexeme = "::"; + type = scoped; + c = get (); + } + + // First caracter of an identifier. + // + if (is_eos (c)) + { + cerr << "error: invalid identifier" << endl; + return TokenPtr (new EndOfStream (0)); + } + + if (is_alpha (c) || c == '_') + { + lexeme += c; + } + else + { + cerr << "error: invalid identifier" << endl; + return TokenPtr (new EndOfStream (0)); + } + + while (true) + { + c = peek (); + + if (is_eos (c)) + { + cerr << "warning: no new line at the end of file" << endl; + break; + } + + // cerr << "lexer::identifier: peeking on \'" << c.char_ () + // << "\'; current lexeme \'" << lexeme << "\'" + // << endl; + + if (is_alnum (c) || c == '_') + { + get (); + lexeme += c; + continue; + } + + if (c == ':' && peek_more () == ':') + { + get (); + get (); + lexeme += "::"; + if (type == simple) type = other; + continue; + } + + break; + } + + //cerr << "lexer: found identifier with lexeme \'" + // << lexeme << "\'" << endl; + + if (type == simple) + { + // Check if it's a keyword. + { + KeywordTable::const_iterator i (keyword_table_.find (lexeme)); + + if (i != keyword_table_.end ()) + { + return TokenPtr (new Keyword (*i, line)); + } + + // This part is tricky. If it's after 6pm then come back + // in the morning. In essence I want the same name + // ('string' and 'wstring') to be recognized as a keyword + // in one case and as an identifier in the other. When + // we see 'string' followed by '<' we want it to be a + // keyword. If it's just all by itself then we want to treat + // it as an identifier (since it is a complete construct + // by itself). So here we are going to check for that. + // + + if (lexeme == "string" || lexeme == "wstring") + { + Char c = skip_space (get ()); + ret (c); + + if (c == '<') + { + return TokenPtr (new Keyword (lexeme, line)); + } + } + } + + // Check if it is a reserved identifier. + // + + { + IdentifierTreeNode::PrefixMap::const_iterator i ( + identifier_tree_.map_.find (lexeme)); + + if (i != identifier_tree_.map_.end ()) + { + if (traverse_identifier_tree (lexeme, i->second)) + { + return TokenPtr ( + new SimpleIdentifier (lexeme, line)); + } + else + { + //@@ error + } + } + } + + // Check if it is a boolean literal. + // + if (lexeme == "TRUE" || lexeme == "FALSE") + { + return TokenPtr (new BooleanLiteral (lexeme, line)); + } + + // Default to SimpleIdentifier. + // + return TokenPtr (new SimpleIdentifier (lexeme, line)); + } + else if (type == scoped) + { + return TokenPtr (new ScopedIdentifier (lexeme, line)); + } + else //type == other + { + return TokenPtr (new Identifier (lexeme, line)); + } + } + + bool LexicalAnalyzer:: + punctuation (Char c, TokenPtr& token) + { + unsigned long line (c.line ()); + + PunctuationTable::const_iterator i = punctuation_table_.begin (); + + while (true) + { + for (;i != punctuation_table_.end () && (*i)[0] != c; ++i); + + if (i == punctuation_table_.end ()) return false; + + if (i->size () == 2) // two-character punctuation + { + Char pc (peek ()); + + if (!is_eos (pc) && (*i)[1] == pc) + { + get (); + } + else + { + // Move on to the next candidate. + // + ++i; + continue; + } + } + + token = TokenPtr (new Punctuation (*i, line)); + return true; + } + } + + bool LexicalAnalyzer:: + operator_ (Char c, TokenPtr& token) + { + unsigned long line (c.line ()); + + OperatorTable::const_iterator i = operator_table_.begin (); + + while (true) + { + for (;i != operator_table_.end () && (*i)[0] != c; ++i); + + if (i == operator_table_.end ()) return false; + + if (i->size () == 2) // two-character operator + { + Char pc (peek ()); + + if (!is_eos (pc) && (*i)[1] == pc) + { + get (); + } + else + { + // Move on to the next candidate. + // + ++i; + continue; + } + } + + token = TokenPtr (new Operator (*i, line)); + return true; + } + } + + pair<char, size_t> LexicalAnalyzer:: + scan_char (char const* s) throw (Format) + { + if (*s == '\0') + { + throw Format (); + } + else if (*s != '\\') + { + return pair<char, size_t> (*s, 1); + } + else + { + // Expected size is 2. + // + pair<char, size_t> r ('\0', 2); + + switch (*++s) + { + case 'n': + r.first = '\n'; + break; + + case 't': + r.first = '\t'; + break; + + case 'v': + r.first = '\v'; + break; + + case 'b': + r.first = '\b'; + break; + + case 'r': + r.first = '\r'; + break; + + case 'f': + r.first = '\f'; + break; + + case 'a': + r.first = '\a'; + break; + + case '\\': + r.first = '\\'; + break; + + case '?': + r.first = '\?'; + break; + + case '\'': + r.first = '\''; + break; + + case '"': + r.first = '\"'; + break; + + case 'x': + { + // hex + + char c (*++s); + + if(is_hex_digit (c)) + { + // Maximum 2 digits. + // + string holder (s, 2); + + char* end; + + // Cannot fail. -1 < v < 256. + // + long v (strtol(holder.c_str (), &end, 16)); + + r.first = static_cast<char> (v); + r.second = 2 + end - holder.c_str (); + } + else + { + throw Format (); + } + break; + } + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + { + // Maximum 3 digits. + // + string holder (s, 3); + + char* end; + + // Cannot fail. + // + long v (strtol(holder.c_str (), &end, 8)); + + if (v < 0 || v > 255) throw Format (); + + r.first = static_cast<char> (v); + r.second = 1 + end - holder.c_str (); + + break; + } + default: + { + throw Format (); + } + } + + return r; + + } + } + + string LexicalAnalyzer:: + scan_string (string const& s) throw (Format) + { + string r; + + char const* p = s.c_str (); + + while (*p != '\0') + { + pair<char, size_t> c (scan_char (p)); + + if (c.first == '\0') throw Format (); + + r += c.first; + p += c.second; + } + + return r; + } + + + bool LexicalAnalyzer:: + character_literal (Char c, TokenPtr& token) + { + if (c != '\'') return false; + + unsigned long line (c.line ()); + string lexeme; + + Char prev (c); + + while (true) + { + c = get (); + + if (is_eos (c)) + { + cerr << "error: end of file while reading character literal" + << endl; + break; + } + + if (c == '\'' && prev != '\\') break; + + lexeme += c; + prev = c; + } + + try + { + pair<char, size_t> r (scan_char (lexeme.c_str ())); + if (r.second != lexeme.size ()) throw Format (); + + token = TokenPtr (new CharacterLiteral (r.first, lexeme, line)); + return true; + } + catch (Format const&) + { + cerr << "error: invalid character literal format" << endl; + return false; + } + } + + bool LexicalAnalyzer:: + string_literal (Char c, TokenPtr& token) + { + if (c != '\"') return false; + + unsigned long line (c.line ()); + string lexeme; + string value; + + try + { + while (true) + { + string r (string_literal_trailer ()); + value += scan_string (r); + lexeme += '\"' + r + '\"'; + + // Check if there are more strings. + // + + c = skip_space (get ()); + + if (c != '\"') + { + ret (c); // put it back + break; + } + + + // Add single space as a string separator. + // + lexeme += " "; + } + + // cerr << "string literal: <" << lexeme << ">/<" << value << ">" + // << endl; + + token = TokenPtr (new StringLiteral (value, lexeme, line)); + return true; + } + catch (Format const&) + { + cerr << "error: invalid string literal format" << endl; + return false; + } + } + + string LexicalAnalyzer:: + string_literal_trailer () + { + string r; + + Char prev ('\"', 0); + + while (true) + { + Char c = get (); + + if (is_eos (c)) + { + cerr << "error: end of file while reading string literal" << endl; + break; + } + + if (c == '\"' && prev != '\\') break; + + r += c; + prev = c; + } + + return r; + } + + unsigned long long LexicalAnalyzer:: + scan_integer (string const& s, unsigned short base) + throw (Format, Boundary) + { + unsigned long long const max (~0ULL); + unsigned long long bound (max / base); + + + char const* p (s.c_str ()); + + // Skip leading 0 if any. + // + while (*p != '\0' && *p == '0') ++p; + + + unsigned long long result (0); + + while(*p != '\0') + { + unsigned short digit; + + char c (to_upper (*p)); + + if (is_dec_digit (c)) + { + digit = c - '0'; + } + else if (is_hex_digit (c)) + { + digit = c - 'A' + 10; + } + else + { + throw Format (); + } + + if (digit > base) throw Format (); + + if (result > bound) + { + // cerr << "boundary: base: " << base << "; bound: " << std::hex + // << bound << "; result: " << std::hex << result << endl; + + throw Boundary (); + } + + + result *= base; + result += digit; + + ++p; + } + + return result; + } + + //@@ need to return unparsed characters for recovery (like in + // integer_literal). + // + bool LexicalAnalyzer:: + integer_literal (Char c, TokenPtr& token) + { + try + { + if (!is_dec_digit (c)) return false; + + unsigned long line (c.line ()); + + ret (c); // Temporarily return the character. + + string lexeme, number; + + unsigned short base (10); // assume 10 + + // Determine base and get rid of its identifications. + // + // + if (c == '0') + { + lexeme += c; + + get (); + + Char pc (peek ()); + + if (!is_eos (pc)) + { + if (pc == 'x' || pc == 'X') + { + get (); + base = 16; + lexeme += pc; + + c = peek (); + } + else + { + base = 8; + if (!is_oct_digit (pc)) + { + number += c; // this is needed to handle single 0 + } + + c = pc; + } + } + else + { + number += c; // this is needed to handle single 0 + } + } + + while (true) + { + // Make sure c is a legal character. + // + + if (is_eos (c)) break; + + if (base == 8 && !is_oct_digit (c)) + { + break; + } + else if (base == 10 && !is_dec_digit (c)) + { + break; + } + else if (!is_hex_digit (c)) + { + break; + } + + get (); + + lexeme += c; + number += c; + + c = peek (); + } + + if (number.empty ()) throw Format (); + + unsigned long long value (scan_integer (number, base)); + + //cerr << "integer literal: <" << lexeme << ">/<" << number << ">/<" + // << value << ">" << endl; + + token = TokenPtr (new IntegerLiteral (value, lexeme, line)); + return true; + } + catch (Format const&) + { + cerr << "error: invalid integer literal format" << endl; + return false; + } + catch (Boundary const&) + { + cerr << "error: integer literal is too big" << endl; + return false; + } + } + } +} |