summaryrefslogtreecommitdiff
path: root/CIAO/CCF/CCF/IDL2/LexicalAnalyzer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'CIAO/CCF/CCF/IDL2/LexicalAnalyzer.cpp')
-rw-r--r--CIAO/CCF/CCF/IDL2/LexicalAnalyzer.cpp964
1 files changed, 964 insertions, 0 deletions
diff --git a/CIAO/CCF/CCF/IDL2/LexicalAnalyzer.cpp b/CIAO/CCF/CCF/IDL2/LexicalAnalyzer.cpp
new file mode 100644
index 00000000000..e357a1a0c28
--- /dev/null
+++ b/CIAO/CCF/CCF/IDL2/LexicalAnalyzer.cpp
@@ -0,0 +1,964 @@
+// file : CCF/IDL2/LexicalAnalyzer.cpp
+// author : Boris Kolpackov <boris@dre.vanderbilt.edu>
+// cvs-id : $Id$
+
+#include "CCF/IDL2/LexicalAnalyzer.hpp"
+
+#include <stdlib.h> // strtol
+#include <iostream>
+
+using std::cerr;
+using std::endl;
+using std::string;
+using std::pair;
+using std::size_t;
+
+namespace CCF
+{
+ namespace IDL2
+ {
+ LexicalAnalyzer::
+ LexicalAnalyzer (CompilerElements::TokenStream<Char>& is)
+ : loc_ ("C"), is_ (is)
+ {
+ // Keywords (alphabetic order).
+ //
+
+ keyword_table_.insert ("abstract" );
+ keyword_table_.insert ("attribute" );
+ keyword_table_.insert ("__binclude" );
+ keyword_table_.insert ("case" );
+ keyword_table_.insert ("const" );
+ keyword_table_.insert ("custom" );
+ keyword_table_.insert ("default" );
+ keyword_table_.insert ("exception" );
+ keyword_table_.insert ("enum" );
+ keyword_table_.insert ("factory" );
+ keyword_table_.insert ("getraises" );
+ keyword_table_.insert ("in" );
+ keyword_table_.insert ("inout" );
+ keyword_table_.insert ("interface" );
+ keyword_table_.insert ("__qinclude" );
+ keyword_table_.insert ("local" );
+ keyword_table_.insert ("module" );
+ keyword_table_.insert ("native" );
+ keyword_table_.insert ("oneway" );
+ keyword_table_.insert ("out" );
+ keyword_table_.insert ("private" );
+ keyword_table_.insert ("public" );
+ keyword_table_.insert ("raises" );
+ keyword_table_.insert ("readonly" );
+ keyword_table_.insert ("sequence" );
+ keyword_table_.insert ("setraises" );
+ keyword_table_.insert ("struct" );
+ keyword_table_.insert ("supports" );
+ keyword_table_.insert ("switch" );
+ keyword_table_.insert ("truncatable");
+ keyword_table_.insert ("typedef" );
+ keyword_table_.insert ("typeid" );
+ keyword_table_.insert ("typeprefix" );
+ keyword_table_.insert ("union" );
+ keyword_table_.insert ("valuetype" );
+
+ // Identifiers (alphabetic order).
+ //
+ // Note: if you are planning to hack something up
+ // in the code below, first make sure you understand
+ // how everything works!
+ //
+
+ IdentifierTreeNode end;
+
+ identifier_tree_["Object" ] = end;
+ identifier_tree_["ValueBase"] = end;
+ identifier_tree_["any" ] = end;
+ identifier_tree_["boolean" ] = end;
+ identifier_tree_["char" ] = end;
+ identifier_tree_["double" ] = end;
+ identifier_tree_["float" ] = end;
+
+ IdentifierTreeNode long_;
+ long_["" ] = end;
+ long_["double"] = end;
+ long_["long" ] = end;
+
+ identifier_tree_["long" ] = long_;
+ identifier_tree_["octet" ] = end;
+ identifier_tree_["short" ] = end;
+ identifier_tree_["string" ] = end;
+
+ IdentifierTreeNode unsigned_long_;
+ unsigned_long_["" ] = end;
+ unsigned_long_["long"] = end;
+
+ IdentifierTreeNode unsigned_;
+ unsigned_["long" ] = unsigned_long_;
+ unsigned_["short" ] = end;
+
+ identifier_tree_["unsigned" ] = unsigned_;
+ identifier_tree_["void" ] = end;
+ identifier_tree_["wchar" ] = end;
+ identifier_tree_["wstring" ] = end;
+
+ // punctuation (pair-matched).
+ //
+ punctuation_table_.insert (":");
+ punctuation_table_.insert (",");
+ punctuation_table_.insert ("{");
+ punctuation_table_.insert ("}");
+ punctuation_table_.insert ("[");
+ punctuation_table_.insert ("]");
+ punctuation_table_.insert ("(");
+ punctuation_table_.insert (")");
+ punctuation_table_.insert ("<");
+ punctuation_table_.insert (">");
+ punctuation_table_.insert (";");
+
+ // operators
+ //
+ operator_table_.insert ("+"); // add
+ operator_table_.insert ("&"); // and
+ operator_table_.insert ("~"); // com
+ operator_table_.insert ("/"); // div
+ operator_table_.insert ("="); // eq
+ operator_table_.insert ("<<"); // lsh
+ operator_table_.insert ("*"); // mul
+ operator_table_.insert ("|"); // or
+ operator_table_.insert ("%"); // rem
+ operator_table_.insert ("-"); // sub
+ operator_table_.insert (">>"); // rsh
+ operator_table_.insert ("^"); // xor
+ }
+
+
+ LexicalAnalyzer::Char LexicalAnalyzer::
+ get ()
+ {
+ if (!ibuffer_.empty ())
+ {
+ Char c = ibuffer_.front ();
+ ibuffer_.pop_front ();
+ return c;
+ }
+ else
+ {
+ return is_.next ();
+ }
+ }
+
+ LexicalAnalyzer::Char LexicalAnalyzer::
+ peek ()
+ {
+ if (ibuffer_.empty ())
+ {
+ ibuffer_.push_back (is_.next ());
+ }
+
+ return ibuffer_.front ();
+ }
+
+ LexicalAnalyzer::Char LexicalAnalyzer::
+ peek_more ()
+ {
+ while (ibuffer_.size () < 2)
+ {
+ ibuffer_.push_back (is_.next ());
+ }
+
+ return ibuffer_.at (1);
+ }
+
+ void LexicalAnalyzer::
+ ret (Char const& c)
+ {
+ ibuffer_.push_front (c);
+ }
+
+
+ TokenPtr LexicalAnalyzer::
+ next ()
+ {
+ while (true) // Recovery loop.
+ {
+ Char c = skip_space (get ());
+
+ if (is_eos (c)) return TokenPtr (new EndOfStream (0));
+
+ TokenPtr token;
+
+ if (character_literal (c, token)) return token;
+
+ if (string_literal (c, token)) return token;
+
+ if (integer_literal (c, token)) return token;
+
+ // Check for identifier after literals because it can be
+ // triggered by wide string prefix (L"...").
+ //
+ if (is_alpha (c) || c == '_' || (c == ':' && peek () == ':'))
+ {
+ return identifier (c);
+ }
+
+ // Check for punctuation after identifier because ':' in
+ // scoped name will trigger.
+ //
+ if (operator_ (c, token)) return token;
+
+ if (punctuation (c, token)) return token;
+
+ cerr << c.line () << ": error: unable to derive any token "
+ << "from \'" << c << "\'" << endl;
+
+
+ //Do some primitive error recovery.
+ //
+ while (c != ';')
+ {
+ c = skip_space (get ());
+ if (is_eos (c)) return TokenPtr (new EndOfStream (0));
+ }
+ }
+ }
+
+ LexicalAnalyzer::Char LexicalAnalyzer::
+ skip_space (Char c)
+ {
+ while (!is_eos (c) && is_space (c)) c = get ();
+
+ return c;
+ }
+
+ bool LexicalAnalyzer::
+ read_simple_identifier (string& lexeme, CharBuffer& buf)
+ {
+ Char c = skip_space (get ());
+
+ buf.push_back (c);
+
+ if (is_eos (c)) return false;
+
+ if (is_alpha (c) || c == '_')
+ {
+ lexeme += c;
+
+ while (true)
+ {
+ c = peek ();
+
+ if (is_eos (c))
+ {
+ cerr << "warning: no new line at the end of file" << endl;
+ break;
+ }
+
+ if (is_alnum (c) || c == '_')
+ {
+ get ();
+ buf.push_back (c);
+ lexeme += c;
+ continue;
+ }
+
+ break;
+ }
+
+ return true;
+ }
+
+ return false;
+ }
+
+
+ bool LexicalAnalyzer::
+ traverse_identifier_tree (string& lexeme, IdentifierTreeNode const& node)
+ {
+ if (node.map_.empty ()) return true;
+
+ CharBuffer buf;
+ string part;
+
+ if (read_simple_identifier (part, buf))
+ {
+ IdentifierTreeNode::PrefixMap::const_iterator i (
+ node.map_.find (part));
+
+ if (i != node.map_.end ())
+ {
+ if (traverse_identifier_tree (part, i->second))
+ {
+ lexeme += " " + part;
+ return true;
+ }
+ }
+ }
+
+ // Return characters to the buffer in case we couldn't
+ // match anything.
+
+ for(;!buf.empty (); buf.pop_back ()) ret (buf.back ());
+
+ // Check is the node.map_ contains empty key which indicates
+ // that what we've got is good enough.
+ //
+ return node.map_.find ("") != node.map_.end ();
+ }
+
+
+ TokenPtr LexicalAnalyzer::
+ identifier (Char c)
+ {
+ unsigned long line (c.line ());
+
+ string lexeme;
+
+ enum
+ {
+ simple,
+ scoped,
+ other
+ } type = simple;
+
+ if (c == ':')
+ {
+ //@@ not checking for eos here
+ if ((c = get ()) != ':')
+ {
+ cerr << "error: " << c.line () << ": \':\' expected."
+ << endl;
+
+ return TokenPtr (new EndOfStream (0));
+ //@@ error handling is lame for lexical analyzer.
+ }
+
+ lexeme = "::";
+ type = scoped;
+ c = get ();
+ }
+
+ // First caracter of an identifier.
+ //
+ if (is_eos (c))
+ {
+ cerr << "error: invalid identifier" << endl;
+ return TokenPtr (new EndOfStream (0));
+ }
+
+ if (is_alpha (c) || c == '_')
+ {
+ lexeme += c;
+ }
+ else
+ {
+ cerr << "error: invalid identifier" << endl;
+ return TokenPtr (new EndOfStream (0));
+ }
+
+ while (true)
+ {
+ c = peek ();
+
+ if (is_eos (c))
+ {
+ cerr << "warning: no new line at the end of file" << endl;
+ break;
+ }
+
+ // cerr << "lexer::identifier: peeking on \'" << c.char_ ()
+ // << "\'; current lexeme \'" << lexeme << "\'"
+ // << endl;
+
+ if (is_alnum (c) || c == '_')
+ {
+ get ();
+ lexeme += c;
+ continue;
+ }
+
+ if (c == ':' && peek_more () == ':')
+ {
+ get ();
+ get ();
+ lexeme += "::";
+ if (type == simple) type = other;
+ continue;
+ }
+
+ break;
+ }
+
+ //cerr << "lexer: found identifier with lexeme \'"
+ // << lexeme << "\'" << endl;
+
+ if (type == simple)
+ {
+ // Check if it's a keyword.
+ {
+ KeywordTable::const_iterator i (keyword_table_.find (lexeme));
+
+ if (i != keyword_table_.end ())
+ {
+ return TokenPtr (new Keyword (*i, line));
+ }
+
+ // This part is tricky. If it's after 6pm then come back
+ // in the morning. In essence I want the same name
+ // ('string' and 'wstring') to be recognized as a keyword
+ // in one case and as an identifier in the other. When
+ // we see 'string' followed by '<' we want it to be a
+ // keyword. If it's just all by itself then we want to treat
+ // it as an identifier (since it is a complete construct
+ // by itself). So here we are going to check for that.
+ //
+
+ if (lexeme == "string" || lexeme == "wstring")
+ {
+ Char c = skip_space (get ());
+ ret (c);
+
+ if (c == '<')
+ {
+ return TokenPtr (new Keyword (lexeme, line));
+ }
+ }
+ }
+
+ // Check if it is a reserved identifier.
+ //
+
+ {
+ IdentifierTreeNode::PrefixMap::const_iterator i (
+ identifier_tree_.map_.find (lexeme));
+
+ if (i != identifier_tree_.map_.end ())
+ {
+ if (traverse_identifier_tree (lexeme, i->second))
+ {
+ return TokenPtr (
+ new SimpleIdentifier (lexeme, line));
+ }
+ else
+ {
+ //@@ error
+ }
+ }
+ }
+
+ // Check if it is a boolean literal.
+ //
+ if (lexeme == "TRUE" || lexeme == "FALSE")
+ {
+ return TokenPtr (new BooleanLiteral (lexeme, line));
+ }
+
+ // Default to SimpleIdentifier.
+ //
+ return TokenPtr (new SimpleIdentifier (lexeme, line));
+ }
+ else if (type == scoped)
+ {
+ return TokenPtr (new ScopedIdentifier (lexeme, line));
+ }
+ else //type == other
+ {
+ return TokenPtr (new Identifier (lexeme, line));
+ }
+ }
+
+ bool LexicalAnalyzer::
+ punctuation (Char c, TokenPtr& token)
+ {
+ unsigned long line (c.line ());
+
+ PunctuationTable::const_iterator i = punctuation_table_.begin ();
+
+ while (true)
+ {
+ for (;i != punctuation_table_.end () && (*i)[0] != c; ++i);
+
+ if (i == punctuation_table_.end ()) return false;
+
+ if (i->size () == 2) // two-character punctuation
+ {
+ Char pc (peek ());
+
+ if (!is_eos (pc) && (*i)[1] == pc)
+ {
+ get ();
+ }
+ else
+ {
+ // Move on to the next candidate.
+ //
+ ++i;
+ continue;
+ }
+ }
+
+ token = TokenPtr (new Punctuation (*i, line));
+ return true;
+ }
+ }
+
+ bool LexicalAnalyzer::
+ operator_ (Char c, TokenPtr& token)
+ {
+ unsigned long line (c.line ());
+
+ OperatorTable::const_iterator i = operator_table_.begin ();
+
+ while (true)
+ {
+ for (;i != operator_table_.end () && (*i)[0] != c; ++i);
+
+ if (i == operator_table_.end ()) return false;
+
+ if (i->size () == 2) // two-character operator
+ {
+ Char pc (peek ());
+
+ if (!is_eos (pc) && (*i)[1] == pc)
+ {
+ get ();
+ }
+ else
+ {
+ // Move on to the next candidate.
+ //
+ ++i;
+ continue;
+ }
+ }
+
+ token = TokenPtr (new Operator (*i, line));
+ return true;
+ }
+ }
+
+ pair<char, size_t> LexicalAnalyzer::
+ scan_char (char const* s) throw (Format)
+ {
+ if (*s == '\0')
+ {
+ throw Format ();
+ }
+ else if (*s != '\\')
+ {
+ return pair<char, size_t> (*s, 1);
+ }
+ else
+ {
+ // Expected size is 2.
+ //
+ pair<char, size_t> r ('\0', 2);
+
+ switch (*++s)
+ {
+ case 'n':
+ r.first = '\n';
+ break;
+
+ case 't':
+ r.first = '\t';
+ break;
+
+ case 'v':
+ r.first = '\v';
+ break;
+
+ case 'b':
+ r.first = '\b';
+ break;
+
+ case 'r':
+ r.first = '\r';
+ break;
+
+ case 'f':
+ r.first = '\f';
+ break;
+
+ case 'a':
+ r.first = '\a';
+ break;
+
+ case '\\':
+ r.first = '\\';
+ break;
+
+ case '?':
+ r.first = '\?';
+ break;
+
+ case '\'':
+ r.first = '\'';
+ break;
+
+ case '"':
+ r.first = '\"';
+ break;
+
+ case 'x':
+ {
+ // hex
+
+ char c (*++s);
+
+ if(is_hex_digit (c))
+ {
+ // Maximum 2 digits.
+ //
+ string holder (s, 2);
+
+ char* end;
+
+ // Cannot fail. -1 < v < 256.
+ //
+ long v (strtol(holder.c_str (), &end, 16));
+
+ r.first = static_cast<char> (v);
+ r.second = 2 + end - holder.c_str ();
+ }
+ else
+ {
+ throw Format ();
+ }
+ break;
+ }
+
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ {
+ // Maximum 3 digits.
+ //
+ string holder (s, 3);
+
+ char* end;
+
+ // Cannot fail.
+ //
+ long v (strtol(holder.c_str (), &end, 8));
+
+ if (v < 0 || v > 255) throw Format ();
+
+ r.first = static_cast<char> (v);
+ r.second = 1 + end - holder.c_str ();
+
+ break;
+ }
+ default:
+ {
+ throw Format ();
+ }
+ }
+
+ return r;
+
+ }
+ }
+
+ string LexicalAnalyzer::
+ scan_string (string const& s) throw (Format)
+ {
+ string r;
+
+ char const* p = s.c_str ();
+
+ while (*p != '\0')
+ {
+ pair<char, size_t> c (scan_char (p));
+
+ if (c.first == '\0') throw Format ();
+
+ r += c.first;
+ p += c.second;
+ }
+
+ return r;
+ }
+
+
+ bool LexicalAnalyzer::
+ character_literal (Char c, TokenPtr& token)
+ {
+ if (c != '\'') return false;
+
+ unsigned long line (c.line ());
+ string lexeme;
+
+ Char prev (c);
+
+ while (true)
+ {
+ c = get ();
+
+ if (is_eos (c))
+ {
+ cerr << "error: end of file while reading character literal"
+ << endl;
+ break;
+ }
+
+ if (c == '\'' && prev != '\\') break;
+
+ lexeme += c;
+ prev = c;
+ }
+
+ try
+ {
+ pair<char, size_t> r (scan_char (lexeme.c_str ()));
+ if (r.second != lexeme.size ()) throw Format ();
+
+ token = TokenPtr (new CharacterLiteral (r.first, lexeme, line));
+ return true;
+ }
+ catch (Format const&)
+ {
+ cerr << "error: invalid character literal format" << endl;
+ return false;
+ }
+ }
+
+ bool LexicalAnalyzer::
+ string_literal (Char c, TokenPtr& token)
+ {
+ if (c != '\"') return false;
+
+ unsigned long line (c.line ());
+ string lexeme;
+ string value;
+
+ try
+ {
+ while (true)
+ {
+ string r (string_literal_trailer ());
+ value += scan_string (r);
+ lexeme += '\"' + r + '\"';
+
+ // Check if there are more strings.
+ //
+
+ c = skip_space (get ());
+
+ if (c != '\"')
+ {
+ ret (c); // put it back
+ break;
+ }
+
+
+ // Add single space as a string separator.
+ //
+ lexeme += " ";
+ }
+
+ // cerr << "string literal: <" << lexeme << ">/<" << value << ">"
+ // << endl;
+
+ token = TokenPtr (new StringLiteral (value, lexeme, line));
+ return true;
+ }
+ catch (Format const&)
+ {
+ cerr << "error: invalid string literal format" << endl;
+ return false;
+ }
+ }
+
+ string LexicalAnalyzer::
+ string_literal_trailer ()
+ {
+ string r;
+
+ Char prev ('\"', 0);
+
+ while (true)
+ {
+ Char c = get ();
+
+ if (is_eos (c))
+ {
+ cerr << "error: end of file while reading string literal" << endl;
+ break;
+ }
+
+ if (c == '\"' && prev != '\\') break;
+
+ r += c;
+ prev = c;
+ }
+
+ return r;
+ }
+
+ unsigned long long LexicalAnalyzer::
+ scan_integer (string const& s, unsigned short base)
+ throw (Format, Boundary)
+ {
+ unsigned long long const max (~0ULL);
+ unsigned long long bound (max / base);
+
+
+ char const* p (s.c_str ());
+
+ // Skip leading 0 if any.
+ //
+ while (*p != '\0' && *p == '0') ++p;
+
+
+ unsigned long long result (0);
+
+ while(*p != '\0')
+ {
+ unsigned short digit;
+
+ char c (to_upper (*p));
+
+ if (is_dec_digit (c))
+ {
+ digit = c - '0';
+ }
+ else if (is_hex_digit (c))
+ {
+ digit = c - 'A' + 10;
+ }
+ else
+ {
+ throw Format ();
+ }
+
+ if (digit > base) throw Format ();
+
+ if (result > bound)
+ {
+ // cerr << "boundary: base: " << base << "; bound: " << std::hex
+ // << bound << "; result: " << std::hex << result << endl;
+
+ throw Boundary ();
+ }
+
+
+ result *= base;
+ result += digit;
+
+ ++p;
+ }
+
+ return result;
+ }
+
+ //@@ need to return unparsed characters for recovery (like in
+ // integer_literal).
+ //
+ bool LexicalAnalyzer::
+ integer_literal (Char c, TokenPtr& token)
+ {
+ try
+ {
+ if (!is_dec_digit (c)) return false;
+
+ unsigned long line (c.line ());
+
+ ret (c); // Temporarily return the character.
+
+ string lexeme, number;
+
+ unsigned short base (10); // assume 10
+
+ // Determine base and get rid of its identifications.
+ //
+ //
+ if (c == '0')
+ {
+ lexeme += c;
+
+ get ();
+
+ Char pc (peek ());
+
+ if (!is_eos (pc))
+ {
+ if (pc == 'x' || pc == 'X')
+ {
+ get ();
+ base = 16;
+ lexeme += pc;
+
+ c = peek ();
+ }
+ else
+ {
+ base = 8;
+ if (!is_oct_digit (pc))
+ {
+ number += c; // this is needed to handle single 0
+ }
+
+ c = pc;
+ }
+ }
+ else
+ {
+ number += c; // this is needed to handle single 0
+ }
+ }
+
+ while (true)
+ {
+ // Make sure c is a legal character.
+ //
+
+ if (is_eos (c)) break;
+
+ if (base == 8 && !is_oct_digit (c))
+ {
+ break;
+ }
+ else if (base == 10 && !is_dec_digit (c))
+ {
+ break;
+ }
+ else if (!is_hex_digit (c))
+ {
+ break;
+ }
+
+ get ();
+
+ lexeme += c;
+ number += c;
+
+ c = peek ();
+ }
+
+ if (number.empty ()) throw Format ();
+
+ unsigned long long value (scan_integer (number, base));
+
+ //cerr << "integer literal: <" << lexeme << ">/<" << number << ">/<"
+ // << value << ">" << endl;
+
+ token = TokenPtr (new IntegerLiteral (value, lexeme, line));
+ return true;
+ }
+ catch (Format const&)
+ {
+ cerr << "error: invalid integer literal format" << endl;
+ return false;
+ }
+ catch (Boundary const&)
+ {
+ cerr << "error: integer literal is too big" << endl;
+ return false;
+ }
+ }
+ }
+}