diff options
author | David Beazley <dave@dabeaz.com> | 2020-01-01 09:51:19 -0600 |
---|---|---|
committer | David Beazley <dave@dabeaz.com> | 2020-01-01 09:51:19 -0600 |
commit | 1321375e013425958ea090b55aecae0a4b7face6 (patch) | |
tree | 9569e16dde14372391142bc0d80ff0f90d72215b /example | |
parent | f6d78006324079bfe824422b9a21dc91811ecfa2 (diff) | |
download | ply-1321375e013425958ea090b55aecae0a4b7face6.tar.gz |
initial year-end cleanup.
Diffstat (limited to 'example')
-rw-r--r-- | example/cpp/cpp.py | 974 | ||||
-rw-r--r-- | example/ctokens/ctokens.py | 127 |
2 files changed, 1101 insertions, 0 deletions
diff --git a/example/cpp/cpp.py b/example/cpp/cpp.py new file mode 100644 index 0000000..50a44a1 --- /dev/null +++ b/example/cpp/cpp.py @@ -0,0 +1,974 @@ +# ----------------------------------------------------------------------------- +# ply: cpp.py +# +# Copyright (C) 2001-2019 +# David M. Beazley (Dabeaz LLC) +# All rights reserved. +# +# Latest version: https://github.com/dabeaz/ply +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * Neither the name of David Beazley or Dabeaz LLC may be used to +# endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ----------------------------------------------------------------------------- + +# This module implements an ANSI-C style lexical preprocessor for PLY. +# ----------------------------------------------------------------------------- +from __future__ import generators + +import sys + +# Some Python 3 compatibility shims +if sys.version_info.major < 3: + STRING_TYPES = (str, unicode) +else: + STRING_TYPES = str + xrange = range + +# ----------------------------------------------------------------------------- +# Default preprocessor lexer definitions. These tokens are enough to get +# a basic preprocessor working. Other modules may import these if they want +# ----------------------------------------------------------------------------- + +tokens = ( + 'CPP_ID','CPP_INTEGER', 'CPP_FLOAT', 'CPP_STRING', 'CPP_CHAR', 'CPP_WS', 'CPP_COMMENT1', 'CPP_COMMENT2', 'CPP_POUND','CPP_DPOUND' +) + +literals = "+-*/%|&~^<>=!?()[]{}.,;:\\\'\"" + +# Whitespace +def t_CPP_WS(t): + r'\s+' + t.lexer.lineno += t.value.count("\n") + return t + +t_CPP_POUND = r'\#' +t_CPP_DPOUND = r'\#\#' + +# Identifier +t_CPP_ID = r'[A-Za-z_][\w_]*' + +# Integer literal +def CPP_INTEGER(t): + r'(((((0x)|(0X))[0-9a-fA-F]+)|(\d+))([uU][lL]|[lL][uU]|[uU]|[lL])?)' + return t + +t_CPP_INTEGER = CPP_INTEGER + +# Floating literal +t_CPP_FLOAT = r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?' + +# String literal +def t_CPP_STRING(t): + r'\"([^\\\n]|(\\(.|\n)))*?\"' + t.lexer.lineno += t.value.count("\n") + return t + +# Character constant 'c' or L'c' +def t_CPP_CHAR(t): + r'(L)?\'([^\\\n]|(\\(.|\n)))*?\'' + t.lexer.lineno += t.value.count("\n") + return t + +# Comment +def t_CPP_COMMENT1(t): + r'(/\*(.|\n)*?\*/)' + ncr = t.value.count("\n") + t.lexer.lineno += ncr + # replace with one space or a number of '\n' + t.type = 'CPP_WS'; t.value = '\n' * ncr if ncr else ' ' + return t + +# Line comment +def t_CPP_COMMENT2(t): + r'(//.*?(\n|$))' + # replace with '/n' + t.type = 'CPP_WS'; t.value = '\n' + return t + +def t_error(t): + t.type = t.value[0] + t.value = t.value[0] + t.lexer.skip(1) + return t + +import re +import copy +import time +import os.path + +# ----------------------------------------------------------------------------- +# trigraph() +# +# Given an input string, this function replaces all trigraph sequences. +# The following mapping is used: +# +# ??= # +# ??/ \ +# ??' ^ +# ??( [ +# ??) ] +# ??! | +# ??< { +# ??> } +# ??- ~ +# ----------------------------------------------------------------------------- + +_trigraph_pat = re.compile(r'''\?\?[=/\'\(\)\!<>\-]''') +_trigraph_rep = { + '=':'#', + '/':'\\', + "'":'^', + '(':'[', + ')':']', + '!':'|', + '<':'{', + '>':'}', + '-':'~' +} + +def trigraph(input): + return _trigraph_pat.sub(lambda g: _trigraph_rep[g.group()[-1]],input) + +# ------------------------------------------------------------------ +# Macro object +# +# This object holds information about preprocessor macros +# +# .name - Macro name (string) +# .value - Macro value (a list of tokens) +# .arglist - List of argument names +# .variadic - Boolean indicating whether or not variadic macro +# .vararg - Name of the variadic parameter +# +# When a macro is created, the macro replacement token sequence is +# pre-scanned and used to create patch lists that are later used +# during macro expansion +# ------------------------------------------------------------------ + +class Macro(object): + def __init__(self,name,value,arglist=None,variadic=False): + self.name = name + self.value = value + self.arglist = arglist + self.variadic = variadic + if variadic: + self.vararg = arglist[-1] + self.source = None + +# ------------------------------------------------------------------ +# Preprocessor object +# +# Object representing a preprocessor. Contains macro definitions, +# include directories, and other information +# ------------------------------------------------------------------ + +class Preprocessor(object): + def __init__(self,lexer=None): + if lexer is None: + lexer = lex.lexer + self.lexer = lexer + self.macros = { } + self.path = [] + self.temp_path = [] + + # Probe the lexer for selected tokens + self.lexprobe() + + tm = time.localtime() + self.define("__DATE__ \"%s\"" % time.strftime("%b %d %Y",tm)) + self.define("__TIME__ \"%s\"" % time.strftime("%H:%M:%S",tm)) + self.parser = None + + # ----------------------------------------------------------------------------- + # tokenize() + # + # Utility function. Given a string of text, tokenize into a list of tokens + # ----------------------------------------------------------------------------- + + def tokenize(self,text): + tokens = [] + self.lexer.input(text) + while True: + tok = self.lexer.token() + if not tok: break + tokens.append(tok) + return tokens + + # --------------------------------------------------------------------- + # error() + # + # Report a preprocessor error/warning of some kind + # ---------------------------------------------------------------------- + + def error(self,file,line,msg): + print("%s:%d %s" % (file,line,msg)) + + # ---------------------------------------------------------------------- + # lexprobe() + # + # This method probes the preprocessor lexer object to discover + # the token types of symbols that are important to the preprocessor. + # If this works right, the preprocessor will simply "work" + # with any suitable lexer regardless of how tokens have been named. + # ---------------------------------------------------------------------- + + def lexprobe(self): + + # Determine the token type for identifiers + self.lexer.input("identifier") + tok = self.lexer.token() + if not tok or tok.value != "identifier": + print("Couldn't determine identifier type") + else: + self.t_ID = tok.type + + # Determine the token type for integers + self.lexer.input("12345") + tok = self.lexer.token() + if not tok or int(tok.value) != 12345: + print("Couldn't determine integer type") + else: + self.t_INTEGER = tok.type + self.t_INTEGER_TYPE = type(tok.value) + + # Determine the token type for strings enclosed in double quotes + self.lexer.input("\"filename\"") + tok = self.lexer.token() + if not tok or tok.value != "\"filename\"": + print("Couldn't determine string type") + else: + self.t_STRING = tok.type + + # Determine the token type for whitespace--if any + self.lexer.input(" ") + tok = self.lexer.token() + if not tok or tok.value != " ": + self.t_SPACE = None + else: + self.t_SPACE = tok.type + + # Determine the token type for newlines + self.lexer.input("\n") + tok = self.lexer.token() + if not tok or tok.value != "\n": + self.t_NEWLINE = None + print("Couldn't determine token for newlines") + else: + self.t_NEWLINE = tok.type + + self.t_WS = (self.t_SPACE, self.t_NEWLINE) + + # Check for other characters used by the preprocessor + chars = [ '<','>','#','##','\\','(',')',',','.'] + for c in chars: + self.lexer.input(c) + tok = self.lexer.token() + if not tok or tok.value != c: + print("Unable to lex '%s' required for preprocessor" % c) + + # ---------------------------------------------------------------------- + # add_path() + # + # Adds a search path to the preprocessor. + # ---------------------------------------------------------------------- + + def add_path(self,path): + self.path.append(path) + + # ---------------------------------------------------------------------- + # group_lines() + # + # Given an input string, this function splits it into lines. Trailing whitespace + # is removed. Any line ending with \ is grouped with the next line. This + # function forms the lowest level of the preprocessor---grouping into text into + # a line-by-line format. + # ---------------------------------------------------------------------- + + def group_lines(self,input): + lex = self.lexer.clone() + lines = [x.rstrip() for x in input.splitlines()] + for i in xrange(len(lines)): + j = i+1 + while lines[i].endswith('\\') and (j < len(lines)): + lines[i] = lines[i][:-1]+lines[j] + lines[j] = "" + j += 1 + + input = "\n".join(lines) + lex.input(input) + lex.lineno = 1 + + current_line = [] + while True: + tok = lex.token() + if not tok: + break + current_line.append(tok) + if tok.type in self.t_WS and '\n' in tok.value: + yield current_line + current_line = [] + + if current_line: + yield current_line + + # ---------------------------------------------------------------------- + # tokenstrip() + # + # Remove leading/trailing whitespace tokens from a token list + # ---------------------------------------------------------------------- + + def tokenstrip(self,tokens): + i = 0 + while i < len(tokens) and tokens[i].type in self.t_WS: + i += 1 + del tokens[:i] + i = len(tokens)-1 + while i >= 0 and tokens[i].type in self.t_WS: + i -= 1 + del tokens[i+1:] + return tokens + + + # ---------------------------------------------------------------------- + # collect_args() + # + # Collects comma separated arguments from a list of tokens. The arguments + # must be enclosed in parenthesis. Returns a tuple (tokencount,args,positions) + # where tokencount is the number of tokens consumed, args is a list of arguments, + # and positions is a list of integers containing the starting index of each + # argument. Each argument is represented by a list of tokens. + # + # When collecting arguments, leading and trailing whitespace is removed + # from each argument. + # + # This function properly handles nested parenthesis and commas---these do not + # define new arguments. + # ---------------------------------------------------------------------- + + def collect_args(self,tokenlist): + args = [] + positions = [] + current_arg = [] + nesting = 1 + tokenlen = len(tokenlist) + + # Search for the opening '('. + i = 0 + while (i < tokenlen) and (tokenlist[i].type in self.t_WS): + i += 1 + + if (i < tokenlen) and (tokenlist[i].value == '('): + positions.append(i+1) + else: + self.error(self.source,tokenlist[0].lineno,"Missing '(' in macro arguments") + return 0, [], [] + + i += 1 + + while i < tokenlen: + t = tokenlist[i] + if t.value == '(': + current_arg.append(t) + nesting += 1 + elif t.value == ')': + nesting -= 1 + if nesting == 0: + if current_arg: + args.append(self.tokenstrip(current_arg)) + positions.append(i) + return i+1,args,positions + current_arg.append(t) + elif t.value == ',' and nesting == 1: + args.append(self.tokenstrip(current_arg)) + positions.append(i+1) + current_arg = [] + else: + current_arg.append(t) + i += 1 + + # Missing end argument + self.error(self.source,tokenlist[-1].lineno,"Missing ')' in macro arguments") + return 0, [],[] + + # ---------------------------------------------------------------------- + # macro_prescan() + # + # Examine the macro value (token sequence) and identify patch points + # This is used to speed up macro expansion later on---we'll know + # right away where to apply patches to the value to form the expansion + # ---------------------------------------------------------------------- + + def macro_prescan(self,macro): + macro.patch = [] # Standard macro arguments + macro.str_patch = [] # String conversion expansion + macro.var_comma_patch = [] # Variadic macro comma patch + i = 0 + while i < len(macro.value): + if macro.value[i].type == self.t_ID and macro.value[i].value in macro.arglist: + argnum = macro.arglist.index(macro.value[i].value) + # Conversion of argument to a string + if i > 0 and macro.value[i-1].value == '#': + macro.value[i] = copy.copy(macro.value[i]) + macro.value[i].type = self.t_STRING + del macro.value[i-1] + macro.str_patch.append((argnum,i-1)) + continue + # Concatenation + elif (i > 0 and macro.value[i-1].value == '##'): + macro.patch.append(('c',argnum,i-1)) + del macro.value[i-1] + i -= 1 + continue + elif ((i+1) < len(macro.value) and macro.value[i+1].value == '##'): + macro.patch.append(('c',argnum,i)) + del macro.value[i + 1] + continue + # Standard expansion + else: + macro.patch.append(('e',argnum,i)) + elif macro.value[i].value == '##': + if macro.variadic and (i > 0) and (macro.value[i-1].value == ',') and \ + ((i+1) < len(macro.value)) and (macro.value[i+1].type == self.t_ID) and \ + (macro.value[i+1].value == macro.vararg): + macro.var_comma_patch.append(i-1) + i += 1 + macro.patch.sort(key=lambda x: x[2],reverse=True) + + # ---------------------------------------------------------------------- + # macro_expand_args() + # + # Given a Macro and list of arguments (each a token list), this method + # returns an expanded version of a macro. The return value is a token sequence + # representing the replacement macro tokens + # ---------------------------------------------------------------------- + + def macro_expand_args(self,macro,args,expanded): + # Make a copy of the macro token sequence + rep = [copy.copy(_x) for _x in macro.value] + + # Make string expansion patches. These do not alter the length of the replacement sequence + + str_expansion = {} + for argnum, i in macro.str_patch: + if argnum not in str_expansion: + str_expansion[argnum] = ('"%s"' % "".join([x.value for x in args[argnum]])).replace("\\","\\\\") + rep[i] = copy.copy(rep[i]) + rep[i].value = str_expansion[argnum] + + # Make the variadic macro comma patch. If the variadic macro argument is empty, we get rid + comma_patch = False + if macro.variadic and not args[-1]: + for i in macro.var_comma_patch: + rep[i] = None + comma_patch = True + + # Make all other patches. The order of these matters. It is assumed that the patch list + # has been sorted in reverse order of patch location since replacements will cause the + # size of the replacement sequence to expand from the patch point. + + expanded_args = { } + for ptype, argnum, i in macro.patch: + # Concatenation. Argument is left unexpanded + if ptype == 'c': + rep[i:i+1] = args[argnum] + # Normal expansion. Argument is macro expanded first + elif ptype == 'e': + if argnum not in expanded_args: + expanded_args[argnum] = self.expand_macros(args[argnum],expanded) + rep[i:i+1] = expanded_args[argnum] + + # Get rid of removed comma if necessary + if comma_patch: + rep = [_i for _i in rep if _i] + + return rep + + + # ---------------------------------------------------------------------- + # expand_macros() + # + # Given a list of tokens, this function performs macro expansion. + # The expanded argument is a dictionary that contains macros already + # expanded. This is used to prevent infinite recursion. + # ---------------------------------------------------------------------- + + def expand_macros(self,tokens,expanded=None): + if expanded is None: + expanded = {} + i = 0 + while i < len(tokens): + t = tokens[i] + if t.type == self.t_ID: + if t.value in self.macros and t.value not in expanded: + # Yes, we found a macro match + expanded[t.value] = True + + m = self.macros[t.value] + if not m.arglist: + # A simple macro + ex = self.expand_macros([copy.copy(_x) for _x in m.value],expanded) + for e in ex: + e.lineno = t.lineno + tokens[i:i+1] = ex + i += len(ex) + else: + # A macro with arguments + j = i + 1 + while j < len(tokens) and tokens[j].type in self.t_WS: + j += 1 + if j < len(tokens) and tokens[j].value == '(': + tokcount,args,positions = self.collect_args(tokens[j:]) + if not m.variadic and len(args) != len(m.arglist): + self.error(self.source,t.lineno,"Macro %s requires %d arguments" % (t.value,len(m.arglist))) + i = j + tokcount + elif m.variadic and len(args) < len(m.arglist)-1: + if len(m.arglist) > 2: + self.error(self.source,t.lineno,"Macro %s must have at least %d arguments" % (t.value, len(m.arglist)-1)) + else: + self.error(self.source,t.lineno,"Macro %s must have at least %d argument" % (t.value, len(m.arglist)-1)) + i = j + tokcount + else: + if m.variadic: + if len(args) == len(m.arglist)-1: + args.append([]) + else: + args[len(m.arglist)-1] = tokens[j+positions[len(m.arglist)-1]:j+tokcount-1] + del args[len(m.arglist):] + + # Get macro replacement text + rep = self.macro_expand_args(m,args,expanded) + rep = self.expand_macros(rep,expanded) + for r in rep: + r.lineno = t.lineno + tokens[i:j+tokcount] = rep + i += len(rep) + else: + # This is not a macro. It is just a word which + # equals to name of the macro. Hence, go to the + # next token. + i += 1 + + del expanded[t.value] + continue + elif t.value == '__LINE__': + t.type = self.t_INTEGER + t.value = self.t_INTEGER_TYPE(t.lineno) + + i += 1 + return tokens + + # ---------------------------------------------------------------------- + # evalexpr() + # + # Evaluate an expression token sequence for the purposes of evaluating + # integral expressions. + # ---------------------------------------------------------------------- + + def evalexpr(self,tokens): + # tokens = tokenize(line) + # Search for defined macros + i = 0 + while i < len(tokens): + if tokens[i].type == self.t_ID and tokens[i].value == 'defined': + j = i + 1 + needparen = False + result = "0L" + while j < len(tokens): + if tokens[j].type in self.t_WS: + j += 1 + continue + elif tokens[j].type == self.t_ID: + if tokens[j].value in self.macros: + result = "1L" + else: + result = "0L" + if not needparen: break + elif tokens[j].value == '(': + needparen = True + elif tokens[j].value == ')': + break + else: + self.error(self.source,tokens[i].lineno,"Malformed defined()") + j += 1 + tokens[i].type = self.t_INTEGER + tokens[i].value = self.t_INTEGER_TYPE(result) + del tokens[i+1:j+1] + i += 1 + tokens = self.expand_macros(tokens) + return self.evalexpr_expanded(tokens) + + # ---------------------------------------------------------------------- + # evalexpr_expanded() + # + # Helper for evalexpr that evaluates the expression that had its macros + # and defined(...) expressions expanded by evalexpr + # ---------------------------------------------------------------------- + + def evalexpr_expanded(self, tokens): + for i,t in enumerate(tokens): + if t.type == self.t_ID: + tokens[i] = copy.copy(t) + tokens[i].type = self.t_INTEGER + tokens[i].value = self.t_INTEGER_TYPE("0") + elif t.type == self.t_INTEGER: + tokens[i] = copy.copy(t) + # Strip off any trailing suffixes + tokens[i].value = str(tokens[i].value) + while tokens[i].value[-1] not in "0123456789abcdefABCDEF": + tokens[i].value = tokens[i].value[:-1] + + return self.evalexpr_string("".join([str(x.value) for x in tokens])) + + # ---------------------------------------------------------------------- + # evalexpr_string() + # + # Helper for evalexpr that evaluates a string expression + # This implementation does basic C->python conversion and then uses eval() + # ---------------------------------------------------------------------- + def evalexpr_string(self, expr): + expr = expr.replace("&&"," and ") + expr = expr.replace("||"," or ") + expr = expr.replace("!"," not ") + expr = expr.replace(" not ="," !=") + try: + result = eval(expr) + except Exception: + self.error(self.source,tokens[0].lineno,"Couldn't evaluate expression") + result = 0 + return result + + # ---------------------------------------------------------------------- + # parsegen() + # + # Parse an input string/ + # ---------------------------------------------------------------------- + def parsegen(self,input,source=None): + + # Replace trigraph sequences + t = trigraph(input) + lines = self.group_lines(t) + + if not source: + source = "" + + self.define("__FILE__ \"%s\"" % source) + + self.source = source + chunk = [] + enable = True + iftrigger = False + ifstack = [] + + for x in lines: + for i,tok in enumerate(x): + if tok.type not in self.t_WS: break + if tok.value == '#': + # Preprocessor directive + + # insert necessary whitespace instead of eaten tokens + for tok in x: + if tok.type in self.t_WS and '\n' in tok.value: + chunk.append(tok) + + dirtokens = self.tokenstrip(x[i+1:]) + if dirtokens: + name = dirtokens[0].value + args = self.tokenstrip(dirtokens[1:]) + else: + name = "" + args = [] + + if name == 'define': + if enable: + for tok in self.expand_macros(chunk): + yield tok + chunk = [] + self.define(args) + elif name == 'include': + if enable: + for tok in self.expand_macros(chunk): + yield tok + chunk = [] + oldfile = self.macros['__FILE__'] + for tok in self.include(args): + yield tok + self.macros['__FILE__'] = oldfile + self.source = source + elif name == 'undef': + if enable: + for tok in self.expand_macros(chunk): + yield tok + chunk = [] + self.undef(args) + elif name == 'ifdef': + ifstack.append((enable,iftrigger)) + if enable: + if not args[0].value in self.macros: + enable = False + iftrigger = False + else: + iftrigger = True + elif name == 'ifndef': + ifstack.append((enable,iftrigger)) + if enable: + if args[0].value in self.macros: + enable = False + iftrigger = False + else: + iftrigger = True + elif name == 'if': + ifstack.append((enable,iftrigger)) + if enable: + result = self.evalexpr(args) + if not result: + enable = False + iftrigger = False + else: + iftrigger = True + elif name == 'elif': + if ifstack: + if ifstack[-1][0]: # We only pay attention if outer "if" allows this + if enable: # If already true, we flip enable False + enable = False + elif not iftrigger: # If False, but not triggered yet, we'll check expression + result = self.evalexpr(args) + if result: + enable = True + iftrigger = True + else: + self.error(self.source,dirtokens[0].lineno,"Misplaced #elif") + + elif name == 'else': + if ifstack: + if ifstack[-1][0]: + if enable: + enable = False + elif not iftrigger: + enable = True + iftrigger = True + else: + self.error(self.source,dirtokens[0].lineno,"Misplaced #else") + + elif name == 'endif': + if ifstack: + enable,iftrigger = ifstack.pop() + else: + self.error(self.source,dirtokens[0].lineno,"Misplaced #endif") + else: + # Unknown preprocessor directive + pass + + else: + # Normal text + if enable: + chunk.extend(x) + + for tok in self.expand_macros(chunk): + yield tok + chunk = [] + + # ---------------------------------------------------------------------- + # include() + # + # Implementation of file-inclusion + # ---------------------------------------------------------------------- + + def include(self,tokens): + # Try to extract the filename and then process an include file + if not tokens: + return + if tokens: + if tokens[0].value != '<' and tokens[0].type != self.t_STRING: + tokens = self.expand_macros(tokens) + + if tokens[0].value == '<': + # Include <...> + i = 1 + while i < len(tokens): + if tokens[i].value == '>': + break + i += 1 + else: + print("Malformed #include <...>") + return + filename = "".join([x.value for x in tokens[1:i]]) + path = self.path + [""] + self.temp_path + elif tokens[0].type == self.t_STRING: + filename = tokens[0].value[1:-1] + path = self.temp_path + [""] + self.path + else: + print("Malformed #include statement") + return + for p in path: + iname = os.path.join(p,filename) + try: + data = self.read_include_file(iname) + dname = os.path.dirname(iname) + if dname: + self.temp_path.insert(0,dname) + for tok in self.parsegen(data,filename): + yield tok + if dname: + del self.temp_path[0] + break + except IOError: + pass + else: + print("Couldn't find '%s'" % filename) + + # ---------------------------------------------------------------------- + # read_include_file() + # + # Reads a source file for inclusion using #include + # Could be overridden to e.g. customize encoding, limit access to + # certain paths on the filesystem, or provide the contents of system + # include files + # ---------------------------------------------------------------------- + + def read_include_file(self, filepath): + with open(filepath, 'r', encoding='utf-8', errors='surrogateescape') as file: + return file.read() + + # ---------------------------------------------------------------------- + # define() + # + # Define a new macro + # ---------------------------------------------------------------------- + + def define(self,tokens): + if isinstance(tokens,STRING_TYPES): + tokens = self.tokenize(tokens) + + linetok = tokens + try: + name = linetok[0] + if len(linetok) > 1: + mtype = linetok[1] + else: + mtype = None + if not mtype: + m = Macro(name.value,[]) + self.macros[name.value] = m + elif mtype.type in self.t_WS: + # A normal macro + m = Macro(name.value,self.tokenstrip(linetok[2:])) + self.macros[name.value] = m + elif mtype.value == '(': + # A macro with arguments + tokcount, args, positions = self.collect_args(linetok[1:]) + variadic = False + for a in args: + if variadic: + print("No more arguments may follow a variadic argument") + break + astr = "".join([str(_i.value) for _i in a]) + if astr == "...": + variadic = True + a[0].type = self.t_ID + a[0].value = '__VA_ARGS__' + variadic = True + del a[1:] + continue + elif astr[-3:] == "..." and a[0].type == self.t_ID: + variadic = True + del a[1:] + # If, for some reason, "." is part of the identifier, strip off the name for the purposes + # of macro expansion + if a[0].value[-3:] == '...': + a[0].value = a[0].value[:-3] + continue + if len(a) > 1 or a[0].type != self.t_ID: + print("Invalid macro argument") + break + else: + mvalue = self.tokenstrip(linetok[1+tokcount:]) + i = 0 + while i < len(mvalue): + if i+1 < len(mvalue): + if mvalue[i].type in self.t_WS and mvalue[i+1].value == '##': + del mvalue[i] + continue + elif mvalue[i].value == '##' and mvalue[i+1].type in self.t_WS: + del mvalue[i+1] + i += 1 + m = Macro(name.value,mvalue,[x[0].value for x in args],variadic) + self.macro_prescan(m) + self.macros[name.value] = m + else: + print("Bad macro definition") + except LookupError: + print("Bad macro definition") + + # ---------------------------------------------------------------------- + # undef() + # + # Undefine a macro + # ---------------------------------------------------------------------- + + def undef(self,tokens): + id = tokens[0].value + try: + del self.macros[id] + except LookupError: + pass + + # ---------------------------------------------------------------------- + # parse() + # + # Parse input text. + # ---------------------------------------------------------------------- + def parse(self,input,source=None,ignore={}): + self.ignore = ignore + self.parser = self.parsegen(input,source) + + # ---------------------------------------------------------------------- + # token() + # + # Method to return individual tokens + # ---------------------------------------------------------------------- + def token(self): + try: + while True: + tok = next(self.parser) + if tok.type not in self.ignore: return tok + except StopIteration: + self.parser = None + return None + +if __name__ == '__main__': + import ply.lex as lex + lexer = lex.lex() + + # Run a preprocessor + import sys + with open(sys.argv[1]) as f: + input = f.read() + + p = Preprocessor(lexer) + p.parse(input,sys.argv[1]) + while True: + tok = p.token() + if not tok: break + print(p.source, tok) diff --git a/example/ctokens/ctokens.py b/example/ctokens/ctokens.py new file mode 100644 index 0000000..b265e59 --- /dev/null +++ b/example/ctokens/ctokens.py @@ -0,0 +1,127 @@ +# ---------------------------------------------------------------------- +# ctokens.py +# +# Token specifications for symbols in ANSI C and C++. This file is +# meant to be used as a library in other tokenizers. +# ---------------------------------------------------------------------- + +# Reserved words + +tokens = [ + # Literals (identifier, integer constant, float constant, string constant, char const) + 'ID', 'TYPEID', 'INTEGER', 'FLOAT', 'STRING', 'CHARACTER', + + # Operators (+,-,*,/,%,|,&,~,^,<<,>>, ||, &&, !, <, <=, >, >=, ==, !=) + 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MODULO', + 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', + 'LOR', 'LAND', 'LNOT', + 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE', + + # Assignment (=, *=, /=, %=, +=, -=, <<=, >>=, &=, ^=, |=) + 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 'PLUSEQUAL', 'MINUSEQUAL', + 'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', 'OREQUAL', + + # Increment/decrement (++,--) + 'INCREMENT', 'DECREMENT', + + # Structure dereference (->) + 'ARROW', + + # Ternary operator (?) + 'TERNARY', + + # Delimeters ( ) [ ] { } , . ; : + 'LPAREN', 'RPAREN', + 'LBRACKET', 'RBRACKET', + 'LBRACE', 'RBRACE', + 'COMMA', 'PERIOD', 'SEMI', 'COLON', + + # Ellipsis (...) + 'ELLIPSIS', +] + +# Operators +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_MODULO = r'%' +t_OR = r'\|' +t_AND = r'&' +t_NOT = r'~' +t_XOR = r'\^' +t_LSHIFT = r'<<' +t_RSHIFT = r'>>' +t_LOR = r'\|\|' +t_LAND = r'&&' +t_LNOT = r'!' +t_LT = r'<' +t_GT = r'>' +t_LE = r'<=' +t_GE = r'>=' +t_EQ = r'==' +t_NE = r'!=' + +# Assignment operators + +t_EQUALS = r'=' +t_TIMESEQUAL = r'\*=' +t_DIVEQUAL = r'/=' +t_MODEQUAL = r'%=' +t_PLUSEQUAL = r'\+=' +t_MINUSEQUAL = r'-=' +t_LSHIFTEQUAL = r'<<=' +t_RSHIFTEQUAL = r'>>=' +t_ANDEQUAL = r'&=' +t_OREQUAL = r'\|=' +t_XOREQUAL = r'\^=' + +# Increment/decrement +t_INCREMENT = r'\+\+' +t_DECREMENT = r'--' + +# -> +t_ARROW = r'->' + +# ? +t_TERNARY = r'\?' + +# Delimeters +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_LBRACKET = r'\[' +t_RBRACKET = r'\]' +t_LBRACE = r'\{' +t_RBRACE = r'\}' +t_COMMA = r',' +t_PERIOD = r'\.' +t_SEMI = r';' +t_COLON = r':' +t_ELLIPSIS = r'\.\.\.' + +# Identifiers +t_ID = r'[A-Za-z_][A-Za-z0-9_]*' + +# Integer literal +t_INTEGER = r'\d+([uU]|[lL]|[uU][lL]|[lL][uU])?' + +# Floating literal +t_FLOAT = r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?' + +# String literal +t_STRING = r'\"([^\\\n]|(\\.))*?\"' + +# Character constant 'c' or L'c' +t_CHARACTER = r'(L)?\'([^\\\n]|(\\.))*?\'' + +# Comment (C-Style) +def t_COMMENT(t): + r'/\*(.|\n)*?\*/' + t.lexer.lineno += t.value.count('\n') + return t + +# Comment (C++-Style) +def t_CPPCOMMENT(t): + r'//.*\n' + t.lexer.lineno += 1 + return t |