diff options
Diffstat (limited to 'docutils/readers/python/moduleparser.py')
-rw-r--r-- | docutils/readers/python/moduleparser.py | 760 |
1 files changed, 760 insertions, 0 deletions
diff --git a/docutils/readers/python/moduleparser.py b/docutils/readers/python/moduleparser.py new file mode 100644 index 000000000..03d57c948 --- /dev/null +++ b/docutils/readers/python/moduleparser.py @@ -0,0 +1,760 @@ +# Author: David Goodger +# Contact: goodger@users.sourceforge.net +# Revision: $Revision$ +# Date: $Date$ +# Copyright: This module has been placed in the public domain. + +""" +Parser for Python modules. Requires Python 2.2 or higher. + +The `parse_module()` function takes a module's text and file name, +runs it through the module parser (using compiler.py and tokenize.py) +and produces a parse tree of the source code, using the nodes as found +in pynodes.py. For example, given this module (x.py):: + + # comment + + '''Docstring''' + + '''Additional docstring''' + + __docformat__ = 'reStructuredText' + + a = 1 + '''Attribute docstring''' + + class C(Super): + + '''C's docstring''' + + class_attribute = 1 + '''class_attribute's docstring''' + + def __init__(self, text=None): + '''__init__'s docstring''' + + self.instance_attribute = (text * 7 + + ' whaddyaknow') + '''instance_attribute's docstring''' + + + def f(x, # parameter x + y=a*5, # parameter y + *args): # parameter args + '''f's docstring''' + return [x + item for item in args] + + f.function_attribute = 1 + '''f.function_attribute's docstring''' + +The module parser will produce this module documentation tree:: + + <module_section filename="test data"> + <docstring> + Docstring + <docstring lineno="5"> + Additional docstring + <attribute lineno="7"> + <object_name> + __docformat__ + <expression_value lineno="7"> + 'reStructuredText' + <attribute lineno="9"> + <object_name> + a + <expression_value lineno="9"> + 1 + <docstring lineno="10"> + Attribute docstring + <class_section lineno="12"> + <object_name> + C + <class_base> + Super + <docstring lineno="12"> + C's docstring + <attribute lineno="16"> + <object_name> + class_attribute + <expression_value lineno="16"> + 1 + <docstring lineno="17"> + class_attribute's docstring + <method_section lineno="19"> + <object_name> + __init__ + <docstring lineno="19"> + __init__'s docstring + <parameter_list lineno="19"> + <parameter lineno="19"> + <object_name> + self + <parameter lineno="19"> + <object_name> + text + <parameter_default lineno="19"> + None + <attribute lineno="22"> + <object_name> + self.instance_attribute + <expression_value lineno="22"> + (text * 7 + ' whaddyaknow') + <docstring lineno="24"> + instance_attribute's docstring + <function_section lineno="27"> + <object_name> + f + <docstring lineno="27"> + f's docstring + <parameter_list lineno="27"> + <parameter lineno="27"> + <object_name> + x + <comment> + # parameter x + <parameter lineno="27"> + <object_name> + y + <parameter_default lineno="27"> + a * 5 + <comment> + # parameter y + <parameter excess_positional="1" lineno="27"> + <object_name> + args + <comment> + # parameter args + <attribute lineno="33"> + <object_name> + f.function_attribute + <expression_value lineno="33"> + 1 + <docstring lineno="34"> + f.function_attribute's docstring + +(Comments are not implemented yet.) + +compiler.parse() provides most of what's needed for this doctree, and +"tokenize" can be used to get the rest. We can determine the line +number from the compiler.parse() AST, and the TokenParser.rhs(lineno) +method provides the rest. + +The Docutils Python reader component will transform this module doctree into a +Python-specific Docutils doctree, and then a "stylist transform" will +further transform it into a generic doctree. Namespaces will have to be +compiled for each of the scopes, but I'm not certain at what stage of +processing. + +It's very important to keep all docstring processing out of this, so that it's +a completely generic and not tool-specific. + +:: + +> Why perform all of those transformations? Why not go from the AST to a +> generic doctree? Or, even from the AST to the final output? + +I want the docutils.readers.python.moduleparser.parse_module() function to +produce a standard documentation-oriented tree that can be used by any tool. +We can develop it together without having to compromise on the rest of our +design (i.e., HappyDoc doesn't have to be made to work like Docutils, and +vice-versa). It would be a higher-level version of what compiler.py provides. + +The Python reader component transforms this generic AST into a Python-specific +doctree (it knows about modules, classes, functions, etc.), but this is +specific to Docutils and cannot be used by HappyDoc or others. The stylist +transform does the final layout, converting Python-specific structures +("class" sections, etc.) into a generic doctree using primitives (tables, +sections, lists, etc.). This generic doctree does *not* know about Python +structures any more. The advantage is that this doctree can be handed off to +any of the output writers to create any output format we like. + +The latter two transforms are separate because I want to be able to have +multiple independent layout styles (multiple runtime-selectable "stylist +transforms"). Each of the existing tools (HappyDoc, pydoc, epydoc, Crystal, +etc.) has its own fixed format. I personally don't like the tables-based +format produced by these tools, and I'd like to be able to customize the +format easily. That's the goal of stylist transforms, which are independent +from the Reader component itself. One stylist transform could produce +HappyDoc-like output, another could produce output similar to module docs in +the Python library reference manual, and so on. + +It's for exactly this reason:: + +>> It's very important to keep all docstring processing out of this, so that +>> it's a completely generic and not tool-specific. + +... but it goes past docstring processing. It's also important to keep style +decisions and tool-specific data transforms out of this module parser. + + +Issues +====== + +* At what point should namespaces be computed? Should they be part of the + basic AST produced by the ASTVisitor walk, or generated by another tree + traversal? + +* At what point should a distinction be made between local variables & + instance attributes in __init__ methods? + +* Docstrings are getting their lineno from their parents. Should the + TokenParser find the real line no's? + +* Comments: include them? How and when? Only full-line comments, or + parameter comments too? (See function "f" above for an example.) + +* Module could use more docstrings & refactoring in places. + +""" + +__docformat__ = 'reStructuredText' + +import sys +import compiler +import compiler.ast +import tokenize +import token +from compiler.consts import OP_ASSIGN +from compiler.visitor import ASTVisitor +from types import StringType, UnicodeType, TupleType +from docutils.readers.python import pynodes +from docutils.nodes import Text + + +def parse_module(module_text, filename): + """Return a module documentation tree from `module_text`.""" + ast = compiler.parse(module_text) + token_parser = TokenParser(module_text) + visitor = ModuleVisitor(filename, token_parser) + compiler.walk(ast, visitor, walker=visitor) + return visitor.module + +class BaseVisitor(ASTVisitor): + + def __init__(self, token_parser): + ASTVisitor.__init__(self) + self.token_parser = token_parser + self.context = [] + self.documentable = None + + def default(self, node, *args): + self.documentable = None + #print 'in default (%s)' % node.__class__.__name__ + #ASTVisitor.default(self, node, *args) + + def default_visit(self, node, *args): + #print 'in default_visit (%s)' % node.__class__.__name__ + ASTVisitor.default(self, node, *args) + + +class DocstringVisitor(BaseVisitor): + + def visitDiscard(self, node): + if self.documentable: + self.visit(node.expr) + + def visitConst(self, node): + if self.documentable: + if type(node.value) in (StringType, UnicodeType): + self.documentable.append(make_docstring(node.value, node.lineno)) + else: + self.documentable = None + + def visitStmt(self, node): + self.default_visit(node) + + +class AssignmentVisitor(DocstringVisitor): + + def visitAssign(self, node): + visitor = AttributeVisitor(self.token_parser) + compiler.walk(node, visitor, walker=visitor) + if visitor.attributes: + self.context[-1].extend(visitor.attributes) + if len(visitor.attributes) == 1: + self.documentable = visitor.attributes[0] + else: + self.documentable = None + + +class ModuleVisitor(AssignmentVisitor): + + def __init__(self, filename, token_parser): + AssignmentVisitor.__init__(self, token_parser) + self.filename = filename + self.module = None + + def visitModule(self, node): + self.module = module = pynodes.module_section() + module['filename'] = self.filename + append_docstring(module, node.doc, node.lineno) + self.context.append(module) + self.documentable = module + self.visit(node.node) + self.context.pop() + + def visitImport(self, node): + self.context[-1] += make_import_group(names=node.names, + lineno=node.lineno) + self.documentable = None + + def visitFrom(self, node): + self.context[-1].append( + make_import_group(names=node.names, from_name=node.modname, + lineno=node.lineno)) + self.documentable = None + + def visitFunction(self, node): + visitor = FunctionVisitor(self.token_parser, + function_class=pynodes.function_section) + compiler.walk(node, visitor, walker=visitor) + self.context[-1].append(visitor.function) + + def visitClass(self, node): + visitor = ClassVisitor(self.token_parser) + compiler.walk(node, visitor, walker=visitor) + self.context[-1].append(visitor.klass) + + +class AttributeVisitor(BaseVisitor): + + def __init__(self, token_parser): + BaseVisitor.__init__(self, token_parser) + self.attributes = pynodes.class_attribute_section() + + def visitAssign(self, node): + # Don't visit the expression itself, just the attribute nodes: + for child in node.nodes: + self.dispatch(child) + expression_text = self.token_parser.rhs(node.lineno) + expression = pynodes.expression_value() + expression.append(Text(expression_text)) + for attribute in self.attributes: + attribute.append(expression) + + def visitAssName(self, node): + self.attributes.append(make_attribute(node.name, + lineno=node.lineno)) + + def visitAssTuple(self, node): + attributes = self.attributes + self.attributes = [] + self.default_visit(node) + n = pynodes.attribute_tuple() + n.extend(self.attributes) + n['lineno'] = self.attributes[0]['lineno'] + attributes.append(n) + self.attributes = attributes + #self.attributes.append(att_tuple) + + def visitAssAttr(self, node): + self.default_visit(node, node.attrname) + + def visitGetattr(self, node, suffix): + self.default_visit(node, node.attrname + '.' + suffix) + + def visitName(self, node, suffix): + self.attributes.append(make_attribute(node.name + '.' + suffix, + lineno=node.lineno)) + + +class FunctionVisitor(DocstringVisitor): + + in_function = 0 + + def __init__(self, token_parser, function_class): + DocstringVisitor.__init__(self, token_parser) + self.function_class = function_class + + def visitFunction(self, node): + if self.in_function: + self.documentable = None + # Don't bother with nested function definitions. + return + self.in_function = 1 + self.function = function = make_function_like_section( + name=node.name, + lineno=node.lineno, + doc=node.doc, + function_class=self.function_class) + self.context.append(function) + self.documentable = function + self.parse_parameter_list(node) + self.visit(node.code) + self.context.pop() + + def parse_parameter_list(self, node): + parameters = [] + special = [] + argnames = list(node.argnames) + if node.kwargs: + special.append(make_parameter(argnames[-1], excess_keyword=1)) + argnames.pop() + if node.varargs: + special.append(make_parameter(argnames[-1], + excess_positional=1)) + argnames.pop() + defaults = list(node.defaults) + defaults = [None] * (len(argnames) - len(defaults)) + defaults + function_parameters = self.token_parser.function_parameters( + node.lineno) + #print >>sys.stderr, function_parameters + for argname, default in zip(argnames, defaults): + if type(argname) is TupleType: + parameter = pynodes.parameter_tuple() + for tuplearg in argname: + parameter.append(make_parameter(tuplearg)) + argname = normalize_parameter_name(argname) + else: + parameter = make_parameter(argname) + if default: + n_default = pynodes.parameter_default() + n_default.append(Text(function_parameters[argname])) + parameter.append(n_default) + parameters.append(parameter) + if parameters or special: + special.reverse() + parameters.extend(special) + parameter_list = pynodes.parameter_list() + parameter_list.extend(parameters) + self.function.append(parameter_list) + + +class ClassVisitor(AssignmentVisitor): + + in_class = 0 + + def __init__(self, token_parser): + AssignmentVisitor.__init__(self, token_parser) + self.bases = [] + + def visitClass(self, node): + if self.in_class: + self.documentable = None + # Don't bother with nested class definitions. + return + self.in_class = 1 + #import mypdb as pdb + #pdb.set_trace() + for base in node.bases: + self.visit(base) + self.klass = klass = make_class_section(node.name, self.bases, + doc=node.doc, + lineno=node.lineno) + self.context.append(klass) + self.documentable = klass + self.visit(node.code) + self.context.pop() + + def visitGetattr(self, node, suffix=None): + if suffix: + name = node.attrname + '.' + suffix + else: + name = node.attrname + self.default_visit(node, name) + + def visitName(self, node, suffix=None): + if suffix: + name = node.name + '.' + suffix + else: + name = node.name + self.bases.append(name) + + def visitFunction(self, node): + if node.name == '__init__': + visitor = InitMethodVisitor(self.token_parser, + function_class=pynodes.method_section) + compiler.walk(node, visitor, walker=visitor) + else: + visitor = FunctionVisitor(self.token_parser, + function_class=pynodes.method_section) + compiler.walk(node, visitor, walker=visitor) + self.context[-1].append(visitor.function) + + +class InitMethodVisitor(FunctionVisitor, AssignmentVisitor): pass + + +class TokenParser: + + def __init__(self, text): + self.text = text + '\n\n' + self.lines = self.text.splitlines(1) + self.generator = tokenize.generate_tokens(iter(self.lines).next) + self.next() + + def __iter__(self): + return self + + def next(self): + self.token = self.generator.next() + self.type, self.string, self.start, self.end, self.line = self.token + return self.token + + def goto_line(self, lineno): + while self.start[0] < lineno: + self.next() + return token + + def rhs(self, lineno): + """ + Return a whitespace-normalized expression string from the right-hand + side of an assignment at line `lineno`. + """ + self.goto_line(lineno) + while self.string != '=': + self.next() + self.stack = None + while self.type != token.NEWLINE and self.string != ';': + if self.string == '=' and not self.stack: + self.tokens = [] + self.stack = [] + self._type = None + self._string = None + self._backquote = 0 + else: + self.note_token() + self.next() + self.next() + text = ''.join(self.tokens) + return text.strip() + + closers = {')': '(', ']': '[', '}': '{'} + openers = {'(': 1, '[': 1, '{': 1} + del_ws_prefix = {'.': 1, '=': 1, ')': 1, ']': 1, '}': 1, ':': 1, ',': 1} + no_ws_suffix = {'.': 1, '=': 1, '(': 1, '[': 1, '{': 1} + + def note_token(self): + if self.type == tokenize.NL: + return + del_ws = self.del_ws_prefix.has_key(self.string) + append_ws = not self.no_ws_suffix.has_key(self.string) + if self.openers.has_key(self.string): + self.stack.append(self.string) + if (self._type == token.NAME + or self.closers.has_key(self._string)): + del_ws = 1 + elif self.closers.has_key(self.string): + assert self.stack[-1] == self.closers[self.string] + self.stack.pop() + elif self.string == '`': + if self._backquote: + del_ws = 1 + assert self.stack[-1] == '`' + self.stack.pop() + else: + append_ws = 0 + self.stack.append('`') + self._backquote = not self._backquote + if del_ws and self.tokens and self.tokens[-1] == ' ': + del self.tokens[-1] + self.tokens.append(self.string) + self._type = self.type + self._string = self.string + if append_ws: + self.tokens.append(' ') + + def function_parameters(self, lineno): + """ + Return a dictionary mapping parameters to defaults + (whitespace-normalized strings). + """ + self.goto_line(lineno) + while self.string != 'def': + self.next() + while self.string != '(': + self.next() + name = None + default = None + parameter_tuple = None + self.tokens = [] + parameters = {} + self.stack = [self.string] + self.next() + while 1: + if len(self.stack) == 1: + if parameter_tuple: + # Just encountered ")". + #print >>sys.stderr, 'parameter_tuple: %r' % self.tokens + name = ''.join(self.tokens).strip() + self.tokens = [] + parameter_tuple = None + if self.string in (')', ','): + if name: + if self.tokens: + default_text = ''.join(self.tokens).strip() + else: + default_text = None + parameters[name] = default_text + self.tokens = [] + name = None + default = None + if self.string == ')': + break + elif self.type == token.NAME: + if name and default: + self.note_token() + else: + assert name is None, ( + 'token=%r name=%r parameters=%r stack=%r' + % (self.token, name, parameters, self.stack)) + name = self.string + #print >>sys.stderr, 'name=%r' % name + elif self.string == '=': + assert name is not None, 'token=%r' % (self.token,) + assert default is None, 'token=%r' % (self.token,) + assert self.tokens == [], 'token=%r' % (self.token,) + default = 1 + self._type = None + self._string = None + self._backquote = 0 + elif name: + self.note_token() + elif self.string == '(': + parameter_tuple = 1 + self._type = None + self._string = None + self._backquote = 0 + self.note_token() + else: # ignore these tokens: + assert (self.string in ('*', '**', '\n') + or self.type == tokenize.COMMENT), ( + 'token=%r' % (self.token,)) + else: + self.note_token() + self.next() + return parameters + + +def make_docstring(doc, lineno): + n = pynodes.docstring() + if lineno: + # Really, only module docstrings don't have a line + # (@@: but maybe they should) + n['lineno'] = lineno + n.append(Text(doc)) + return n + +def append_docstring(node, doc, lineno): + if doc: + node.append(make_docstring(doc, lineno)) + +def make_class_section(name, bases, lineno, doc): + n = pynodes.class_section() + n['lineno'] = lineno + n.append(make_object_name(name)) + for base in bases: + b = pynodes.class_base() + b.append(make_object_name(base)) + n.append(b) + append_docstring(n, doc, lineno) + return n + +def make_object_name(name): + n = pynodes.object_name() + n.append(Text(name)) + return n + +def make_function_like_section(name, lineno, doc, function_class): + n = function_class() + n['lineno'] = lineno + n.append(make_object_name(name)) + append_docstring(n, doc, lineno) + return n + +def make_import_group(names, lineno, from_name=None): + n = pynodes.import_group() + n['lineno'] = lineno + if from_name: + n_from = pynodes.import_from() + n_from.append(Text(from_name)) + n.append(n_from) + for name, alias in names: + n_name = pynodes.import_name() + n_name.append(Text(name)) + if alias: + n_alias = pynodes.import_alias() + n_alias.append(Text(alias)) + n_name.append(n_alias) + n.append(n_name) + return n + +def make_class_attribute(name, lineno): + n = pynodes.class_attribute() + n['lineno'] = lineno + n.append(Text(name)) + return n + +def make_attribute(name, lineno): + n = pynodes.attribute() + n['lineno'] = lineno + n.append(make_object_name(name)) + return n + +def make_parameter(name, excess_keyword=0, excess_positional=0): + """ + excess_keyword and excess_positional must be either 1 or 0, and + not both of them can be 1. + """ + n = pynodes.parameter() + n.append(make_object_name(name)) + assert not excess_keyword or not excess_positional + if excess_keyword: + n['excess_keyword'] = 1 + if excess_positional: + n['excess_positional'] = 1 + return n + +def trim_docstring(text): + """ + Trim indentation and blank lines from docstring text & return it. + + See PEP 257. + """ + if not text: + return text + # Convert tabs to spaces (following the normal Python rules) + # and split into a list of lines: + lines = text.expandtabs().splitlines() + # Determine minimum indentation (first line doesn't count): + indent = sys.maxint + for line in lines[1:]: + stripped = line.lstrip() + if stripped: + indent = min(indent, len(line) - len(stripped)) + # Remove indentation (first line is special): + trimmed = [lines[0].strip()] + if indent < sys.maxint: + for line in lines[1:]: + trimmed.append(line[indent:].rstrip()) + # Strip off trailing and leading blank lines: + while trimmed and not trimmed[-1]: + trimmed.pop() + while trimmed and not trimmed[0]: + trimmed.pop(0) + # Return a single string: + return '\n'.join(trimmed) + +def normalize_parameter_name(name): + """ + Converts a tuple like ``('a', ('b', 'c'), 'd')`` into ``'(a, (b, c), d)'`` + """ + if type(name) is TupleType: + return '(%s)' % ', '.join([normalize_parameter_name(n) for n in name]) + else: + return name + +if __name__ == '__main__': + import sys + args = sys.argv[1:] + if args[0] == '-v': + filename = args[1] + module_text = open(filename).read() + ast = compiler.parse(module_text) + visitor = compiler.visitor.ExampleASTVisitor() + compiler.walk(ast, visitor, walker=visitor, verbose=1) + else: + filename = args[0] + content = open(filename).read() + print parse_module(content, filename).pformat() + |