diff options
author | milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> | 2010-12-07 10:42:53 +0000 |
---|---|---|
committer | milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> | 2010-12-07 10:42:53 +0000 |
commit | 7524e3eb7861b3c7417cb003b98af51a67cce5b4 (patch) | |
tree | 08b564413ced28845bec03be36066efe35024b21 /sandbox/davidg/docutils/readers/python/moduleparser.py | |
parent | f233f08fe5ba13ed408123b01de5147543dd93fe (diff) | |
download | docutils-7524e3eb7861b3c7417cb003b98af51a67cce5b4.tar.gz |
Move the orphaned python source reader to the sandbox.
The "python" reader is no longer maintained and (in its current form)
does not work with Python 3.
Documentation from RST docstrings can be generated with the Sphinx
"autodoc" extension (http://sphinx.pocoo.org/latest/ext/autodoc.html).
git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk@6495 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
Diffstat (limited to 'sandbox/davidg/docutils/readers/python/moduleparser.py')
-rw-r--r-- | sandbox/davidg/docutils/readers/python/moduleparser.py | 757 |
1 files changed, 757 insertions, 0 deletions
diff --git a/sandbox/davidg/docutils/readers/python/moduleparser.py b/sandbox/davidg/docutils/readers/python/moduleparser.py new file mode 100644 index 000000000..93448d160 --- /dev/null +++ b/sandbox/davidg/docutils/readers/python/moduleparser.py @@ -0,0 +1,757 @@ +# $Id$ +# Author: David Goodger <goodger@python.org> +# Copyright: This module has been placed in the public domain. + +""" +Parser for Python modules. + +The `parse_module()` function takes a module's text and file name, +runs it through the module parser (using compiler.py and tokenize.py) +and produces a parse tree of the source code, using the nodes as found +in pynodes.py. For example, given this module (x.py):: + + # comment + + '''Docstring''' + + '''Additional docstring''' + + __docformat__ = 'reStructuredText' + + a = 1 + '''Attribute docstring''' + + class C(Super): + + '''C's docstring''' + + class_attribute = 1 + '''class_attribute's docstring''' + + def __init__(self, text=None): + '''__init__'s docstring''' + + self.instance_attribute = (text * 7 + + ' whaddyaknow') + '''instance_attribute's docstring''' + + + def f(x, # parameter x + y=a*5, # parameter y + *args): # parameter args + '''f's docstring''' + return [x + item for item in args] + + f.function_attribute = 1 + '''f.function_attribute's docstring''' + +The module parser will produce this module documentation tree:: + + <module_section filename="test data"> + <docstring> + Docstring + <docstring lineno="5"> + Additional docstring + <attribute lineno="7"> + <object_name> + __docformat__ + <expression_value lineno="7"> + 'reStructuredText' + <attribute lineno="9"> + <object_name> + a + <expression_value lineno="9"> + 1 + <docstring lineno="10"> + Attribute docstring + <class_section lineno="12"> + <object_name> + C + <class_base> + Super + <docstring lineno="12"> + C's docstring + <attribute lineno="16"> + <object_name> + class_attribute + <expression_value lineno="16"> + 1 + <docstring lineno="17"> + class_attribute's docstring + <method_section lineno="19"> + <object_name> + __init__ + <docstring lineno="19"> + __init__'s docstring + <parameter_list lineno="19"> + <parameter lineno="19"> + <object_name> + self + <parameter lineno="19"> + <object_name> + text + <parameter_default lineno="19"> + None + <attribute lineno="22"> + <object_name> + self.instance_attribute + <expression_value lineno="22"> + (text * 7 + ' whaddyaknow') + <docstring lineno="24"> + instance_attribute's docstring + <function_section lineno="27"> + <object_name> + f + <docstring lineno="27"> + f's docstring + <parameter_list lineno="27"> + <parameter lineno="27"> + <object_name> + x + <comment> + # parameter x + <parameter lineno="27"> + <object_name> + y + <parameter_default lineno="27"> + a * 5 + <comment> + # parameter y + <parameter excess_positional="1" lineno="27"> + <object_name> + args + <comment> + # parameter args + <attribute lineno="33"> + <object_name> + f.function_attribute + <expression_value lineno="33"> + 1 + <docstring lineno="34"> + f.function_attribute's docstring + +(Comments are not implemented yet.) + +compiler.parse() provides most of what's needed for this doctree, and +"tokenize" can be used to get the rest. We can determine the line +number from the compiler.parse() AST, and the TokenParser.rhs(lineno) +method provides the rest. + +The Docutils Python reader component will transform this module doctree into a +Python-specific Docutils doctree, and then a "stylist transform" will +further transform it into a generic doctree. Namespaces will have to be +compiled for each of the scopes, but I'm not certain at what stage of +processing. + +It's very important to keep all docstring processing out of this, so that it's +a completely generic and not tool-specific. + +:: + +> Why perform all of those transformations? Why not go from the AST to a +> generic doctree? Or, even from the AST to the final output? + +I want the docutils.readers.python.moduleparser.parse_module() function to +produce a standard documentation-oriented tree that can be used by any tool. +We can develop it together without having to compromise on the rest of our +design (i.e., HappyDoc doesn't have to be made to work like Docutils, and +vice-versa). It would be a higher-level version of what compiler.py provides. + +The Python reader component transforms this generic AST into a Python-specific +doctree (it knows about modules, classes, functions, etc.), but this is +specific to Docutils and cannot be used by HappyDoc or others. The stylist +transform does the final layout, converting Python-specific structures +("class" sections, etc.) into a generic doctree using primitives (tables, +sections, lists, etc.). This generic doctree does *not* know about Python +structures any more. The advantage is that this doctree can be handed off to +any of the output writers to create any output format we like. + +The latter two transforms are separate because I want to be able to have +multiple independent layout styles (multiple runtime-selectable "stylist +transforms"). Each of the existing tools (HappyDoc, pydoc, epydoc, Crystal, +etc.) has its own fixed format. I personally don't like the tables-based +format produced by these tools, and I'd like to be able to customize the +format easily. That's the goal of stylist transforms, which are independent +from the Reader component itself. One stylist transform could produce +HappyDoc-like output, another could produce output similar to module docs in +the Python library reference manual, and so on. + +It's for exactly this reason:: + +>> It's very important to keep all docstring processing out of this, so that +>> it's a completely generic and not tool-specific. + +... but it goes past docstring processing. It's also important to keep style +decisions and tool-specific data transforms out of this module parser. + + +Issues +====== + +* At what point should namespaces be computed? Should they be part of the + basic AST produced by the ASTVisitor walk, or generated by another tree + traversal? + +* At what point should a distinction be made between local variables & + instance attributes in __init__ methods? + +* Docstrings are getting their lineno from their parents. Should the + TokenParser find the real line no's? + +* Comments: include them? How and when? Only full-line comments, or + parameter comments too? (See function "f" above for an example.) + +* Module could use more docstrings & refactoring in places. + +""" + +__docformat__ = 'reStructuredText' + +import sys +import compiler +import compiler.ast +import tokenize +import token +from compiler.consts import OP_ASSIGN +from compiler.visitor import ASTVisitor +from docutils.readers.python import pynodes +from docutils.nodes import Text + + +def parse_module(module_text, filename): + """Return a module documentation tree from `module_text`.""" + ast = compiler.parse(module_text) + token_parser = TokenParser(module_text) + visitor = ModuleVisitor(filename, token_parser) + compiler.walk(ast, visitor, walker=visitor) + return visitor.module + +class BaseVisitor(ASTVisitor): + + def __init__(self, token_parser): + ASTVisitor.__init__(self) + self.token_parser = token_parser + self.context = [] + self.documentable = None + + def default(self, node, *args): + self.documentable = None + #print 'in default (%s)' % node.__class__.__name__ + #ASTVisitor.default(self, node, *args) + + def default_visit(self, node, *args): + #print 'in default_visit (%s)' % node.__class__.__name__ + ASTVisitor.default(self, node, *args) + + +class DocstringVisitor(BaseVisitor): + + def visitDiscard(self, node): + if self.documentable: + self.visit(node.expr) + + def visitConst(self, node): + if self.documentable: + if type(node.value) in (str, unicode): + self.documentable.append(make_docstring(node.value, node.lineno)) + else: + self.documentable = None + + def visitStmt(self, node): + self.default_visit(node) + + +class AssignmentVisitor(DocstringVisitor): + + def visitAssign(self, node): + visitor = AttributeVisitor(self.token_parser) + compiler.walk(node, visitor, walker=visitor) + if visitor.attributes: + self.context[-1].extend(visitor.attributes) + if len(visitor.attributes) == 1: + self.documentable = visitor.attributes[0] + else: + self.documentable = None + + +class ModuleVisitor(AssignmentVisitor): + + def __init__(self, filename, token_parser): + AssignmentVisitor.__init__(self, token_parser) + self.filename = filename + self.module = None + + def visitModule(self, node): + self.module = module = pynodes.module_section() + module['filename'] = self.filename + append_docstring(module, node.doc, node.lineno) + self.context.append(module) + self.documentable = module + self.visit(node.node) + self.context.pop() + + def visitImport(self, node): + self.context[-1] += make_import_group(names=node.names, + lineno=node.lineno) + self.documentable = None + + def visitFrom(self, node): + self.context[-1].append( + make_import_group(names=node.names, from_name=node.modname, + lineno=node.lineno)) + self.documentable = None + + def visitFunction(self, node): + visitor = FunctionVisitor(self.token_parser, + function_class=pynodes.function_section) + compiler.walk(node, visitor, walker=visitor) + self.context[-1].append(visitor.function) + + def visitClass(self, node): + visitor = ClassVisitor(self.token_parser) + compiler.walk(node, visitor, walker=visitor) + self.context[-1].append(visitor.klass) + + +class AttributeVisitor(BaseVisitor): + + def __init__(self, token_parser): + BaseVisitor.__init__(self, token_parser) + self.attributes = pynodes.class_attribute_section() + + def visitAssign(self, node): + # Don't visit the expression itself, just the attribute nodes: + for child in node.nodes: + self.dispatch(child) + expression_text = self.token_parser.rhs(node.lineno) + expression = pynodes.expression_value() + expression.append(Text(expression_text)) + for attribute in self.attributes: + attribute.append(expression) + + def visitAssName(self, node): + self.attributes.append(make_attribute(node.name, + lineno=node.lineno)) + + def visitAssTuple(self, node): + attributes = self.attributes + self.attributes = [] + self.default_visit(node) + n = pynodes.attribute_tuple() + n.extend(self.attributes) + n['lineno'] = self.attributes[0]['lineno'] + attributes.append(n) + self.attributes = attributes + #self.attributes.append(att_tuple) + + def visitAssAttr(self, node): + self.default_visit(node, node.attrname) + + def visitGetattr(self, node, suffix): + self.default_visit(node, node.attrname + '.' + suffix) + + def visitName(self, node, suffix): + self.attributes.append(make_attribute(node.name + '.' + suffix, + lineno=node.lineno)) + + +class FunctionVisitor(DocstringVisitor): + + in_function = 0 + + def __init__(self, token_parser, function_class): + DocstringVisitor.__init__(self, token_parser) + self.function_class = function_class + + def visitFunction(self, node): + if self.in_function: + self.documentable = None + # Don't bother with nested function definitions. + return + self.in_function = 1 + self.function = function = make_function_like_section( + name=node.name, + lineno=node.lineno, + doc=node.doc, + function_class=self.function_class) + self.context.append(function) + self.documentable = function + self.parse_parameter_list(node) + self.visit(node.code) + self.context.pop() + + def parse_parameter_list(self, node): + parameters = [] + special = [] + argnames = list(node.argnames) + if node.kwargs: + special.append(make_parameter(argnames[-1], excess_keyword=1)) + argnames.pop() + if node.varargs: + special.append(make_parameter(argnames[-1], + excess_positional=1)) + argnames.pop() + defaults = list(node.defaults) + defaults = [None] * (len(argnames) - len(defaults)) + defaults + function_parameters = self.token_parser.function_parameters( + node.lineno) + #print >>sys.stderr, function_parameters + for argname, default in zip(argnames, defaults): + if type(argname) is tuple: + parameter = pynodes.parameter_tuple() + for tuplearg in argname: + parameter.append(make_parameter(tuplearg)) + argname = normalize_parameter_name(argname) + else: + parameter = make_parameter(argname) + if default: + n_default = pynodes.parameter_default() + n_default.append(Text(function_parameters[argname])) + parameter.append(n_default) + parameters.append(parameter) + if parameters or special: + special.reverse() + parameters.extend(special) + parameter_list = pynodes.parameter_list() + parameter_list.extend(parameters) + self.function.append(parameter_list) + + +class ClassVisitor(AssignmentVisitor): + + in_class = 0 + + def __init__(self, token_parser): + AssignmentVisitor.__init__(self, token_parser) + self.bases = [] + + def visitClass(self, node): + if self.in_class: + self.documentable = None + # Don't bother with nested class definitions. + return + self.in_class = 1 + #import mypdb as pdb + #pdb.set_trace() + for base in node.bases: + self.visit(base) + self.klass = klass = make_class_section(node.name, self.bases, + doc=node.doc, + lineno=node.lineno) + self.context.append(klass) + self.documentable = klass + self.visit(node.code) + self.context.pop() + + def visitGetattr(self, node, suffix=None): + if suffix: + name = node.attrname + '.' + suffix + else: + name = node.attrname + self.default_visit(node, name) + + def visitName(self, node, suffix=None): + if suffix: + name = node.name + '.' + suffix + else: + name = node.name + self.bases.append(name) + + def visitFunction(self, node): + if node.name == '__init__': + visitor = InitMethodVisitor(self.token_parser, + function_class=pynodes.method_section) + compiler.walk(node, visitor, walker=visitor) + else: + visitor = FunctionVisitor(self.token_parser, + function_class=pynodes.method_section) + compiler.walk(node, visitor, walker=visitor) + self.context[-1].append(visitor.function) + + +class InitMethodVisitor(FunctionVisitor, AssignmentVisitor): pass + + +class TokenParser: + + def __init__(self, text): + self.text = text + '\n\n' + self.lines = self.text.splitlines(1) + self.generator = tokenize.generate_tokens(iter(self.lines).next) + self.next() + + def __iter__(self): + return self + + def next(self): + self.token = self.generator.next() + self.type, self.string, self.start, self.end, self.line = self.token + return self.token + + def goto_line(self, lineno): + while self.start[0] < lineno: + self.next() + return token + + def rhs(self, lineno): + """ + Return a whitespace-normalized expression string from the right-hand + side of an assignment at line `lineno`. + """ + self.goto_line(lineno) + while self.string != '=': + self.next() + self.stack = None + while self.type != token.NEWLINE and self.string != ';': + if self.string == '=' and not self.stack: + self.tokens = [] + self.stack = [] + self._type = None + self._string = None + self._backquote = 0 + else: + self.note_token() + self.next() + self.next() + text = ''.join(self.tokens) + return text.strip() + + closers = {')': '(', ']': '[', '}': '{'} + openers = {'(': 1, '[': 1, '{': 1} + del_ws_prefix = {'.': 1, '=': 1, ')': 1, ']': 1, '}': 1, ':': 1, ',': 1} + no_ws_suffix = {'.': 1, '=': 1, '(': 1, '[': 1, '{': 1} + + def note_token(self): + if self.type == tokenize.NL: + return + del_ws = self.string in self.del_ws_prefix + append_ws = self.string not in self.no_ws_suffix + if self.string in self.openers: + self.stack.append(self.string) + if (self._type == token.NAME + or self._string in self.closers): + del_ws = 1 + elif self.string in self.closers: + assert self.stack[-1] == self.closers[self.string] + self.stack.pop() + elif self.string == '`': + if self._backquote: + del_ws = 1 + assert self.stack[-1] == '`' + self.stack.pop() + else: + append_ws = 0 + self.stack.append('`') + self._backquote = not self._backquote + if del_ws and self.tokens and self.tokens[-1] == ' ': + del self.tokens[-1] + self.tokens.append(self.string) + self._type = self.type + self._string = self.string + if append_ws: + self.tokens.append(' ') + + def function_parameters(self, lineno): + """ + Return a dictionary mapping parameters to defaults + (whitespace-normalized strings). + """ + self.goto_line(lineno) + while self.string != 'def': + self.next() + while self.string != '(': + self.next() + name = None + default = None + parameter_tuple = None + self.tokens = [] + parameters = {} + self.stack = [self.string] + self.next() + while 1: + if len(self.stack) == 1: + if parameter_tuple: + # Just encountered ")". + #print >>sys.stderr, 'parameter_tuple: %r' % self.tokens + name = ''.join(self.tokens).strip() + self.tokens = [] + parameter_tuple = None + if self.string in (')', ','): + if name: + if self.tokens: + default_text = ''.join(self.tokens).strip() + else: + default_text = None + parameters[name] = default_text + self.tokens = [] + name = None + default = None + if self.string == ')': + break + elif self.type == token.NAME: + if name and default: + self.note_token() + else: + assert name is None, ( + 'token=%r name=%r parameters=%r stack=%r' + % (self.token, name, parameters, self.stack)) + name = self.string + #print >>sys.stderr, 'name=%r' % name + elif self.string == '=': + assert name is not None, 'token=%r' % (self.token,) + assert default is None, 'token=%r' % (self.token,) + assert self.tokens == [], 'token=%r' % (self.token,) + default = 1 + self._type = None + self._string = None + self._backquote = 0 + elif name: + self.note_token() + elif self.string == '(': + parameter_tuple = 1 + self._type = None + self._string = None + self._backquote = 0 + self.note_token() + else: # ignore these tokens: + assert (self.string in ('*', '**', '\n') + or self.type == tokenize.COMMENT), ( + 'token=%r' % (self.token,)) + else: + self.note_token() + self.next() + return parameters + + +def make_docstring(doc, lineno): + n = pynodes.docstring() + if lineno: + # Really, only module docstrings don't have a line + # (@@: but maybe they should) + n['lineno'] = lineno + n.append(Text(doc)) + return n + +def append_docstring(node, doc, lineno): + if doc: + node.append(make_docstring(doc, lineno)) + +def make_class_section(name, bases, lineno, doc): + n = pynodes.class_section() + n['lineno'] = lineno + n.append(make_object_name(name)) + for base in bases: + b = pynodes.class_base() + b.append(make_object_name(base)) + n.append(b) + append_docstring(n, doc, lineno) + return n + +def make_object_name(name): + n = pynodes.object_name() + n.append(Text(name)) + return n + +def make_function_like_section(name, lineno, doc, function_class): + n = function_class() + n['lineno'] = lineno + n.append(make_object_name(name)) + append_docstring(n, doc, lineno) + return n + +def make_import_group(names, lineno, from_name=None): + n = pynodes.import_group() + n['lineno'] = lineno + if from_name: + n_from = pynodes.import_from() + n_from.append(Text(from_name)) + n.append(n_from) + for name, alias in names: + n_name = pynodes.import_name() + n_name.append(Text(name)) + if alias: + n_alias = pynodes.import_alias() + n_alias.append(Text(alias)) + n_name.append(n_alias) + n.append(n_name) + return n + +def make_class_attribute(name, lineno): + n = pynodes.class_attribute() + n['lineno'] = lineno + n.append(Text(name)) + return n + +def make_attribute(name, lineno): + n = pynodes.attribute() + n['lineno'] = lineno + n.append(make_object_name(name)) + return n + +def make_parameter(name, excess_keyword=0, excess_positional=0): + """ + excess_keyword and excess_positional must be either 1 or 0, and + not both of them can be 1. + """ + n = pynodes.parameter() + n.append(make_object_name(name)) + assert not excess_keyword or not excess_positional + if excess_keyword: + n['excess_keyword'] = 1 + if excess_positional: + n['excess_positional'] = 1 + return n + +def trim_docstring(text): + """ + Trim indentation and blank lines from docstring text & return it. + + See PEP 257. + """ + if not text: + return text + # Convert tabs to spaces (following the normal Python rules) + # and split into a list of lines: + lines = text.expandtabs().splitlines() + # Determine minimum indentation (first line doesn't count): + indent = sys.maxint + for line in lines[1:]: + stripped = line.lstrip() + if stripped: + indent = min(indent, len(line) - len(stripped)) + # Remove indentation (first line is special): + trimmed = [lines[0].strip()] + if indent < sys.maxint: + for line in lines[1:]: + trimmed.append(line[indent:].rstrip()) + # Strip off trailing and leading blank lines: + while trimmed and not trimmed[-1]: + trimmed.pop() + while trimmed and not trimmed[0]: + trimmed.pop(0) + # Return a single string: + return '\n'.join(trimmed) + +def normalize_parameter_name(name): + """ + Converts a tuple like ``('a', ('b', 'c'), 'd')`` into ``'(a, (b, c), d)'`` + """ + if type(name) is tuple: + return '(%s)' % ', '.join([normalize_parameter_name(n) for n in name]) + else: + return name + +if __name__ == '__main__': + import sys + args = sys.argv[1:] + if args[0] == '-v': + filename = args[1] + module_text = open(filename).read() + ast = compiler.parse(module_text) + visitor = compiler.visitor.ExampleASTVisitor() + compiler.walk(ast, visitor, walker=visitor, verbose=1) + else: + filename = args[0] + content = open(filename).read() + print parse_module(content, filename).pformat() + |