3 files changed, 269 insertions, 173 deletions
diff --git a/pycco/compat.py b/pycco/compat.py
new file mode 100644
index 0000000..68233a4
--- /dev/null
+++ b/pycco/compat.py
@@ -0,0 +1,10 @@
+try:
+    pycco_unichr = unichr
+except NameError:
+    pycco_unichr = chr
+
+def compat_items(d):
+    try:
+        return d.iteritems()
+    except AttributeError:
+        return d.items()
diff --git a/pycco/generate_index.py b/pycco/generate_index.py
new file mode 100644
index 0000000..2ddf301
--- /dev/null
+++ b/pycco/generate_index.py
@@ -0,0 +1,77 @@
+"""
+This is the module responsible for automatically generating an HTML index of
+all documentation files generated by Pycco.
+"""
+import re
+from os import path
+
+from pycco.compat import compat_items
+from pycco_resources import pycco_template
+
+
+__all__ = ('generate_index',)
+
+
+def build_tree(file_paths, outdir):
+    tree = {}
+    for file_path in file_paths:
+        entry = {
+            'path': file_path,
+            'relpath': path.relpath(file_path, outdir)
+        }
+        path_steps = entry['relpath'].split(path.sep)
+        add_file(entry, path_steps, tree)
+
+    return tree
+
+
+def add_file(entry, path_steps, tree):
+    """
+    :param entry: A dictionary containing a path to a documentation file, and a
+    relative path to the same file.
+    :param path_steps: A list of steps in a file path to look within.
+    """
+    node, subpath = path_steps[0], path_steps[1:]
+    if node not in tree:
+        tree[node] = {}
+
+    if subpath:
+        add_file(entry, subpath, tree[node])
+
+    else:
+        tree[node]['entry'] = entry
+
+
+def generate_tree_html(tree):
+    """
+    Given a tree representing HTML file paths, return an HTML table plotting
+    those paths.
+    """
+    items = []
+    for node, subtree in sorted(compat_items(tree)):
+        if 'entry' in subtree:
+            html = '<li><a href="{}">{}</a></li>'.format(subtree['entry']['relpath'], node)
+        else:
+            html = '<dl><dt>{}</dt><dd><ul>{}</ul></dd></dl>'.format(node, generate_tree_html(subtree))
+
+        items.append(html)
+
+    return ''.join(items)
+
+
+def generate_index(files, outdir):
+    """
+    Given a list of generated documentation files, generate HTML to display
+    index of all files.
+    """
+    tree = build_tree(files, outdir)
+    css_path = path.join(outdir, "pycco.css")
+
+    rendered = pycco_template({
+        "title": 'Index',
+        "stylesheet": css_path,
+        "sections": {'docs_html': generate_tree_html(tree)},
+        "source": '',
+    })
+
+    return re.sub(r"__DOUBLE_OPEN_STACHE__", "{{", rendered).encode("utf-8")
diff --git a/pycco/main.py b/pycco/main.py
index 787a5d2..e95ad73 100644
--- a/pycco/main.py
+++ b/pycco/main.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python
+from __future__ import print_function
+
+# This module contains all of our static resources.
+from pycco_resources import pycco_template, css as pycco_css
 
 """
 "**Pycco**" is a Python port of [Docco](http://jashkenas.github.com/docco/):
@@ -17,7 +21,7 @@ If you install Pycco, you can run it from the command-line:
 This will generate linked HTML documentation for the named source files,
 saving it into a `docs` folder by default.
 
-The [source for Pycco](https://github.com/fitzgen/pycco) is available on GitHub,
+The [source for Pycco](https://github.com/pycco-docs/pycco) is available on GitHub,
 and released under the MIT license.
 
 To install Pycco, simply
@@ -26,15 +30,16 @@ To install Pycco, simply
 
 Or, to install the latest source
 
-    git clone git://github.com/fitzgen/pycco.git
+    git clone git://github.com/pycco-docs/pycco.git
     cd pycco
     python setup.py install
 """
 
 # === Main Documentation Generation Functions ===
 
+
 def generate_documentation(source, outdir=None, preserve_paths=True,
-                           language=None):
+                           language=None, encoding="utf8"):
     """
     Generate the documentation for a source file by reading it in, splitting it
     up into comment/code sections, highlighting them for the appropriate
@@ -43,13 +48,21 @@ def generate_documentation(source, outdir=None, preserve_paths=True,
 
     if not outdir:
         raise TypeError("Missing the required 'outdir' keyword argument.")
-    code = open(source, "r").read()
-    language = get_language(source, code, language=language)
-    sections = parse(source, code, language)
-    highlight(source, sections, language, preserve_paths=preserve_paths, outdir=outdir)
-    return generate_html(source, sections, preserve_paths=preserve_paths, outdir=outdir)
+    code = open(source, "rb").read().decode(encoding)
+    return _generate_documentation(source, code, outdir, preserve_paths, language)
 
-def parse(source, code, language):
+
+def _generate_documentation(file_path, code, outdir, preserve_paths, language):
+    """
+    Helper function to allow documentation generation without file handling.
+    """
+    language = get_language(file_path, code, language=language)
+    sections = parse(code, language)
+    highlight(sections, language, preserve_paths=preserve_paths, outdir=outdir)
+    return generate_html(file_path, sections, preserve_paths=preserve_paths, outdir=outdir)
+
+
+def parse(code, language):
     """
     Given a string of source code, parse out each comment and the code that
     follows it, and create an individual **section** for it.
@@ -76,7 +89,6 @@ def parse(source, code, language):
                 lines.pop(linenum)
                 break
 
-
     def save(docs, code):
         if docs or code:
             sections.append({
@@ -86,50 +98,67 @@ def parse(source, code, language):
 
     # Setup the variables to get ready to check for multiline comments
     multi_line = False
-    multi_line_delimiters = [language.get("multistart"), language.get("multiend")]
+    multi_string = False
+    multistart, multiend = language.get("multistart"), language.get("multiend")
+    comment_matcher = language['comment_matcher']
 
     for line in lines:
-
+        process_as_code = False
         # Only go into multiline comments section when one of the delimiters is
         # found to be at the start of a line
-        if all(multi_line_delimiters) and any([line.lstrip().startswith(delim) or line.rstrip().endswith(delim) for delim in multi_line_delimiters]):
-            if not multi_line:
-                multi_line = True
-
-            else:
+        if multistart and multiend \
+           and any(line.lstrip().startswith(delim) or line.rstrip().endswith(delim)
+                   for delim in (multistart, multiend)):
+            multi_line = not multi_line
+
+            if multi_line \
+               and line.strip().endswith(multiend) \
+               and len(line.strip()) > len(multiend):
                 multi_line = False
 
-            if (multi_line
-               and line.strip().endswith(language.get("multiend"))
-               and len(line.strip()) > len(language.get("multiend"))):
-                multi_line = False
+            if not line.strip().startswith(multistart) and not multi_line \
+               or multi_string:
 
-            # Get rid of the delimiters so that they aren't in the final docs
-            line = line.replace(language["multistart"], '')
-            line = line.replace(language["multiend"], '')
-            docs_text += line.strip() + '\n'
-            indent_level = re.match("\s*", line).group(0)
+                process_as_code = True
 
-            if has_code and docs_text.strip():
-                save(docs_text, code_text[:-1])
-                code_text = code_text.split('\n')[-1]
-                has_code = docs_text = ''
+                if multi_string:
+                    multi_line = False
+                    multi_string = False
+                else:
+                    multi_string = True
+
+            else:
+                # Get rid of the delimiters so that they aren't in the final
+                # docs
+                line = line.replace(multistart, '')
+                line = line.replace(multiend, '')
+                docs_text += line.strip() + '\n'
+                indent_level = re.match("\s*", line).group(0)
+
+                if has_code and docs_text.strip():
+                    save(docs_text, code_text[:-1])
+                    code_text = code_text.split('\n')[-1]
+                    has_code = docs_text = ''
 
         elif multi_line:
             # Remove leading spaces
-            if re.match(r' {%d}' % len(indent_level), line):
+            if re.match(r' {{{:d}}}'.format(len(indent_level)), line):
                 docs_text += line[len(indent_level):] + '\n'
             else:
                 docs_text += line + '\n'
 
-        elif re.match(language["comment_matcher"], line):
+        elif re.match(comment_matcher, line):
             if has_code:
                 save(docs_text, code_text)
                 has_code = docs_text = code_text = ''
-            docs_text += re.sub(language["comment_matcher"], "", line) + "\n"
+            docs_text += re.sub(comment_matcher, "", line) + "\n"
 
         else:
-            if code_text and any([line.lstrip().startswith(x) for x in ['class ', 'def ', '@']]):
+            process_as_code = True
+
+        if process_as_code:
+            if code_text and any(line.lstrip().startswith(x)
+                                 for x in ['class ', 'def ', '@']):
                 if not code_text.lstrip().startswith("@"):
                     save(docs_text, code_text)
                     code_text = has_code = docs_text = ''
@@ -137,14 +166,14 @@ def parse(source, code, language):
             has_code = True
             code_text += line + '\n'
 
-
     save(docs_text, code_text)
 
     return sections
 
 # === Preprocessing the comments ===
 
-def preprocess(comment, section_nr, preserve_paths=True, outdir=None):
+
+def preprocess(comment, preserve_paths=True, outdir=None):
     """
     Add cross-references before having the text processed by markdown.  It's
     possible to reference another file, like this : `[[main.py]]` which renders
@@ -157,6 +186,7 @@ def preprocess(comment, section_nr, preserve_paths=True, outdir=None):
 
     if not outdir:
         raise TypeError("Missing the required 'outdir' keyword argument.")
+
     def sanitize_section_name(name):
         return "-".join(name.lower().strip().split(" "))
 
@@ -164,33 +194,37 @@ def preprocess(comment, section_nr, preserve_paths=True, outdir=None):
         # Check if the match contains an anchor
         if '#' in match.group(1):
             name, anchor = match.group(1).split('#')
-            return " [%s](%s#%s)" % (name,
-                                     path.basename(destination(name,
-                                                               preserve_paths=preserve_paths,
-                                                               outdir=outdir)),
-                                     anchor)
+            return " [{}]({}#{})".format(name,
+                                         path.basename(destination(name,
+                                                                   preserve_paths=preserve_paths,
+                                                                   outdir=outdir)),
+                                         anchor)
 
         else:
-            return " [%s](%s)" % (match.group(1),
-                                  path.basename(destination(match.group(1),
-                                                            preserve_paths=preserve_paths,
-                                                            outdir=outdir)))
+            return " [{}]({})".format(match.group(1),
+                                      path.basename(destination(match.group(1),
+                                                                preserve_paths=preserve_paths,
+                                                                outdir=outdir)))
 
     def replace_section_name(match):
-        return '%(lvl)s <span id="%(id)s" href="%(id)s">%(name)s</span>' % {
-            "lvl"  : re.sub('=', '#', match.group(1)),
-            "id"   : sanitize_section_name(match.group(2)),
-            "name" : match.group(2)
-        }
+        """
+        Replace equals-sign-formatted section names with anchor links.
+        """
+        return '{lvl} <span id="{id}" href="{id}">{name}</span>'.format(
+            lvl=re.sub('=', '#', match.group(1)),
+            id=sanitize_section_name(match.group(2)),
+            name=match.group(2)
+        )
 
     comment = re.sub('^([=]+)([^=]+)[=]*\s*$', replace_section_name, comment)
-    comment = re.sub('[^`]\[\[(.+?)\]\]', replace_crossref, comment)
+    comment = re.sub('(?<!`)\[\[(.+?)\]\]', replace_crossref, comment)
 
     return comment
 
 # === Highlighting the source code ===
 
-def highlight(source, sections, language, preserve_paths=True, outdir=None):
+
+def highlight(sections, language, preserve_paths=True, outdir=None):
     """
     Highlights a single chunk of code using the **Pygments** module, and runs
     the text of its corresponding comment through **Markdown**.
@@ -215,14 +249,18 @@ def highlight(source, sections, language, preserve_paths=True, outdir=None):
             docs_text = unicode(section["docs_text"])
         except UnicodeError:
             docs_text = unicode(section["docs_text"].decode('utf-8'))
+        except NameError:
+            docs_text = section['docs_text']
         section["docs_html"] = markdown(preprocess(docs_text,
-                                                   i,
                                                    preserve_paths=preserve_paths,
                                                    outdir=outdir))
         section["num"] = i
 
+    return sections
+
 # === HTML Code generation ===
 
+
 def generate_html(source, sections, preserve_paths=True, outdir=None):
     """
     Once all of the code is finished highlighting, we can generate the HTML file
@@ -245,82 +283,25 @@ def generate_html(source, sections, preserve_paths=True, outdir=None):
         sect["code_html"] = re.sub(r"\{\{", r"__DOUBLE_OPEN_STACHE__", sect["code_html"])
 
     rendered = pycco_template({
-        "title"       : title,
-        "stylesheet"  : csspath,
-        "sections"    : sections,
-        "source"      : source,
-        "path"        : path,
-        "destination" : destination
-    })
-
-    return re.sub(r"__DOUBLE_OPEN_STACHE__", "{{", rendered).encode("utf-8")
-
-# === Sitemap Generation ===
-def generate_index(files, outdir):
-
-    css_path = path.join(outdir, "pycco.css")
-
-    sections = []
-
-    def add_file(entry, path, tree):
-        node, subpath = path[0], path[1:]
-        if not node in tree:
-            tree[node] = {}
-
-        if subpath:
-            add_file(entry, subpath, tree[node])
-
-        else:
-            tree[node]['entry'] = entry
-
-    tree = {}
-    for file_path in files:
-        entry = {
-            'path': file_path,
-            'relpath': path.relpath(file_path, outdir)
-        }
-
-        add_file(entry=entry, path=entry['relpath'].split(path.sep), tree=tree)
-
-    def generate_tree_html(tree):
-        items = []
-        for node, subtree in tree.items():
-            if 'entry' in subtree:
-                html = '<li><a href="%s">%s</a></li>' % (subtree['entry']['relpath'], node)
-
-            else:
-                html = '<dl><dt>%s</dt><dd><ul>%s</ul></dd></dl>' % (node, generate_tree_html(subtree))
-
-            items.append(html)
-
-        return ''.join(items)
-
-    sections.append({'docs_html': generate_tree_html(tree)})
-
-    rendered = pycco_template({
-        "title"       : 'Index',
-        "stylesheet"  : css_path,
-        "sections"    : sections,
-        "source"      : '',
-        "path"        : path,
-        "destination" : destination
+        "title": title,
+        "stylesheet": csspath,
+        "sections": sections,
+        "source": source,
     })
 
     return re.sub(r"__DOUBLE_OPEN_STACHE__", "{{", rendered).encode("utf-8")
 
 # === Helpers & Setup ===
 
-# This module contains all of our static resources.
-import pycco_resources
-
 # Import our external dependencies.
 import optparse
 import os
 import pygments
-import pystache
 import re
 import sys
 import time
+import pycco.generate_index as generate_index
+
 from markdown import markdown
 from os import path
 from pygments import lexers, formatters
@@ -329,45 +310,43 @@ from pygments import lexers, formatters
 # the name of the Pygments lexer and the symbol that indicates a comment. To
 # add another language to Pycco's repertoire, add it here.
 languages = {
-    ".coffee": { "name": "coffee-script", "symbol": "#",
-        "multistart": '###', "multiend": '###' },
-
-    ".pl":  { "name": "perl", "symbol": "#" },
+    ".coffee": {"name": "coffee-script", "symbol": "#",
+                "multistart": '###', "multiend": '###'},
 
-    ".sql": { "name": "sql", "symbol": "--" },
+    ".pl":  {"name": "perl", "symbol": "#"},
 
-    ".c":   { "name": "c", "symbol": "//",
-        "multistart": "/*", "multiend": "*/"},
+    ".sql": {"name": "sql", "symbol": "--"},
 
-    ".h":   { "name": "c", "symbol": "//",
-        "multistart": "/*", "multiend": "*/"},
+    ".c":   {"name": "c", "symbol": "//",
+             "multistart": "/*", "multiend": "*/"},
+    ".h":   {"name": "c", "symbol": "//",
+             "multistart": "/*", "multiend": "*/"},
 
-    ".cpp": { "name": "cpp", "symbol": "//"},
+    ".cpp": {"name": "cpp", "symbol": "//"},
 
-    ".cl":   { "name": "c", "symbol": "//",
-        "multistart": "/*", "multiend": "*/"},
+    ".cl":   {"name": "c", "symbol": "//",
+              "multistart": "/*", "multiend": "*/"},
 
-    ".js": { "name": "javascript", "symbol": "//",
-        "multistart": "/*", "multiend": "*/"},
+    ".js": {"name": "javascript", "symbol": "//",
+            "multistart": "/*", "multiend": "*/"},
+    ".rb": {"name": "ruby", "symbol": "#",
+            "multistart": "=begin", "multiend": "=end"},
 
-    ".rb": { "name": "ruby", "symbol": "#",
-        "multistart": "=begin", "multiend": "=end"},
+    ".py": {"name": "python", "symbol": "#",
+            "multistart": '"""', "multiend": '"""'},
 
-    ".py": { "name": "python", "symbol": "#",
-        "multistart": '"""', "multiend": '"""' },
+    ".scm": {"name": "scheme", "symbol": ";;",
+             "multistart": "#|", "multiend": "|#"},
 
-    ".scm": { "name": "scheme", "symbol": ";;",
-        "multistart": "#|", "multiend": "|#"},
+    ".lua": {"name": "lua", "symbol": "--",
+             "multistart": "--[[", "multiend": "--]]"},
 
-    ".lua": { "name": "lua", "symbol": "--",
-        "multistart": "--[[", "multiend": "--]]"},
+    ".erl": {"name": "erlang", "symbol": "%%"},
 
-    ".erl": { "name": "erlang", "symbol": "%%" },
+    ".tcl":  {"name": "tcl", "symbol": "#"},
 
-    ".tcl":  { "name": "tcl", "symbol": "#" },
-
-    ".hs": { "name": "haskell", "symbol": "--",
-        "multistart": "{-", "multiend": "-}"},
+    ".hs": {"name": "haskell", "symbol": "--",
+            "multistart": "{-", "multiend": "-}"},
 }
 
 # Build out the appropriate matchers and delimiters for each language.
@@ -386,6 +365,7 @@ for ext, l in languages.items():
     # Get the Pygments Lexer for this language.
     l["lexer"] = lexers.get_lexer_by_name(l["name"])
 
+
 def get_language(source, code, language=None):
     """Get the current language we're documenting, based on the extension."""
 
@@ -396,16 +376,23 @@ def get_language(source, code, language=None):
         else:
             raise ValueError("Unknown forced language: " + language)
 
-    m = re.match(r'.*(\..+)', os.path.basename(source))
+    m = re.match(r'.*(\..+)', os.path.basename(source)) if source else None
     if m and m.group(1) in languages:
         return languages[m.group(1)]
     else:
-        lang = lexers.guess_lexer(code).name.lower()
-        for l in languages.values():
-            if l["name"] == lang:
-                return l
-        else:
-            raise ValueError("Can't figure out the language! of %s" % source)
+        try:
+            lang = lexers.guess_lexer(code).name.lower()
+            for l in languages.values():
+                if l["name"] == lang:
+                    return l
+            else:
+                raise ValueError()
+        except ValueError:
+            # If pygments can't find any lexers, it will raise its own
+            # subclass of ValueError. We will catch it and raise ours
+            # for consistency.
+            raise ValueError("Can't figure out the language!")
+
 
 def destination(filepath, preserve_paths=True, outdir=None):
     """
@@ -422,7 +409,14 @@ def destination(filepath, preserve_paths=True, outdir=None):
         name = filename
     if preserve_paths:
         name = path.join(dirname, name)
-    return path.join(outdir, "%s.html" % name)
+    dest = path.join(outdir, u"{}.html".format(name))
+    # If `join` is passed an absolute path, it will ignore any earlier path
+    # elements. We will force outdir to the beginning of the path to avoid
+    # writing outside our destination.
+    if not dest.startswith(outdir):
+        dest = outdir + os.sep + dest
+    return dest
+
 
 def shift(list, default):
     """
@@ -435,20 +429,26 @@ def shift(list, default):
     except IndexError:
         return default
 
-def ensure_directory(directory):
-    """Ensure that the destination directory exists."""
 
+def remove_control_chars(s):
+    # Sanitization regexp copied from
+    # http://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python
+    from pycco.compat import pycco_unichr
+    control_chars = ''.join(map(pycco_unichr, list(range(0, 32)) + list(range(127, 160))))
+    control_char_re = re.compile(u'[{}]'.format(re.escape(control_chars)))
+    return control_char_re.sub('', s)
+
+
+def ensure_directory(directory):
+    """
+    Sanitize directory string and ensure that the destination directory exists.
+    """
+    directory = remove_control_chars(directory)
     if not os.path.isdir(directory):
         os.makedirs(directory)
 
-def template(source):
-    return lambda context: pystache.render(source, context)
-
-# Create the template that we will use to generate the Pycco HTML page.
-pycco_template = template(pycco_resources.html)
+    return directory
 
-# The CSS styles we'd like to apply to the documentation.
-pycco_styles = pycco_resources.css
 
 # The start of each Pygments highlight block.
 highlight_start = "<div class=\"highlight\"><pre>"
@@ -456,11 +456,12 @@ highlight_start = "<div class=\"highlight\"><pre>"
 # The end of each Pygments highlight block.
 highlight_end = "</pre></div>"
 
-def process(sources, preserve_paths=True, outdir=None, language=None, index=False):
+
+def process(sources, preserve_paths=True, outdir=None, language=None, encoding="utf8", index=False):
     """For each source file passed as argument, generate the documentation."""
 
     if not outdir:
-        raise TypeError("Missing the required 'outdir' keyword argument.")
+        raise TypeError("Missing the required 'directory' keyword argument.")
 
     # Make a copy of sources given on the command line. `main()` needs the
     # original list when monitoring for changed files.
@@ -468,16 +469,15 @@ def process(sources, preserve_paths=True, outdir=None, language=None, index=Fals
 
     # Proceed to generating the documentation.
     if sources:
-        ensure_directory(outdir)
-        css = open(path.join(outdir, "pycco.css"), "w")
-        css.write(pycco_styles)
+        outdir = ensure_directory(outdir)
+        css = open(path.join(outdir, "pycco.css"), "wb")
+        css.write(pycco_css.encode(encoding))
         css.close()
 
         generated_files = []
 
         def next_file():
             s = sources.pop(0)
-            print "pycco = %s ->" % s,
             dest = destination(s, preserve_paths=preserve_paths, outdir=outdir)
 
             try:
@@ -485,10 +485,13 @@ def process(sources, preserve_paths=True, outdir=None, language=None, index=Fals
             except OSError:
                 pass
 
-            with open(dest, "w") as f:
-                f.write(generate_documentation(s, preserve_paths=preserve_paths, outdir=outdir,
-                                               language=language))
-            print dest
+            with open(dest, "wb") as f:
+                f.write(generate_documentation(s, preserve_paths=preserve_paths,
+                                               outdir=outdir,
+                                               language=language,
+                                               encoding=encoding))
+
+            print("pycco: {} -> {}".format(s, dest))
             generated_files.append(dest)
 
             if sources:
@@ -496,8 +499,8 @@ def process(sources, preserve_paths=True, outdir=None, language=None, index=Fals
         next_file()
 
         if index:
-            with open(path.join(outdir, "index.html"), "w") as f:
-                f.write(generate_index(generated_files, outdir))
+            with open(path.join(outdir, "index.html"), "wb") as f:
+                f.write(generate_index.generate_index(generated_files, outdir))
 
 __all__ = ("process", "generate_documentation")
 
@@ -516,7 +519,9 @@ def monitor(sources, opts):
                             for source in sources)
 
     class RegenerateHandler(watchdog.events.FileSystemEventHandler):
+
         """A handler for recompiling files which triggered watchdog events"""
+
         def on_modified(self, event):
             """Regenerate documentation for a file which triggered an event"""
             # Re-generate documentation from a source file if it was listed on
@@ -567,8 +572,12 @@ def main():
                       help='Generate an index.html document with sitemap content')
 
     opts, sources = parser.parse_args()
+    if opts.outdir == '':
+        outdir = '.'
+    else:
+        outdir = opts.outdir
 
-    process(sources, outdir=opts.outdir, preserve_paths=opts.paths,
+    process(sources, outdir=outdir, preserve_paths=opts.paths,
             language=opts.language, index=opts.generate_index)
 
     # If the -w / --watch option was present, monitor the source directories