diff options
Diffstat (limited to 'src/contrib/markdown/__init__.py')
-rw-r--r-- | src/contrib/markdown/__init__.py | 603 |
1 files changed, 603 insertions, 0 deletions
diff --git a/src/contrib/markdown/__init__.py b/src/contrib/markdown/__init__.py new file mode 100644 index 0000000..af5a2c1 --- /dev/null +++ b/src/contrib/markdown/__init__.py @@ -0,0 +1,603 @@ +""" +Python Markdown +=============== + +Python Markdown converts Markdown to HTML and can be used as a library or +called from the command line. + +## Basic usage as a module: + + import markdown + md = Markdown() + html = md.convert(your_text_string) + +## Basic use from the command line: + + python markdown.py source.txt > destination.html + +Run "python markdown.py --help" to see more options. + +## Extensions + +See <http://www.freewisdom.org/projects/python-markdown/> for more +information and instructions on how to extend the functionality of +Python Markdown. Read that before you try modifying this file. + +## Authors and License + +Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and +maintained by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan +Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com). + +Contact: markdown@freewisdom.org + +Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later) +Copyright 200? Django Software Foundation (OrderedDict implementation) +Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) +Copyright 2004 Manfred Stienstra (the original version) + +License: BSD (see docs/LICENSE for details). +""" + +version = "2.0-rc2" +version_info = (2,0,0, "rc2") + +import re +import codecs +import sys +import warnings +import logging +from logging import DEBUG, INFO, WARN, ERROR, CRITICAL + + +""" +CONSTANTS +============================================================================= +""" + +""" +Constants you might want to modify +----------------------------------------------------------------------------- +""" + +# default logging level for command-line use +COMMAND_LINE_LOGGING_LEVEL = CRITICAL +TAB_LENGTH = 4 # expand tabs to this many spaces +ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz"> +SMART_EMPHASIS = True # this_or_that does not become this<i>or</i>that +DEFAULT_OUTPUT_FORMAT = 'xhtml1' # xhtml or html4 output +HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode +BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul" + "|script|noscript|form|fieldset|iframe|math" + "|ins|del|hr|hr/|style|li|dt|dd|thead|tbody" + "|tr|th|td") +DOC_TAG = "div" # Element used to wrap document - later removed + +# Placeholders +STX = u'\u0002' # Use STX ("Start of text") for start-of-placeholder +ETX = u'\u0003' # Use ETX ("End of text") for end-of-placeholder +INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:" +INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX +AMP_SUBSTITUTE = STX+"amp"+ETX + + +""" +Constants you probably do not need to change +----------------------------------------------------------------------------- +""" + +RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'), + # Hebrew (0590-05FF), Arabic (0600-06FF), + # Syriac (0700-074F), Arabic supplement (0750-077F), + # Thaana (0780-07BF), Nko (07C0-07FF). + (u'\u2D30', u'\u2D7F'), # Tifinagh + ) + + +""" +AUXILIARY GLOBAL FUNCTIONS +============================================================================= +""" + + +def message(level, text): + """ A wrapper method for logging debug messages. """ + logger = logging.getLogger('MARKDOWN') + if logger.handlers: + # The logger is configured + logger.log(level, text) + if level > WARN: + sys.exit(0) + elif level > WARN: + raise MarkdownException, text + else: + warnings.warn(text, MarkdownWarning) + + +def isBlockLevel(tag): + """Check if the tag is a block level HTML tag.""" + return BLOCK_LEVEL_ELEMENTS.match(tag) + +""" +MISC AUXILIARY CLASSES +============================================================================= +""" + +class AtomicString(unicode): + """A string which should not be further processed.""" + pass + + +class MarkdownException(Exception): + """ A Markdown Exception. """ + pass + + +class MarkdownWarning(Warning): + """ A Markdown Warning. """ + pass + + +""" +OVERALL DESIGN +============================================================================= + +Markdown processing takes place in four steps: + +1. A bunch of "preprocessors" munge the input text. +2. BlockParser() parses the high-level structural elements of the + pre-processed text into an ElementTree. +3. A bunch of "treeprocessors" are run against the ElementTree. One such + treeprocessor runs InlinePatterns against the ElementTree, detecting inline + markup. +4. Some post-processors are run against the text after the ElementTree has + been serialized into text. +5. The output is written to a string. + +Those steps are put together by the Markdown() class. + +""" + +import preprocessors +import blockprocessors +import treeprocessors +import inlinepatterns +import postprocessors +import blockparser +import etree_loader +import odict + +# Extensions should use "markdown.etree" instead of "etree" (or do `from +# markdown import etree`). Do not import it by yourself. + +etree = etree_loader.importETree() + +# Adds the ability to output html4 +import html4 + + +class Markdown: + """Convert Markdown to HTML.""" + + def __init__(self, + extensions=[], + extension_configs={}, + safe_mode = False, + output_format=DEFAULT_OUTPUT_FORMAT): + """ + Creates a new Markdown instance. + + Keyword arguments: + + * extensions: A list of extensions. + If they are of type string, the module mdx_name.py will be loaded. + If they are a subclass of markdown.Extension, they will be used + as-is. + * extension-configs: Configuration setting for extensions. + * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". + * output_format: Format of output. Supported formats are: + * "xhtml1": Outputs XHTML 1.x. Default. + * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1). + * "html4": Outputs HTML 4 + * "html": Outputs latest supported version of HTML (currently HTML 4). + Note that it is suggested that the more specific formats ("xhtml1" + and "html4") be used as "xhtml" or "html" may change in the future + if it makes sense at that time. + + """ + + self.safeMode = safe_mode + self.registeredExtensions = [] + self.docType = "" + self.stripTopLevelTags = True + + # Preprocessors + self.preprocessors = odict.OrderedDict() + self.preprocessors["html_block"] = \ + preprocessors.HtmlBlockPreprocessor(self) + self.preprocessors["reference"] = \ + preprocessors.ReferencePreprocessor(self) + # footnote preprocessor will be inserted with "<reference" + + # Block processors - ran by the parser + self.parser = blockparser.BlockParser() + self.parser.blockprocessors['empty'] = \ + blockprocessors.EmptyBlockProcessor(self.parser) + self.parser.blockprocessors['indent'] = \ + blockprocessors.ListIndentProcessor(self.parser) + self.parser.blockprocessors['code'] = \ + blockprocessors.CodeBlockProcessor(self.parser) + self.parser.blockprocessors['hashheader'] = \ + blockprocessors.HashHeaderProcessor(self.parser) + self.parser.blockprocessors['setextheader'] = \ + blockprocessors.SetextHeaderProcessor(self.parser) + self.parser.blockprocessors['hr'] = \ + blockprocessors.HRProcessor(self.parser) + self.parser.blockprocessors['olist'] = \ + blockprocessors.OListProcessor(self.parser) + self.parser.blockprocessors['ulist'] = \ + blockprocessors.UListProcessor(self.parser) + self.parser.blockprocessors['quote'] = \ + blockprocessors.BlockQuoteProcessor(self.parser) + self.parser.blockprocessors['paragraph'] = \ + blockprocessors.ParagraphProcessor(self.parser) + + + #self.prePatterns = [] + + # Inline patterns - Run on the tree + self.inlinePatterns = odict.OrderedDict() + self.inlinePatterns["backtick"] = \ + inlinepatterns.BacktickPattern(inlinepatterns.BACKTICK_RE) + self.inlinePatterns["escape"] = \ + inlinepatterns.SimpleTextPattern(inlinepatterns.ESCAPE_RE) + self.inlinePatterns["reference"] = \ + inlinepatterns.ReferencePattern(inlinepatterns.REFERENCE_RE, self) + self.inlinePatterns["link"] = \ + inlinepatterns.LinkPattern(inlinepatterns.LINK_RE, self) + self.inlinePatterns["image_link"] = \ + inlinepatterns.ImagePattern(inlinepatterns.IMAGE_LINK_RE, self) + self.inlinePatterns["image_reference"] = \ + inlinepatterns.ImageReferencePattern(inlinepatterns.IMAGE_REFERENCE_RE, self) + self.inlinePatterns["autolink"] = \ + inlinepatterns.AutolinkPattern(inlinepatterns.AUTOLINK_RE, self) + self.inlinePatterns["automail"] = \ + inlinepatterns.AutomailPattern(inlinepatterns.AUTOMAIL_RE, self) + self.inlinePatterns["linebreak2"] = \ + inlinepatterns.SubstituteTagPattern(inlinepatterns.LINE_BREAK_2_RE, 'br') + self.inlinePatterns["linebreak"] = \ + inlinepatterns.SubstituteTagPattern(inlinepatterns.LINE_BREAK_RE, 'br') + self.inlinePatterns["html"] = \ + inlinepatterns.HtmlPattern(inlinepatterns.HTML_RE, self) + self.inlinePatterns["entity"] = \ + inlinepatterns.HtmlPattern(inlinepatterns.ENTITY_RE, self) + self.inlinePatterns["not_strong"] = \ + inlinepatterns.SimpleTextPattern(inlinepatterns.NOT_STRONG_RE) + self.inlinePatterns["strong_em"] = \ + inlinepatterns.DoubleTagPattern(inlinepatterns.STRONG_EM_RE, 'strong,em') + self.inlinePatterns["strong"] = \ + inlinepatterns.SimpleTagPattern(inlinepatterns.STRONG_RE, 'strong') + self.inlinePatterns["emphasis"] = \ + inlinepatterns.SimpleTagPattern(inlinepatterns.EMPHASIS_RE, 'em') + self.inlinePatterns["emphasis2"] = \ + inlinepatterns.SimpleTagPattern(inlinepatterns.EMPHASIS_2_RE, 'em') + # The order of the handlers matters!!! + + + # Tree processors - run once we have a basic parse. + self.treeprocessors = odict.OrderedDict() + self.treeprocessors["inline"] = treeprocessors.InlineProcessor(self) + self.treeprocessors["prettify"] = \ + treeprocessors.PrettifyTreeprocessor(self) + + # Postprocessors - finishing touches. + self.postprocessors = odict.OrderedDict() + self.postprocessors["raw_html"] = \ + postprocessors.RawHtmlPostprocessor(self) + self.postprocessors["amp_substitute"] = \ + postprocessors.AndSubstitutePostprocessor() + # footnote postprocessor will be inserted with ">amp_substitute" + + # Map format keys to serializers + self.output_formats = { + 'html' : html4.to_html_string, + 'html4' : html4.to_html_string, + 'xhtml' : etree.tostring, + 'xhtml1': etree.tostring, + } + + self.references = {} + self.htmlStash = preprocessors.HtmlStash() + self.registerExtensions(extensions = extensions, + configs = extension_configs) + self.set_output_format(output_format) + self.reset() + + def registerExtensions(self, extensions, configs): + """ + Register extensions with this instance of Markdown. + + Keyword aurguments: + + * extensions: A list of extensions, which can either + be strings or objects. See the docstring on Markdown. + * configs: A dictionary mapping module names to config options. + + """ + for ext in extensions: + if isinstance(ext, basestring): + ext = load_extension(ext, configs.get(ext, [])) + try: + ext.extendMarkdown(self, globals()) + except AttributeError: + message(ERROR, "Incorrect type! Extension '%s' is " + "neither a string or an Extension." %(repr(ext))) + + + def registerExtension(self, extension): + """ This gets called by the extension """ + self.registeredExtensions.append(extension) + + def reset(self): + """ + Resets all state variables so that we can start with a new text. + """ + self.htmlStash.reset() + self.references.clear() + + for extension in self.registeredExtensions: + extension.reset() + + def set_output_format(self, format): + """ Set the output format for the class instance. """ + try: + self.serializer = self.output_formats[format.lower()] + except KeyError: + message(CRITICAL, 'Invalid Output Format: "%s". Use one of %s.' \ + % (format, self.output_formats.keys())) + + def convert(self, source): + """ + Convert markdown to serialized XHTML or HTML. + + Keyword arguments: + + * source: Source text as a Unicode string. + + """ + + # Fixup the source text + if not source.strip(): + return u"" # a blank unicode string + try: + source = unicode(source) + except UnicodeDecodeError: + message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.') + return u"" + + source = source.replace(STX, "").replace(ETX, "") + source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" + source = re.sub(r'\n\s+\n', '\n\n', source) + source = source.expandtabs(TAB_LENGTH) + + # Split into lines and run the line preprocessors. + self.lines = source.split("\n") + for prep in self.preprocessors.values(): + self.lines = prep.run(self.lines) + + # Parse the high-level elements. + root = self.parser.parseDocument(self.lines).getroot() + + # Run the tree-processors + for treeprocessor in self.treeprocessors.values(): + newRoot = treeprocessor.run(root) + if newRoot: + root = newRoot + + # Serialize _properly_. Strip top-level tags. + output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf8")) + if self.stripTopLevelTags: + start = output.index('<%s>'%DOC_TAG)+len(DOC_TAG)+2 + end = output.rindex('</%s>'%DOC_TAG) + output = output[start:end].strip() + + # Run the text post-processors + for pp in self.postprocessors.values(): + output = pp.run(output) + + return output.strip() + + def convertFile(self, input=None, output=None, encoding=None): + """Converts a markdown file and returns the HTML as a unicode string. + + Decodes the file using the provided encoding (defaults to utf-8), + passes the file content to markdown, and outputs the html to either + the provided stream or the file with provided name, using the same + encoding as the source file. + + **Note:** This is the only place that decoding and encoding of unicode + takes place in Python-Markdown. (All other code is unicode-in / + unicode-out.) + + Keyword arguments: + + * input: Name of source text file. + * output: Name of output file. Writes to stdout if `None`. + * encoding: Encoding of input and output files. Defaults to utf-8. + + """ + + encoding = encoding or "utf-8" + + # Read the source + input_file = codecs.open(input, mode="r", encoding=encoding) + text = input_file.read() + input_file.close() + text = text.lstrip(u'\ufeff') # remove the byte-order mark + + # Convert + html = self.convert(text) + + # Write to file or stdout + if isinstance(output, (str, unicode)): + output_file = codecs.open(output, "w", encoding=encoding) + output_file.write(html) + output_file.close() + else: + output.write(html.encode(encoding)) + + +""" +Extensions +----------------------------------------------------------------------------- +""" + +class Extension: + """ Base class for extensions to subclass. """ + def __init__(self, configs = {}): + """Create an instance of an Extention. + + Keyword arguments: + + * configs: A dict of configuration setting used by an Extension. + """ + self.config = configs + + def getConfig(self, key): + """ Return a setting for the given key or an empty string. """ + if key in self.config: + return self.config[key][0] + else: + return "" + + def getConfigInfo(self): + """ Return all config settings as a list of tuples. """ + return [(key, self.config[key][1]) for key in self.config.keys()] + + def setConfig(self, key, value): + """ Set a config setting for `key` with the given `value`. """ + self.config[key][0] = value + + def extendMarkdown(self, md, md_globals): + """ + Add the various proccesors and patterns to the Markdown Instance. + + This method must be overriden by every extension. + + Keyword arguments: + + * md: The Markdown instance. + + * md_globals: Global variables in the markdown module namespace. + + """ + pass + + +def load_extension(ext_name, configs = []): + """Load extension by name, then return the module. + + The extension name may contain arguments as part of the string in the + following format: "extname(key1=value1,key2=value2)" + + """ + + # Parse extensions config params (ignore the order) + configs = dict(configs) + pos = ext_name.find("(") # find the first "(" + if pos > 0: + ext_args = ext_name[pos+1:-1] + ext_name = ext_name[:pos] + pairs = [x.split("=") for x in ext_args.split(",")] + configs.update([(x.strip(), y.strip()) for (x, y) in pairs]) + + # Setup the module names + ext_module = 'markdown.extensions' + module_name_new_style = '.'.join([ext_module, ext_name]) + module_name_old_style = '_'.join(['mdx', ext_name]) + + # Try loading the extention first from one place, then another + try: # New style (markdown.extensons.<extension>) + module = __import__(module_name_new_style, {}, {}, [ext_module]) + except ImportError: + try: # Old style (mdx.<extension>) + module = __import__(module_name_old_style) + except ImportError: + message(WARN, "Failed loading extension '%s' from '%s' or '%s'" + % (ext_name, module_name_new_style, module_name_old_style)) + # Return None so we don't try to initiate none-existant extension + return None + + # If the module is loaded successfully, we expect it to define a + # function called makeExtension() + try: + return module.makeExtension(configs.items()) + except AttributeError: + message(CRITICAL, "Failed to initiate extension '%s'" % ext_name) + + +def load_extensions(ext_names): + """Loads multiple extensions""" + extensions = [] + for ext_name in ext_names: + extension = load_extension(ext_name) + if extension: + extensions.append(extension) + return extensions + + +""" +EXPORTED FUNCTIONS +============================================================================= + +Those are the two functions we really mean to export: markdown() and +markdownFromFile(). +""" + +def markdown(text, + extensions = [], + safe_mode = False, + output_format = DEFAULT_OUTPUT_FORMAT): + """Convert a markdown string to HTML and return HTML as a unicode string. + + This is a shortcut function for `Markdown` class to cover the most + basic use case. It initializes an instance of Markdown, loads the + necessary extensions and runs the parser on the given text. + + Keyword arguments: + + * text: Markdown formatted text as Unicode or ASCII string. + * extensions: A list of extensions or extension names (may contain config args). + * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". + * output_format: Format of output. Supported formats are: + * "xhtml1": Outputs XHTML 1.x. Default. + * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1). + * "html4": Outputs HTML 4 + * "html": Outputs latest supported version of HTML (currently HTML 4). + Note that it is suggested that the more specific formats ("xhtml1" + and "html4") be used as "xhtml" or "html" may change in the future + if it makes sense at that time. + + Returns: An HTML document as a string. + + """ + md = Markdown(extensions=load_extensions(extensions), + safe_mode=safe_mode, + output_format=output_format) + return md.convert(text) + + +def markdownFromFile(input = None, + output = None, + extensions = [], + encoding = None, + safe_mode = False, + output_format = DEFAULT_OUTPUT_FORMAT): + """Read markdown code from a file and write it to a file or a stream.""" + md = Markdown(extensions=load_extensions(extensions), + safe_mode=safe_mode, + output_format=output_format) + md.convertFile(input, output, encoding) + + + |