diff options
Diffstat (limited to 'src/contrib/markdown/blockprocessors.py')
-rw-r--r-- | src/contrib/markdown/blockprocessors.py | 460 |
1 files changed, 460 insertions, 0 deletions
diff --git a/src/contrib/markdown/blockprocessors.py b/src/contrib/markdown/blockprocessors.py new file mode 100644 index 0000000..79f4db9 --- /dev/null +++ b/src/contrib/markdown/blockprocessors.py @@ -0,0 +1,460 @@ +""" +CORE MARKDOWN BLOCKPARSER +============================================================================= + +This parser handles basic parsing of Markdown blocks. It doesn't concern itself +with inline elements such as **bold** or *italics*, but rather just catches +blocks, lists, quotes, etc. + +The BlockParser is made up of a bunch of BlockProssors, each handling a +different type of block. Extensions may add/replace/remove BlockProcessors +as they need to alter how markdown blocks are parsed. + +""" + +import re +import markdown + +class BlockProcessor: + """ Base class for block processors. + + Each subclass will provide the methods below to work with the source and + tree. Each processor will need to define it's own ``test`` and ``run`` + methods. The ``test`` method should return True or False, to indicate + whether the current block should be processed by this processor. If the + test passes, the parser will call the processors ``run`` method. + + """ + + def __init__(self, parser=None): + self.parser = parser + + def lastChild(self, parent): + """ Return the last child of an etree element. """ + if len(parent): + return parent[-1] + else: + return None + + def detab(self, text): + """ Remove a tab from the front of each line of the given text. """ + newtext = [] + lines = text.split('\n') + for line in lines: + if line.startswith(' '*markdown.TAB_LENGTH): + newtext.append(line[markdown.TAB_LENGTH:]) + elif not line.strip(): + newtext.append('') + else: + break + return '\n'.join(newtext), '\n'.join(lines[len(newtext):]) + + def looseDetab(self, text, level=1): + """ Remove a tab from front of lines but allowing dedented lines. """ + lines = text.split('\n') + for i in range(len(lines)): + if lines[i].startswith(' '*markdown.TAB_LENGTH*level): + lines[i] = lines[i][markdown.TAB_LENGTH*level:] + return '\n'.join(lines) + + def test(self, parent, block): + """ Test for block type. Must be overridden by subclasses. + + As the parser loops through processors, it will call the ``test`` method + on each to determine if the given block of text is of that type. This + method must return a boolean ``True`` or ``False``. The actual method of + testing is left to the needs of that particular block type. It could + be as simple as ``block.startswith(some_string)`` or a complex regular + expression. As the block type may be different depending on the parent + of the block (i.e. inside a list), the parent etree element is also + provided and may be used as part of the test. + + Keywords: + + * ``parent``: A etree element which will be the parent of the block. + * ``block``: A block of text from the source which has been split at + blank lines. + """ + pass + + def run(self, parent, blocks): + """ Run processor. Must be overridden by subclasses. + + When the parser determines the appropriate type of a block, the parser + will call the corresponding processor's ``run`` method. This method + should parse the individual lines of the block and append them to + the etree. + + Note that both the ``parent`` and ``etree`` keywords are pointers + to instances of the objects which should be edited in place. Each + processor must make changes to the existing objects as there is no + mechanism to return new/different objects to replace them. + + This means that this method should be adding SubElements or adding text + to the parent, and should remove (``pop``) or add (``insert``) items to + the list of blocks. + + Keywords: + + * ``parent``: A etree element which is the parent of the current block. + * ``blocks``: A list of all remaining blocks of the document. + """ + pass + + +class ListIndentProcessor(BlockProcessor): + """ Process children of list items. + + Example: + * a list item + process this part + + or this part + + """ + + INDENT_RE = re.compile(r'^(([ ]{%s})+)'% markdown.TAB_LENGTH) + ITEM_TYPES = ['li'] + LIST_TYPES = ['ul', 'ol'] + + def test(self, parent, block): + return block.startswith(' '*markdown.TAB_LENGTH) and \ + not self.parser.state.isstate('detabbed') and \ + (parent.tag in self.ITEM_TYPES or \ + (len(parent) and parent[-1] and \ + (parent[-1].tag in self.LIST_TYPES) + ) + ) + + def run(self, parent, blocks): + block = blocks.pop(0) + level, sibling = self.get_level(parent, block) + block = self.looseDetab(block, level) + + self.parser.state.set('detabbed') + if parent.tag in self.ITEM_TYPES: + # The parent is already a li. Just parse the child block. + self.parser.parseBlocks(parent, [block]) + elif sibling.tag in self.ITEM_TYPES: + # The sibling is a li. Use it as parent. + self.parser.parseBlocks(sibling, [block]) + elif len(sibling) and sibling[-1].tag in self.ITEM_TYPES: + # The parent is a list (``ol`` or ``ul``) which has children. + # Assume the last child li is the parent of this block. + if sibling[-1].text: + # If the parent li has text, that text needs to be moved to a p + block = '%s\n\n%s' % (sibling[-1].text, block) + sibling[-1].text = '' + self.parser.parseChunk(sibling[-1], block) + else: + self.create_item(sibling, block) + self.parser.state.reset() + + def create_item(self, parent, block): + """ Create a new li and parse the block with it as the parent. """ + li = markdown.etree.SubElement(parent, 'li') + self.parser.parseBlocks(li, [block]) + + def get_level(self, parent, block): + """ Get level of indent based on list level. """ + # Get indent level + m = self.INDENT_RE.match(block) + if m: + indent_level = len(m.group(1))/markdown.TAB_LENGTH + else: + indent_level = 0 + if self.parser.state.isstate('list'): + # We're in a tightlist - so we already are at correct parent. + level = 1 + else: + # We're in a looselist - so we need to find parent. + level = 0 + # Step through children of tree to find matching indent level. + while indent_level > level: + child = self.lastChild(parent) + if child and (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES): + if child.tag in self.LIST_TYPES: + level += 1 + parent = child + else: + # No more child levels. If we're short of indent_level, + # we have a code block. So we stop here. + break + return level, parent + + +class CodeBlockProcessor(BlockProcessor): + """ Process code blocks. """ + + def test(self, parent, block): + return block.startswith(' '*markdown.TAB_LENGTH) + + def run(self, parent, blocks): + sibling = self.lastChild(parent) + block = blocks.pop(0) + theRest = '' + if sibling and sibling.tag == "pre" and len(sibling) \ + and sibling[0].tag == "code": + # The previous block was a code block. As blank lines do not start + # new code blocks, append this block to the previous, adding back + # linebreaks removed from the split into a list. + code = sibling[0] + block, theRest = self.detab(block) + code.text = markdown.AtomicString('%s\n%s\n' % (code.text, block.rstrip())) + else: + # This is a new codeblock. Create the elements and insert text. + pre = markdown.etree.SubElement(parent, 'pre') + code = markdown.etree.SubElement(pre, 'code') + block, theRest = self.detab(block) + code.text = markdown.AtomicString('%s\n' % block.rstrip()) + if theRest: + # This block contained unindented line(s) after the first indented + # line. Insert these lines as the first block of the master blocks + # list for future processing. + blocks.insert(0, theRest) + + +class BlockQuoteProcessor(BlockProcessor): + + RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)') + + def test(self, parent, block): + return bool(self.RE.search(block)) + + def run(self, parent, blocks): + block = blocks.pop(0) + m = self.RE.search(block) + if m: + before = block[:m.start()] # Lines before blockquote + # Pass lines before blockquote in recursively for parsing forst. + self.parser.parseBlocks(parent, [before]) + # Remove ``> `` from begining of each line. + block = '\n'.join([self.clean(line) for line in + block[m.start():].split('\n')]) + sibling = self.lastChild(parent) + if sibling and sibling.tag == "blockquote": + # Previous block was a blockquote so set that as this blocks parent + quote = sibling + else: + # This is a new blockquote. Create a new parent element. + quote = markdown.etree.SubElement(parent, 'blockquote') + # Recursively parse block with blockquote as parent. + self.parser.parseChunk(quote, block) + + def clean(self, line): + """ Remove ``>`` from beginning of a line. """ + m = self.RE.match(line) + if line.strip() == ">": + return "" + elif m: + return m.group(2) + else: + return line + +class OListProcessor(BlockProcessor): + """ Process ordered list blocks. """ + + TAG = 'ol' + # Detect an item (``1. item``). ``group(1)`` contains contents of item. + RE = re.compile(r'^[ ]{0,3}\d+\.[ ](.*)') + # Detect items on secondary lines. they can be of either list type. + CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.)|[*+-])[ ](.*)') + # Detect indented (nested) items of either type + INDENT_RE = re.compile(r'^[ ]{4,7}((\d+\.)|[*+-])[ ].*') + + def test(self, parent, block): + return bool(self.RE.match(block)) + + def run(self, parent, blocks): + # Check fr multiple items in one block. + items = self.get_items(blocks.pop(0)) + sibling = self.lastChild(parent) + if sibling and sibling.tag in ['ol', 'ul']: + # Previous block was a list item, so set that as parent + lst = sibling + # make sure previous item is in a p. + if len(lst) and lst[-1].text and not len(lst[-1]): + p = markdown.etree.SubElement(lst[-1], 'p') + p.text = lst[-1].text + lst[-1].text = '' + # parse first block differently as it gets wrapped in a p. + li = markdown.etree.SubElement(lst, 'li') + self.parser.state.set('looselist') + firstitem = items.pop(0) + self.parser.parseBlocks(li, [firstitem]) + self.parser.state.reset() + else: + # This is a new list so create parent with appropriate tag. + lst = markdown.etree.SubElement(parent, self.TAG) + self.parser.state.set('list') + # Loop through items in block, recursively parsing each with the + # appropriate parent. + for item in items: + if item.startswith(' '*markdown.TAB_LENGTH): + # Item is indented. Parse with last item as parent + self.parser.parseBlocks(lst[-1], [item]) + else: + # New item. Create li and parse with it as parent + li = markdown.etree.SubElement(lst, 'li') + self.parser.parseBlocks(li, [item]) + self.parser.state.reset() + + def get_items(self, block): + """ Break a block into list items. """ + items = [] + for line in block.split('\n'): + m = self.CHILD_RE.match(line) + if m: + # This is a new item. Append + items.append(m.group(3)) + elif self.INDENT_RE.match(line): + # This is an indented (possibly nested) item. + if items[-1].startswith(' '*markdown.TAB_LENGTH): + # Previous item was indented. Append to that item. + items[-1] = '%s\n%s' % (items[-1], line) + else: + items.append(line) + else: + # This is another line of previous item. Append to that item. + items[-1] = '%s\n%s' % (items[-1], line) + return items + + +class UListProcessor(OListProcessor): + """ Process unordered list blocks. """ + + TAG = 'ul' + RE = re.compile(r'^[ ]{0,3}[*+-][ ](.*)') + + +class HashHeaderProcessor(BlockProcessor): + """ Process Hash Headers. """ + + # Detect a header at start of any line in block + RE = re.compile(r'(^|\n)(?P<level>#{1,6})(?P<header>.*?)#*(\n|$)') + + def test(self, parent, block): + return bool(self.RE.search(block)) + + def run(self, parent, blocks): + block = blocks.pop(0) + m = self.RE.search(block) + if m: + before = block[:m.start()] # All lines before header + after = block[m.end():] # All lines after header + if before: + # As the header was not the first line of the block and the + # lines before the header must be parsed first, + # recursively parse this lines as a block. + self.parser.parseBlocks(parent, [before]) + # Create header using named groups from RE + h = markdown.etree.SubElement(parent, 'h%d' % len(m.group('level'))) + h.text = m.group('header').strip() + if after: + # Insert remaining lines as first block for future parsing. + blocks.insert(0, after) + else: + # This should never happen, but just in case... + message(CRITICAL, "We've got a problem header!") + + +class SetextHeaderProcessor(BlockProcessor): + """ Process Setext-style Headers. """ + + # Detect Setext-style header. Must be first 2 lines of block. + RE = re.compile(r'^.*?\n[=-]{3,}', re.MULTILINE) + + def test(self, parent, block): + return bool(self.RE.match(block)) + + def run(self, parent, blocks): + lines = blocks.pop(0).split('\n') + # Determine level. ``=`` is 1 and ``-`` is 2. + if lines[1].startswith('='): + level = 1 + else: + level = 2 + h = markdown.etree.SubElement(parent, 'h%d' % level) + h.text = lines[0].strip() + if len(lines) > 2: + # Block contains additional lines. Add to master blocks for later. + blocks.insert(0, '\n'.join(lines[2:])) + + +class HRProcessor(BlockProcessor): + """ Process Horizontal Rules. """ + + RE = r'[ ]{0,3}(?P<ch>[*_-])[ ]?((?P=ch)[ ]?){2,}[ ]*' + # Detect hr on any line of a block. + SEARCH_RE = re.compile(r'(^|\n)%s(\n|$)' % RE) + # Match a hr on a single line of text. + MATCH_RE = re.compile(r'^%s$' % RE) + + def test(self, parent, block): + return bool(self.SEARCH_RE.search(block)) + + def run(self, parent, blocks): + lines = blocks.pop(0).split('\n') + prelines = [] + # Check for lines in block before hr. + for line in lines: + m = self.MATCH_RE.match(line) + if m: + break + else: + prelines.append(line) + if len(prelines): + # Recursively parse lines before hr so they get parsed first. + self.parser.parseBlocks(parent, ['\n'.join(prelines)]) + # create hr + hr = markdown.etree.SubElement(parent, 'hr') + # check for lines in block after hr. + lines = lines[len(prelines)+1:] + if len(lines): + # Add lines after hr to master blocks for later parsing. + blocks.insert(0, '\n'.join(lines)) + + +class EmptyBlockProcessor(BlockProcessor): + """ Process blocks and start with an empty line. """ + + # Detect a block that only contains whitespace + # or only whitespace on the first line. + RE = re.compile(r'^\s*\n') + + def test(self, parent, block): + return bool(self.RE.match(block)) + + def run(self, parent, blocks): + block = blocks.pop(0) + m = self.RE.match(block) + if m: + # Add remaining line to master blocks for later. + blocks.insert(0, block[m.end():]) + sibling = self.lastChild(parent) + if sibling and sibling.tag == 'pre' and sibling[0] and \ + sibling[0].tag == 'code': + # Last block is a codeblock. Append to preserve whitespace. + sibling[0].text = markdown.AtomicString('%s/n/n/n' % sibling[0].text ) + + +class ParagraphProcessor(BlockProcessor): + """ Process Paragraph blocks. """ + + def test(self, parent, block): + return True + + def run(self, parent, blocks): + block = blocks.pop(0) + if block.strip(): + # Not a blank block. Add to parent, otherwise throw it away. + if self.parser.state.isstate('list'): + # The parent is a tight-list. Append to parent.text + if parent.text: + parent.text = '%s\n%s' % (parent.text, block) + else: + parent.text = block.lstrip() + else: + # Create a regular paragraph + p = markdown.etree.SubElement(parent, 'p') + p.text = block.lstrip() |