added github flavoured markdown parser

author: Kurt Neufeld <kneufeld@burgundywall.com> 2016-06-02 15:53:59 -0700
committer: Kurt Neufeld <kneufeld@burgundywall.com> 2016-06-02 15:53:59 -0700
commit: 549f98f0896731e80efee22ac44d105fe77039e6 (patch)
tree: 3393e267af2631b07581649dface8cd0c204458f
parent: 6bd42c3ccd9ee14b0268592761a0aa0b39bd2b9e (diff)
download: pygments-549f98f0896731e80efee22ac44d105fe77039e6.tar.gz
4 files changed, 157 insertions, 1 deletions
diff --git a/AUTHORS b/AUTHORS
index 3b3f0502..e26fa6e9 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -212,5 +212,6 @@ Other contributors, listed alphabetically, are:
 * Alex Zimin -- Nemerle lexer
 * Rob Zimmerman -- Kal lexer
 * Vincent Zurczak -- Roboconf lexer
+* Kurt Neufeld -- Markdown lexer
 
 Many thanks for all contributions!
diff --git a/pygments/lexers/_mapping.py b/pygments/lexers/_mapping.py
index b52f2823..80cdeee0 100644
--- a/pygments/lexers/_mapping.py
+++ b/pygments/lexers/_mapping.py
@@ -252,6 +252,7 @@ LEXERS = {
     'MakoLexer': ('pygments.lexers.templates', 'Mako', ('mako',), ('*.mao',), ('application/x-mako',)),
     'MakoXmlLexer': ('pygments.lexers.templates', 'XML+Mako', ('xml+mako',), (), ('application/xml+mako',)),
     'MaqlLexer': ('pygments.lexers.business', 'MAQL', ('maql',), ('*.maql',), ('text/x-gooddata-maql', 'application/x-gooddata-maql')),
+    'MarkdownLexer': ('pygments.lexers.markup', 'markdown', ('md',), ('*.md',), ('text/x-markdown',)),
     'MaskLexer': ('pygments.lexers.javascript', 'Mask', ('mask',), ('*.mask',), ('text/x-mask',)),
     'MasonLexer': ('pygments.lexers.templates', 'Mason', ('mason',), ('*.m', '*.mhtml', '*.mc', '*.mi', 'autohandler', 'dhandler'), ('application/x-mason',)),
     'MathematicaLexer': ('pygments.lexers.algebra', 'Mathematica', ('mathematica', 'mma', 'nb'), ('*.nb', '*.cdf', '*.nbp', '*.ma'), ('application/mathematica', 'application/vnd.wolfram.mathematica', 'application/vnd.wolfram.mathematica.package', 'application/vnd.wolfram.cdf')),
diff --git a/pygments/lexers/markup.py b/pygments/lexers/markup.py
index aac8d27e..bb4ae6c5 100644
--- a/pygments/lexers/markup.py
+++ b/pygments/lexers/markup.py
@@ -24,7 +24,7 @@ from pygments.util import get_bool_opt, ClassNotFound
 __all__ = ['BBCodeLexer', 'MoinWikiLexer', 'RstLexer', 'TexLexer', 'GroffLexer',
            'MozPreprocHashLexer', 'MozPreprocPercentLexer',
            'MozPreprocXulLexer', 'MozPreprocJavascriptLexer',
-           'MozPreprocCssLexer']
+           'MozPreprocCssLexer', 'MarkdownLexer']
 
 
 class BBCodeLexer(RegexLexer):
@@ -500,3 +500,96 @@ class MozPreprocCssLexer(DelegatingLexer):
         super(MozPreprocCssLexer, self).__init__(
             CssLexer, MozPreprocPercentLexer, **options)
 
+
+class MarkdownLexer(RegexLexer):
+    """
+    For `Markdown <https://help.github.com/categories/writing-on-github/>`_ markup.
+
+    .. versionadded:: 2.2
+    """
+    name = 'markdown'
+    aliases = ['md']
+    filenames = ['*.md']
+    mimetypes = ["text/x-markdown"]
+    flags = re.MULTILINE
+
+    def _handle_codeblock(self, match):
+        """
+        match args: 1:backticks, 2:lang_name, 3:newline, 4:code, 5:backticks
+        """
+        from pygments.lexers import get_lexer_by_name
+
+        # section header
+        yield match.start(1), String        , match.group(1)
+        yield match.start(2), String        , match.group(2)
+        yield match.start(3), Text          , match.group(3)
+
+        # lookup lexer if wanted and existing
+        lexer = None
+        if self.handlecodeblocks:
+            try:
+                lexer = get_lexer_by_name( match.group(2).strip() )
+            except ClassNotFound:
+                pass
+        code = match.group(4)
+
+        # no lexer for this language. handle it like it was a code block
+        if lexer is None:
+            yield match.start(4), String, code
+            return
+
+        for item in do_insertions([], lexer.get_tokens_unprocessed(code)):
+            yield item
+
+        yield match.start(5), String        , match.group(5)
+
+    tokens = {
+        'root': [
+            # heading with pound prefix
+            (r'^(#)([^#].+\n)', bygroups(Generic.Heading, Text)),
+            (r'^(#{2,6})(.+\n)', bygroups(Generic.Subheading, Text)),
+            # task list
+            (r'^(\s*)([*-] )(\[[ xX]\])( .+\n)',
+            bygroups(Text, Keyword, Keyword, using(this, state='inline'))),
+            # bulleted lists
+            (r'^(\s*)([*-])(\s)(.+\n)',
+            bygroups(Text, Keyword, Text, using(this, state='inline'))),
+            # numbered lists
+            (r'^(\s*)([0-9]+\.)( .+\n)',
+            bygroups(Text, Keyword, using(this, state='inline'))),
+            # quote
+            (r'^(\s*>\s)(.+\n)', bygroups(Keyword, Generic.Emph)),
+            # text block
+            (r'^(```\n)([\w\W]*?)(^```$)', bygroups(String, Text, String)),
+            # code block with language
+            (r'^(```)(\w+)(\n)([\w\W]*?)(^```$)', _handle_codeblock),
+
+            include('inline'),
+        ],
+        'inline': [
+            # escape
+            (r'\\.', Text),
+            # italics
+            (r'(\s)([*_][^*_]+[*_])(\W|\n)', bygroups(Text, Generic.Emph, Text)),
+            # bold
+            # warning: the following rule eats internal tags. eg. **foo _bar_ baz** bar is not italics
+            (r'(\s)((\*\*|__).*\3)((?=\W|\n))', bygroups(Text, Generic.Strong, None, Text)),
+            # "proper way" (r'(\s)([*_]{2}[^*_]+[*_]{2})((?=\W|\n))', bygroups(Text, Generic.Strong, Text)),
+            # strikethrough
+            (r'(\s)(~~[^~]+~~)((?=\W|\n))', bygroups(Text, Generic.Deleted, Text)),
+            # inline code
+            (r'`[^`]+`', String.Backtick),
+            # mentions and topics (twitter and github stuff)
+            (r'[@#][\w/:]+', Name.Entity),
+            # (image?) links eg: ![Image of Yaktocat](https://octodex.github.com/images/yaktocat.png)
+            (r'(!?\[)([^]]+)(\])(\()([^)]+)(\))', bygroups(Text, Name.Tag, Text, Text, Name.Attribute, Text)),
+
+            # general text, must come last!
+            (r'[^\\\s]+', Text),
+            (r'.', Text),
+        ],
+    }
+
+    def __init__(self, **options):
+        self.handlecodeblocks = get_bool_opt(options, 'handlecodeblocks', True)
+        RegexLexer.__init__(self, **options)
diff --git a/tests/examplefiles/example.md b/tests/examplefiles/example.md
new file mode 100644
index 00000000..2befb107
--- /dev/null
+++ b/tests/examplefiles/example.md
@@ -0,0 +1,61 @@
+# this is a header
+
+## this is a 2nd level header
+
+* list item 1
+  * list item 1.1
+* list item 2
+- list item 3
+
+1. numbered list item 1
+1. numbered list item 2
+
+- [ ] todo
+- [x] done
+- [X] done
+
+The following is italic: *italic*
+The following is italic: _italic_
+
+The following is not italic: \*italic\*
+The following is not italic: \_italic\_
+
+The following is not italic: snake*case*word
+The following is not italic: snake_case_word
+
+The following is bold: **bold** **two or more words**
+The following is bold: __bold__ __two or more words__
+
+The following is not bold: snake**case**word
+The following is not bold: snake__case__word
+
+The following is strikethrough: ~~bold~~
+The following is not strikethrough: snake~~case~~word
+
+The following is bold with italics inside: **the next _word_ should have been italics**
+
+> this is a quote
+
+> this is a multiline
+> quote string thing
+
+this sentence `has monospace` in it
+
+this sentence @tweets a person about a #topic.
+
+[google](https://google.com/some/path.html)
+![Image of Yaktocat](https://octodex.github.com/images/yaktocat.png)
+
+```
+  * this is just unformated
+      __text__
+```
+
+some other text
+
+```python
+from pygments import token
+# comment
+```
+
+some more text
author	Kurt Neufeld <kneufeld@burgundywall.com>	2016-06-02 15:53:59 -0700
committer	Kurt Neufeld <kneufeld@burgundywall.com>	2016-06-02 15:53:59 -0700
commit	549f98f0896731e80efee22ac44d105fe77039e6 (patch)
tree	3393e267af2631b07581649dface8cd0c204458f
parent	6bd42c3ccd9ee14b0268592761a0aa0b39bd2b9e (diff)
download	pygments-549f98f0896731e80efee22ac44d105fe77039e6.tar.gz