summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKurt Neufeld <kneufeld@burgundywall.com>2016-06-02 15:53:59 -0700
committerKurt Neufeld <kneufeld@burgundywall.com>2016-06-02 15:53:59 -0700
commit549f98f0896731e80efee22ac44d105fe77039e6 (patch)
tree3393e267af2631b07581649dface8cd0c204458f
parent6bd42c3ccd9ee14b0268592761a0aa0b39bd2b9e (diff)
downloadpygments-549f98f0896731e80efee22ac44d105fe77039e6.tar.gz
added github flavoured markdown parser
-rw-r--r--AUTHORS1
-rw-r--r--pygments/lexers/_mapping.py1
-rw-r--r--pygments/lexers/markup.py95
-rw-r--r--tests/examplefiles/example.md61
4 files changed, 157 insertions, 1 deletions
diff --git a/AUTHORS b/AUTHORS
index 3b3f0502..e26fa6e9 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -212,5 +212,6 @@ Other contributors, listed alphabetically, are:
* Alex Zimin -- Nemerle lexer
* Rob Zimmerman -- Kal lexer
* Vincent Zurczak -- Roboconf lexer
+* Kurt Neufeld -- Markdown lexer
Many thanks for all contributions!
diff --git a/pygments/lexers/_mapping.py b/pygments/lexers/_mapping.py
index b52f2823..80cdeee0 100644
--- a/pygments/lexers/_mapping.py
+++ b/pygments/lexers/_mapping.py
@@ -252,6 +252,7 @@ LEXERS = {
'MakoLexer': ('pygments.lexers.templates', 'Mako', ('mako',), ('*.mao',), ('application/x-mako',)),
'MakoXmlLexer': ('pygments.lexers.templates', 'XML+Mako', ('xml+mako',), (), ('application/xml+mako',)),
'MaqlLexer': ('pygments.lexers.business', 'MAQL', ('maql',), ('*.maql',), ('text/x-gooddata-maql', 'application/x-gooddata-maql')),
+ 'MarkdownLexer': ('pygments.lexers.markup', 'markdown', ('md',), ('*.md',), ('text/x-markdown',)),
'MaskLexer': ('pygments.lexers.javascript', 'Mask', ('mask',), ('*.mask',), ('text/x-mask',)),
'MasonLexer': ('pygments.lexers.templates', 'Mason', ('mason',), ('*.m', '*.mhtml', '*.mc', '*.mi', 'autohandler', 'dhandler'), ('application/x-mason',)),
'MathematicaLexer': ('pygments.lexers.algebra', 'Mathematica', ('mathematica', 'mma', 'nb'), ('*.nb', '*.cdf', '*.nbp', '*.ma'), ('application/mathematica', 'application/vnd.wolfram.mathematica', 'application/vnd.wolfram.mathematica.package', 'application/vnd.wolfram.cdf')),
diff --git a/pygments/lexers/markup.py b/pygments/lexers/markup.py
index aac8d27e..bb4ae6c5 100644
--- a/pygments/lexers/markup.py
+++ b/pygments/lexers/markup.py
@@ -24,7 +24,7 @@ from pygments.util import get_bool_opt, ClassNotFound
__all__ = ['BBCodeLexer', 'MoinWikiLexer', 'RstLexer', 'TexLexer', 'GroffLexer',
'MozPreprocHashLexer', 'MozPreprocPercentLexer',
'MozPreprocXulLexer', 'MozPreprocJavascriptLexer',
- 'MozPreprocCssLexer']
+ 'MozPreprocCssLexer', 'MarkdownLexer']
class BBCodeLexer(RegexLexer):
@@ -500,3 +500,96 @@ class MozPreprocCssLexer(DelegatingLexer):
super(MozPreprocCssLexer, self).__init__(
CssLexer, MozPreprocPercentLexer, **options)
+
+class MarkdownLexer(RegexLexer):
+ """
+ For `Markdown <https://help.github.com/categories/writing-on-github/>`_ markup.
+
+ .. versionadded:: 2.2
+ """
+ name = 'markdown'
+ aliases = ['md']
+ filenames = ['*.md']
+ mimetypes = ["text/x-markdown"]
+ flags = re.MULTILINE
+
+ def _handle_codeblock(self, match):
+ """
+ match args: 1:backticks, 2:lang_name, 3:newline, 4:code, 5:backticks
+ """
+ from pygments.lexers import get_lexer_by_name
+
+ # section header
+ yield match.start(1), String , match.group(1)
+ yield match.start(2), String , match.group(2)
+ yield match.start(3), Text , match.group(3)
+
+ # lookup lexer if wanted and existing
+ lexer = None
+ if self.handlecodeblocks:
+ try:
+ lexer = get_lexer_by_name( match.group(2).strip() )
+ except ClassNotFound:
+ pass
+ code = match.group(4)
+
+ # no lexer for this language. handle it like it was a code block
+ if lexer is None:
+ yield match.start(4), String, code
+ return
+
+ for item in do_insertions([], lexer.get_tokens_unprocessed(code)):
+ yield item
+
+ yield match.start(5), String , match.group(5)
+
+ tokens = {
+ 'root': [
+ # heading with pound prefix
+ (r'^(#)([^#].+\n)', bygroups(Generic.Heading, Text)),
+ (r'^(#{2,6})(.+\n)', bygroups(Generic.Subheading, Text)),
+ # task list
+ (r'^(\s*)([*-] )(\[[ xX]\])( .+\n)',
+ bygroups(Text, Keyword, Keyword, using(this, state='inline'))),
+ # bulleted lists
+ (r'^(\s*)([*-])(\s)(.+\n)',
+ bygroups(Text, Keyword, Text, using(this, state='inline'))),
+ # numbered lists
+ (r'^(\s*)([0-9]+\.)( .+\n)',
+ bygroups(Text, Keyword, using(this, state='inline'))),
+ # quote
+ (r'^(\s*>\s)(.+\n)', bygroups(Keyword, Generic.Emph)),
+ # text block
+ (r'^(```\n)([\w\W]*?)(^```$)', bygroups(String, Text, String)),
+ # code block with language
+ (r'^(```)(\w+)(\n)([\w\W]*?)(^```$)', _handle_codeblock),
+
+ include('inline'),
+ ],
+ 'inline': [
+ # escape
+ (r'\\.', Text),
+ # italics
+ (r'(\s)([*_][^*_]+[*_])(\W|\n)', bygroups(Text, Generic.Emph, Text)),
+ # bold
+ # warning: the following rule eats internal tags. eg. **foo _bar_ baz** bar is not italics
+ (r'(\s)((\*\*|__).*\3)((?=\W|\n))', bygroups(Text, Generic.Strong, None, Text)),
+ # "proper way" (r'(\s)([*_]{2}[^*_]+[*_]{2})((?=\W|\n))', bygroups(Text, Generic.Strong, Text)),
+ # strikethrough
+ (r'(\s)(~~[^~]+~~)((?=\W|\n))', bygroups(Text, Generic.Deleted, Text)),
+ # inline code
+ (r'`[^`]+`', String.Backtick),
+ # mentions and topics (twitter and github stuff)
+ (r'[@#][\w/:]+', Name.Entity),
+ # (image?) links eg: ![Image of Yaktocat](https://octodex.github.com/images/yaktocat.png)
+ (r'(!?\[)([^]]+)(\])(\()([^)]+)(\))', bygroups(Text, Name.Tag, Text, Text, Name.Attribute, Text)),
+
+ # general text, must come last!
+ (r'[^\\\s]+', Text),
+ (r'.', Text),
+ ],
+ }
+
+ def __init__(self, **options):
+ self.handlecodeblocks = get_bool_opt(options, 'handlecodeblocks', True)
+ RegexLexer.__init__(self, **options)
diff --git a/tests/examplefiles/example.md b/tests/examplefiles/example.md
new file mode 100644
index 00000000..2befb107
--- /dev/null
+++ b/tests/examplefiles/example.md
@@ -0,0 +1,61 @@
+# this is a header
+
+## this is a 2nd level header
+
+* list item 1
+ * list item 1.1
+* list item 2
+- list item 3
+
+1. numbered list item 1
+1. numbered list item 2
+
+- [ ] todo
+- [x] done
+- [X] done
+
+The following is italic: *italic*
+The following is italic: _italic_
+
+The following is not italic: \*italic\*
+The following is not italic: \_italic\_
+
+The following is not italic: snake*case*word
+The following is not italic: snake_case_word
+
+The following is bold: **bold** **two or more words**
+The following is bold: __bold__ __two or more words__
+
+The following is not bold: snake**case**word
+The following is not bold: snake__case__word
+
+The following is strikethrough: ~~bold~~
+The following is not strikethrough: snake~~case~~word
+
+The following is bold with italics inside: **the next _word_ should have been italics**
+
+> this is a quote
+
+> this is a multiline
+> quote string thing
+
+this sentence `has monospace` in it
+
+this sentence @tweets a person about a #topic.
+
+[google](https://google.com/some/path.html)
+![Image of Yaktocat](https://octodex.github.com/images/yaktocat.png)
+
+```
+ * this is just unformated
+ __text__
+```
+
+some other text
+
+```python
+from pygments import token
+# comment
+```
+
+some more text