diff options
author | Kurt Neufeld <kneufeld@burgundywall.com> | 2016-06-02 15:53:59 -0700 |
---|---|---|
committer | Kurt Neufeld <kneufeld@burgundywall.com> | 2016-06-02 15:53:59 -0700 |
commit | 549f98f0896731e80efee22ac44d105fe77039e6 (patch) | |
tree | 3393e267af2631b07581649dface8cd0c204458f | |
parent | 6bd42c3ccd9ee14b0268592761a0aa0b39bd2b9e (diff) | |
download | pygments-549f98f0896731e80efee22ac44d105fe77039e6.tar.gz |
added github flavoured markdown parser
-rw-r--r-- | AUTHORS | 1 | ||||
-rw-r--r-- | pygments/lexers/_mapping.py | 1 | ||||
-rw-r--r-- | pygments/lexers/markup.py | 95 | ||||
-rw-r--r-- | tests/examplefiles/example.md | 61 |
4 files changed, 157 insertions, 1 deletions
@@ -212,5 +212,6 @@ Other contributors, listed alphabetically, are: * Alex Zimin -- Nemerle lexer * Rob Zimmerman -- Kal lexer * Vincent Zurczak -- Roboconf lexer +* Kurt Neufeld -- Markdown lexer Many thanks for all contributions! diff --git a/pygments/lexers/_mapping.py b/pygments/lexers/_mapping.py index b52f2823..80cdeee0 100644 --- a/pygments/lexers/_mapping.py +++ b/pygments/lexers/_mapping.py @@ -252,6 +252,7 @@ LEXERS = { 'MakoLexer': ('pygments.lexers.templates', 'Mako', ('mako',), ('*.mao',), ('application/x-mako',)), 'MakoXmlLexer': ('pygments.lexers.templates', 'XML+Mako', ('xml+mako',), (), ('application/xml+mako',)), 'MaqlLexer': ('pygments.lexers.business', 'MAQL', ('maql',), ('*.maql',), ('text/x-gooddata-maql', 'application/x-gooddata-maql')), + 'MarkdownLexer': ('pygments.lexers.markup', 'markdown', ('md',), ('*.md',), ('text/x-markdown',)), 'MaskLexer': ('pygments.lexers.javascript', 'Mask', ('mask',), ('*.mask',), ('text/x-mask',)), 'MasonLexer': ('pygments.lexers.templates', 'Mason', ('mason',), ('*.m', '*.mhtml', '*.mc', '*.mi', 'autohandler', 'dhandler'), ('application/x-mason',)), 'MathematicaLexer': ('pygments.lexers.algebra', 'Mathematica', ('mathematica', 'mma', 'nb'), ('*.nb', '*.cdf', '*.nbp', '*.ma'), ('application/mathematica', 'application/vnd.wolfram.mathematica', 'application/vnd.wolfram.mathematica.package', 'application/vnd.wolfram.cdf')), diff --git a/pygments/lexers/markup.py b/pygments/lexers/markup.py index aac8d27e..bb4ae6c5 100644 --- a/pygments/lexers/markup.py +++ b/pygments/lexers/markup.py @@ -24,7 +24,7 @@ from pygments.util import get_bool_opt, ClassNotFound __all__ = ['BBCodeLexer', 'MoinWikiLexer', 'RstLexer', 'TexLexer', 'GroffLexer', 'MozPreprocHashLexer', 'MozPreprocPercentLexer', 'MozPreprocXulLexer', 'MozPreprocJavascriptLexer', - 'MozPreprocCssLexer'] + 'MozPreprocCssLexer', 'MarkdownLexer'] class BBCodeLexer(RegexLexer): @@ -500,3 +500,96 @@ class MozPreprocCssLexer(DelegatingLexer): super(MozPreprocCssLexer, self).__init__( CssLexer, MozPreprocPercentLexer, **options) + +class MarkdownLexer(RegexLexer): + """ + For `Markdown <https://help.github.com/categories/writing-on-github/>`_ markup. + + .. versionadded:: 2.2 + """ + name = 'markdown' + aliases = ['md'] + filenames = ['*.md'] + mimetypes = ["text/x-markdown"] + flags = re.MULTILINE + + def _handle_codeblock(self, match): + """ + match args: 1:backticks, 2:lang_name, 3:newline, 4:code, 5:backticks + """ + from pygments.lexers import get_lexer_by_name + + # section header + yield match.start(1), String , match.group(1) + yield match.start(2), String , match.group(2) + yield match.start(3), Text , match.group(3) + + # lookup lexer if wanted and existing + lexer = None + if self.handlecodeblocks: + try: + lexer = get_lexer_by_name( match.group(2).strip() ) + except ClassNotFound: + pass + code = match.group(4) + + # no lexer for this language. handle it like it was a code block + if lexer is None: + yield match.start(4), String, code + return + + for item in do_insertions([], lexer.get_tokens_unprocessed(code)): + yield item + + yield match.start(5), String , match.group(5) + + tokens = { + 'root': [ + # heading with pound prefix + (r'^(#)([^#].+\n)', bygroups(Generic.Heading, Text)), + (r'^(#{2,6})(.+\n)', bygroups(Generic.Subheading, Text)), + # task list + (r'^(\s*)([*-] )(\[[ xX]\])( .+\n)', + bygroups(Text, Keyword, Keyword, using(this, state='inline'))), + # bulleted lists + (r'^(\s*)([*-])(\s)(.+\n)', + bygroups(Text, Keyword, Text, using(this, state='inline'))), + # numbered lists + (r'^(\s*)([0-9]+\.)( .+\n)', + bygroups(Text, Keyword, using(this, state='inline'))), + # quote + (r'^(\s*>\s)(.+\n)', bygroups(Keyword, Generic.Emph)), + # text block + (r'^(```\n)([\w\W]*?)(^```$)', bygroups(String, Text, String)), + # code block with language + (r'^(```)(\w+)(\n)([\w\W]*?)(^```$)', _handle_codeblock), + + include('inline'), + ], + 'inline': [ + # escape + (r'\\.', Text), + # italics + (r'(\s)([*_][^*_]+[*_])(\W|\n)', bygroups(Text, Generic.Emph, Text)), + # bold + # warning: the following rule eats internal tags. eg. **foo _bar_ baz** bar is not italics + (r'(\s)((\*\*|__).*\3)((?=\W|\n))', bygroups(Text, Generic.Strong, None, Text)), + # "proper way" (r'(\s)([*_]{2}[^*_]+[*_]{2})((?=\W|\n))', bygroups(Text, Generic.Strong, Text)), + # strikethrough + (r'(\s)(~~[^~]+~~)((?=\W|\n))', bygroups(Text, Generic.Deleted, Text)), + # inline code + (r'`[^`]+`', String.Backtick), + # mentions and topics (twitter and github stuff) + (r'[@#][\w/:]+', Name.Entity), + # (image?) links eg: ![Image of Yaktocat](https://octodex.github.com/images/yaktocat.png) + (r'(!?\[)([^]]+)(\])(\()([^)]+)(\))', bygroups(Text, Name.Tag, Text, Text, Name.Attribute, Text)), + + # general text, must come last! + (r'[^\\\s]+', Text), + (r'.', Text), + ], + } + + def __init__(self, **options): + self.handlecodeblocks = get_bool_opt(options, 'handlecodeblocks', True) + RegexLexer.__init__(self, **options) diff --git a/tests/examplefiles/example.md b/tests/examplefiles/example.md new file mode 100644 index 00000000..2befb107 --- /dev/null +++ b/tests/examplefiles/example.md @@ -0,0 +1,61 @@ +# this is a header + +## this is a 2nd level header + +* list item 1 + * list item 1.1 +* list item 2 +- list item 3 + +1. numbered list item 1 +1. numbered list item 2 + +- [ ] todo +- [x] done +- [X] done + +The following is italic: *italic* +The following is italic: _italic_ + +The following is not italic: \*italic\* +The following is not italic: \_italic\_ + +The following is not italic: snake*case*word +The following is not italic: snake_case_word + +The following is bold: **bold** **two or more words** +The following is bold: __bold__ __two or more words__ + +The following is not bold: snake**case**word +The following is not bold: snake__case__word + +The following is strikethrough: ~~bold~~ +The following is not strikethrough: snake~~case~~word + +The following is bold with italics inside: **the next _word_ should have been italics** + +> this is a quote + +> this is a multiline +> quote string thing + +this sentence `has monospace` in it + +this sentence @tweets a person about a #topic. + +[google](https://google.com/some/path.html) +![Image of Yaktocat](https://octodex.github.com/images/yaktocat.png) + +``` + * this is just unformated + __text__ +``` + +some other text + +```python +from pygments import token +# comment +``` + +some more text |