diff options
author | Waylan Limberg <waylan.limberg@icloud.com> | 2020-10-20 14:06:48 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-10-20 14:06:48 -0400 |
commit | 6b6cd8bc2f0a870ed309f8b8036492af535e75a1 (patch) | |
tree | 4641036a9df302c211f51a07971c4b483b777b8d | |
parent | 56b03b21f50d2b28b7ab87df7d8015e1f1b62184 (diff) | |
download | python-markdown-6b6cd8bc2f0a870ed309f8b8036492af535e75a1.tar.gz |
Unify all block-level tags. (#1048)
Use the list of tags defined in the core by the md_in_html extension.
This ensures that the lists do not diverge and allows users and/or
extensions to expand the list in the core and have that change affect
the extension. Fixes #1047.
-rw-r--r-- | docs/change_log/index.md | 4 | ||||
-rw-r--r-- | docs/extensions/md_in_html.md | 8 | ||||
-rw-r--r-- | markdown/core.py | 9 | ||||
-rw-r--r-- | markdown/extensions/md_in_html.py | 43 | ||||
-rw-r--r-- | markdown/util.py | 9 |
5 files changed, 37 insertions, 36 deletions
diff --git a/docs/change_log/index.md b/docs/change_log/index.md index 47e8f9e..994e9a2 100644 --- a/docs/change_log/index.md +++ b/docs/change_log/index.md @@ -3,6 +3,10 @@ title: Change Log Python-Markdown Change Log ========================= +Under development: version 3.3.3 (a bug-fix release). + +* Unify all block-level tags (#1047). + Oct 19, 2020: version 3.3.2 (a bug-fix release). * Properly parse inline HTML in md_in_html (#1040 & #1045). diff --git a/docs/extensions/md_in_html.md b/docs/extensions/md_in_html.md index ba4424b..978f5c3 100644 --- a/docs/extensions/md_in_html.md +++ b/docs/extensions/md_in_html.md @@ -25,10 +25,10 @@ The `markdown` attribute can be assigned one of three values: [`"1"`](#1), [`"bl When the `markdown` attribute is set to `"1"`, then the parser will use the default behavior for that specific tag. -The following tags have the `block` behavior by default: `address`, `article`, `aside`, `blockquote`, `body`, -`colgroup`, `details`, `div`, `dl`, `fieldset`, `figcaption`, `figure`, `footer`, `form`, `iframe`, `header`, `hr`, -`main`, `menu`, `nav`, `map`, `noscript`, `object`, `ol`, `section`, `table`, `tbody`, `thead`, `tfoot`, `tr`, and -`ul`. +The following tags have the `block` behavior by default: `article`, `aside`, `blockquote`, `body`, `colgroup`, +`details`, `div`, `dl`, `fieldset`, `figcaption`, `figure`, `footer`, `form`, `group`, `header`, `hgroup`, `hr`, +`iframe`, `main`, `map`, `menu`, `nav`, `noscript`, `object`, `ol`, `output`, `progress`, `section`, `table`, +`tbody`, `tfoot`, `thead`, `tr`, `ul` and `video`. For example, the following: diff --git a/markdown/core.py b/markdown/core.py index 79ca3f3..2f7f2d5 100644 --- a/markdown/core.py +++ b/markdown/core.py @@ -77,11 +77,12 @@ class Markdown: # See https://w3c.github.io/html/grouping-content.html#the-p-element 'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', - 'h4', 'h5', 'h6', 'header', 'hr', 'main', 'menu', 'nav', 'ol', 'p', 'pre', - 'section', 'table', 'ul', + 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'main', 'menu', 'nav', 'ol', + 'p', 'pre', 'section', 'table', 'ul', # Other elements which Markdown should not be mucking up the contents of. - 'canvas', 'dd', 'dt', 'group', 'iframe', 'li', 'math', 'noscript', 'output', - 'progress', 'script', 'style', 'tbody', 'td', 'th', 'thead', 'tr', 'video' + 'canvas', 'colgroup', 'dd', 'body', 'dt', 'group', 'iframe', 'li', 'legend', + 'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script', + 'style', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video' ] self.registeredExtensions = [] diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index f635563..489c3fe 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -23,27 +23,22 @@ from ..htmlparser import HTMLExtractor import xml.etree.ElementTree as etree -# Block-level tags in which the content only gets span level parsing -span_tags = ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th'] - -# Block-level tags in which the content gets parsed as blocks -block_tags = [ - 'address', 'article', 'aside', 'blockquote', 'body', 'colgroup', 'details', 'div', 'dl', 'fieldset', - 'figcaption', 'figure', 'footer', 'form', 'iframe', 'header', 'hr', 'main', 'menu', 'nav', 'map', - 'noscript', 'object', 'ol', 'section', 'table', 'tbody', 'thead', 'tfoot', 'tr', 'ul' -] - -# Block-level tags which never get their content parsed. -raw_tags = ['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea'] - -block_level_tags = span_tags + block_tags + raw_tags - - class HTMLExtractorExtra(HTMLExtractor): """ Override HTMLExtractor and create etree Elements for any elements which should have content parsed as Markdown. """ + def __init__(self, md, *args, **kwargs): + # All block-level tags. + self.block_level_tags = md.block_level_elements.copy() + # Block-level tags in which the content only gets span level parsing + self.span_tags = ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th'] + # Block-level tags which never get their content parsed. + self.raw_tags = ['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea'] + # Block-level tags in which the content gets parsed as blocks + self.block_tags = [tag for tag in self.block_level_tags if tag not in self.span_tags + self.raw_tags] + super().__init__(md, *args, **kwargs) + def reset(self): """Reset this instance. Loses all unprocessed data.""" self.mdstack = [] # When markdown=1, stack contains a list of tags @@ -75,13 +70,13 @@ class HTMLExtractorExtra(HTMLExtractor): if parent_state == 'off' or (parent_state == 'span' and md_attr != '0'): # Only use the parent state if it is more restrictive than the markdown attribute. md_attr = parent_state - if ((md_attr == '1' and tag in block_tags) or - (md_attr == 'block' and tag in span_tags + block_tags)): + if ((md_attr == '1' and tag in self.block_tags) or + (md_attr == 'block' and tag in self.span_tags + self.block_tags)): return 'block' - elif ((md_attr == '1' and tag in span_tags) or - (md_attr == 'span' and tag in span_tags + block_tags)): + elif ((md_attr == '1' and tag in self.span_tags) or + (md_attr == 'span' and tag in self.span_tags + self.block_tags)): return 'span' - elif tag in block_level_tags: + elif tag in self.block_level_tags: return 'off' else: # pragma: no cover return None @@ -95,7 +90,7 @@ class HTMLExtractorExtra(HTMLExtractor): return value def handle_starttag(self, tag, attrs): - if tag in block_level_tags: + if tag in self.block_level_tags: # Valueless attr (ex: `<tag checked>`) results in `[('checked', None)]`. # Convert to `{'checked': 'checked'}`. attrs = {key: value if value is not None else key for key, value in attrs} @@ -106,7 +101,7 @@ class HTMLExtractorExtra(HTMLExtractor): attrs.pop('markdown', None) super().handle_starttag(tag, attrs) else: - if 'p' in self.mdstack and tag in block_level_tags: + if 'p' in self.mdstack and tag in self.block_level_tags: # Close unclosed 'p' tag self.handle_endtag('p') self.mdstate.append(state) @@ -125,7 +120,7 @@ class HTMLExtractorExtra(HTMLExtractor): self.handle_data(text) def handle_endtag(self, tag): - if tag in block_level_tags: + if tag in self.block_level_tags: if self.inraw: super().handle_endtag(tag) elif tag in self.mdstack: diff --git a/markdown/util.py b/markdown/util.py index a49486b..2cb2317 100644 --- a/markdown/util.py +++ b/markdown/util.py @@ -58,11 +58,12 @@ BLOCK_LEVEL_ELEMENTS = [ # See https://w3c.github.io/html/grouping-content.html#the-p-element 'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', - 'h4', 'h5', 'h6', 'header', 'hr', 'main', 'menu', 'nav', 'ol', 'p', 'pre', - 'section', 'table', 'ul', + 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'main', 'menu', 'nav', 'ol', + 'p', 'pre', 'section', 'table', 'ul', # Other elements which Markdown should not be mucking up the contents of. - 'canvas', 'dd', 'dt', 'group', 'iframe', 'li', 'math', 'noscript', 'output', - 'progress', 'script', 'style', 'tbody', 'td', 'th', 'thead', 'tr', 'video' + 'canvas', 'colgroup', 'dd', 'body', 'dt', 'group', 'iframe', 'li', 'legend', + 'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script', + 'style', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video' ] # Placeholders |