Unify all block-level tags. (#1048)

Use the list of tags defined in the core by the md_in_html extension. This ensures that the lists do not diverge and allows users and/or extensions to expand the list in the core and have that change affect the extension. Fixes #1047.
author: Waylan Limberg <waylan.limberg@icloud.com> 2020-10-20 14:06:48 -0400
committer: GitHub <noreply@github.com> 2020-10-20 14:06:48 -0400
commit: 6b6cd8bc2f0a870ed309f8b8036492af535e75a1 (patch)
tree: 4641036a9df302c211f51a07971c4b483b777b8d
parent: 56b03b21f50d2b28b7ab87df7d8015e1f1b62184 (diff)
download: python-markdown-6b6cd8bc2f0a870ed309f8b8036492af535e75a1.tar.gz
5 files changed, 37 insertions, 36 deletions
diff --git a/docs/change_log/index.md b/docs/change_log/index.md
index 47e8f9e..994e9a2 100644
--- a/docs/change_log/index.md
+++ b/docs/change_log/index.md
@@ -3,6 +3,10 @@ title: Change Log
 Python-Markdown Change Log
 =========================
 
+Under development: version 3.3.3 (a bug-fix release).
+
+* Unify all block-level tags (#1047).
+
 Oct 19, 2020: version 3.3.2 (a bug-fix release).
 
 * Properly parse inline HTML in md_in_html (#1040 & #1045).
diff --git a/docs/extensions/md_in_html.md b/docs/extensions/md_in_html.md
index ba4424b..978f5c3 100644
--- a/docs/extensions/md_in_html.md
+++ b/docs/extensions/md_in_html.md
@@ -25,10 +25,10 @@ The `markdown` attribute can be assigned one of three values: [`"1"`](#1), [`"bl
 
 When the `markdown` attribute is set to `"1"`, then the parser will use the default behavior for that specific tag.
 
-The following tags have the `block` behavior by default: `address`, `article`, `aside`, `blockquote`, `body`,
-`colgroup`, `details`, `div`, `dl`, `fieldset`, `figcaption`, `figure`, `footer`, `form`, `iframe`, `header`, `hr`,
-`main`, `menu`, `nav`,  `map`, `noscript`, `object`, `ol`, `section`, `table`, `tbody`, `thead`, `tfoot`, `tr`, and
-`ul`.
+The following tags have the `block` behavior by default: `article`, `aside`, `blockquote`, `body`, `colgroup`,
+`details`, `div`, `dl`, `fieldset`, `figcaption`, `figure`, `footer`, `form`, `group`, `header`, `hgroup`, `hr`,
+`iframe`,  `main`, `map`, `menu`, `nav`, `noscript`, `object`, `ol`, `output`, `progress`, `section`, `table`,
+`tbody`, `tfoot`, `thead`, `tr`,  `ul` and `video`.
 
 For example, the following:
 
diff --git a/markdown/core.py b/markdown/core.py
index 79ca3f3..2f7f2d5 100644
--- a/markdown/core.py
+++ b/markdown/core.py
@@ -77,11 +77,12 @@ class Markdown:
             # See https://w3c.github.io/html/grouping-content.html#the-p-element
             'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl',
             'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3',
-            'h4', 'h5', 'h6', 'header', 'hr', 'main', 'menu', 'nav', 'ol', 'p', 'pre',
-            'section', 'table', 'ul',
+            'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'main', 'menu', 'nav', 'ol',
+            'p', 'pre', 'section', 'table', 'ul',
             # Other elements which Markdown should not be mucking up the contents of.
-            'canvas', 'dd', 'dt', 'group', 'iframe', 'li', 'math', 'noscript', 'output',
-            'progress', 'script', 'style', 'tbody', 'td', 'th', 'thead', 'tr', 'video'
+            'canvas', 'colgroup', 'dd', 'body', 'dt', 'group', 'iframe', 'li', 'legend',
+            'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script',
+            'style', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video'
         ]
 
         self.registeredExtensions = []
diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
index f635563..489c3fe 100644
--- a/markdown/extensions/md_in_html.py
+++ b/markdown/extensions/md_in_html.py
@@ -23,27 +23,22 @@ from ..htmlparser import HTMLExtractor
 import xml.etree.ElementTree as etree
 
 
-# Block-level tags in which the content only gets span level parsing
-span_tags = ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th']
-
-# Block-level tags in which the content gets parsed as blocks
-block_tags = [
-    'address', 'article', 'aside', 'blockquote', 'body', 'colgroup', 'details', 'div', 'dl', 'fieldset',
-    'figcaption', 'figure', 'footer', 'form', 'iframe', 'header', 'hr', 'main', 'menu', 'nav',  'map',
-    'noscript', 'object', 'ol', 'section', 'table', 'tbody', 'thead', 'tfoot', 'tr', 'ul'
-]
-
-# Block-level tags which never get their content parsed.
-raw_tags = ['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea']
-
-block_level_tags = span_tags + block_tags + raw_tags
-
-
 class HTMLExtractorExtra(HTMLExtractor):
     """
     Override HTMLExtractor and create etree Elements for any elements which should have content parsed as Markdown.
     """
 
+    def __init__(self, md, *args, **kwargs):
+        # All block-level tags.
+        self.block_level_tags = md.block_level_elements.copy()
+        # Block-level tags in which the content only gets span level parsing
+        self.span_tags = ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th']
+        # Block-level tags which never get their content parsed.
+        self.raw_tags = ['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea']
+        # Block-level tags in which the content gets parsed as blocks
+        self.block_tags = [tag for tag in self.block_level_tags if tag not in self.span_tags + self.raw_tags]
+        super().__init__(md, *args, **kwargs)
+
     def reset(self):
         """Reset this instance.  Loses all unprocessed data."""
         self.mdstack = []  # When markdown=1, stack contains a list of tags
@@ -75,13 +70,13 @@ class HTMLExtractorExtra(HTMLExtractor):
         if parent_state == 'off' or (parent_state == 'span' and md_attr != '0'):
             # Only use the parent state if it is more restrictive than the markdown attribute.
             md_attr = parent_state
-        if ((md_attr == '1' and tag in block_tags) or
-                (md_attr == 'block' and tag in span_tags + block_tags)):
+        if ((md_attr == '1' and tag in self.block_tags) or
+                (md_attr == 'block' and tag in self.span_tags + self.block_tags)):
             return 'block'
-        elif ((md_attr == '1' and tag in span_tags) or
-              (md_attr == 'span' and tag in span_tags + block_tags)):
+        elif ((md_attr == '1' and tag in self.span_tags) or
+              (md_attr == 'span' and tag in self.span_tags + self.block_tags)):
             return 'span'
-        elif tag in block_level_tags:
+        elif tag in self.block_level_tags:
             return 'off'
         else:  # pragma: no cover
             return None
@@ -95,7 +90,7 @@ class HTMLExtractorExtra(HTMLExtractor):
         return value
 
     def handle_starttag(self, tag, attrs):
-        if tag in block_level_tags:
+        if tag in self.block_level_tags:
             # Valueless attr (ex: `<tag checked>`) results in `[('checked', None)]`.
             # Convert to `{'checked': 'checked'}`.
             attrs = {key: value if value is not None else key for key, value in attrs}
@@ -106,7 +101,7 @@ class HTMLExtractorExtra(HTMLExtractor):
                 attrs.pop('markdown', None)
                 super().handle_starttag(tag, attrs)
             else:
-                if 'p' in self.mdstack and tag in block_level_tags:
+                if 'p' in self.mdstack and tag in self.block_level_tags:
                     # Close unclosed 'p' tag
                     self.handle_endtag('p')
                 self.mdstate.append(state)
@@ -125,7 +120,7 @@ class HTMLExtractorExtra(HTMLExtractor):
                     self.handle_data(text)
 
     def handle_endtag(self, tag):
-        if tag in block_level_tags:
+        if tag in self.block_level_tags:
             if self.inraw:
                 super().handle_endtag(tag)
             elif tag in self.mdstack:
diff --git a/markdown/util.py b/markdown/util.py
index a49486b..2cb2317 100644
--- a/markdown/util.py
+++ b/markdown/util.py
@@ -58,11 +58,12 @@ BLOCK_LEVEL_ELEMENTS = [
     # See https://w3c.github.io/html/grouping-content.html#the-p-element
     'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl',
     'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3',
-    'h4', 'h5', 'h6', 'header', 'hr', 'main', 'menu', 'nav', 'ol', 'p', 'pre',
-    'section', 'table', 'ul',
+    'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'main', 'menu', 'nav', 'ol',
+    'p', 'pre', 'section', 'table', 'ul',
     # Other elements which Markdown should not be mucking up the contents of.
-    'canvas', 'dd', 'dt', 'group', 'iframe', 'li', 'math', 'noscript', 'output',
-    'progress', 'script', 'style', 'tbody', 'td', 'th', 'thead', 'tr', 'video'
+    'canvas', 'colgroup', 'dd', 'body', 'dt', 'group', 'iframe', 'li', 'legend',
+    'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script',
+    'style', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video'
 ]
 
 # Placeholders
author	Waylan Limberg <waylan.limberg@icloud.com>	2020-10-20 14:06:48 -0400
committer	GitHub <noreply@github.com>	2020-10-20 14:06:48 -0400
commit	6b6cd8bc2f0a870ed309f8b8036492af535e75a1 (patch)
tree	4641036a9df302c211f51a07971c4b483b777b8d
parent	56b03b21f50d2b28b7ab87df7d8015e1f1b62184 (diff)
download	python-markdown-6b6cd8bc2f0a870ed309f8b8036492af535e75a1.tar.gz