markdown/htmlparser.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86

# -*- coding: utf-8 -*-
"""
Python Markdown

A Python implementation of John Gruber's Markdown.

Documentation: https://python-markdown.github.io/
GitHub: https://github.com/Python-Markdown/markdown/
PyPI: https://pypi.org/project/Markdown/

Started by Manfred Stienstra (http://www.dwerg.net/).
Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
Currently maintained by Waylan Limberg (https://github.com/waylan),
Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).

Copyright 2007-2019 The Python Markdown Project (v. 1.7 and later)
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
Copyright 2004 Manfred Stienstra (the original version)

License: BSD (see LICENSE.md for details).
"""

try:
    from HTMLParser import HTMLParser
    PY2 = True
except ImportError:
    from html.parser import HTMLParser
    PY2 = False


class HTMLExtractor(HTMLParser):
    """
    Extract raw HTML from text.

    The raw HTML is stored in the `htmlStash` of the Markdown instance passed
    to `md` and the remaining text is stored in `cleandoc` as a list of strings.
    """

    def __init__(self, md):
        if PY2:
            # In PY2 HTMLParser is an old style class :(
            HTMLParser.__init__(self)
        else:
            super(HTMLExtractor, self).__init__()
        self.md = md
        self.inraw = False
        self.stack = []  # When inraw==True, stack contains a list of tags
        self._cache = []
        self.cleandoc = []

    def handle_starttag(self, tag, attrs):
        self.stack.append(tag)

        line, col = self.getpos()
        if col < 4 and self.md.is_block_level(tag) and not self.inraw:
            # Started a new raw block
            self.inraw = True

        text = self.get_starttag_text()
        if self.inraw:
            self._cache.append(text)
        else:
            self.cleandoc.append(text)

    def handle_endtag(self, tag):
        text = '<{0}/>'.format(tag)
        if tag in self.stack:
            while self.stack:
                if self.stack.pop() == tag:
                    break
        if self.inraw and len(self.stack) == 0:
            # End of raw block
            self.inraw = False
            self._cache.append(text)
            self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
            self._cache = []
        elif self.inraw:
            self._cache.append(text)
        else:
            self.cleandoc.append(text)

    def handle_data(self, data):
        if self.inraw:
            self._cache.append(data)
        else:
            self.cleandoc.append(data)