summaryrefslogtreecommitdiff
path: root/mako/filters.py
blob: 0ae33ff48649ba2b1d6a88e50989e0d9dc74ebf4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# mako/filters.py
# Copyright 2006-2020 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php


import codecs
import re

from mako import compat
from mako.compat import codepoint2name
from mako.compat import name2codepoint
from mako.compat import quote_plus
from mako.compat import unquote_plus

xml_escapes = {
    "&": "&amp;",
    ">": "&gt;",
    "<": "&lt;",
    '"': "&#34;",  # also &quot; in html-only
    "'": "&#39;",  # also &apos; in html-only
}

# XXX: &quot; is valid in HTML and XML
#      &apos; is not valid HTML, but is valid XML


def legacy_html_escape(s):
    """legacy HTML escape for non-unicode mode."""
    s = s.replace("&", "&amp;")
    s = s.replace(">", "&gt;")
    s = s.replace("<", "&lt;")
    s = s.replace('"', "&#34;")
    s = s.replace("'", "&#39;")
    return s


try:
    import markupsafe

    html_escape = markupsafe.escape
except ImportError:
    html_escape = legacy_html_escape


def xml_escape(string):
    return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string)


def url_escape(string):
    # convert into a list of octets
    string = string.encode("utf8")
    return quote_plus(string)


def legacy_url_escape(string):
    # convert into a list of octets
    return quote_plus(string)


def url_unescape(string):
    text = unquote_plus(string)
    if not is_ascii_str(text):
        text = text.decode("utf8")
    return text


def trim(string):
    return string.strip()


class Decode(object):
    def __getattr__(self, key):
        def decode(x):
            if isinstance(x, compat.text_type):
                return x
            elif not isinstance(x, compat.binary_type):
                return decode(str(x))
            else:
                return compat.text_type(x, encoding=key)

        return decode


decode = Decode()


_ASCII_re = re.compile(r"\A[\x00-\x7f]*\Z")


def is_ascii_str(text):
    return isinstance(text, str) and _ASCII_re.match(text)


################################################################


class XMLEntityEscaper(object):
    def __init__(self, codepoint2name, name2codepoint):
        self.codepoint2entity = dict(
            [
                (c, compat.text_type("&%s;" % n))
                for c, n in codepoint2name.items()
            ]
        )
        self.name2codepoint = name2codepoint

    def escape_entities(self, text):
        """Replace characters with their character entity references.

        Only characters corresponding to a named entity are replaced.
        """
        return compat.text_type(text).translate(self.codepoint2entity)

    def __escape(self, m):
        codepoint = ord(m.group())
        try:
            return self.codepoint2entity[codepoint]
        except (KeyError, IndexError):
            return "&#x%X;" % codepoint

    __escapable = re.compile(r'["&<>]|[^\x00-\x7f]')

    def escape(self, text):
        """Replace characters with their character references.

        Replace characters by their named entity references.
        Non-ASCII characters, if they do not have a named entity reference,
        are replaced by numerical character references.

        The return value is guaranteed to be ASCII.
        """
        return self.__escapable.sub(
            self.__escape, compat.text_type(text)
        ).encode("ascii")

    # XXX: This regexp will not match all valid XML entity names__.
    # (It punts on details involving involving CombiningChars and Extenders.)
    #
    # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef
    __characterrefs = re.compile(
        r"""& (?:
                                          \#(\d+)
                                          | \#x([\da-f]+)
                                          | ( (?!\d) [:\w] [-.:\w]+ )
                                          ) ;""",
        re.X | re.UNICODE,
    )

    def __unescape(self, m):
        dval, hval, name = m.groups()
        if dval:
            codepoint = int(dval)
        elif hval:
            codepoint = int(hval, 16)
        else:
            codepoint = self.name2codepoint.get(name, 0xFFFD)
            # U+FFFD = "REPLACEMENT CHARACTER"
        if codepoint < 128:
            return chr(codepoint)
        return chr(codepoint)

    def unescape(self, text):
        """Unescape character references.

        All character references (both entity references and numerical
        character references) are unescaped.
        """
        return self.__characterrefs.sub(self.__unescape, text)


_html_entities_escaper = XMLEntityEscaper(codepoint2name, name2codepoint)

html_entities_escape = _html_entities_escaper.escape_entities
html_entities_unescape = _html_entities_escaper.unescape


def htmlentityreplace_errors(ex):
    """An encoding error handler.

    This python codecs error handler replaces unencodable
    characters with HTML entities, or, if no HTML entity exists for
    the character, XML character references::

        >>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace')
        'The cost was &euro;12.'
    """
    if isinstance(ex, UnicodeEncodeError):
        # Handle encoding errors
        bad_text = ex.object[ex.start : ex.end]
        text = _html_entities_escaper.escape(bad_text)
        return (compat.text_type(text), ex.end)
    raise ex


codecs.register_error("htmlentityreplace", htmlentityreplace_errors)


# TODO: options to make this dynamic per-compilation will be added in a later
# release
DEFAULT_ESCAPES = {
    "x": "filters.xml_escape",
    "h": "filters.html_escape",
    "u": "filters.url_escape",
    "trim": "filters.trim",
    "entity": "filters.html_entities_escape",
    "unicode": "unicode",
    "decode": "decode",
    "str": "str",
    "n": "n",
}

if compat.py3k:
    DEFAULT_ESCAPES.update({"unicode": "str"})

NON_UNICODE_ESCAPES = DEFAULT_ESCAPES.copy()
NON_UNICODE_ESCAPES["h"] = "filters.legacy_html_escape"
NON_UNICODE_ESCAPES["u"] = "filters.legacy_url_escape"