summaryrefslogtreecommitdiff
path: root/mako/filters.py
blob: b255aaf8d59a900f7099058c9731b057946a8ad9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# mako/filters.py
# Copyright 2006-2023 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php


import codecs
from html.entities import codepoint2name
from html.entities import name2codepoint
import re
from urllib.parse import quote_plus

import markupsafe

html_escape = markupsafe.escape

xml_escapes = {
    "&": "&amp;",
    ">": "&gt;",
    "<": "&lt;",
    '"': "&#34;",  # also &quot; in html-only
    "'": "&#39;",  # also &apos; in html-only
}


def xml_escape(string):
    return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string)


def url_escape(string):
    # convert into a list of octets
    string = string.encode("utf8")
    return quote_plus(string)


def trim(string):
    return string.strip()


class Decode:
    def __getattr__(self, key):
        def decode(x):
            if isinstance(x, str):
                return x
            elif not isinstance(x, bytes):
                return decode(str(x))
            else:
                return str(x, encoding=key)

        return decode


decode = Decode()


class XMLEntityEscaper:
    def __init__(self, codepoint2name, name2codepoint):
        self.codepoint2entity = {
            c: str("&%s;" % n) for c, n in codepoint2name.items()
        }
        self.name2codepoint = name2codepoint

    def escape_entities(self, text):
        """Replace characters with their character entity references.

        Only characters corresponding to a named entity are replaced.
        """
        return str(text).translate(self.codepoint2entity)

    def __escape(self, m):
        codepoint = ord(m.group())
        try:
            return self.codepoint2entity[codepoint]
        except (KeyError, IndexError):
            return "&#x%X;" % codepoint

    __escapable = re.compile(r'["&<>]|[^\x00-\x7f]')

    def escape(self, text):
        """Replace characters with their character references.

        Replace characters by their named entity references.
        Non-ASCII characters, if they do not have a named entity reference,
        are replaced by numerical character references.

        The return value is guaranteed to be ASCII.
        """
        return self.__escapable.sub(self.__escape, str(text)).encode("ascii")

    # XXX: This regexp will not match all valid XML entity names__.
    # (It punts on details involving involving CombiningChars and Extenders.)
    #
    # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef
    __characterrefs = re.compile(
        r"""& (?:
                                          \#(\d+)
                                          | \#x([\da-f]+)
                                          | ( (?!\d) [:\w] [-.:\w]+ )
                                          ) ;""",
        re.X | re.UNICODE,
    )

    def __unescape(self, m):
        dval, hval, name = m.groups()
        if dval:
            codepoint = int(dval)
        elif hval:
            codepoint = int(hval, 16)
        else:
            codepoint = self.name2codepoint.get(name, 0xFFFD)
            # U+FFFD = "REPLACEMENT CHARACTER"
        if codepoint < 128:
            return chr(codepoint)
        return chr(codepoint)

    def unescape(self, text):
        """Unescape character references.

        All character references (both entity references and numerical
        character references) are unescaped.
        """
        return self.__characterrefs.sub(self.__unescape, text)


_html_entities_escaper = XMLEntityEscaper(codepoint2name, name2codepoint)

html_entities_escape = _html_entities_escaper.escape_entities
html_entities_unescape = _html_entities_escaper.unescape


def htmlentityreplace_errors(ex):
    """An encoding error handler.

    This python codecs error handler replaces unencodable
    characters with HTML entities, or, if no HTML entity exists for
    the character, XML character references::

        >>> 'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace')
        'The cost was &euro;12.'
    """
    if isinstance(ex, UnicodeEncodeError):
        # Handle encoding errors
        bad_text = ex.object[ex.start : ex.end]
        text = _html_entities_escaper.escape(bad_text)
        return (str(text), ex.end)
    raise ex


codecs.register_error("htmlentityreplace", htmlentityreplace_errors)


DEFAULT_ESCAPES = {
    "x": "filters.xml_escape",
    "h": "filters.html_escape",
    "u": "filters.url_escape",
    "trim": "filters.trim",
    "entity": "filters.html_entities_escape",
    "unicode": "str",
    "decode": "decode",
    "str": "str",
    "n": "n",
}