Support PEP 393 new Unicode APIs

author: INADA Naoki <songofacandy@gmail.com> 2017-02-12 23:56:37 +0900
committer: David Lord <davidism@gmail.com> 2018-05-04 08:07:32 -0700
commit: a03ecd5efc3d5a2d12a3bc72d7ce9e776cccfc66 (patch)
tree: 4564aa15cb084c7d36baab4ad0a452ae8af239b1
parent: ce443dd4ab6453607166e75211396cae0f811309 (diff)
download: markupsafe-a03ecd5efc3d5a2d12a3bc72d7ce9e776cccfc66.tar.gz
2 files changed, 446 insertions, 2 deletions
diff --git a/markupsafe/_speedups.c b/markupsafe/_speedups.c
index fb4a03e..8362490 100644
--- a/markupsafe/_speedups.c
+++ b/markupsafe/_speedups.c
@@ -8,20 +8,24 @@
  * :copyright: © 2010 by the Pallets team.
  * :license: BSD, see LICENSE for more details.
  */
-
 #include <Python.h>
 
+#if PY_MAJOR_VERSION < 3
 #define ESCAPED_CHARS_TABLE_SIZE 63
 #define UNICHR(x) (PyUnicode_AS_UNICODE((PyUnicodeObject*)PyUnicode_DecodeASCII(x, strlen(x), NULL)));
 
-static PyObject* markup;
 static Py_ssize_t escaped_chars_delta_len[ESCAPED_CHARS_TABLE_SIZE];
 static Py_UNICODE *escaped_chars_repl[ESCAPED_CHARS_TABLE_SIZE];
+#endif
+
+static PyObject* markup;
 
 static int
 init_constants(void)
 {
 	PyObject *module;
+
+#if PY_MAJOR_VERSION < 3
 	/* mapping of characters to replace */
 	escaped_chars_repl['"'] = UNICHR("&#34;");
 	escaped_chars_repl['\''] = UNICHR("&#39;");
@@ -34,6 +38,7 @@ init_constants(void)
 	escaped_chars_delta_len['"'] = escaped_chars_delta_len['\''] = \
 		escaped_chars_delta_len['&'] = 4;
 	escaped_chars_delta_len['<'] = escaped_chars_delta_len['>'] = 3;
+#endif
 
 	/* import markup type so that we can mark the return value */
 	module = PyImport_ImportModule("markupsafe");
@@ -45,6 +50,7 @@ init_constants(void)
 	return 1;
 }
 
+#if PY_MAJOR_VERSION < 3
 static PyObject*
 escape_unicode(PyUnicodeObject *in)
 {
@@ -105,7 +111,174 @@ escape_unicode(PyUnicodeObject *in)
 
 	return (PyObject*)out;
 }
+#else /* PY_MAJOR_VERSION < 3 */
+
+#define GET_DELTA(inp, inp_end, delta) \
+	while (inp < inp_end) {	 \
+		switch (*inp++) {	   \
+		case '"':			   \
+		case '\'':			  \
+		case '&':			   \
+			delta += 4;		 \
+			break;			  \
+		case '<':			   \
+		case '>':			   \
+			delta += 3;		 \
+			break;			  \
+		}					   \
+	}
+
+#define DO_ESCAPE(inp, inp_end, outp) \
+	{  \
+		Py_ssize_t ncopy = 0;  \
+		while (inp < inp_end) {  \
+			switch (*inp) {  \
+			case '"':  \
+				memcpy(outp, inp-ncopy, sizeof(*outp)*ncopy); \
+				outp += ncopy; ncopy = 0; \
+				*outp++ = '&';  \
+				*outp++ = '#';  \
+				*outp++ = '3';  \
+				*outp++ = '4';  \
+				*outp++ = ';';  \
+				break;  \
+			case '\'':  \
+				memcpy(outp, inp-ncopy, sizeof(*outp)*ncopy); \
+				outp += ncopy; ncopy = 0; \
+				*outp++ = '&';  \
+				*outp++ = '#';  \
+				*outp++ = '3';  \
+				*outp++ = '9';  \
+				*outp++ = ';';  \
+				break;  \
+			case '&':  \
+				memcpy(outp, inp-ncopy, sizeof(*outp)*ncopy); \
+				outp += ncopy; ncopy = 0; \
+				*outp++ = '&';  \
+				*outp++ = 'a';  \
+				*outp++ = 'm';  \
+				*outp++ = 'p';  \
+				*outp++ = ';';  \
+				break;  \
+			case '<':  \
+				memcpy(outp, inp-ncopy, sizeof(*outp)*ncopy); \
+				outp += ncopy; ncopy = 0; \
+				*outp++ = '&';  \
+				*outp++ = 'l';  \
+				*outp++ = 't';  \
+				*outp++ = ';';  \
+				break;  \
+			case '>':  \
+				memcpy(outp, inp-ncopy, sizeof(*outp)*ncopy); \
+				outp += ncopy; ncopy = 0; \
+				*outp++ = '&';  \
+				*outp++ = 'g';  \
+				*outp++ = 't';  \
+				*outp++ = ';';  \
+				break;  \
+			default:  \
+				ncopy++; \
+			}  \
+            inp++; \
+		}  \
+		memcpy(outp, inp-ncopy, sizeof(*outp)*ncopy); \
+	}
+
+static PyObject*
+escape_unicode_kind1(PyUnicodeObject *in)
+{
+	Py_UCS1 *inp = PyUnicode_1BYTE_DATA(in);
+	Py_UCS1 *inp_end = inp + PyUnicode_GET_LENGTH(in);
+	Py_UCS1 *outp;
+	PyObject *out;
+	Py_ssize_t delta = 0;
+
+	GET_DELTA(inp, inp_end, delta);
+	if (!delta) {
+		Py_INCREF(in);
+		return (PyObject*)in;
+	}
+
+	out = PyUnicode_New(PyUnicode_GET_LENGTH(in) + delta,
+						PyUnicode_IS_ASCII(in) ? 127 : 255);
+	if (!out)
+		return NULL;
+
+	inp = PyUnicode_1BYTE_DATA(in);
+	outp = PyUnicode_1BYTE_DATA(out);
+	DO_ESCAPE(inp, inp_end, outp);
+	return out;
+}
+
+static PyObject*
+escape_unicode_kind2(PyUnicodeObject *in)
+{
+	Py_UCS2 *inp = PyUnicode_2BYTE_DATA(in);
+	Py_UCS2 *inp_end = inp + PyUnicode_GET_LENGTH(in);
+	Py_UCS2 *outp;
+	PyObject *out;
+	Py_ssize_t delta = 0;
+
+	GET_DELTA(inp, inp_end, delta);
+	if (!delta) {
+		Py_INCREF(in);
+		return (PyObject*)in;
+	}
+
+	out = PyUnicode_New(PyUnicode_GET_LENGTH(in) + delta, 65535);
+	if (!out)
+		return NULL;
+
+	inp = PyUnicode_2BYTE_DATA(in);
+	outp = PyUnicode_2BYTE_DATA(out);
+	DO_ESCAPE(inp, inp_end, outp);
+	return out;
+}
+
 
+static PyObject*
+escape_unicode_kind4(PyUnicodeObject *in)
+{
+	Py_UCS4 *inp = PyUnicode_4BYTE_DATA(in);
+	Py_UCS4 *inp_end = inp + PyUnicode_GET_LENGTH(in);
+	Py_UCS4 *outp;
+	PyObject *out;
+	Py_ssize_t delta = 0;
+
+	GET_DELTA(inp, inp_end, delta);
+	if (!delta) {
+		Py_INCREF(in);
+		return (PyObject*)in;
+	}
+
+	out = PyUnicode_New(PyUnicode_GET_LENGTH(in) + delta, 1114111);
+	if (!out)
+		return NULL;
+
+	inp = PyUnicode_4BYTE_DATA(in);
+	outp = PyUnicode_4BYTE_DATA(out);
+	DO_ESCAPE(inp, inp_end, outp);
+	return out;
+}
+
+static PyObject*
+escape_unicode(PyUnicodeObject *in)
+{
+	if (PyUnicode_READY(in))
+		return NULL;
+
+	switch (PyUnicode_KIND(in)) {
+	case PyUnicode_1BYTE_KIND:
+		return escape_unicode_kind1(in);
+	case PyUnicode_2BYTE_KIND:
+		return escape_unicode_kind2(in);
+	case PyUnicode_4BYTE_KIND:
+		return escape_unicode_kind4(in);
+	}
+	assert(0);  /* shouldn't happen */
+	return NULL;
+}
+#endif /* PY_MAJOR_VERSION < 3 */
 
 static PyObject*
 escape(PyObject *self, PyObject *text)
diff --git a/tests.py b/tests.py
new file mode 100755
index 0000000..da4b486
--- /dev/null
+++ b/tests.py
@@ -0,0 +1,271 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import gc
+import sys
+import unittest
+from markupsafe import Markup, escape, escape_silent
+from markupsafe._compat import text_type, PY2
+from markupsafe import _native
+try:
+    from markupsafe import _speedups
+    have_speedups = True
+except ImportError:
+    have_speedups = False
+
+
+class MarkupTestCase(unittest.TestCase):
+
+    def test_adding(self):
+        # adding two strings should escape the unsafe one
+        unsafe = '<script type="application/x-some-script">alert("foo");</script>'
+        safe = Markup('<em>username</em>')
+        assert unsafe + safe == text_type(escape(unsafe)) + text_type(safe)
+
+    def test_string_interpolation(self):
+        # string interpolations are safe to use too
+        assert Markup('<em>%s</em>') % '<bad user>' == \
+               '<em>&lt;bad user&gt;</em>'
+        assert Markup('<em>%(username)s</em>') % {
+            'username': '<bad user>'
+        } == '<em>&lt;bad user&gt;</em>'
+
+        assert Markup('%i') % 3.14 == '3'
+        assert Markup('%.2f') % 3.14 == '3.14'
+
+    def test_type_behavior(self):
+        # an escaped object is markup too
+        assert type(Markup('foo') + 'bar') is Markup
+
+        # and it implements __html__ by returning itself
+        x = Markup("foo")
+        assert x.__html__() is x
+
+    def test_html_interop(self):
+        # it also knows how to treat __html__ objects
+        class Foo(object):
+            def __html__(self):
+                return '<em>awesome</em>'
+            def __unicode__(self):
+                return 'awesome'
+            __str__ = __unicode__
+        assert Markup(Foo()) == '<em>awesome</em>'
+        assert Markup('<strong>%s</strong>') % Foo() == \
+            '<strong><em>awesome</em></strong>'
+
+    def test_tuple_interpol(self):
+        self.assertEqual(Markup('<em>%s:%s</em>') % (
+            '<foo>',
+            '<bar>',
+        ), Markup(u'<em>&lt;foo&gt;:&lt;bar&gt;</em>'))
+
+    def test_dict_interpol(self):
+        self.assertEqual(Markup('<em>%(foo)s</em>') % {
+            'foo': '<foo>',
+        }, Markup(u'<em>&lt;foo&gt;</em>'))
+        self.assertEqual(Markup('<em>%(foo)s:%(bar)s</em>') % {
+            'foo': '<foo>',
+            'bar': '<bar>',
+        }, Markup(u'<em>&lt;foo&gt;:&lt;bar&gt;</em>'))
+
+    def test_escaping(self):
+        # escaping
+        assert escape('"<>&\'') == '&#34;&lt;&gt;&amp;&#39;'
+        assert Markup("<em>Foo &amp; Bar</em>").striptags() == "Foo & Bar"
+
+    def test_unescape(self):
+        assert Markup("&lt;test&gt;").unescape() == "<test>"
+        assert "jack & tavi are cooler than mike & russ" == \
+            Markup("jack & tavi are cooler than mike &amp; russ").unescape(), \
+            Markup("jack & tavi are cooler than mike &amp; russ").unescape()
+
+        # Test that unescape is idempotent
+        original = '&foo&#x3b;'
+        once = Markup(original).unescape()
+        twice = Markup(once).unescape()
+        expected = "&foo;"
+        assert expected == once == twice, (once, twice)
+
+    def test_formatting(self):
+        for actual, expected in (
+            (Markup('%i') % 3.14, '3'),
+            (Markup('%.2f') % 3.14159, '3.14'),
+            (Markup('%s %s %s') % ('<', 123, '>'), '&lt; 123 &gt;'),
+            (Markup('<em>{awesome}</em>').format(awesome='<awesome>'),
+             '<em>&lt;awesome&gt;</em>'),
+            (Markup('{0[1][bar]}').format([0, {'bar': '<bar/>'}]),
+             '&lt;bar/&gt;'),
+            (Markup('{0[1][bar]}').format([0, {'bar': Markup('<bar/>')}]),
+             '<bar/>')):
+            assert actual == expected, "%r should be %r!" % (actual, expected)
+
+    # This is new in 2.7
+    if sys.version_info >= (2, 7):
+        def test_formatting_empty(self):
+            formatted = Markup('{}').format(0)
+            assert formatted == Markup('0')
+
+    def test_custom_formatting(self):
+        class HasHTMLOnly(object):
+            def __html__(self):
+                return Markup('<foo>')
+
+        class HasHTMLAndFormat(object):
+            def __html__(self):
+                return Markup('<foo>')
+            def __html_format__(self, spec):
+                return Markup('<FORMAT>')
+
+        assert Markup('{0}').format(HasHTMLOnly()) == Markup('<foo>')
+        assert Markup('{0}').format(HasHTMLAndFormat()) == Markup('<FORMAT>')
+
+    def test_complex_custom_formatting(self):
+        class User(object):
+            def __init__(self, id, username):
+                self.id = id
+                self.username = username
+            def __html_format__(self, format_spec):
+                if format_spec == 'link':
+                    return Markup('<a href="/user/{0}">{1}</a>').format(
+                        self.id,
+                        self.__html__(),
+                    )
+                elif format_spec:
+                    raise ValueError('Invalid format spec')
+                return self.__html__()
+            def __html__(self):
+                return Markup('<span class=user>{0}</span>').format(self.username)
+
+        user = User(1, 'foo')
+        assert Markup('<p>User: {0:link}').format(user) == \
+            Markup('<p>User: <a href="/user/1"><span class=user>foo</span></a>')
+
+    def test_formatting_with_objects(self):
+        class Stringable(object):
+            def __unicode__(self):
+                return u'строка'
+            if PY2:
+                def __str__(self):
+                    return 'some other value'
+            else:
+                __str__ = __unicode__
+
+        assert Markup('{s}').format(s=Stringable()) == \
+            Markup(u'строка')
+
+    def test_all_set(self):
+        import markupsafe as markup
+        for item in markup.__all__:
+            getattr(markup, item)
+
+    def test_escape_silent(self):
+        assert escape_silent(None) == Markup()
+        assert escape(None) == Markup(None)
+        assert escape_silent('<foo>') == Markup(u'&lt;foo&gt;')
+
+    def test_splitting(self):
+        self.assertEqual(Markup('a b').split(), [
+            Markup('a'),
+            Markup('b')
+        ])
+        self.assertEqual(Markup('a b').rsplit(), [
+            Markup('a'),
+            Markup('b')
+        ])
+        self.assertEqual(Markup('a\nb').splitlines(), [
+            Markup('a'),
+            Markup('b')
+        ])
+
+    def test_mul(self):
+        self.assertEqual(Markup('a') * 3, Markup('aaa'))
+
+    def test_escape_return_type(self):
+        self.assertTrue(isinstance(escape('a'), Markup))
+        self.assertTrue(isinstance(escape(Markup('a')), Markup))
+        class Foo:
+            def __html__(self):
+                return '<strong>Foo</strong>'
+        self.assertTrue(isinstance(escape(Foo()), Markup))
+
+
+class MarkupLeakTestCase(unittest.TestCase):
+
+    def test_markup_leaks(self):
+        counts = set()
+        for count in range(20):
+            for item in range(1000):
+                escape("foo")
+                escape("<foo>")
+                escape(u"foo")
+                escape(u"<foo>")
+            if hasattr(sys, 'pypy_version_info'):
+                gc.collect()
+            counts.add(len(gc.get_objects()))
+        assert len(counts) == 1, 'ouch, c extension seems to ' \
+            'leak objects, got: ' + str(len(counts))
+
+
+class NativeEscapeTestCase(unittest.TestCase):
+
+    escape = staticmethod(_native.escape)
+
+    def test_empty(self):
+        self.assertEqual(Markup(u''), self.escape(u''))
+
+    def test_ascii(self):
+        self.assertEqual(
+                Markup(u'abcd&amp;&gt;&lt;&#39;&#34;efgh'),
+                self.escape(u'abcd&><\'"efgh'))
+        self.assertEqual(
+                Markup(u'&amp;&gt;&lt;&#39;&#34;efgh'),
+                self.escape(u'&><\'"efgh'))
+        self.assertEqual(
+                Markup(u'abcd&amp;&gt;&lt;&#39;&#34;'),
+                self.escape(u'abcd&><\'"'))
+
+    def test_2byte(self):
+        self.assertEqual(
+                Markup(u'こんにちは&amp;&gt;&lt;&#39;&#34;こんばんは'),
+                self.escape(u'こんにちは&><\'"こんばんは'))
+        self.assertEqual(
+                Markup(u'&amp;&gt;&lt;&#39;&#34;こんばんは'),
+                self.escape(u'&><\'"こんばんは'))
+        self.assertEqual(
+                Markup(u'こんにちは&amp;&gt;&lt;&#39;&#34;'),
+                self.escape(u'こんにちは&><\'"'))
+
+    def test_4byte(self):
+        self.assertEqual(
+                Markup(u'\U0001F363\U0001F362&amp;&gt;&lt;&#39;&#34;\U0001F37A xyz'),
+                self.escape(u'\U0001F363\U0001F362&><\'"\U0001F37A xyz'))
+        self.assertEqual(
+                Markup(u'&amp;&gt;&lt;&#39;&#34;\U0001F37A xyz'),
+                self.escape(u'&><\'"\U0001F37A xyz'))
+        self.assertEqual(
+                Markup(u'\U0001F363\U0001F362&amp;&gt;&lt;&#39;&#34;'),
+                self.escape(u'\U0001F363\U0001F362&><\'"'))
+
+if have_speedups:
+    class SpeedupEscapeTestCase(NativeEscapeTestCase):
+        escape = _speedups.escape
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(MarkupTestCase))
+
+    # this test only tests the c extension
+    if not hasattr(escape, 'func_code'):
+        suite.addTest(unittest.makeSuite(MarkupLeakTestCase))
+
+    suite.addTest(unittest.makeSuite(NativeEscapeTestCase))
+    if have_speedups:
+        suite.addTest(unittest.makeSuite(SpeedupEscapeTestCase))
+
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main(defaultTest='suite')
+
+# vim:sts=4:sw=4:et:
author	INADA Naoki <songofacandy@gmail.com>	2017-02-12 23:56:37 +0900
committer	David Lord <davidism@gmail.com>	2018-05-04 08:07:32 -0700
commit	a03ecd5efc3d5a2d12a3bc72d7ce9e776cccfc66 (patch)
tree	4564aa15cb084c7d36baab4ad0a452ae8af239b1
parent	ce443dd4ab6453607166e75211396cae0f811309 (diff)
download	markupsafe-a03ecd5efc3d5a2d12a3bc72d7ce9e776cccfc66.tar.gz