idempotent unescape

If we examine the XML spec for entities, we find that ampersand and space are not allowed characters in an entity name. I've also modified the unescape function to not modify unexpected inputs (such as &foo;). This is a common best practice when dealing with layered systems. http://www.w3.org/TR/REC-xml/#sec-references EntityRef ::= '&' Name ';' Name ::= NameStartChar (NameChar)* NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
author: Buck Golemon <buck@yelp.com> 2012-07-06 08:33:23 -0700
committer: Buck Golemon <workitharder@gmail.com> 2014-04-17 08:33:46 -0700
commit: 810bae60461fd7c00c853b91c8e03dce3103b020 (patch)
tree: f2c838936ce985fec4b8e308f376a2eb535eb93f
parent: 3257d6c7e6ae26098ed5e1ada041235a3a18a957 (diff)
download: markupsafe-810bae60461fd7c00c853b91c8e03dce3103b020.tar.gz
2 files changed, 16 insertions, 3 deletions
diff --git a/markupsafe/__init__.py b/markupsafe/__init__.py
index d6c2ef4..d3d9ac9 100644
--- a/markupsafe/__init__.py
+++ b/markupsafe/__init__.py
@@ -18,7 +18,7 @@ __all__ = ['Markup', 'soft_unicode', 'escape', 'escape_silent']
 
 
 _striptags_re = re.compile(r'(<!--.*?-->|<[^>]*>)')
-_entity_re = re.compile(r'&([^;]+);')
+_entity_re = re.compile(r'&([^& ;]+);')
 
 
 class Markup(text_type):
@@ -140,7 +140,8 @@ class Markup(text_type):
                     return unichr(int(name[1:]))
             except ValueError:
                 pass
-            return u''
+            # Don't modify unexpected input.
+            return m.group()
         return _entity_re.sub(handle_match, text_type(self))
 
     def striptags(self):
diff --git a/markupsafe/tests.py b/markupsafe/tests.py
index 13e8b8c..9431767 100644
--- a/markupsafe/tests.py
+++ b/markupsafe/tests.py
@@ -60,10 +60,22 @@ class MarkupTestCase(unittest.TestCase):
         }, Markup(u'<em>&lt;foo&gt;:&lt;bar&gt;</em>'))
 
     def test_escaping(self):
-        # escaping and unescaping
+        # escaping
         assert escape('"<>&\'') == '&#34;&lt;&gt;&amp;&#39;'
         assert Markup("<em>Foo &amp; Bar</em>").striptags() == "Foo & Bar"
+
+    def test_unescape(self):
         assert Markup("&lt;test&gt;").unescape() == "<test>"
+        assert "jack & tavi are cooler than mike & russ" == \
+                Markup("jack & tavi are cooler than mike &amp; russ").unescape(), \
+                Markup("jack & tavi are cooler than mike &amp; russ").unescape()
+
+        # Test that unescape is idempotent
+        original = '&foo&#x3b;'
+        once = Markup(original).unescape()
+        twice = Markup(once).unescape()
+        expected = "&foo;"
+        assert expected == once == twice, (once, twice)
 
     def test_formatting(self):
         for actual, expected in (
author	Buck Golemon <buck@yelp.com>	2012-07-06 08:33:23 -0700
committer	Buck Golemon <workitharder@gmail.com>	2014-04-17 08:33:46 -0700
commit	810bae60461fd7c00c853b91c8e03dce3103b020 (patch)
tree	f2c838936ce985fec4b8e308f376a2eb535eb93f
parent	3257d6c7e6ae26098ed5e1ada041235a3a18a957 (diff)
download	markupsafe-810bae60461fd7c00c853b91c8e03dce3103b020.tar.gz