Do not unescape text when generating Text nodes.

Store the null-escaped text in the doctree and unescape on export. Enables transforms to account for escaped markup. git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk@8233 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
author: milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2018-11-20 23:55:23 +0000
committer: milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2018-11-20 23:55:23 +0000
commit: 68406f5b1612ed128449330a7ed0418bb2ae6787 (patch)
tree: 4ed7108b501c006df57b640a3ebe9129b2432491
parent: e3464eabc52829aeecad3db352a02e8f0e29a4a8 (diff)
download: docutils-68406f5b1612ed128449330a7ed0418bb2ae6787.tar.gz
2 files changed, 42 insertions, 38 deletions
diff --git a/docutils/docutils/parsers/rst/roles.py b/docutils/docutils/parsers/rst/roles.py
index 35227e6d1..ef690c5fa 100644
--- a/docutils/docutils/parsers/rst/roles.py
+++ b/docutils/docutils/parsers/rst/roles.py
@@ -195,7 +195,7 @@ class GenericRole:
     def __call__(self, role, rawtext, text, lineno, inliner,
                  options={}, content=[]):
         set_classes(options)
-        return [self.node_class(rawtext, utils.unescape(text), **options)], []
+        return [self.node_class(rawtext, text, **options)], []
 
 
 class CustomRole:
@@ -234,7 +234,7 @@ def generic_custom_role(role, rawtext, text, lineno, inliner,
     # Once nested inline markup is implemented, this and other methods should
     # recursively call inliner.nested_parse().
     set_classes(options)
-    return [nodes.inline(rawtext, utils.unescape(text), **options)], []
+    return [nodes.inline(rawtext, text, **options)], []
 
 generic_custom_role.options = {'class': directives.class_option}
 
@@ -255,7 +255,7 @@ register_generic_role('title-reference', nodes.title_reference)
 def pep_reference_role(role, rawtext, text, lineno, inliner,
                        options={}, content=[]):
     try:
-        pepnum = int(text)
+        pepnum = int(utils.unescape(text))
         if pepnum < 0 or pepnum > 9999:
             raise ValueError
     except ValueError:
@@ -268,7 +268,7 @@ def pep_reference_role(role, rawtext, text, lineno, inliner,
     ref = (inliner.document.settings.pep_base_url
            + inliner.document.settings.pep_file_url_template % pepnum)
     set_classes(options)
-    return [nodes.reference(rawtext, 'PEP ' + utils.unescape(text), refuri=ref,
+    return [nodes.reference(rawtext, 'PEP ' + text, refuri=ref,
                             **options)], []
 
 register_canonical_role('pep-reference', pep_reference_role)
@@ -276,7 +276,7 @@ register_canonical_role('pep-reference', pep_reference_role)
 def rfc_reference_role(role, rawtext, text, lineno, inliner,
                        options={}, content=[]):
     try:
-        rfcnum = int(text)
+        rfcnum = int(utils.unescape(text))
         if rfcnum <= 0:
             raise ValueError
     except ValueError:
@@ -288,7 +288,7 @@ def rfc_reference_role(role, rawtext, text, lineno, inliner,
     # Base URL mainly used by inliner.rfc_reference, so this is correct:
     ref = inliner.document.settings.rfc_base_url + inliner.rfc_url % rfcnum
     set_classes(options)
-    node = nodes.reference(rawtext, 'RFC ' + utils.unescape(text), refuri=ref,
+    node = nodes.reference(rawtext, 'RFC ' + text, refuri=ref,
                            **options)
     return [node], []
 
diff --git a/docutils/docutils/parsers/rst/states.py b/docutils/docutils/parsers/rst/states.py
index c7ee06f66..bfa07314c 100644
--- a/docutils/docutils/parsers/rst/states.py
+++ b/docutils/docutils/parsers/rst/states.py
@@ -713,20 +713,20 @@ class Inliner:
             return (string[:matchend], [], string[matchend:], [], '')
         endmatch = end_pattern.search(string[matchend:])
         if endmatch and endmatch.start(1):  # 1 or more chars
-            _text = endmatch.string[:endmatch.start(1)]
-            text = unescape(_text, restore_backslashes)
+            text = endmatch.string[:endmatch.start(1)]
+            if restore_backslashes:
+                text = unescape(text, True)
             textend = matchend + endmatch.end(1)
             rawsource = unescape(string[matchstart:textend], True)
             node = nodeclass(rawsource, text)
-            node[0].rawsource = unescape(_text, True)
+            node[0].rawsource = unescape(text, True)
             return (string[:matchstart], [node],
                     string[textend:], [], endmatch.group(1))
         msg = self.reporter.warning(
               'Inline %s start-string without end-string.'
               % nodeclass.__name__, line=lineno)
         text = unescape(string[matchstart:matchend], True)
-        rawsource = unescape(string[matchstart:matchend], True)
-        prb = self.problematic(text, rawsource, msg)
+        prb = self.problematic(text, text, msg)
         return string[:matchstart], [prb], string[matchend:], [msg], ''
 
     def problematic(self, text, rawsource, message):
@@ -784,7 +784,7 @@ class Inliner:
                     prb = self.problematic(text, text, msg)
                     return string[:rolestart], [prb], string[textend:], [msg]
                 return self.phrase_ref(string[:matchstart], string[textend:],
-                                       rawsource, escaped, unescape(escaped))
+                                       rawsource, escaped)
             else:
                 rawsource = unescape(string[rolestart:textend], True)
                 nodelist, messages = self.interpreted(rawsource, escaped, role,
@@ -798,26 +798,30 @@ class Inliner:
         prb = self.problematic(text, text, msg)
         return string[:matchstart], [prb], string[matchend:], [msg]
 
-    def phrase_ref(self, before, after, rawsource, escaped, text):
+    def phrase_ref(self, before, after, rawsource, escaped, text=None):
+        # `text` is ignored (since 0.15dev)
         match = self.patterns.embedded_link.search(escaped)
         if match: # embedded <URI> or <alias_>
-            text = unescape(escaped[:match.start(0)])
-            rawtext = unescape(escaped[:match.start(0)], True)
-            aliastext = unescape(match.group(2))
-            rawaliastext = unescape(match.group(2), True)
+            text = escaped[:match.start(0)]
+            unescaped = unescape(text)
+            rawtext = unescape(text, True)
+            aliastext = match.group(2)
+            rawaliastext = unescape(aliastext, True)
             underscore_escaped = rawaliastext.endswith(r'\_')
             if aliastext.endswith('_') and not (underscore_escaped
                                         or self.patterns.uri.match(aliastext)):
                 aliastype = 'name'
-                alias = normalize_name(aliastext[:-1])
+                alias = normalize_name(unescape(aliastext[:-1]))
                 target = nodes.target(match.group(1), refname=alias)
-                target.indirect_reference_name = aliastext[:-1]
+                target.indirect_reference_name = whitespace_normalize_name(
+                                                    unescape(aliastext[:-1]))
             else:
                 aliastype = 'uri'
+                # remove unescaped whitespace
                 alias_parts = split_escaped_whitespace(match.group(2))
-                alias = ' '.join(''.join(unescape(part).split())
+                alias = ' '.join(''.join(part.split())
                                  for part in alias_parts)
-                alias = self.adjust_uri(alias)
+                alias = self.adjust_uri(unescape(alias))
                 if alias.endswith(r'\_'):
                     alias = alias[:-2] + '_'
                 target = nodes.target(match.group(1), refuri=alias)
@@ -827,14 +831,17 @@ class Inliner:
                                        % aliastext)
             if not text:
                 text = alias
+                unescaped = unescape(text)
                 rawtext = rawaliastext
         else:
+            text = escaped
+            unescaped = unescape(text)
             target = None
             rawtext = unescape(escaped, True)
 
-        refname = normalize_name(text)
+        refname = normalize_name(unescaped)
         reference = nodes.reference(rawsource, text,
-                                    name=whitespace_normalize_name(text))
+                                    name=whitespace_normalize_name(unescaped))
         reference[0].rawsource = rawtext
 
         node_list = [reference]
@@ -991,11 +998,9 @@ class Inliner:
             else:
                 addscheme = ''
             text = match.group('whole')
-            unescaped = unescape(text)
-            rawsource = unescape(text, True)
-            reference = nodes.reference(rawsource, unescaped,
-                                        refuri=addscheme + unescaped)
-            reference[0].rawsource = rawsource
+            refuri = addscheme + unescape(text)
+            reference = nodes.reference(unescape(text, True), text,
+                                        refuri=refuri)
             return [reference]
         else:                   # not a valid scheme
             raise MarkupMismatch
@@ -1003,27 +1008,25 @@ class Inliner:
     def pep_reference(self, match, lineno):
         text = match.group(0)
         if text.startswith('pep-'):
-            pepnum = int(match.group('pepnum1'))
+            pepnum = int(unescape(match.group('pepnum1')))
         elif text.startswith('PEP'):
-            pepnum = int(match.group('pepnum2'))
+            pepnum = int(unescape(match.group('pepnum2')))
         else:
             raise MarkupMismatch
         ref = (self.document.settings.pep_base_url
                + self.document.settings.pep_file_url_template % pepnum)
-        unescaped = unescape(text)
-        return [nodes.reference(unescape(text, True), unescaped, refuri=ref)]
+        return [nodes.reference(unescape(text, True), text, refuri=ref)]
 
     rfc_url = 'rfc%d.html'
 
     def rfc_reference(self, match, lineno):
         text = match.group(0)
         if text.startswith('RFC'):
-            rfcnum = int(match.group('rfcnum'))
+            rfcnum = int(unescape(match.group('rfcnum')))
             ref = self.document.settings.rfc_base_url + self.rfc_url % rfcnum
         else:
             raise MarkupMismatch
-        unescaped = unescape(text)
-        return [nodes.reference(unescape(text, True), unescaped, refuri=ref)]
+        return [nodes.reference(unescape(text, True), text, refuri=ref)]
 
     def implicit_inline(self, text, lineno):
         """
@@ -1045,7 +1048,7 @@ class Inliner:
                             self.implicit_inline(text[match.end():], lineno))
                 except MarkupMismatch:
                     pass
-        return [nodes.Text(unescape(text), rawsource=unescape(text, True))]
+        return [nodes.Text(text, unescape(text, True))]
 
     dispatch = {'*': emphasis,
                 '**': strong,
@@ -2842,6 +2845,7 @@ class Text(RSTState):
         self.nested_parse(indented, input_offset=line_offset, node=definition)
         return itemnode, blank_finish
 
+    #@ TODO ignore null-escaped delimiter
     classifier_delimiter = re.compile(' +: +')
 
     def term(self, lines, lineno):
@@ -2855,12 +2859,12 @@ class Text(RSTState):
         for i in range(len(text_nodes)):
             node = text_nodes[i]
             if isinstance(node, nodes.Text):
-                parts = self.classifier_delimiter.split(node.rawsource)
+                parts = self.classifier_delimiter.split(node)
                 if len(parts) == 1:
                     node_list[-1] += node
                 else:
                     text = parts[0].rstrip()
-                    textnode = nodes.Text(utils.unescape(text, True))
+                    textnode = nodes.Text(text, unescape(text, True))
                     node_list[-1] += textnode
                     for part in parts[1:]:
                         classifier_node = nodes.classifier(
author	milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2018-11-20 23:55:23 +0000
committer	milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2018-11-20 23:55:23 +0000
commit	68406f5b1612ed128449330a7ed0418bb2ae6787 (patch)
tree	4ed7108b501c006df57b640a3ebe9129b2432491
parent	e3464eabc52829aeecad3db352a02e8f0e29a4a8 (diff)
download	docutils-68406f5b1612ed128449330a7ed0418bb2ae6787.tar.gz