Pass all known tests

git-svn-id: http://svn.code.sf.net/p/docutils/code/branches/nesting@1822 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
author: david_abrahams <david_abrahams@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2004-01-30 03:39:41 +0000
committer: david_abrahams <david_abrahams@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2004-01-30 03:39:41 +0000
commit: da3677954688a874ab00524586b7896675f03f7a (patch)
tree: a70ec0fdf9e71179d7f2c0912c8ba95bf380963d
parent: ebaa35c82398d98beb3916d921b1d7161839752d (diff)
download: docutils-da3677954688a874ab00524586b7896675f03f7a.tar.gz
3 files changed, 259 insertions, 56 deletions
diff --git a/docutils/docutils/parsers/rst/states.py b/docutils/docutils/parsers/rst/states.py
index 734b3fccf..eb95da7b0 100644
--- a/docutils/docutils/parsers/rst/states.py
+++ b/docutils/docutils/parsers/rst/states.py
@@ -591,6 +591,25 @@ class Inliner:
             remaining, lineno, self.patterns.initial.search)
         return nodes, messages
 
+    def debug_match(self, msg, match, group):
+        """
+        Show where the given group of the given sre match object
+        falls in the text that was searched, and where the match
+        overall falls
+        """
+        if match and self._debug:
+            searched = match.string
+            self %= msg + ' ' + repr(searched)
+            x0 = len(repr(searched[:match.start(0)]))
+            x1 = len(repr(searched[:match.start(group)]))
+            x2 = len(repr(searched[:match.end(group)]))
+            x3 = len(repr(searched[:match.end(0)]))
+            self %= (
+                (x0 + len(msg)) * ' '
+                + (x1-x0) * '-'
+                + (x2-x1)*'^' 
+                + (x3-x2) * '-')
+        
     def inner_parse(self, remaining, lineno, token_search):
         """The guts of the parse method"""
         nodes = []    # Buffer for result nodes 
@@ -598,27 +617,36 @@ class Inliner:
         prefixes = [] # Buffer for un-marked text preceding first
                       # recognized explicit inline markup
         self.indent += '  '
-        match = None
+        token_match = None
         while remaining:
             # !!! all "self %=" lines for debugging; 2Bcut
             self %= 'inner_parse of: %r' % remaining
-            match = token_search(remaining)
-            if not match:
+            token_match = token_search(remaining)
+            if not token_match:
                 break
-            groups = match.groupdict()
-            # self %= 'with: %r' % match.re.pattern
-            self %= 'inner_parse, groupdict', groups
-            self %= 'inner_parse, groups:', match.groups()
+            groups = token_match.groupdict()
+            # self %= 'with: %r' % token_match.re.pattern
+            # self %= 'inner_parse, groupdict', groups
+            # self %= 'inner_parse, groups:', token_match.groups()
+            
             opener = groups['start'] or groups['backquote'] \
                      or groups['refend'] or groups['fnend']
+
             if not opener: # either the end-string was found or we're done
-                self %= 'inner_parse, dropping: %r' % remaining[match.start(1):]
-                remaining = remaining[:match.start(1)]
+                self %= 'inner_parse, dropping: %r' % remaining[token_match.start(1):]
+                remaining = remaining[:token_match.start(1)]
                 break
-            self %= 'inner_parse, opener: %r' % opener
+            
+            # self %= 'inner_parse, opener: %r' % opener
+            if self._debug:
+                opengrp = [
+                    x for x in ('start', 'backquote', 'refend', 'fnend') if groups[x]][0]
+                self.debug_match('inner_parse:', token_match, opengrp)
+            
             method = self.dispatch[opener]
             before, inlines, remaining, sysmessages = method(
-                self, match, lineno)
+                self, token_match, lineno)
+            
             prefixes.append(before)
             messages += sysmessages
             if inlines:
@@ -629,7 +657,7 @@ class Inliner:
         if remaining:
             nodes += self.implicit_inline(remaining, lineno)
         self.indent = self.indent[:-2]
-        return nodes, messages, match
+        return nodes, messages, token_match
         
     openers = '\'"([{<'
     closers = '\'")]}>'
@@ -687,9 +715,9 @@ class Inliner:
                '',
                non_whitespace_after,
                [ ('backquote',
-                  '(?P<role>(:%s:)?)' % simplename,    # optional role
+                  '(?P<role>(:%s:)?)' % simplename,   # optional role
                   '',
-                  ['`(?!`)']                             # backquote but not literal
+                  ['`(?!`)']                          # backquote but not literal
                   )
                  ]         
                )
@@ -707,12 +735,20 @@ class Inliner:
         )
     interpreted_or_phrase_ref_end = (
         r'(?:' + embedded_uri + ')?'
+        
+        # The assumption here is that the only excuse for escaped
+        # whitespace immediately following an inline markup start
+        # string is to prevent the backquote from being interpreted as
+        # part of the literal token (``).  This negative lookbehind is
+        # required for cases such as:
+        #
+        #  `\ `\ suffix roles`:emphasis: aren't worth the trouble`:strong:
+        #
+        # (hint, hint)
+        + r'(?<!^\x00[ \n])'
         + '`(?P<rolesuffix>:' + simplename + ':)?(?:__?)?'
         )
-    interpreted_or_phrase_ref_end2 = (
-        r'(?:' + embedded_uri + ')?'
-        + '`(?:(?P<rolesuffix>:' + simplename + ':)(?:__?)?|(?:__?))'
-        )
+
     _end_pattern = non_whitespace_escape_before + '(%s)' + end_string_suffix
     
     def _compile_end_pattern(endpattern, _end_pattern = _end_pattern):
@@ -720,6 +756,7 @@ class Inliner:
     
     _re_map = {}
 
+    # Matches named groups in regexps
     _groupname = re.compile(r'\(\?P\<[^>]+\>', re.UNICODE)
     
     def _push_end_string(self, startmatch, endpattern):
@@ -729,13 +766,14 @@ class Inliner:
         end-of-string in nested markup
         """
         oldpattern = startmatch.re.pattern
-        self %= '_push_end_string, endpattern=', endpattern
+        # self %= '_push_end_string, endpattern=', endpattern
         # self %= '_push_end_string, oldpattern=', oldpattern
         
         X = '(?##)'
         parts = oldpattern.split(X)
         
         if len(parts) == 2:
+            # no end strings being sought yet
             part1 = (
                 '(?:' + self.non_whitespace_escape_before
                 + '(?:'
@@ -744,11 +782,13 @@ class Inliner:
             allends = '(' + endpattern + ')'
             part2 = ')' + self.end_string_suffix + ')|' + parts[0]
             part3 = parts[1]
+            
         else:
             part1,oldends,part2,part3 = parts
-            groupnames = self._groupname.findall(endpattern)
-            for n in groupnames:
-                oldends = oldends.replace(n, '(?:')
+
+            # No group name can appear twice in a regexp.
+            oldends = self._groupname.sub('(?:',oldends)
+            
             allends = '(%s)(?:%s)?|%s' % (endpattern,oldends,oldends)
             
         newpattern = ''.join((
@@ -759,7 +799,7 @@ class Inliner:
             part3
             ))
 
-        self %= "_push_end_string ->      r'" + newpattern + "'"
+        # self %= "_push_end_string ->      r'" + newpattern + "'"
         return self._compile_re(newpattern)
 
     def _compile_re(self, pattern):
@@ -779,8 +819,6 @@ class Inliner:
         end-of-string in nested markup
         """
         oldpattern = startmatch.re.pattern
-        self %= '_push_end_string, endpattern=', endpattern
-        # self %= '_push_end_string, oldpattern=', oldpattern
         
         X = '(?##)'
         parts = oldpattern.split(X)
@@ -921,7 +959,7 @@ class Inliner:
               % nodeclass.__name__, line=lineno)
         rawsource = null2escape(start)
         prb = self.problematic(rawsource, rawsource, msg)
-        return before, [prb], remaining, [msg], ''
+        return (before, [prb], remaining, [msg], '')
 
     def problematic(self, text, rawsource, message):
         msgid = self.document.set_id(message, self.parent)
@@ -944,8 +982,9 @@ class Inliner:
         return self.nested_inline_obj(match, lineno, r'\*\*', nodes.strong)
 
     def interpreted_or_phrase_ref(self, startmatch, lineno):
+        # self %= 'interpreted_or_phrase_ref:'
         before,start,remaining = _split_match(startmatch,'interpreted_or_phrase_ref')
-        
+
         role = startmatch.group('role')
                 
         roleposition = ''
@@ -955,31 +994,23 @@ class Inliner:
             role = role[1:-1] # drop the leading and trailing colons
             roleposition = 'prefix'
         elif self.quoted_start(startmatch):
-            return (remaining, [], remaining, [])
+            return (before+start, [], remaining, [])
 
         # Look for the end
-        if role:
-            endpattern = self.interpreted_or_phrase_ref_end
-        else:
-            endpattern = self.interpreted_or_phrase_ref_end2
-            
-        pattern = self._push_end_string(startmatch, endpattern)
+        newpattern = self._push_end_string(startmatch, self.interpreted_or_phrase_ref_end)
 
-        children,msgs,endmatch = self.inner_parse(remaining,lineno,pattern.search)
-        self %= 'interpreted_or_phrase_ref:'
+        children,msgs,endmatch = self.inner_parse(remaining,lineno,newpattern.search)
         self %= '  searched = %r' % remaining
         self %= '  children =', children
         
         if endmatch and endmatch.group(1):   
-            # self %= '  groups = ', endmatch.groups()
-            # self %= '  named groups = ', endmatch.groupdict()
 
             mid, end, after = _split_match(endmatch, 1)
             # length absorbed by inner_parse
             innerlen = len(remaining)-len(endmatch.string)
             between = remaining[:innerlen]+mid
 
-            # self %= '  split=', (before, start, between, end, after)
+            rawsource = null2escape(start+between+end)
             
             if endmatch.group('rolesuffix'):
                 
@@ -994,8 +1025,6 @@ class Inliner:
                 role = endmatch.group('rolesuffix')[1:-1] # drop the colons
                 roleposition = 'suffix'
                 
-            rawsource = null2escape(start+between+end)
-            
             if rawsource[-1:] == '_':
                 
                 if not role:
@@ -1338,7 +1367,7 @@ class Inliner:
             prb = self.problematic(between, between, msg)
             return [prb], [msg]
         ref = self.pep_url % pepnum
-        return [nodes.reference(rawsource, 'PEP ' + between, refuri=ref, *children)], []
+        return [nodes.reference(rawsource, 'PEP ' + between, refuri=ref)], []
 
     def rfc_reference_role(self, role, rawsource, between, children, lineno):
         try:
@@ -1352,7 +1381,7 @@ class Inliner:
             prb = self.problematic(between, between, msg)
             return [prb], [msg]
         ref = self.rfc_url % rfcnum
-        return [nodes.reference(rawsource, 'RFC ' + between, refuri=ref, *children)], []
+        return [nodes.reference(rawsource, 'RFC ' + between, refuri=ref)], []
 
 
 class Body(RSTState):
diff --git a/docutils/test/test_parsers/test_rst/test_inline_markup.py b/docutils/test/test_parsers/test_rst/test_inline_markup.py
index 10893f069..c90b34db2 100755
--- a/docutils/test/test_parsers/test_rst/test_inline_markup.py
+++ b/docutils/test/test_parsers/test_rst/test_inline_markup.py
@@ -56,8 +56,7 @@ across lines*
 """],
 ["""\
 '*emphasis*' and 1/*emphasis*/2 and 3-*emphasis*-4 and 5:*emphasis*:6
-but not '*' or '"*"' or  x*2* or 2*x* or \\*args or *
-or *the\\* *stars\\\\\\* *inside*
+but not '*' or '"*"' or  x*2* or 2*x* or \\*args or * .
 
 (however, '*args' will trigger a warning and may be problematic)
 
@@ -79,16 +78,13 @@ what about *this**?
         <emphasis>
             emphasis
         :6
-        but not '*' or '"*"' or  x*2* or 2*x* or *args or *
-        or \n\
-        <emphasis>
-            the* *stars\* *inside
+        but not '*' or '"*"' or  x*2* or 2*x* or *args or * .
     <paragraph>
         (however, '
         <problematic id="id2" refid="id1">
             *
         args' will trigger a warning and may be problematic)
-    <system_message backrefs="id2" id="id1" level="2" line="5" source="test data" type="WARNING">
+    <system_message backrefs="id2" id="id1" level="2" line="4" source="test data" type="WARNING">
         <paragraph>
             Inline emphasis start-string without end-string.
     <paragraph>
@@ -147,9 +143,9 @@ totest['strong'] = [
             Inline strong start-string without end-string.
 """],
 ["""\
-Strong asterisk: *****
+Strong asterisk: **\***
 
-Strong double asterisk: ******
+Strong double asterisk: **\*\***
 """,
 """\
 <document source="test data">
@@ -505,18 +501,30 @@ See `HTML Anchors: \\<a>`_.
 <document source="test data">
     <paragraph>
         <reference anonymous="1">
-            embedded URI with too much whitespace < http://example.com/
+            embedded URI with too much whitespace < 
+            <reference refuri="http://example.com/">
+                http://example.com/
+            \n\
             long/path /and  /whitespace >
     <paragraph>
         <reference anonymous="1">
-            embedded URI with too much whitespace at end <http://example.com/
+            embedded URI with too much whitespace at end <
+            <reference refuri="http://example.com/">
+                http://example.com/
+            \n\
             long/path /and  /whitespace >
     <paragraph>
         <reference anonymous="1">
-            embedded URI with no preceding whitespace<http://example.com>
+            embedded URI with no preceding whitespace<
+            <reference refuri="http://example.com">
+                http://example.com
+            >
     <paragraph>
         <reference anonymous="1">
-            escaped URI <http://example.com>
+            escaped URI <
+            <reference refuri="http://example.com">
+                http://example.com
+            >
     <paragraph>
         See \n\
         <reference refname="html anchors: <a>">
diff --git a/docutils/test/test_parsers/test_rst/test_nested_inline_markup.py b/docutils/test/test_parsers/test_rst/test_nested_inline_markup.py
new file mode 100755
index 000000000..3e6a3f052
--- /dev/null
+++ b/docutils/test/test_parsers/test_rst/test_nested_inline_markup.py
@@ -0,0 +1,166 @@
+#! /usr/bin/env python
+
+# Copy this file to docutils/test/test_parsers/test_rst/ and do
+# ``chmod +x test_inline_markup.py``, then execute this file to test.
+
+# To be added (later) to
+# docutils/test/test_parsers/test_rst/test_inline_markup.py?
+
+from __init__ import DocutilsTestSupport
+
+def suite():
+    s = DocutilsTestSupport.ParserTestSuite()
+    s.generateTests(totest)
+    return s
+
+totest = {}
+
+
+totest['nested'] = [
+["""\
+*emphasis **strong***
+""",
+"""\
+<document source="test data">
+    <paragraph>
+        <emphasis>
+            emphasis \n\
+            <strong>
+                strong
+"""],
+["""\
+**strong *emphasis***
+""",
+"""\
+<document source="test data">
+    <paragraph>
+        <strong>
+            strong \n\
+            <emphasis>
+                emphasis
+"""],
+["""\
+*emphasis ``literal``*
+""",
+"""\
+<document source="test data">
+    <paragraph>
+        <emphasis>
+            emphasis \n\
+            <literal>
+                literal
+"""],
+["""\
+*emphasis **strong*
+""",
+"""\
+<document source="test data">
+    <paragraph>
+        <emphasis>
+            emphasis \n\
+            <problematic id="id2" refid="id1">
+                **
+            strong
+    <system_message backrefs="id2" id="id1" level="2" line="1" source="test data" type="WARNING">
+        <paragraph>
+            Inline strong start-string without end-string.
+"""],
+["""\
+:emphasis:`interpreted :strong:`text``
+""",
+"""\
+<document source="test data">
+    <paragraph>
+        <emphasis>
+            interpreted 
+            <strong>
+                text
+"""
+],
+["""\
+*emphasis **strong**
+""",
+"""\
+<document source="test data">
+    <paragraph>
+        <problematic id="id2" refid="id1">
+            *
+        emphasis \n\
+        <strong>
+            strong
+    <system_message backrefs="id2" id="id1" level="2" line="1" source="test data" type="WARNING">
+        <paragraph>
+            Inline emphasis start-string without end-string.
+"""],
+["""\
+*emphasized ``literal`` and |substitution ref| and ref_*
+""",
+"""\
+<document source="test data">
+    <paragraph>
+        <emphasis>
+            emphasized \n\
+            <literal>
+                literal
+             and \n\
+            <substitution_reference refname="substitution ref">
+                substitution ref
+             and \n\
+            <reference refname="ref">
+                ref
+"""],
+["""\
+Explicit roles for standard inline markup:
+
+:emphasis:`emphasis including :strong:`strong
+including :literal:`inline literal text```.
+""",
+"""\
+<document source="test data">
+    <paragraph>
+        Explicit roles for standard inline markup:
+    <paragraph>
+        <emphasis>
+            emphasis including \n\
+            <strong>
+                strong
+                including \n\
+                <literal>
+                    inline literal text
+        .
+"""],
+["""\
+Suffix-based nested explicit roles:
+
+`\ `\ `inline literal text`:literal: inside
+strong`:strong: within emphasis`:emphasis:.
+""",
+"""\
+<document source="test data">
+    <paragraph>
+        Suffix-based nested explicit roles:
+    <paragraph>
+        <emphasis>
+            <strong>
+                <literal>
+                    inline literal text
+                 inside
+                strong
+             within emphasis
+        .
+"""],
+["""\
+``literal *doesn't* **get** `parsed```
+""",
+"""\
+<document source="test data">
+    <paragraph>
+        <literal>
+            literal *doesn't* **get** `parsed`
+"""],
+]
+
+
+if __name__ == '__main__':
+    import unittest
+    unittest.main(defaultTest='suite')
author	david_abrahams <david_abrahams@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2004-01-30 03:39:41 +0000
committer	david_abrahams <david_abrahams@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2004-01-30 03:39:41 +0000
commit	da3677954688a874ab00524586b7896675f03f7a (patch)
tree	a70ec0fdf9e71179d7f2c0912c8ba95bf380963d
parent	ebaa35c82398d98beb3916d921b1d7161839752d (diff)
download	docutils-da3677954688a874ab00524586b7896675f03f7a.tar.gz