diff options
author | milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> | 2018-11-21 13:58:51 +0000 |
---|---|---|
committer | milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> | 2018-11-21 13:58:51 +0000 |
commit | 556aec900cdaccbec3fce287e7af69e82d80ea53 (patch) | |
tree | e57f39d78e6a9af01eb5fc0f63237f85abe48476 | |
parent | b95bd7805b9b5f859410758ce3c1ef6177e1a8cc (diff) | |
download | docutils-556aec900cdaccbec3fce287e7af69e82d80ea53.tar.gz |
Revert the fix for backslash escaping in transforms.
Still waiting for review.
Reverts last three commits from a local "feature branch"
unintentionally applied to trunk with `git svn`.
git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk@8235 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
-rw-r--r-- | docutils/HISTORY.txt | 1 | ||||
-rw-r--r-- | docutils/docs/user/smartquotes.txt | 43 | ||||
-rw-r--r-- | docutils/docutils/parsers/rst/roles.py | 12 | ||||
-rw-r--r-- | docutils/docutils/parsers/rst/states.py | 68 | ||||
-rw-r--r-- | docutils/docutils/transforms/universal.py | 21 | ||||
-rw-r--r-- | docutils/test/test_transforms/test_smartquotes.py | 117 |
6 files changed, 76 insertions, 186 deletions
diff --git a/docutils/HISTORY.txt b/docutils/HISTORY.txt index 17d110226..b4820af63 100644 --- a/docutils/HISTORY.txt +++ b/docutils/HISTORY.txt @@ -67,7 +67,6 @@ Changes Since 0.14 * docutils/utils/smartquotes.py: - Fix bug #332: use open quote after whitespace, ZWSP, and ZWNJ. - - Use single backslashes for escaping. * docutils/writers/html5_polyglot/ diff --git a/docutils/docs/user/smartquotes.txt b/docutils/docs/user/smartquotes.txt index 0cdfbe27b..92bbafd2d 100644 --- a/docutils/docs/user/smartquotes.txt +++ b/docutils/docs/user/smartquotes.txt @@ -5,8 +5,8 @@ Smart Quotes for Docutils :Author: Günter Milde, based on SmartyPants by John Gruber, Brad Choate, and Chad Miller :Contact: docutils-develop@lists.sourceforge.net -:Revision: $Revision: 8112 $ -:Date: $Date: 2017-06-14 16:20:20 +0200 (Mi, 14. Jun 2017) $ +:Revision: $Revision$ +:Date: $Date$ :License: Released under the terms of the `2-Clause BSD license`_ :Abstract: This document describes the Docutils `smartquotes` module. @@ -25,19 +25,19 @@ transformation on Text nodes that includes the following steps: - three consecutive dots (``...`` or ``. . .``) into an ellipsis entity. This means you can write, edit, and save your documents using plain old -ASCII -- straight quotes, plain dashes, and plain dots -- while Docutils +ASCII---straight quotes, plain dashes, and plain dots---while Docutils generates documents with typographical quotes, dashes, and ellipses. Advantages: * typing speed (especially when blind-typing), * the possibility to change the quoting style of the - complete document with just one configuration option, -* restriction to 7-bit ASCII characters in the source. + complete document with just one configuration option, and +* restriction to 7-bit characters in the source. However, there are `algorithmic shortcomings`_ for 2 reasons: -* dual use of the "ASCII-apostrophe" (') as single quote and apostrophe, +* Dual use of the "ASCII-apostrophe" (') as single quote and apostrophe, * languages that do not use whitespace around words. So, please consider also @@ -54,25 +54,22 @@ The `SmartQuotes` transform does not modify characters in literal text such as source code, maths, or literal blocks. If you need literal straight quotes (or plain hyphens and periods) in normal -text, you can `backslash escape`_ the characters to preserve -ASCII-punctuation. - -.. class:: booktabs - -========= ========= == ======== ========== -Input Output Input Output -========= ========= == ======== ========== -``\\`` \\ ``\...`` \... -``\"`` \" ``\--`` \-- -``\'`` \' ``\``` \` -========= ========= == ======== ========== +text, you can backslash escape the characters to preserve +ASCII-punctuation. You need two backslashes as one backslash is removed by +the reStructuredText `escaping mechanism`_. + +======== ========= ======== ========= +Escape Character Escape Character +======== ========= ======== ========= +``\\`` \\ ``\\.`` \\. +``\\"`` \\" ``\\-`` \\- +``\\'`` \\' ``\\``` \\` +======== ========= ======== ========= This is useful, for example, when you want to use straight quotes as -foot and inch marks: - - 6\'2\" tall; a 17\" monitor. +foot and inch marks: 6\\'2\\" tall; a 17\\" iMac. -.. _backslash escape: ../ref/rst/restructuredtext.html#escaping-mechanism +.. _escaping mechanism: ../ref/rst/restructuredtext.html#escaping-mechanism Localisation @@ -85,7 +82,7 @@ __ https://en.wikipedia.org/wiki/Quotation_mark#Summary_table `SmartQuotes` inserts quotation marks depending on the language of the current block element and the value of the `"smart_quotes" setting`_.\ -[#x-altquot]_ +[#x-altquot]_ There is built-in support for the following languages:\ [#smartquotes-locales]_ :af: .. class:: language-af diff --git a/docutils/docutils/parsers/rst/roles.py b/docutils/docutils/parsers/rst/roles.py index ef690c5fa..35227e6d1 100644 --- a/docutils/docutils/parsers/rst/roles.py +++ b/docutils/docutils/parsers/rst/roles.py @@ -195,7 +195,7 @@ class GenericRole: def __call__(self, role, rawtext, text, lineno, inliner, options={}, content=[]): set_classes(options) - return [self.node_class(rawtext, text, **options)], [] + return [self.node_class(rawtext, utils.unescape(text), **options)], [] class CustomRole: @@ -234,7 +234,7 @@ def generic_custom_role(role, rawtext, text, lineno, inliner, # Once nested inline markup is implemented, this and other methods should # recursively call inliner.nested_parse(). set_classes(options) - return [nodes.inline(rawtext, text, **options)], [] + return [nodes.inline(rawtext, utils.unescape(text), **options)], [] generic_custom_role.options = {'class': directives.class_option} @@ -255,7 +255,7 @@ register_generic_role('title-reference', nodes.title_reference) def pep_reference_role(role, rawtext, text, lineno, inliner, options={}, content=[]): try: - pepnum = int(utils.unescape(text)) + pepnum = int(text) if pepnum < 0 or pepnum > 9999: raise ValueError except ValueError: @@ -268,7 +268,7 @@ def pep_reference_role(role, rawtext, text, lineno, inliner, ref = (inliner.document.settings.pep_base_url + inliner.document.settings.pep_file_url_template % pepnum) set_classes(options) - return [nodes.reference(rawtext, 'PEP ' + text, refuri=ref, + return [nodes.reference(rawtext, 'PEP ' + utils.unescape(text), refuri=ref, **options)], [] register_canonical_role('pep-reference', pep_reference_role) @@ -276,7 +276,7 @@ register_canonical_role('pep-reference', pep_reference_role) def rfc_reference_role(role, rawtext, text, lineno, inliner, options={}, content=[]): try: - rfcnum = int(utils.unescape(text)) + rfcnum = int(text) if rfcnum <= 0: raise ValueError except ValueError: @@ -288,7 +288,7 @@ def rfc_reference_role(role, rawtext, text, lineno, inliner, # Base URL mainly used by inliner.rfc_reference, so this is correct: ref = inliner.document.settings.rfc_base_url + inliner.rfc_url % rfcnum set_classes(options) - node = nodes.reference(rawtext, 'RFC ' + text, refuri=ref, + node = nodes.reference(rawtext, 'RFC ' + utils.unescape(text), refuri=ref, **options) return [node], [] diff --git a/docutils/docutils/parsers/rst/states.py b/docutils/docutils/parsers/rst/states.py index bfa07314c..c7ee06f66 100644 --- a/docutils/docutils/parsers/rst/states.py +++ b/docutils/docutils/parsers/rst/states.py @@ -713,20 +713,20 @@ class Inliner: return (string[:matchend], [], string[matchend:], [], '') endmatch = end_pattern.search(string[matchend:]) if endmatch and endmatch.start(1): # 1 or more chars - text = endmatch.string[:endmatch.start(1)] - if restore_backslashes: - text = unescape(text, True) + _text = endmatch.string[:endmatch.start(1)] + text = unescape(_text, restore_backslashes) textend = matchend + endmatch.end(1) rawsource = unescape(string[matchstart:textend], True) node = nodeclass(rawsource, text) - node[0].rawsource = unescape(text, True) + node[0].rawsource = unescape(_text, True) return (string[:matchstart], [node], string[textend:], [], endmatch.group(1)) msg = self.reporter.warning( 'Inline %s start-string without end-string.' % nodeclass.__name__, line=lineno) text = unescape(string[matchstart:matchend], True) - prb = self.problematic(text, text, msg) + rawsource = unescape(string[matchstart:matchend], True) + prb = self.problematic(text, rawsource, msg) return string[:matchstart], [prb], string[matchend:], [msg], '' def problematic(self, text, rawsource, message): @@ -784,7 +784,7 @@ class Inliner: prb = self.problematic(text, text, msg) return string[:rolestart], [prb], string[textend:], [msg] return self.phrase_ref(string[:matchstart], string[textend:], - rawsource, escaped) + rawsource, escaped, unescape(escaped)) else: rawsource = unescape(string[rolestart:textend], True) nodelist, messages = self.interpreted(rawsource, escaped, role, @@ -798,30 +798,26 @@ class Inliner: prb = self.problematic(text, text, msg) return string[:matchstart], [prb], string[matchend:], [msg] - def phrase_ref(self, before, after, rawsource, escaped, text=None): - # `text` is ignored (since 0.15dev) + def phrase_ref(self, before, after, rawsource, escaped, text): match = self.patterns.embedded_link.search(escaped) if match: # embedded <URI> or <alias_> - text = escaped[:match.start(0)] - unescaped = unescape(text) - rawtext = unescape(text, True) - aliastext = match.group(2) - rawaliastext = unescape(aliastext, True) + text = unescape(escaped[:match.start(0)]) + rawtext = unescape(escaped[:match.start(0)], True) + aliastext = unescape(match.group(2)) + rawaliastext = unescape(match.group(2), True) underscore_escaped = rawaliastext.endswith(r'\_') if aliastext.endswith('_') and not (underscore_escaped or self.patterns.uri.match(aliastext)): aliastype = 'name' - alias = normalize_name(unescape(aliastext[:-1])) + alias = normalize_name(aliastext[:-1]) target = nodes.target(match.group(1), refname=alias) - target.indirect_reference_name = whitespace_normalize_name( - unescape(aliastext[:-1])) + target.indirect_reference_name = aliastext[:-1] else: aliastype = 'uri' - # remove unescaped whitespace alias_parts = split_escaped_whitespace(match.group(2)) - alias = ' '.join(''.join(part.split()) + alias = ' '.join(''.join(unescape(part).split()) for part in alias_parts) - alias = self.adjust_uri(unescape(alias)) + alias = self.adjust_uri(alias) if alias.endswith(r'\_'): alias = alias[:-2] + '_' target = nodes.target(match.group(1), refuri=alias) @@ -831,17 +827,14 @@ class Inliner: % aliastext) if not text: text = alias - unescaped = unescape(text) rawtext = rawaliastext else: - text = escaped - unescaped = unescape(text) target = None rawtext = unescape(escaped, True) - refname = normalize_name(unescaped) + refname = normalize_name(text) reference = nodes.reference(rawsource, text, - name=whitespace_normalize_name(unescaped)) + name=whitespace_normalize_name(text)) reference[0].rawsource = rawtext node_list = [reference] @@ -998,9 +991,11 @@ class Inliner: else: addscheme = '' text = match.group('whole') - refuri = addscheme + unescape(text) - reference = nodes.reference(unescape(text, True), text, - refuri=refuri) + unescaped = unescape(text) + rawsource = unescape(text, True) + reference = nodes.reference(rawsource, unescaped, + refuri=addscheme + unescaped) + reference[0].rawsource = rawsource return [reference] else: # not a valid scheme raise MarkupMismatch @@ -1008,25 +1003,27 @@ class Inliner: def pep_reference(self, match, lineno): text = match.group(0) if text.startswith('pep-'): - pepnum = int(unescape(match.group('pepnum1'))) + pepnum = int(match.group('pepnum1')) elif text.startswith('PEP'): - pepnum = int(unescape(match.group('pepnum2'))) + pepnum = int(match.group('pepnum2')) else: raise MarkupMismatch ref = (self.document.settings.pep_base_url + self.document.settings.pep_file_url_template % pepnum) - return [nodes.reference(unescape(text, True), text, refuri=ref)] + unescaped = unescape(text) + return [nodes.reference(unescape(text, True), unescaped, refuri=ref)] rfc_url = 'rfc%d.html' def rfc_reference(self, match, lineno): text = match.group(0) if text.startswith('RFC'): - rfcnum = int(unescape(match.group('rfcnum'))) + rfcnum = int(match.group('rfcnum')) ref = self.document.settings.rfc_base_url + self.rfc_url % rfcnum else: raise MarkupMismatch - return [nodes.reference(unescape(text, True), text, refuri=ref)] + unescaped = unescape(text) + return [nodes.reference(unescape(text, True), unescaped, refuri=ref)] def implicit_inline(self, text, lineno): """ @@ -1048,7 +1045,7 @@ class Inliner: self.implicit_inline(text[match.end():], lineno)) except MarkupMismatch: pass - return [nodes.Text(text, unescape(text, True))] + return [nodes.Text(unescape(text), rawsource=unescape(text, True))] dispatch = {'*': emphasis, '**': strong, @@ -2845,7 +2842,6 @@ class Text(RSTState): self.nested_parse(indented, input_offset=line_offset, node=definition) return itemnode, blank_finish - #@ TODO ignore null-escaped delimiter classifier_delimiter = re.compile(' +: +') def term(self, lines, lineno): @@ -2859,12 +2855,12 @@ class Text(RSTState): for i in range(len(text_nodes)): node = text_nodes[i] if isinstance(node, nodes.Text): - parts = self.classifier_delimiter.split(node) + parts = self.classifier_delimiter.split(node.rawsource) if len(parts) == 1: node_list[-1] += node else: text = parts[0].rstrip() - textnode = nodes.Text(text, unescape(text, True)) + textnode = nodes.Text(utils.unescape(text, True)) node_list[-1] += textnode for part in parts[1:]: classifier_node = nodes.classifier( diff --git a/docutils/docutils/transforms/universal.py b/docutils/docutils/transforms/universal.py index 1b42d854e..0e7f305e6 100644 --- a/docutils/docutils/transforms/universal.py +++ b/docutils/docutils/transforms/universal.py @@ -222,10 +222,9 @@ class SmartQuotes(Transform): nodes_to_skip = (nodes.FixedTextElement, nodes.Special) """Do not apply "smartquotes" to instances of these block-level nodes.""" - literal_nodes = (nodes.FixedTextElement, nodes.Special, - nodes.image, nodes.literal, nodes.math, + literal_nodes = (nodes.image, nodes.literal, nodes.math, nodes.raw, nodes.problematic) - """Do apply smartquotes to instances of these inline nodes.""" + """Do not change quotes in instances of these inline nodes.""" smartquotes_action = 'qDe' """Setting to select smartquote transformations. @@ -241,14 +240,14 @@ class SmartQuotes(Transform): def get_tokens(self, txtnodes): # A generator that yields ``(texttype, nodetext)`` tuples for a list # of "Text" nodes (interface to ``smartquotes.educate_tokens()``). - for node in txtnodes: - if (isinstance(node.parent, self.literal_nodes) - or isinstance(node.parent.parent, self.literal_nodes)): - yield ('literal', unicode(node)) - else: - # SmartQuotes uses backslash escapes instead of null-escapes - txt = re.sub('(?<=\x00)([-\\\'".`])', r'\\\1', unicode(node)) - yield ('plain', txt) + + texttype = {True: 'literal', # "literal" text is not changed: + False: 'plain'} + for txtnode in txtnodes: + nodetype = texttype[isinstance(txtnode.parent, + self.literal_nodes)] + yield (nodetype, txtnode.astext()) + def apply(self): smart_quotes = self.document.settings.smart_quotes diff --git a/docutils/test/test_transforms/test_smartquotes.py b/docutils/test/test_transforms/test_smartquotes.py index 2604a1e40..7e9613b78 100644 --- a/docutils/test/test_transforms/test_smartquotes.py +++ b/docutils/test/test_transforms/test_smartquotes.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# $Id: test_smartquotes.py 8190 2017-10-25 13:57:27Z milde $ +# $Id$ # # :Copyright: © 2011 Günter Milde. # :Maintainer: docutils-develop@lists.sourceforge.net @@ -24,8 +24,7 @@ from docutils.parsers.rst import Parser def suite(): parser = Parser() - settings = {'smart_quotes': True, - 'trim_footnote_ref_space': True} + settings = {'smart_quotes': True} s = DocutilsTestSupport.TransformTestSuite( parser, suite_settings=settings) s.generateTests(totest) @@ -44,7 +43,7 @@ totest_de = {} totest_de_alt = {} totest_locales = {} -totest['smartquotes'] = ((SmartQuotes,), [ +totest['transitions'] = ((SmartQuotes,), [ ["""\ Test "smart quotes", 'secondary smart quotes', "'nested' smart" quotes @@ -57,7 +56,7 @@ u"""\ “‘nested’ smart” quotes – and —also long— dashes. """], -[r"""Escaped \"ASCII quotes\" and \'secondary ASCII quotes\'. +[r"""Escaped \\"ASCII quotes\\" and \\'secondary ASCII quotes\\'. """, u"""\ <document source="test data"> @@ -114,7 +113,6 @@ em space "a" 'a', NBSP "a" 'a', ZWSP\u200B"a" and\u200B'a', ZWNJ\u200C"a" and\u200C'a', -escaped space\ "a" and\ 'a', —"a",—'a' en dash–"a"–'a', @@ -133,7 +131,6 @@ u"""\ NBSP “a” ‘a’, ZWSP\u200B“a” and\u200B‘a’, ZWNJ\u200C“a” and\u200C‘a’, - escaped space“a” and‘a’, <paragraph> —“a”,—‘a’ en dash–“a”–‘a’, @@ -199,7 +196,7 @@ Do not convert context-character at inline-tag boundaries and links to "targets_". Inside *"emphasized"* or other `inline "roles"`: - (``"string"``), (``'string'``), *\"betont\"*, \"*betont*". + (``"string"``), (``'string'``), *\\"betont\\"*, \\"*betont*". Do not drop characters from intra-word inline markup like *re*\ ``Structured``\ *Text*. @@ -255,104 +252,6 @@ u"""\ Text . """], -[r""" -Docutils escape mechanism uses the backslash: - -\Remove \non-escaped \backslashes\: -\item \newline \tab \" \' \*. - -\ Remove-\ escaped-\ white\ space-\ -including-\ newlines. - -\\Keep\\escaped\\backslashes\\ -(but\\only\\one). - -\\ Keep \\ space\\ around \\ backslashes. - -Keep backslashes ``\in\ literal``, :math:`in \mathrm{math}`, -and :code:`in\ code`. - -Test around inline elements:\ [*]_ - -*emphasized*, H\ :sub:`2`\ O and :math:`x^2` - -*emphasized*, H\ :sub:`2`\ O and :math:`x^2` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. [*] and footnotes -""", -u"""\ -<document source="test data"> - <paragraph> - Docutils escape mechanism uses the backslash: - <paragraph> - Remove non-escaped backslashes: - item newline tab " \' *. - <paragraph> - Remove-escaped-whitespace-including-newlines. - <paragraph> - \\Keep\\escaped\\backslashes\\ - (but\\only\\one). - <paragraph> - \\ Keep \\ space\\ around \\ backslashes. - <paragraph> - Keep backslashes \n\ - <literal> - \\in\\ literal - , \n\ - <math> - in \\mathrm{math} - , - and \n\ - <literal classes="code"> - in\\ code - . - <paragraph> - Test around inline elements: - <footnote_reference auto="*" ids="id1"> - <paragraph> - <emphasis> - emphasized - , H - <subscript> - 2 - O and \n\ - <math> - x^2 - <section ids="emphasized-h2o-and-x-2" names="emphasized,\\ h2o\\ and\\ x^2"> - <title> - <emphasis> - emphasized - , H - <subscript> - 2 - O and \n\ - <math> - x^2 - <footnote auto="*" ids="id2"> - <paragraph> - and footnotes -"""], -[r""" -Character-level m\ *a*\ **r**\ ``k``\ `u`:title:\p -with backslash-escaped whitespace, including new\ -lines. -""", -"""\ -<document source="test data"> - <paragraph> - Character-level m - <emphasis> - a - <strong> - r - <literal> - k - <title_reference> - u - p - with backslash-escaped whitespace, including newlines. -"""], ["""\ .. class:: language-de @@ -388,7 +287,7 @@ u"""\ """], ]) -totest_de['smartquotes'] = ((SmartQuotes,), [ +totest_de['transitions'] = ((SmartQuotes,), [ ["""\ German "smart quotes" and 'secondary smart quotes'. @@ -405,7 +304,7 @@ u"""\ """], ]) -totest_de_alt['smartquotes'] = ((SmartQuotes,), [ +totest_de_alt['transitions'] = ((SmartQuotes,), [ ["""\ Alternative German "smart quotes" and 'secondary smart quotes'. @@ -434,7 +333,7 @@ u"""\ """], ]) -totest_locales['smartquotes'] = ((SmartQuotes,), [ +totest_locales['transitions'] = ((SmartQuotes,), [ ["""\ German "smart quotes" and 'secondary smart quotes'. |