Fix DeprecationWarning on a2x regexes (#247)

author: Matthew Peveler <matt.peveler@gmail.com> 2022-03-01 10:35:21 -0500
committer: GitHub <noreply@github.com> 2022-03-01 10:35:21 -0500
commit: f4cf5a07e324160895872f093692d8387094c866 (patch)
tree: bf067f57c97bd3def78622573259c9830e778700
parent: 9b494802b25b60a5d21976f45e21380ed7b78766 (diff)
download: asciidoc-py3-f4cf5a07e324160895872f093692d8387094c866.tar.gz
2 files changed, 97 insertions, 42 deletions
diff --git a/asciidoc/a2x.py b/asciidoc/a2x.py
index 4348f1e..7407d05 100644
--- a/asciidoc/a2x.py
+++ b/asciidoc/a2x.py
@@ -253,6 +253,17 @@ def shell(cmd, raise_error=True):
     return (stdoutdata, stderrdata, popen.returncode)
 
 
+def get_encoding(string: bytes) -> str:
+    """
+    Given a byte representation of a XML or HTML document, find and return the encoding set
+    as an attribute, or return utf-8 if none could be found.
+    """
+    mo = re.search(br'^<\?xml.* encoding="(.*?)"', string)
+    if mo is None:
+        mo = re.search(br'<meta http\-equiv="Content\-Type" content="text\/html; charset=(.*?)">', string)
+    return mo.group(1).decode('utf-8') if mo else 'utf-8'
+
+
 def find_resources(files, tagname, attrname, filter=None):
     '''
     Search all files and return a list of local URIs from attrname attribute
@@ -286,10 +297,7 @@ def find_resources(files, tagname, attrname, filter=None):
         parser = FindResources()
         with open(filename, 'rb') as open_file:
             contents = open_file.read()
-        mo = re.search(b'\A<\?xml.* encoding="(.*?)"', contents)
-        if mo is None:
-            mo = re.search(br'<meta http\-equiv="Content\-Type" content="text\/html; charset=(.*?)">', contents)
-        contents = contents.decode(mo.group(1).decode('utf-8') if mo else 'utf-8')
+        contents = contents.decode(get_encoding(contents))
         parser.feed(contents)
         parser.close()
     result = list(set(result))   # Drop duplicate values.
@@ -318,51 +326,53 @@ def exec_xsltproc(xsl_file, xml_file, dst_dir, opts=''):
         shell_cd(cwd)
 
 
-def get_source_options(asciidoc_file):
+def get_source_options(asciidoc_file: str) -> List[str]:
     '''
     Look for a2x command options in AsciiDoc source file.
     Limitation: options cannot contain double-quote characters.
     '''
-    def parse_options():
-        # Parse options to result sequence.
-        inquotes = False
-        opt = ''
-        for c in options:
-            if c == '"':
-                if inquotes:
-                    result.append(opt)
-                    opt = ''
-                    inquotes = False
-                else:
-                    inquotes = True
-            elif c == ' ':
-                if inquotes:
-                    opt += c
-                elif opt:
-                    result.append(opt)
-                    opt = ''
+    result = []
+
+    if not os.path.isfile(asciidoc_file):
+        return result
+
+    options = ''
+    with open(asciidoc_file, 'rb') as f:
+        line_number = 0
+        for line in f:
+            line_number += 1
+            mo = re.search(br'^//\s*a2x:', line)
+            if mo:
+                try:
+                    options += ' ' + line[mo.end():].strip().decode('ascii')
+                except UnicodeDecodeError as e:
+                    warning(
+                        "Could not decode option to %s " % e.encoding +
+                        "on line %s in %s" % (line_number, asciidoc_file)
+                    )
+
+    # Parse options to result sequence.
+    inquotes = False
+    opt = ''
+    for c in options:
+        if c == '"':
+            if inquotes:
+                result.append(opt)
+                opt = ''
+                inquotes = False
             else:
+                inquotes = True
+        elif c == ' ':
+            if inquotes:
                 opt += c
-        if opt:
-            result.append(opt)
+            elif opt:
+                result.append(opt)
+                opt = ''
+        else:
+            opt += c
+    if opt:
+        result.append(opt)
 
-    result = []
-    if os.path.isfile(asciidoc_file):
-        options = ''
-        with open(asciidoc_file, 'rb') as f:
-            line_number = 0
-            for line in f:
-                line_number += 1
-                mo = re.search(b'^//\s*a2x:', line)
-                if mo:
-                    try:
-                        options += ' ' + line[mo.end():].strip().decode('ascii')
-                    except UnicodeDecodeError as e:
-                        warning(
-                            "Could not decode option to %s " % e.encoding +
-                            "on line %s in %s" % (line_number, asciidoc_file)
-                        )
-        parse_options()
     return result
 
 
diff --git a/tests/test_a2x.py b/tests/test_a2x.py
index 6546636..bb16346 100644
--- a/tests/test_a2x.py
+++ b/tests/test_a2x.py
@@ -32,3 +32,48 @@ def test_parse_args(input, expected_argv, expected_opts, expected_args):
     assert argv == expected_argv
     assert opts == expected_opts
     assert args == expected_args
+
+
+@pytest.mark.parametrize(
+    "input,expected",
+    (
+        (b'', 'utf-8'),
+        (b'<?xml version="1.0" encoding="ASCII"?>', 'ASCII'),
+        (b'<?xml version="1.0"?>', 'utf-8'),
+        (
+            b'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">\n<html>\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=ASCII">',
+            'ASCII',
+        ),
+        (
+            b'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">\n<html>\n<head>\n<title>Foo</title>\n</head>',
+            'utf-8',
+        ),
+    ),
+)
+def test_get_encoding(input, expected):
+    assert a2x.get_encoding(input) == expected
+
+
+@pytest.mark.parametrize(
+    "input,expected",
+    (
+        ('', []),
+        (
+            """
+// a2x: --foo --bar
+//      a2x: --baz
+// a2x: "--foo --bar
+// a2x: --baz" --qux
+            """,
+            ['--foo', '--bar', '--baz', "--foo --bar --baz", '--qux'],
+        ),
+    )
+)
+def test_get_source_options(tmp_path, input, expected):
+    test_file = tmp_path / 'test_file.adoc'
+    test_file.write_text(input)
+    assert a2x.get_source_options(str(test_file)) == expected
+
+
+def test_get_source_options_non_existing_file():
+    assert a2x.get_source_options('/some/non/existing/file') == []
author	Matthew Peveler <matt.peveler@gmail.com>	2022-03-01 10:35:21 -0500
committer	GitHub <noreply@github.com>	2022-03-01 10:35:21 -0500
commit	f4cf5a07e324160895872f093692d8387094c866 (patch)
tree	bf067f57c97bd3def78622573259c9830e778700
parent	9b494802b25b60a5d21976f45e21380ed7b78766 (diff)
download	asciidoc-py3-f4cf5a07e324160895872f093692d8387094c866.tar.gz