summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Peveler <matt.peveler@gmail.com>2022-03-01 10:35:21 -0500
committerGitHub <noreply@github.com>2022-03-01 10:35:21 -0500
commitf4cf5a07e324160895872f093692d8387094c866 (patch)
treebf067f57c97bd3def78622573259c9830e778700
parent9b494802b25b60a5d21976f45e21380ed7b78766 (diff)
downloadasciidoc-py3-f4cf5a07e324160895872f093692d8387094c866.tar.gz
Fix DeprecationWarning on a2x regexes (#247)
-rw-r--r--asciidoc/a2x.py94
-rw-r--r--tests/test_a2x.py45
2 files changed, 97 insertions, 42 deletions
diff --git a/asciidoc/a2x.py b/asciidoc/a2x.py
index 4348f1e..7407d05 100644
--- a/asciidoc/a2x.py
+++ b/asciidoc/a2x.py
@@ -253,6 +253,17 @@ def shell(cmd, raise_error=True):
return (stdoutdata, stderrdata, popen.returncode)
+def get_encoding(string: bytes) -> str:
+ """
+ Given a byte representation of a XML or HTML document, find and return the encoding set
+ as an attribute, or return utf-8 if none could be found.
+ """
+ mo = re.search(br'^<\?xml.* encoding="(.*?)"', string)
+ if mo is None:
+ mo = re.search(br'<meta http\-equiv="Content\-Type" content="text\/html; charset=(.*?)">', string)
+ return mo.group(1).decode('utf-8') if mo else 'utf-8'
+
+
def find_resources(files, tagname, attrname, filter=None):
'''
Search all files and return a list of local URIs from attrname attribute
@@ -286,10 +297,7 @@ def find_resources(files, tagname, attrname, filter=None):
parser = FindResources()
with open(filename, 'rb') as open_file:
contents = open_file.read()
- mo = re.search(b'\A<\?xml.* encoding="(.*?)"', contents)
- if mo is None:
- mo = re.search(br'<meta http\-equiv="Content\-Type" content="text\/html; charset=(.*?)">', contents)
- contents = contents.decode(mo.group(1).decode('utf-8') if mo else 'utf-8')
+ contents = contents.decode(get_encoding(contents))
parser.feed(contents)
parser.close()
result = list(set(result)) # Drop duplicate values.
@@ -318,51 +326,53 @@ def exec_xsltproc(xsl_file, xml_file, dst_dir, opts=''):
shell_cd(cwd)
-def get_source_options(asciidoc_file):
+def get_source_options(asciidoc_file: str) -> List[str]:
'''
Look for a2x command options in AsciiDoc source file.
Limitation: options cannot contain double-quote characters.
'''
- def parse_options():
- # Parse options to result sequence.
- inquotes = False
- opt = ''
- for c in options:
- if c == '"':
- if inquotes:
- result.append(opt)
- opt = ''
- inquotes = False
- else:
- inquotes = True
- elif c == ' ':
- if inquotes:
- opt += c
- elif opt:
- result.append(opt)
- opt = ''
+ result = []
+
+ if not os.path.isfile(asciidoc_file):
+ return result
+
+ options = ''
+ with open(asciidoc_file, 'rb') as f:
+ line_number = 0
+ for line in f:
+ line_number += 1
+ mo = re.search(br'^//\s*a2x:', line)
+ if mo:
+ try:
+ options += ' ' + line[mo.end():].strip().decode('ascii')
+ except UnicodeDecodeError as e:
+ warning(
+ "Could not decode option to %s " % e.encoding +
+ "on line %s in %s" % (line_number, asciidoc_file)
+ )
+
+ # Parse options to result sequence.
+ inquotes = False
+ opt = ''
+ for c in options:
+ if c == '"':
+ if inquotes:
+ result.append(opt)
+ opt = ''
+ inquotes = False
else:
+ inquotes = True
+ elif c == ' ':
+ if inquotes:
opt += c
- if opt:
- result.append(opt)
+ elif opt:
+ result.append(opt)
+ opt = ''
+ else:
+ opt += c
+ if opt:
+ result.append(opt)
- result = []
- if os.path.isfile(asciidoc_file):
- options = ''
- with open(asciidoc_file, 'rb') as f:
- line_number = 0
- for line in f:
- line_number += 1
- mo = re.search(b'^//\s*a2x:', line)
- if mo:
- try:
- options += ' ' + line[mo.end():].strip().decode('ascii')
- except UnicodeDecodeError as e:
- warning(
- "Could not decode option to %s " % e.encoding +
- "on line %s in %s" % (line_number, asciidoc_file)
- )
- parse_options()
return result
diff --git a/tests/test_a2x.py b/tests/test_a2x.py
index 6546636..bb16346 100644
--- a/tests/test_a2x.py
+++ b/tests/test_a2x.py
@@ -32,3 +32,48 @@ def test_parse_args(input, expected_argv, expected_opts, expected_args):
assert argv == expected_argv
assert opts == expected_opts
assert args == expected_args
+
+
+@pytest.mark.parametrize(
+ "input,expected",
+ (
+ (b'', 'utf-8'),
+ (b'<?xml version="1.0" encoding="ASCII"?>', 'ASCII'),
+ (b'<?xml version="1.0"?>', 'utf-8'),
+ (
+ b'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">\n<html>\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=ASCII">',
+ 'ASCII',
+ ),
+ (
+ b'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">\n<html>\n<head>\n<title>Foo</title>\n</head>',
+ 'utf-8',
+ ),
+ ),
+)
+def test_get_encoding(input, expected):
+ assert a2x.get_encoding(input) == expected
+
+
+@pytest.mark.parametrize(
+ "input,expected",
+ (
+ ('', []),
+ (
+ """
+// a2x: --foo --bar
+// a2x: --baz
+// a2x: "--foo --bar
+// a2x: --baz" --qux
+ """,
+ ['--foo', '--bar', '--baz', "--foo --bar --baz", '--qux'],
+ ),
+ )
+)
+def test_get_source_options(tmp_path, input, expected):
+ test_file = tmp_path / 'test_file.adoc'
+ test_file.write_text(input)
+ assert a2x.get_source_options(str(test_file)) == expected
+
+
+def test_get_source_options_non_existing_file():
+ assert a2x.get_source_options('/some/non/existing/file') == []