From f4cf5a07e324160895872f093692d8387094c866 Mon Sep 17 00:00:00 2001 From: Matthew Peveler Date: Tue, 1 Mar 2022 10:35:21 -0500 Subject: Fix DeprecationWarning on a2x regexes (#247) --- asciidoc/a2x.py | 94 ++++++++++++++++++++++++++++++------------------------- tests/test_a2x.py | 45 ++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 42 deletions(-) diff --git a/asciidoc/a2x.py b/asciidoc/a2x.py index 4348f1e..7407d05 100644 --- a/asciidoc/a2x.py +++ b/asciidoc/a2x.py @@ -253,6 +253,17 @@ def shell(cmd, raise_error=True): return (stdoutdata, stderrdata, popen.returncode) +def get_encoding(string: bytes) -> str: + """ + Given a byte representation of a XML or HTML document, find and return the encoding set + as an attribute, or return utf-8 if none could be found. + """ + mo = re.search(br'^<\?xml.* encoding="(.*?)"', string) + if mo is None: + mo = re.search(br'', string) + return mo.group(1).decode('utf-8') if mo else 'utf-8' + + def find_resources(files, tagname, attrname, filter=None): ''' Search all files and return a list of local URIs from attrname attribute @@ -286,10 +297,7 @@ def find_resources(files, tagname, attrname, filter=None): parser = FindResources() with open(filename, 'rb') as open_file: contents = open_file.read() - mo = re.search(b'\A<\?xml.* encoding="(.*?)"', contents) - if mo is None: - mo = re.search(br'', contents) - contents = contents.decode(mo.group(1).decode('utf-8') if mo else 'utf-8') + contents = contents.decode(get_encoding(contents)) parser.feed(contents) parser.close() result = list(set(result)) # Drop duplicate values. @@ -318,51 +326,53 @@ def exec_xsltproc(xsl_file, xml_file, dst_dir, opts=''): shell_cd(cwd) -def get_source_options(asciidoc_file): +def get_source_options(asciidoc_file: str) -> List[str]: ''' Look for a2x command options in AsciiDoc source file. Limitation: options cannot contain double-quote characters. ''' - def parse_options(): - # Parse options to result sequence. - inquotes = False - opt = '' - for c in options: - if c == '"': - if inquotes: - result.append(opt) - opt = '' - inquotes = False - else: - inquotes = True - elif c == ' ': - if inquotes: - opt += c - elif opt: - result.append(opt) - opt = '' + result = [] + + if not os.path.isfile(asciidoc_file): + return result + + options = '' + with open(asciidoc_file, 'rb') as f: + line_number = 0 + for line in f: + line_number += 1 + mo = re.search(br'^//\s*a2x:', line) + if mo: + try: + options += ' ' + line[mo.end():].strip().decode('ascii') + except UnicodeDecodeError as e: + warning( + "Could not decode option to %s " % e.encoding + + "on line %s in %s" % (line_number, asciidoc_file) + ) + + # Parse options to result sequence. + inquotes = False + opt = '' + for c in options: + if c == '"': + if inquotes: + result.append(opt) + opt = '' + inquotes = False else: + inquotes = True + elif c == ' ': + if inquotes: opt += c - if opt: - result.append(opt) + elif opt: + result.append(opt) + opt = '' + else: + opt += c + if opt: + result.append(opt) - result = [] - if os.path.isfile(asciidoc_file): - options = '' - with open(asciidoc_file, 'rb') as f: - line_number = 0 - for line in f: - line_number += 1 - mo = re.search(b'^//\s*a2x:', line) - if mo: - try: - options += ' ' + line[mo.end():].strip().decode('ascii') - except UnicodeDecodeError as e: - warning( - "Could not decode option to %s " % e.encoding + - "on line %s in %s" % (line_number, asciidoc_file) - ) - parse_options() return result diff --git a/tests/test_a2x.py b/tests/test_a2x.py index 6546636..bb16346 100644 --- a/tests/test_a2x.py +++ b/tests/test_a2x.py @@ -32,3 +32,48 @@ def test_parse_args(input, expected_argv, expected_opts, expected_args): assert argv == expected_argv assert opts == expected_opts assert args == expected_args + + +@pytest.mark.parametrize( + "input,expected", + ( + (b'', 'utf-8'), + (b'', 'ASCII'), + (b'', 'utf-8'), + ( + b'\n\n\n', + 'ASCII', + ), + ( + b'\n\n\nFoo\n', + 'utf-8', + ), + ), +) +def test_get_encoding(input, expected): + assert a2x.get_encoding(input) == expected + + +@pytest.mark.parametrize( + "input,expected", + ( + ('', []), + ( + """ +// a2x: --foo --bar +// a2x: --baz +// a2x: "--foo --bar +// a2x: --baz" --qux + """, + ['--foo', '--bar', '--baz', "--foo --bar --baz", '--qux'], + ), + ) +) +def test_get_source_options(tmp_path, input, expected): + test_file = tmp_path / 'test_file.adoc' + test_file.write_text(input) + assert a2x.get_source_options(str(test_file)) == expected + + +def test_get_source_options_non_existing_file(): + assert a2x.get_source_options('/some/non/existing/file') == [] -- cgit v1.2.1