tests: Port gen-casefold-txt.pl and gen-casemap-txt.pl to Python 3. See #1332

I've tried to keep the code structure roughly the same.
author: Christoph Reiter <creiter@src.gnome.org> 2018-05-29 11:31:49 +0200
committer: Christoph Reiter <creiter@src.gnome.org> 2018-06-12 22:18:03 +0200
commit: a580185cdca0de76cf71bd6b3c01230d281f5a05 (patch)
tree: 37e714142bf43a61a62fc3bbb72d8a87ac7498c4
parent: 603d40467c98ad56880021c804ce1f42a1663cc9 (diff)
download: glib-a580185cdca0de76cf71bd6b3c01230d281f5a05.tar.gz
8 files changed, 283 insertions, 343 deletions
diff --git a/tests/Makefile.am b/tests/Makefile.am
index de3ddb49e..fba18655d 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -146,8 +146,8 @@ endif
 
 EXTRA_DIST += \
 	$(test_scripts)				\
-	gen-casefold-txt.pl			\
-	gen-casemap-txt.pl			\
+	gen-casefold-txt.py			\
+	gen-casemap-txt.py			\
 	iochannel-test-infile			\
 	timeloop-basic.c			\
 	assert-msg-test.gdb
diff --git a/tests/casefold.txt b/tests/casefold.txt
index f7b47abd2..6043c1201 100644
--- a/tests/casefold.txt
+++ b/tests/casefold.txt
@@ -1,5 +1,5 @@
 # Test cases generated from Unicode 10.0.0 data
-# by gen-casefold-test.pl. Do not edit.
+# by gen-casefold-txt.py. Do not edit.
 #
 # Some special hand crafted tests
 #
diff --git a/tests/casemap.txt b/tests/casemap.txt
index 5e983f70f..6533e8dd9 100644
--- a/tests/casemap.txt
+++ b/tests/casemap.txt
@@ -1,5 +1,5 @@
 # Test cases generated from Unicode 10.0.0 data
-# by gen-case-tests.pl. Do not edit.
+# by gen-casemap-txt.py. Do not edit.
 #
 # Some special hand crafted tests
 #
diff --git a/tests/gen-casefold-txt.pl b/tests/gen-casefold-txt.pl
deleted file mode 100755
index 2a6a0d4b1..000000000
--- a/tests/gen-casefold-txt.pl
+++ /dev/null
@@ -1,82 +0,0 @@
-#! /usr/bin/perl -w
-
-#    Copyright (C) 1998, 1999 Tom Tromey
-#    Copyright (C) 2001 Red Hat Software
-
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2, or (at your option)
-#    any later version.
-
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, see <http://www.gnu.org/licenses/>.
-
-# gen-casefold-test.pl - Generate test cases for casefolding from Unicode data.
-# See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
-# Usage: 
-# I consider the output of this program to be unrestricted.  Use it as
-# you will.
-
-require 5.006;
-
-# Names of fields in the CaseFolding table
-$FOLDING_CODE = 0;
-$FOLDING_STATUS = 1;
-$FOLDING_MAPPING = 2;
-
-my $casefoldlen = 0;
-my @casefold;
-
-if (@ARGV != 2) {
-    $0 =~ s@.*/@@;
-    die "Usage: $0 UNICODE-VERSION  CaseFolding.txt\n";
-}
- 
-print <<EOT;
-# Test cases generated from Unicode $ARGV[0] data
-# by gen-casefold-test.pl. Do not edit.
-#
-# Some special hand crafted tests
-#
-AaBbCc@@\taabbcc@@
-#
-# Now the automatic tests
-#
-EOT
-
-binmode STDOUT, ":utf8";
-open (INPUT, "< $ARGV[1]") || exit 1;
-
-while (<INPUT>)
-{
-    chop;
-
-    next if /^#/;
-    next if /^\s*$/;
-
-    s/\s*#.*//;
-
-    my @fields = split ('\s*;\s*', $_, 30);
-
-    my $raw_code = $fields[$FOLDING_CODE];
-    my $code = hex ($raw_code);
-
-    if ($#fields != 3)
-    {
-	printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
-	next;
-    }
-
-    # skip simple and Turkic mappings
-    next if ($fields[$FOLDING_STATUS] =~ /^[ST]$/);
-
-    @values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING];
-    printf ("%s\t%s\n", pack ("U", $code), pack ("U*", @values));
-}
-
-close INPUT;
diff --git a/tests/gen-casefold-txt.py b/tests/gen-casefold-txt.py
new file mode 100755
index 000000000..3c55828d3
--- /dev/null
+++ b/tests/gen-casefold-txt.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# Copyright (C) 1998, 1999 Tom Tromey
+# Copyright (C) 2001 Red Hat Software
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+"""
+gen-casefold-txt.py - Generate test cases for casefolding from Unicode data.
+See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
+Usage:
+    I consider the output of this program to be unrestricted.
+    Use it as you will.
+"""
+
+import sys
+import argparse
+
+
+def main(argv):
+    parser = argparse.ArgumentParser(
+        description="Generate test cases for casefolding from Unicode data")
+    parser.add_argument("UNICODE-VERSION")
+    parser.add_argument("CaseFolding.txt")
+    args = parser.parse_args(argv[1:])
+    version = getattr(args, "UNICODE-VERSION")
+    filename = getattr(args, "CaseFolding.txt")
+
+    print("""\
+# Test cases generated from Unicode {} data
+# by gen-casefold-txt.py. Do not edit.
+#
+# Some special hand crafted tests
+#
+AaBbCc@@\taabbcc@@
+#
+# Now the automatic tests
+#""".format(version))
+
+    # Names of fields in the CaseFolding table
+    CODE, STATUS, MAPPING = range(3)
+
+    with open(filename, encoding="utf-8") as fileobj:
+        for line in fileobj:
+            # strip comments and skip empty lines
+            line = line.split("#", 1)[0].strip()
+            if not line:
+                continue
+
+            fields = [f.strip() for f in line.split(";", 3)[:3]]
+            if len(fields) != 3:
+                raise SystemExit(
+                    "Entry for %s has wrong number of fields (%d)" % (
+                        fields[CODE], len(fields)))
+
+            status = fields[STATUS]
+            # skip simple and Turkic mappings
+            if status in "ST":
+                continue
+
+            code = chr(int(fields[CODE], 16))
+            values = "".join(
+                [chr(int(v, 16)) for v in fields[MAPPING].split()])
+            print("{}\t{}".format(code, values))
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))
diff --git a/tests/gen-casemap-txt.pl b/tests/gen-casemap-txt.pl
deleted file mode 100755
index 3ae419ca9..000000000
--- a/tests/gen-casemap-txt.pl
+++ /dev/null
@@ -1,256 +0,0 @@
-#! /usr/bin/perl -w
-
-#    Copyright (C) 1998, 1999 Tom Tromey
-#    Copyright (C) 2001 Red Hat Software
-
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2, or (at your option)
-#    any later version.
-
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, see <http://www.gnu.org/licenses/>.
-
-# gen-casemap-test.pl - Generate test cases for case mapping from Unicode data.
-# See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
-# I consider the output of this program to be unrestricted.  Use it as
-# you will.
-
-require 5.006;
-use utf8;
-
-if (@ARGV != 3) {
-    $0 =~ s@.*/@@;
-    die "Usage: $0 UNICODE-VERSION UnicodeData.txt SpecialCasing.txt\n";
-}
- 
-use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION);
-
-# Names of fields in Unicode data table.
-$CODE = 0;
-$NAME = 1;
-$CATEGORY = 2;
-$COMBINING_CLASSES = 3;
-$BIDI_CATEGORY = 4;
-$DECOMPOSITION = 5;
-$DECIMAL_VALUE = 6;
-$DIGIT_VALUE = 7;
-$NUMERIC_VALUE = 8;
-$MIRRORED = 9;
-$OLD_NAME = 10;
-$COMMENT = 11;
-$UPPER = 12;
-$LOWER = 13;
-$TITLE = 14;
-
-# Names of fields in the SpecialCasing table
-$CASE_CODE = 0;
-$CASE_LOWER = 1;
-$CASE_TITLE = 2;
-$CASE_UPPER = 3;
-$CASE_CONDITION = 4;
-
-my @upper;
-my @title;
-my @lower;
-
-binmode STDOUT, ":utf8";
-open (INPUT, "< $ARGV[1]") || exit 1;
-
-$last_code = -1;
-while (<INPUT>)
-{
-    chop;
-    @fields = split (';', $_, 30);
-    if ($#fields != 14)
-    {
-	printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);
-    }
-
-    $code = hex ($fields[$CODE]);
-
-    if ($code > $last_code + 1)
-    {
-	# Found a gap.
-	if ($fields[$NAME] =~ /Last>/)
-	{
-	    # Fill the gap with the last character read,
-            # since this was a range specified in the char database
-	    @gfields = @fields;
-	}
-	else
-	{
-	    # The gap represents undefined characters.  Only the type
-	    # matters.
-	    @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
-			'', '', '', '');
-	}
-	for (++$last_code; $last_code < $code; ++$last_code)
-	{
-	    $gfields{$CODE} = sprintf ("%04x", $last_code);
-	    &process_one ($last_code, @gfields);
-	}
-    }
-    &process_one ($code, @fields);
-    $last_code = $code;
-}
-
-close INPUT;
-
-open (INPUT, "< $ARGV[2]") || exit 1;
-
-while (<INPUT>)
-{
-    my $code;
-    
-    chop;
-
-    next if /^#/;
-    next if /^\s*$/;
-
-    s/\s*#.*//;
-
-    @fields = split ('\s*;\s*', $_, 30);
-
-    $raw_code = $fields[$CASE_CODE];
-    $code = hex ($raw_code);
-
-    if ($#fields != 4 && $#fields != 5)
-    {
-	printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
-	next;
-    }
-
-    if (defined $fields[5]) {
-	# Ignore conditional special cases - we'll handle them manually
-	next;
-    }
-
-    $upper[$code] = &make_hex ($fields[$CASE_UPPER]);
-    $lower[$code] = &make_hex ($fields[$CASE_LOWER]);
-    $title[$code] = &make_hex ($fields[$CASE_TITLE]);
-}
-
-close INPUT;
-
-print <<EOT;
-# Test cases generated from Unicode $ARGV[0] data
-# by gen-case-tests.pl. Do not edit.
-#
-# Some special hand crafted tests
-#
-tr_TR\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
-tr_TR\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
-tr_TR\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I
-tr_TR.UTF-8\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
-tr_TR.UTF-8\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
-tr_TR.UTF-8\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I
-# Test reordering of YPOGEGRAMMENI across other accents
-\t\x{03b1}\x{0345}\x{0314}\t\x{03b1}\x{0345}\x{314}\t\x{0391}\x{0345}\x{0314}\t\x{0391}\x{0314}\x{0399}\t
-\t\x{03b1}\x{0314}\x{0345}\t\x{03b1}\x{314}\x{0345}\t\x{0391}\x{0314}\x{0345}\t\x{0391}\x{0314}\x{0399}\t
-# Handling of final and nonfinal sigma
-	ΜΆΙΟΣ 	μάιος 	Μάιος 	ΜΆΙΟΣ 	
-	ΜΆΙΟΣ	μάιος	Μάιος	ΜΆΙΟΣ	
-	ΣΙΓΜΑ	σιγμα	Σιγμα	ΣΙΓΜΑ	
-# Lithuanian rule of i followed by letter with dot. Not at all sure
-# about the titlecase part here
-lt_LT\ti\x{117}\ti\x{117}\tIe\tIE\t
-lt_LT\tie\x{307}\tie\x{307}\tIe\tIE\t
-lt_LT\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE
-lt_LT\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE
-lt_LT\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE
-lt_LT\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent)
-lt_LT\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent)
-lt_LT\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above)
-lt_LT\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
-lt_LT\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent)
-lt_LT\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
-lt_LT.UTF-8\ti\x{117}\ti\x{117}\tIe\tIE\t
-lt_LT.UTF-8\tie\x{307}\tie\x{307}\tIe\tIE\t
-lt_LT.UTF-8\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE
-lt_LT.UTF-8\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE
-lt_LT.UTF-8\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE
-lt_LT.UTF-8\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent)
-lt_LT.UTF-8\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent)
-lt_LT.UTF-8\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above)
-lt_LT.UTF-8\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
-lt_LT.UTF-8\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent)
-lt_LT.UTF-8\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
-# Special case not at initial position
-\ta\x{fb04}\ta\x{fb04}\tAffl\tAFFL\t# FB04
-#
-# Now the automatic tests
-#
-EOT
-&print_tests;
-
-exit 0;
-
-# Process a single character.
-sub process_one
-{
-    my ($code, @fields) = @_;
-
-    my $type =  $fields[$CATEGORY];
-    if ($type eq 'Ll')
-    {
-	$upper[$code] = make_hex ($fields[$UPPER]);
-	$lower[$code] = pack ("U", $code);
-	$title[$code] = make_hex ($fields[$TITLE]);
-    }
-    elsif ($type eq 'Lu')
-    {
-	$lower[$code] = make_hex ($fields[$LOWER]);
-	$upper[$code] = pack ("U", $code);
-	$title[$code] = make_hex ($fields[$TITLE]);
-    }
-
-    if ($type eq 'Lt')
-    {
-	$upper[$code] = make_hex ($fields[$UPPER]);
-	$lower[$code] = pack ("U", hex ($fields[$LOWER]));
-	$title[$code] = make_hex ($fields[$LOWER]);
-    }
-}
-
-sub print_tests
-{
-    for ($i = 0; $i < 0x10ffff; $i++) {
-	if ($i == 0x3A3) {
-	    # Greek sigma needs special tests
-	    next;
-	}
-	
-	my $lower = $lower[$i];
-	my $title = $title[$i];
-	my $upper = $upper[$i];
-
-	if (defined $upper || defined $lower || defined $title) {
-	    printf "\t%s\t%s\t%s\t%s\t# %4X\n",
-		    pack ("U", $i),
-		    (defined $lower ? $lower : ""),
-		    (defined $title ? $title : ""),
-		    (defined $upper ? $upper : ""),
-                    $i;
-	}
-    }
-}
-
-sub make_hex
-{
-    my $codes = shift;
-
-    $codes =~ s/^\s+//;
-    $codes =~ s/\s+$//;
-
-    if ($codes eq "0" || $codes eq "") {
-	return "";
-    } else {
-	return pack ("U*", map { hex ($_) } split /\s+/, $codes);
-    }
-}
diff --git a/tests/gen-casemap-txt.py b/tests/gen-casemap-txt.py
new file mode 100755
index 000000000..98f6bc969
--- /dev/null
+++ b/tests/gen-casemap-txt.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+# Copyright (C) 1998, 1999 Tom Tromey
+# Copyright (C) 2001 Red Hat Software
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+"""
+gen-casemap-txt.py - Generate test cases for case mapping from Unicode data.
+See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
+Usage:
+    I consider the output of this program to be unrestricted.
+    Use it as you will.
+"""
+
+import sys
+import argparse
+
+
+def main(argv):
+    parser = argparse.ArgumentParser(
+        description="Generate test cases for case mapping from Unicode data")
+    parser.add_argument("UNICODE-VERSION")
+    parser.add_argument("UnicodeData.txt")
+    parser.add_argument("SpecialCasing.txt")
+    args = parser.parse_args(argv[1:])
+    version = getattr(args, "UNICODE-VERSION")
+    filename_udata = getattr(args, "UnicodeData.txt")
+    filename_casing = getattr(args, "SpecialCasing.txt")
+
+    # Names of fields in Unicode data table.
+    CODE, NAME, CATEGORY, COMBINING_CLASSES, BIDI_CATEGORY, DECOMPOSITION, \
+        DECIMAL_VALUE, DIGIT_VALUE, NUMERIC_VALUE, MIRRORED, OLD_NAME, \
+        COMMENT, UPPER, LOWER, TITLE = range(15)
+
+    # Names of fields in the SpecialCasing table
+    CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = range(5)
+
+    upper = {}
+    title = {}
+    lower = {}
+
+    def make_hex(codes):
+        """Converts a string of white space separated code points encoded as
+        hex values to a Unicode string. Any extra white space is ignored.
+        """
+        return "".join([chr(int(c, 16)) for c in codes.split()])
+
+    def process_one(code, fields):
+        type_ = fields[CATEGORY]
+        if type_ == "Ll":
+            upper[code] = make_hex(fields[UPPER])
+            lower[code] = chr(code)
+            title[code] = make_hex(fields[TITLE])
+        elif type_ == "Lu":
+            lower[code] = make_hex(fields[LOWER])
+            upper[code] = chr(code)
+            title[code] = make_hex(fields[TITLE])
+        elif type_ == "Lt":
+            upper[code] = make_hex(fields[UPPER])
+            lower[code] = make_hex(fields[LOWER])
+            title[code] = make_hex(fields[LOWER])
+
+    with open(filename_udata, encoding="utf-8") as fileobj:
+        last_code = -1
+        for line in fileobj:
+            line = line.strip()
+            fields = [f.strip() for f in line.split(";")]
+            if len(fields) != 15:
+                raise SystemExit(
+                    "Entry for %s has wrong number of fields (%d)" % (
+                        fields[CODE], len(fields)))
+
+            code = int(fields[CODE], 16)
+
+            if code > last_code + 1:
+                # Found a gap
+                if fields[NAME].endswith("Last>"):
+                    # Fill the gap with the last character read,
+                    # since this was a range specified in the char database
+                    gfields = fields
+                else:
+                    # The gap represents undefined characters.  Only the type
+                    # matters.
+                    gfields = ['', '', 'Cn', '0', '', '', '', '', '', '', '',
+                               '', '', '', '']
+
+                last_code += 1
+                while last_code < code:
+                    gfields[CODE] = "%04x" % last_code
+                    process_one(last_code, gfields)
+                    last_code += 1
+
+            process_one(code, fields)
+            last_code = code
+
+    with open(filename_casing, encoding="utf-8") as fileobj:
+        last_code = -1
+        for line in fileobj:
+            # strip comments and skip empty lines
+            line = line.split("#", 1)[0].strip()
+            if not line:
+                continue
+
+            # all lines end with ";" so just remove it
+            line = line.rstrip(";").rstrip()
+            fields = [f.strip() for f in line.split(";")]
+            if len(fields) not in (4, 5):
+                raise SystemExit(
+                    "Entry for %s has wrong number of fields (%d)" % (
+                        fields[CASE_CODE], len(fields)))
+
+            if len(fields) == 5:
+                # Ignore conditional special cases - we'll handle them manually
+                continue
+
+            code = int(fields[CASE_CODE], 16)
+
+            upper[code] = make_hex(fields[CASE_UPPER])
+            lower[code] = make_hex(fields[CASE_LOWER])
+            title[code] = make_hex(fields[CASE_TITLE])
+
+    print_tests(version, upper, title, lower)
+
+
+def print_tests(version, upper, title, lower):
+    print("""\
+# Test cases generated from Unicode {} data
+# by gen-casemap-txt.py. Do not edit.
+#
+# Some special hand crafted tests
+#
+tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
+tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
+tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
+tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
+tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
+tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
+# Test reordering of YPOGEGRAMMENI across other accents
+\t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t
+\t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t
+# Handling of final and nonfinal sigma
+\tΜΆΙΟΣ 	μάιος 	Μάιος 	ΜΆΙΟΣ 	
+\tΜΆΙΟΣ	μάιος	Μάιος	ΜΆΙΟΣ	
+\tΣΙΓΜΑ	σιγμα	Σιγμα	ΣΙΓΜΑ	
+# Lithuanian rule of i followed by letter with dot. Not at all sure
+# about the titlecase part here
+lt_LT\ti\u0117\ti\u0117\tIe\tIE\t
+lt_LT\tie\u0307\tie\u0307\tIe\tIE\t
+lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
+lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
+lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
+lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
+lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
+lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
+lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
+lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
+lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
+lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t
+lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t
+lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
+lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
+lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
+lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
+lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
+lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
+lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
+lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
+lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
+# Special case not at initial position
+\ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04
+#
+# Now the automatic tests
+#""".format(version))
+
+    for i in range(0x10ffff):
+        if i == 0x3A3:
+            # Greek sigma needs special tests
+            continue
+
+        up = upper.get(i, "")
+        lo = lower.get(i, "")
+        ti = title.get(i, "")
+
+        if any([up, lo, ti]):
+            print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i), lo, ti, up, i))
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))
diff --git a/tests/unicode-caseconv.c b/tests/unicode-caseconv.c
index affb55888..c124633d1 100644
--- a/tests/unicode-caseconv.c
+++ b/tests/unicode-caseconv.c
@@ -57,7 +57,7 @@ int main (int argc, char **argv)
       
       test = strings[1];
 
-      /* gen-casemap-txt.pl uses an empty string when a single character
+      /* gen-casemap-txt.py uses an empty string when a single character
        * doesn't have an equivalent in a particular case; since that behavior
        * is nonsense for multicharacter strings, it would make more sense
        * to put the expected result .. the original character unchanged. But
author	Christoph Reiter <creiter@src.gnome.org>	2018-05-29 11:31:49 +0200
committer	Christoph Reiter <creiter@src.gnome.org>	2018-06-12 22:18:03 +0200
commit	a580185cdca0de76cf71bd6b3c01230d281f5a05 (patch)
tree	37e714142bf43a61a62fc3bbb72d8a87ac7498c4
parent	603d40467c98ad56880021c804ce1f42a1663cc9 (diff)
download	glib-a580185cdca0de76cf71bd6b3c01230d281f5a05.tar.gz