diff options
author | Christoph Reiter <creiter@src.gnome.org> | 2018-05-29 11:31:49 +0200 |
---|---|---|
committer | Christoph Reiter <creiter@src.gnome.org> | 2018-06-12 22:18:03 +0200 |
commit | a580185cdca0de76cf71bd6b3c01230d281f5a05 (patch) | |
tree | 37e714142bf43a61a62fc3bbb72d8a87ac7498c4 | |
parent | 603d40467c98ad56880021c804ce1f42a1663cc9 (diff) | |
download | glib-a580185cdca0de76cf71bd6b3c01230d281f5a05.tar.gz |
tests: Port gen-casefold-txt.pl and gen-casemap-txt.pl to Python 3. See #1332
I've tried to keep the code structure roughly the same.
-rw-r--r-- | tests/Makefile.am | 4 | ||||
-rw-r--r-- | tests/casefold.txt | 2 | ||||
-rw-r--r-- | tests/casemap.txt | 2 | ||||
-rwxr-xr-x | tests/gen-casefold-txt.pl | 82 | ||||
-rwxr-xr-x | tests/gen-casefold-txt.py | 78 | ||||
-rwxr-xr-x | tests/gen-casemap-txt.pl | 256 | ||||
-rwxr-xr-x | tests/gen-casemap-txt.py | 200 | ||||
-rw-r--r-- | tests/unicode-caseconv.c | 2 |
8 files changed, 283 insertions, 343 deletions
diff --git a/tests/Makefile.am b/tests/Makefile.am index de3ddb49e..fba18655d 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -146,8 +146,8 @@ endif EXTRA_DIST += \ $(test_scripts) \ - gen-casefold-txt.pl \ - gen-casemap-txt.pl \ + gen-casefold-txt.py \ + gen-casemap-txt.py \ iochannel-test-infile \ timeloop-basic.c \ assert-msg-test.gdb diff --git a/tests/casefold.txt b/tests/casefold.txt index f7b47abd2..6043c1201 100644 --- a/tests/casefold.txt +++ b/tests/casefold.txt @@ -1,5 +1,5 @@ # Test cases generated from Unicode 10.0.0 data -# by gen-casefold-test.pl. Do not edit. +# by gen-casefold-txt.py. Do not edit. # # Some special hand crafted tests # diff --git a/tests/casemap.txt b/tests/casemap.txt index 5e983f70f..6533e8dd9 100644 --- a/tests/casemap.txt +++ b/tests/casemap.txt @@ -1,5 +1,5 @@ # Test cases generated from Unicode 10.0.0 data -# by gen-case-tests.pl. Do not edit. +# by gen-casemap-txt.py. Do not edit. # # Some special hand crafted tests # diff --git a/tests/gen-casefold-txt.pl b/tests/gen-casefold-txt.pl deleted file mode 100755 index 2a6a0d4b1..000000000 --- a/tests/gen-casefold-txt.pl +++ /dev/null @@ -1,82 +0,0 @@ -#! /usr/bin/perl -w - -# Copyright (C) 1998, 1999 Tom Tromey -# Copyright (C) 2001 Red Hat Software - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program; if not, see <http://www.gnu.org/licenses/>. - -# gen-casefold-test.pl - Generate test cases for casefolding from Unicode data. -# See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html -# Usage: -# I consider the output of this program to be unrestricted. Use it as -# you will. - -require 5.006; - -# Names of fields in the CaseFolding table -$FOLDING_CODE = 0; -$FOLDING_STATUS = 1; -$FOLDING_MAPPING = 2; - -my $casefoldlen = 0; -my @casefold; - -if (@ARGV != 2) { - $0 =~ s@.*/@@; - die "Usage: $0 UNICODE-VERSION CaseFolding.txt\n"; -} - -print <<EOT; -# Test cases generated from Unicode $ARGV[0] data -# by gen-casefold-test.pl. Do not edit. -# -# Some special hand crafted tests -# -AaBbCc@@\taabbcc@@ -# -# Now the automatic tests -# -EOT - -binmode STDOUT, ":utf8"; -open (INPUT, "< $ARGV[1]") || exit 1; - -while (<INPUT>) -{ - chop; - - next if /^#/; - next if /^\s*$/; - - s/\s*#.*//; - - my @fields = split ('\s*;\s*', $_, 30); - - my $raw_code = $fields[$FOLDING_CODE]; - my $code = hex ($raw_code); - - if ($#fields != 3) - { - printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields); - next; - } - - # skip simple and Turkic mappings - next if ($fields[$FOLDING_STATUS] =~ /^[ST]$/); - - @values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING]; - printf ("%s\t%s\n", pack ("U", $code), pack ("U*", @values)); -} - -close INPUT; diff --git a/tests/gen-casefold-txt.py b/tests/gen-casefold-txt.py new file mode 100755 index 000000000..3c55828d3 --- /dev/null +++ b/tests/gen-casefold-txt.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# Copyright (C) 1998, 1999 Tom Tromey +# Copyright (C) 2001 Red Hat Software +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see <http://www.gnu.org/licenses/>. + +""" +gen-casefold-txt.py - Generate test cases for casefolding from Unicode data. +See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html +Usage: + I consider the output of this program to be unrestricted. + Use it as you will. +""" + +import sys +import argparse + + +def main(argv): + parser = argparse.ArgumentParser( + description="Generate test cases for casefolding from Unicode data") + parser.add_argument("UNICODE-VERSION") + parser.add_argument("CaseFolding.txt") + args = parser.parse_args(argv[1:]) + version = getattr(args, "UNICODE-VERSION") + filename = getattr(args, "CaseFolding.txt") + + print("""\ +# Test cases generated from Unicode {} data +# by gen-casefold-txt.py. Do not edit. +# +# Some special hand crafted tests +# +AaBbCc@@\taabbcc@@ +# +# Now the automatic tests +#""".format(version)) + + # Names of fields in the CaseFolding table + CODE, STATUS, MAPPING = range(3) + + with open(filename, encoding="utf-8") as fileobj: + for line in fileobj: + # strip comments and skip empty lines + line = line.split("#", 1)[0].strip() + if not line: + continue + + fields = [f.strip() for f in line.split(";", 3)[:3]] + if len(fields) != 3: + raise SystemExit( + "Entry for %s has wrong number of fields (%d)" % ( + fields[CODE], len(fields))) + + status = fields[STATUS] + # skip simple and Turkic mappings + if status in "ST": + continue + + code = chr(int(fields[CODE], 16)) + values = "".join( + [chr(int(v, 16)) for v in fields[MAPPING].split()]) + print("{}\t{}".format(code, values)) + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/tests/gen-casemap-txt.pl b/tests/gen-casemap-txt.pl deleted file mode 100755 index 3ae419ca9..000000000 --- a/tests/gen-casemap-txt.pl +++ /dev/null @@ -1,256 +0,0 @@ -#! /usr/bin/perl -w - -# Copyright (C) 1998, 1999 Tom Tromey -# Copyright (C) 2001 Red Hat Software - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program; if not, see <http://www.gnu.org/licenses/>. - -# gen-casemap-test.pl - Generate test cases for case mapping from Unicode data. -# See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html -# I consider the output of this program to be unrestricted. Use it as -# you will. - -require 5.006; -use utf8; - -if (@ARGV != 3) { - $0 =~ s@.*/@@; - die "Usage: $0 UNICODE-VERSION UnicodeData.txt SpecialCasing.txt\n"; -} - -use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION); - -# Names of fields in Unicode data table. -$CODE = 0; -$NAME = 1; -$CATEGORY = 2; -$COMBINING_CLASSES = 3; -$BIDI_CATEGORY = 4; -$DECOMPOSITION = 5; -$DECIMAL_VALUE = 6; -$DIGIT_VALUE = 7; -$NUMERIC_VALUE = 8; -$MIRRORED = 9; -$OLD_NAME = 10; -$COMMENT = 11; -$UPPER = 12; -$LOWER = 13; -$TITLE = 14; - -# Names of fields in the SpecialCasing table -$CASE_CODE = 0; -$CASE_LOWER = 1; -$CASE_TITLE = 2; -$CASE_UPPER = 3; -$CASE_CONDITION = 4; - -my @upper; -my @title; -my @lower; - -binmode STDOUT, ":utf8"; -open (INPUT, "< $ARGV[1]") || exit 1; - -$last_code = -1; -while (<INPUT>) -{ - chop; - @fields = split (';', $_, 30); - if ($#fields != 14) - { - printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields); - } - - $code = hex ($fields[$CODE]); - - if ($code > $last_code + 1) - { - # Found a gap. - if ($fields[$NAME] =~ /Last>/) - { - # Fill the gap with the last character read, - # since this was a range specified in the char database - @gfields = @fields; - } - else - { - # The gap represents undefined characters. Only the type - # matters. - @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '', - '', '', '', ''); - } - for (++$last_code; $last_code < $code; ++$last_code) - { - $gfields{$CODE} = sprintf ("%04x", $last_code); - &process_one ($last_code, @gfields); - } - } - &process_one ($code, @fields); - $last_code = $code; -} - -close INPUT; - -open (INPUT, "< $ARGV[2]") || exit 1; - -while (<INPUT>) -{ - my $code; - - chop; - - next if /^#/; - next if /^\s*$/; - - s/\s*#.*//; - - @fields = split ('\s*;\s*', $_, 30); - - $raw_code = $fields[$CASE_CODE]; - $code = hex ($raw_code); - - if ($#fields != 4 && $#fields != 5) - { - printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields); - next; - } - - if (defined $fields[5]) { - # Ignore conditional special cases - we'll handle them manually - next; - } - - $upper[$code] = &make_hex ($fields[$CASE_UPPER]); - $lower[$code] = &make_hex ($fields[$CASE_LOWER]); - $title[$code] = &make_hex ($fields[$CASE_TITLE]); -} - -close INPUT; - -print <<EOT; -# Test cases generated from Unicode $ARGV[0] data -# by gen-case-tests.pl. Do not edit. -# -# Some special hand crafted tests -# -tr_TR\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE -tr_TR\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I -tr_TR\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I -tr_TR.UTF-8\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE -tr_TR.UTF-8\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I -tr_TR.UTF-8\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I -# Test reordering of YPOGEGRAMMENI across other accents -\t\x{03b1}\x{0345}\x{0314}\t\x{03b1}\x{0345}\x{314}\t\x{0391}\x{0345}\x{0314}\t\x{0391}\x{0314}\x{0399}\t -\t\x{03b1}\x{0314}\x{0345}\t\x{03b1}\x{314}\x{0345}\t\x{0391}\x{0314}\x{0345}\t\x{0391}\x{0314}\x{0399}\t -# Handling of final and nonfinal sigma - ΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ - ΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ - ΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ -# Lithuanian rule of i followed by letter with dot. Not at all sure -# about the titlecase part here -lt_LT\ti\x{117}\ti\x{117}\tIe\tIE\t -lt_LT\tie\x{307}\tie\x{307}\tIe\tIE\t -lt_LT\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE -lt_LT\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE -lt_LT\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE -lt_LT\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent) -lt_LT\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent) -lt_LT\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above) -lt_LT\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) -lt_LT\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent) -lt_LT\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) -lt_LT.UTF-8\ti\x{117}\ti\x{117}\tIe\tIE\t -lt_LT.UTF-8\tie\x{307}\tie\x{307}\tIe\tIE\t -lt_LT.UTF-8\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE -lt_LT.UTF-8\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE -lt_LT.UTF-8\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE -lt_LT.UTF-8\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent) -lt_LT.UTF-8\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent) -lt_LT.UTF-8\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above) -lt_LT.UTF-8\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) -lt_LT.UTF-8\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent) -lt_LT.UTF-8\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) -# Special case not at initial position -\ta\x{fb04}\ta\x{fb04}\tAffl\tAFFL\t# FB04 -# -# Now the automatic tests -# -EOT -&print_tests; - -exit 0; - -# Process a single character. -sub process_one -{ - my ($code, @fields) = @_; - - my $type = $fields[$CATEGORY]; - if ($type eq 'Ll') - { - $upper[$code] = make_hex ($fields[$UPPER]); - $lower[$code] = pack ("U", $code); - $title[$code] = make_hex ($fields[$TITLE]); - } - elsif ($type eq 'Lu') - { - $lower[$code] = make_hex ($fields[$LOWER]); - $upper[$code] = pack ("U", $code); - $title[$code] = make_hex ($fields[$TITLE]); - } - - if ($type eq 'Lt') - { - $upper[$code] = make_hex ($fields[$UPPER]); - $lower[$code] = pack ("U", hex ($fields[$LOWER])); - $title[$code] = make_hex ($fields[$LOWER]); - } -} - -sub print_tests -{ - for ($i = 0; $i < 0x10ffff; $i++) { - if ($i == 0x3A3) { - # Greek sigma needs special tests - next; - } - - my $lower = $lower[$i]; - my $title = $title[$i]; - my $upper = $upper[$i]; - - if (defined $upper || defined $lower || defined $title) { - printf "\t%s\t%s\t%s\t%s\t# %4X\n", - pack ("U", $i), - (defined $lower ? $lower : ""), - (defined $title ? $title : ""), - (defined $upper ? $upper : ""), - $i; - } - } -} - -sub make_hex -{ - my $codes = shift; - - $codes =~ s/^\s+//; - $codes =~ s/\s+$//; - - if ($codes eq "0" || $codes eq "") { - return ""; - } else { - return pack ("U*", map { hex ($_) } split /\s+/, $codes); - } -} diff --git a/tests/gen-casemap-txt.py b/tests/gen-casemap-txt.py new file mode 100755 index 000000000..98f6bc969 --- /dev/null +++ b/tests/gen-casemap-txt.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +# Copyright (C) 1998, 1999 Tom Tromey +# Copyright (C) 2001 Red Hat Software +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see <http://www.gnu.org/licenses/>. + +""" +gen-casemap-txt.py - Generate test cases for case mapping from Unicode data. +See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html +Usage: + I consider the output of this program to be unrestricted. + Use it as you will. +""" + +import sys +import argparse + + +def main(argv): + parser = argparse.ArgumentParser( + description="Generate test cases for case mapping from Unicode data") + parser.add_argument("UNICODE-VERSION") + parser.add_argument("UnicodeData.txt") + parser.add_argument("SpecialCasing.txt") + args = parser.parse_args(argv[1:]) + version = getattr(args, "UNICODE-VERSION") + filename_udata = getattr(args, "UnicodeData.txt") + filename_casing = getattr(args, "SpecialCasing.txt") + + # Names of fields in Unicode data table. + CODE, NAME, CATEGORY, COMBINING_CLASSES, BIDI_CATEGORY, DECOMPOSITION, \ + DECIMAL_VALUE, DIGIT_VALUE, NUMERIC_VALUE, MIRRORED, OLD_NAME, \ + COMMENT, UPPER, LOWER, TITLE = range(15) + + # Names of fields in the SpecialCasing table + CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = range(5) + + upper = {} + title = {} + lower = {} + + def make_hex(codes): + """Converts a string of white space separated code points encoded as + hex values to a Unicode string. Any extra white space is ignored. + """ + return "".join([chr(int(c, 16)) for c in codes.split()]) + + def process_one(code, fields): + type_ = fields[CATEGORY] + if type_ == "Ll": + upper[code] = make_hex(fields[UPPER]) + lower[code] = chr(code) + title[code] = make_hex(fields[TITLE]) + elif type_ == "Lu": + lower[code] = make_hex(fields[LOWER]) + upper[code] = chr(code) + title[code] = make_hex(fields[TITLE]) + elif type_ == "Lt": + upper[code] = make_hex(fields[UPPER]) + lower[code] = make_hex(fields[LOWER]) + title[code] = make_hex(fields[LOWER]) + + with open(filename_udata, encoding="utf-8") as fileobj: + last_code = -1 + for line in fileobj: + line = line.strip() + fields = [f.strip() for f in line.split(";")] + if len(fields) != 15: + raise SystemExit( + "Entry for %s has wrong number of fields (%d)" % ( + fields[CODE], len(fields))) + + code = int(fields[CODE], 16) + + if code > last_code + 1: + # Found a gap + if fields[NAME].endswith("Last>"): + # Fill the gap with the last character read, + # since this was a range specified in the char database + gfields = fields + else: + # The gap represents undefined characters. Only the type + # matters. + gfields = ['', '', 'Cn', '0', '', '', '', '', '', '', '', + '', '', '', ''] + + last_code += 1 + while last_code < code: + gfields[CODE] = "%04x" % last_code + process_one(last_code, gfields) + last_code += 1 + + process_one(code, fields) + last_code = code + + with open(filename_casing, encoding="utf-8") as fileobj: + last_code = -1 + for line in fileobj: + # strip comments and skip empty lines + line = line.split("#", 1)[0].strip() + if not line: + continue + + # all lines end with ";" so just remove it + line = line.rstrip(";").rstrip() + fields = [f.strip() for f in line.split(";")] + if len(fields) not in (4, 5): + raise SystemExit( + "Entry for %s has wrong number of fields (%d)" % ( + fields[CASE_CODE], len(fields))) + + if len(fields) == 5: + # Ignore conditional special cases - we'll handle them manually + continue + + code = int(fields[CASE_CODE], 16) + + upper[code] = make_hex(fields[CASE_UPPER]) + lower[code] = make_hex(fields[CASE_LOWER]) + title[code] = make_hex(fields[CASE_TITLE]) + + print_tests(version, upper, title, lower) + + +def print_tests(version, upper, title, lower): + print("""\ +# Test cases generated from Unicode {} data +# by gen-casemap-txt.py. Do not edit. +# +# Some special hand crafted tests +# +tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE +tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I +tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I +tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE +tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I +tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I +# Test reordering of YPOGEGRAMMENI across other accents +\t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t +\t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t +# Handling of final and nonfinal sigma +\tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ +\tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ +\tΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ +# Lithuanian rule of i followed by letter with dot. Not at all sure +# about the titlecase part here +lt_LT\ti\u0117\ti\u0117\tIe\tIE\t +lt_LT\tie\u0307\tie\u0307\tIe\tIE\t +lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE +lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE +lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE +lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent) +lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent) +lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above) +lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) +lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent) +lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) +lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t +lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t +lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE +lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE +lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE +lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent) +lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent) +lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above) +lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) +lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent) +lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) +# Special case not at initial position +\ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04 +# +# Now the automatic tests +#""".format(version)) + + for i in range(0x10ffff): + if i == 0x3A3: + # Greek sigma needs special tests + continue + + up = upper.get(i, "") + lo = lower.get(i, "") + ti = title.get(i, "") + + if any([up, lo, ti]): + print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i), lo, ti, up, i)) + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/tests/unicode-caseconv.c b/tests/unicode-caseconv.c index affb55888..c124633d1 100644 --- a/tests/unicode-caseconv.c +++ b/tests/unicode-caseconv.c @@ -57,7 +57,7 @@ int main (int argc, char **argv) test = strings[1]; - /* gen-casemap-txt.pl uses an empty string when a single character + /* gen-casemap-txt.py uses an empty string when a single character * doesn't have an equivalent in a particular case; since that behavior * is nonsense for multicharacter strings, it would make more sense * to put the expected result .. the original character unchanged. But |