summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2020-11-12 14:40:19 -0700
committerKarl Williamson <khw@cpan.org>2020-11-18 09:50:32 -0700
commit3dcca105f68f9a3c4474da8390e439dd6bc34a74 (patch)
treec2289b456a393cfa3b0a3aef0ea47f6a9ca53fe0
parentbcf3564c67eb142f6b534cb04acdf34604567910 (diff)
downloadperl-3dcca105f68f9a3c4474da8390e439dd6bc34a74.tar.gz
re/fold_grind.pl: Test a couple more code points
These add tests for checking that the revised folding in a future commit works in some edge cases that previously weren't an issue.
-rw-r--r--t/re/fold_grind.pl16
1 files changed, 15 insertions, 1 deletions
diff --git a/t/re/fold_grind.pl b/t/re/fold_grind.pl
index fb0d3620e8..a5ae6fd0fd 100644
--- a/t/re/fold_grind.pl
+++ b/t/re/fold_grind.pl
@@ -45,9 +45,23 @@ if ($charset eq 'T') {
# Special-cased characters in the .c's that we want to make sure get tested.
my %be_sure_to_test = (
chr utf8::unicode_to_native(0xDF) => 1, # LATIN_SMALL_LETTER_SHARP_S
- "\x{1E9E}" => 1, # LATIN_CAPITAL_LETTER_SHARP_S
+
+ # This is included because the uppercase occupies more bytes, but the
+ # first two bytes of their representations differ only in one bit,
+ # that could lead the code looking for shortcuts astray; you can't do
+ # certain shortcuts if the lengths differ
+ "\x{29E}" => 1, # LATIN SMALL LETTER TURNED K
+
"\x{390}" => 1, # GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
"\x{3B0}" => 1, # GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
+
+ # This is included because the uppercase and lowercase differ by only
+ # a single bit and it is in the first of the two byte representations.
+ # This showed that a previous way was erroneous of calculating if
+ # initial substrings were closely-related bit-wise.
+ "\x{3CC}" => 1, # GREEK SMALL LETTER OMICRON WITH TONOS
+
+ "\x{1E9E}" => 1, # LATIN_CAPITAL_LETTER_SHARP_S
"\x{1FD3}" => 1, # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
"\x{1FE3}" => 1, # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
"I" => 1,