diff options
author | Karl Williamson <khw@cpan.org> | 2019-09-18 13:12:51 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2019-11-17 20:34:24 -0700 |
commit | 1a9f5e2a7e9a7e40d7728d318182a61d39976d95 (patch) | |
tree | 42d34d093cb5311c01a11e42c1cfbe3e75ddf5b2 /t | |
parent | 5d7580af4b14229eafb27db9b7a34b8b918876b4 (diff) | |
download | perl-1a9f5e2a7e9a7e40d7728d318182a61d39976d95.tar.gz |
Prefer EXACTish regnodes to ANYOFH nodes
ANYOFH nodes (that match code points above 255) are smaller than regular
ANYOF nodes because they don't have a 256-bit bitmap. But the
disadvantage of them over EXACT nodes is that the characters encountered
must first be converted from UTF-8 to code point to see if they match
the ANYOFH. (The difference is less clearcut with /i, because typically,
currently, the UTF-8 must be converted to code point anyway in order to
fold them.) But the EXACTFish node doesn't have an inversion list to do
lookup in, and occupies less space, because it doesn't have inversion
list data attached to it.
Also there is a bug in using ANYOFH under /l, as wide character warnings
should be emitted if the locale isn't a UTF-8 one.
The reason this change hasn't been made before (by me anyway) is that
the old way avoided upgrading the pattern to UTF-8. But having thought
about this for a long time, to match this node, the target string must
be in UTF-8 anyway, and having a UTF8ness mismatch slows down pattern
matching, as things have to be continually converted, and reconverted
after backtracking.
Diffstat (limited to 't')
-rw-r--r-- | t/re/anyof.t | 61 | ||||
-rw-r--r-- | t/re/pat_advanced.t | 2 |
2 files changed, 38 insertions, 25 deletions
diff --git a/t/re/anyof.t b/t/re/anyof.t index 5c7dfaf157..629dfe5ddb 100644 --- a/t/re/anyof.t +++ b/t/re/anyof.t @@ -140,7 +140,9 @@ my @tests = ( '[\xA0[:^blank:]]' => 'ANYOF[^\t ][0100-167F 1681-1FFF 200B-202E 2030-205E 2060-2FFF 3001-INFTY]', '(?d:[_[:^blank:]])' => 'NPOSIXD[:blank:]', '[\x{07}-\x{0B}]' => 'ANYOF[\a\b\t\n\x0B]', - '(?il)[\x{212A}]' => 'ANYOFL{i}[{utf8 locale}Kk][212A]', + '(?l)[\x{2029}]' => 'EXACTL <\x{2029}>', + '(?l)(?[\x{2029}])' => 'ANYOFL{utf8-locale-reqd}[2029]', # regex sets requires utf8 locale for /l + '(?il)[\x{212A}]' => 'EXACTFL <\\x{212a}>', '(?il)(?[\x{212A}])' => 'ANYOFL{utf8-locale-reqd}[Kk][212A]', '(?i)b[s]\xe0' => 'ANYOFM[Bb]', # The s goes into a 2nd node @@ -461,7 +463,7 @@ my @tests = ( '(?i)(?u)[\D\w]' => 'SANY', '(?i)(?a)[\d\w]' => 'POSIXA[\w]', '(?i)(?a)[\D\w]' => 'SANY', - '(?l:[\x{212A}])' => 'ANYOFL[212A]', + '(?l:[\x{212A}])' => 'EXACTL <\x{212a}>', '(?l:[\s\x{212A}])' => 'ANYOFPOSIXL[\s][1680 2000-200A 2028-2029 202F 205F 212A 3000]', '(?l:[^\S\x{202F}])' => 'ANYOFPOSIXL[^\\S][1680 2000-200A 2028-2029 205F 3000]', '(?li:[a-z])' => 'ANYOFL{i}[a-z{utf8 locale}\x{017F}\x{212A}]', @@ -579,7 +581,7 @@ my @tests = ( '[\x{102}-\x{104}\x{108}-\x{10A}\x{109}]' => 'ANYOFHb[0102-0104 0108-010A]', '[\x{102}-\x{104}\x{108}-\x{10A}\x{10A}]' => 'ANYOFHb[0102-0104 0108-010A]', '[\x{102}-\x{104}\x{108}-\x{10A}\x{10B}]' => 'ANYOFHb[0102-0104 0108-010B]', - '[\x{103}\x{102}]' => 'ANYOFHb[0102-0103]', + '[\x{103}\x{102}]' => 'EXACTFU_REQ8 <\x{103}>', '[\x{104}\x{102}]' => 'ANYOFHb[0102 0104]', '[\x{104}\x{102}\x{103}]' => 'ANYOFHb[0102-0104]', '[\x{106}-{INFTY}\x{104}]' => 'ANYOFH[0104 0106-INFTY]', @@ -708,12 +710,12 @@ my @tests = ( '[\x{10C}-{INFTY}\x{103}\x{102}]' => 'ANYOFH[0102-0103 010C-INFTY]', '[\x{10C}-{INFTY}\x{104}\x{102}]' => 'ANYOFH[0102 0104 010C-INFTY]', '[\x{10C}-{INFTY}\x{104}\x{102}\x{103}]' => 'ANYOFH[0102-0104 010C-INFTY]', - '[{HIGHEST_CP}]' => 'ANYOFHb[HIGHEST_CP]', + '[{HIGHEST_CP}]' => 'EXACT_REQ8 <\x{HIGHEST_CP}>', - '(?8)(?i)[\x{100}]' => 'EXACTFU_REQ8 <\x{101}>', + '(?8)(?i)[\x{410}]' => 'EXACTFU_REQ8 <\x{430}>', '(?8)(?i)[\x{399}]' => 'EXACTFU_REQ8 <\x{3b9}>', '(?8)(?i)[\x{345}\x{399}\x{3B9}\x{1FBE}]' => 'EXACTFU_REQ8 <\x{3b9}>', - '(?i)[\x{2b9}]' => 'ANYOFHb[02B9]', # Doesn't participate in a fold + '(?i)[\x{2b9}]' => 'EXACT_REQ8 <\x{2b9}>', # Doesn't participate in a fold '(?8)(?i)[\x{2b9}]' => 'EXACT_REQ8 <\x{2b9}>', '(?i)[\x{2bc}]' => 'EXACTFU_REQ8 <\x{2bc}>', # Part of a multi-char fold, ASCII component '(?i)[\x{390}]' => 'EXACTFU_REQ8 <\x{3b9}\x{308}\x{301}>', # Part of a multi-char fold, no ASCII component @@ -721,7 +723,7 @@ my @tests = ( '(?i)[\x{1E9E}]' => 'EXACTFU <ss>', '(?iaa)[\x{1E9E}]' => 'EXACTFAA <\x{17f}\x{17f}>', '(?i)[\x{FB00}]' => 'EXACTFU <ff>', - '(?iaa)[\x{FB00}]' => 'ANYOFHb[FB00]', + '(?iaa)[\x{FB00}]' => 'EXACT_REQ8 <\x{fb00}>', '(?i)[\x{FB00}]' => 'EXACTFU <ff>', '(?i)[\x{FB01}]' => 'EXACTFU <fi>', '(?i)[\x{FB02}]' => 'EXACTFU <fl>', @@ -820,29 +822,40 @@ for my $char (@single_chars_to_test) { push @single_tests, get_compiled("$upgrade$modifiers\\x{$hex}"); } else { - my $interior = ""; - my @list = $cp; + use feature 'fc'; + + my %list = ( sprintf("%X", $cp) => 1 ); if ($fold) { - if (lc $char ne $char) { - push @list, ord lc $char; - } - elsif (uc $char ne $char) { - push @list, ord uc $char; + for my $op (qw(fc lc uc)) { + my $result = eval "$op(\"$char\")"; + $list{sprintf "%X", ord $result} = 1; } } - @list = sort { $a <=> $b } @list; - if (@list == 1) { - $interior = sprintf "%04X", $list[0]; - } - elsif (@list == 2) { - my $separator = ($list[1] == $list[0] + 1) ? '-' : ', '; - $interior = sprintf "%04X$separator%04X", $list[0], $list[1]; + + my $mod_cp = $cp; + my $op; + + if (! $fold || scalar keys %list == 1) { + $op = ($charset eq 'l') + ? 'EXACTL' + : ($cp < 256) + ? 'EXACT' + : 'EXACT_REQ8'; } else { - die join ", ", @list; + $op = ($charset eq 'aa') + ? 'EXACTFAA' + : ($charset eq 'l') + ? (($cp < 256) + ? 'EXACTFL' + : 'EXACTFLU8') + : ($cp < 256) + ? 'EXACTFU' + : 'EXACTFU_REQ8'; + $mod_cp = ord fc $char; } - my $anyof = ($charset eq "l") ? "ANYOFL" : "ANYOFHb"; - push @single_tests, "$anyof\[$interior\]"; + + push @single_tests, sprintf "$op <\\x{%X}>", $mod_cp; } } } diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t index 59f2987015..8f265494fc 100644 --- a/t/re/pat_advanced.t +++ b/t/re/pat_advanced.t @@ -2424,7 +2424,7 @@ EOF like(chr(0x7FFF_FFFF_FFFF_FFFF), qr/^\p{Is_Portable_Super}$/, "chr(0x7FFF_FFFF_FFFF_FFFF) can match a Unicode property"); - my $p = qr/^[\x{7FFF_FFFF_FFFF_FFFF}]$/; + my $p = eval 'qr/^\x{7FFF_FFFF_FFFF_FFFF}$/'; like(chr(0x7FFF_FFFF_FFFF_FFFF), qr/$p/, "chr(0x7FFF_FFFF_FFFF_FFFF) can match itself in a [class]"); like(chr(0x7FFF_FFFF_FFFF_FFFF), qr/$p/, # Tests any caching |