Use lookup table for /\b{gcb}/ instead of switch stmt

This changes the handling of Grapheme Cluster Breaks to be entirely via a lookup table generated by regen/mk_invlists.pl. This is easier to maintain and follow, as the generation of the table follows the text of Unicode's UAX29 precisely, and loops can be used to set every class up instead of having to name each explicitly, so it will be easier to add new rules. And the runtime switch statement is replaced by a single line. My gcc compiler optimized the previous version to an array lookup, but this commit does it for not so clever compilers.
author: Karl Williamson <khw@cpan.org> 2016-01-18 23:14:10 -0700
committer: Karl Williamson <khw@cpan.org> 2016-01-19 15:09:00 -0700
commit: 973a28ed6e42449520d66bc69f677a120861ccb3 (patch)
tree: 6e4a8ec3625bada7aadac9a9adf2f08db3a11395 /regen
parent: 3571e9a754834338c4e5b8c1349920062570f2a0 (diff)
download: perl-973a28ed6e42449520d66bc69f677a120861ccb3.tar.gz
1 files changed, 116 insertions, 7 deletions
diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl
index c72fc0b1ec..974800cc04 100644
--- a/regen/mk_invlists.pl
+++ b/regen/mk_invlists.pl
@@ -155,6 +155,8 @@ my %hard_coded_enums =
         ],
 );
 
+my %gcb_enums;
+my @gcb_short_enums;
 my %lb_enums;
 my @lb_short_enums;
 
@@ -322,9 +324,17 @@ sub output_invmap ($$$$$$$) {
                     $enums{$enum} = $enum_val++ unless exists $enums{$enum};
                 }
 
-                # Calculate the enum values for property _Perl_LB because we
-                # output a special table for that
-                if ($name eq '_Perl_LB' && ! %lb_enums) {
+                # Calculate the enum values for properties _Perl_GCB and
+                # _Perl_LB because we output special tables for them
+                if ($name eq '_Perl_GCB' && ! %gcb_enums) {
+                    while (my ($enum, $value) = each %enums) {
+                        my ($short) = prop_value_aliases('GCB', $enum);
+                        $short = lc $enum unless defined $short;
+                        $gcb_enums{$short} = $value;
+                        @gcb_short_enums[$value] = $short;
+                    }
+                }
+                elsif ($name eq '_Perl_LB' && ! %lb_enums) {
                     while (my ($enum, $value) = each %enums) {
                         my ($short) = prop_value_aliases('LB', $enum);
                         $short = substr(lc $enum, 0, 2) unless defined $short;
@@ -455,6 +465,104 @@ sub UpperLatin1 {
     return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
 }
 
+sub output_GCB_table() {
+
+    # Create and output the pair table for use in determining Grapheme Cluster
+    # Breaks, given in http://www.unicode.org/reports/tr29/.
+
+    # The table is constructed in reverse order of the rules, to make the
+    # lower-numbered, higher priority ones override the later ones, as the
+    # algorithm stops at the earliest matching rule
+
+    my @gcb_table;
+    my $table_size = @gcb_short_enums;
+
+    # Otherwise, break everywhere.
+    # GB10 	Any ÷  Any
+    for my $i (0 .. $table_size - 1) {
+        for my $j (0 .. $table_size - 1) {
+            $gcb_table[$i][$j] = 1;
+        }
+    }
+
+    # Do not break before extending characters.
+    # Do not break before SpacingMarks, or after Prepend characters.
+    # GB9   ×  Extend
+    # GB9a  × SpacingMark
+    # GB9b  Prepend  ×
+    for my $i (0 .. @gcb_table - 1) {
+        $gcb_table[$i][$gcb_enums{'EX'}] = 0;
+        $gcb_table[$i][$gcb_enums{'SM'}] = 0;
+        $gcb_table[$gcb_enums{'PP'}][$i] = 0;
+    }
+
+    # Do not break between regional indicator symbols.
+    # GB8a  Regional_Indicator  ×  Regional_Indicator
+    $gcb_table[$gcb_enums{'RI'}][$gcb_enums{'RI'}] = 0;
+
+    # Do not break Hangul syllable sequences.
+    # GB8  ( LVT | T)  ×  T
+    $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0;
+    $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0;
+
+    # GB7  ( LV | V )  ×  ( V | T )
+    $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0;
+    $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0;
+    $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0;
+    $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0;
+
+    # GB6  L  ×  ( L | V | LV | LVT )
+    $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0;
+    $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0;
+    $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0;
+    $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0;
+
+    # Do not break between a CR and LF. Otherwise, break before and after controls.
+    # GB5   ÷  ( Control | CR | LF )
+    # GB4  ( Control | CR | LF )  ÷
+    for my $i (0 .. @gcb_table - 1) {
+        $gcb_table[$i][$gcb_enums{'CN'}] = 1;
+        $gcb_table[$i][$gcb_enums{'CR'}] = 1;
+        $gcb_table[$i][$gcb_enums{'LF'}] = 1;
+        $gcb_table[$gcb_enums{'CN'}][$i] = 1;
+        $gcb_table[$gcb_enums{'CR'}][$i] = 1;
+        $gcb_table[$gcb_enums{'LF'}][$i] = 1;
+    }
+
+    # GB3  CR  ×  LF
+    $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0;
+
+    # Break at the start and end of text.
+    # GB1  sot  ÷
+    # GB2   ÷  eot
+    for my $i (0 .. @gcb_table - 1) {
+        $gcb_table[$i][$gcb_enums{'edge'}] = 1;
+        $gcb_table[$gcb_enums{'edge'}][$i] = 1;
+    }
+
+    # But, unspecified by Unicode, we shouldn't break on an empty string.
+    $gcb_table[$gcb_enums{'edge'}][$gcb_enums{'edge'}] = 0;
+
+    print $out_fh "\nstatic const bool GCB_table[$table_size][$table_size] = {\n";
+    print $out_fh "/*       ";
+    for my $i (0 .. @gcb_table - 1) {
+        printf $out_fh "%5s", $gcb_short_enums[$i];
+    }
+    print $out_fh " */\n";
+
+    for my $i (0 .. @gcb_table - 1) {
+        printf $out_fh "/*%5s */ ", $gcb_short_enums[$i];
+        for my $j (0 .. @gcb_table - 1) {
+            printf $out_fh "%3d", $gcb_table[$i][$j];
+            print $out_fh "," if $i < @gcb_table - 1 || $j < @gcb_table - 1;
+            print $out_fh " " if $j < @gcb_table - 1;
+        }
+        print $out_fh "\n";
+    }
+
+    print $out_fh "};\n";
+}
+
 sub output_LB_table() {
 
     # Create and output the enums, #defines, and pair table for use in
@@ -462,8 +570,6 @@ sub output_LB_table() {
     # given in http://www.unicode.org/reports/tr14/, but tailored by example 7
     # in that page, as the Unicode-furnished tests assume that tailoring.
 
-    switch_pound_if('LB_table', 'PERL_IN_REGEXEC_C');
-
     # The result is really just true or false.  But we follow along with tr14,
     # creating a rule which is false for something like X SP* X.  That gets
     # encoding 2.  The rest of the actions are synthetic ones that indicate
@@ -1000,8 +1106,6 @@ sub output_LB_table() {
     }
 
     print $out_fh "};\n";
-
-    end_file_pound_if;
 }
 
 output_invlist("Latin1", [ 0, 256 ]);
@@ -1420,8 +1524,13 @@ for my $charset (get_supported_code_pages()) {
     print $out_fh "\n" . get_conditional_compile_line_end();
 }
 
+switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
+
+output_GCB_table();
 output_LB_table();
 
+end_file_pound_if;
+
 my $sources_list = "lib/unicore/mktables.lst";
 my @sources = ($0, qw(lib/unicore/mktables
                       lib/Unicode/UCD.pm
author	Karl Williamson <khw@cpan.org>	2016-01-18 23:14:10 -0700
committer	Karl Williamson <khw@cpan.org>	2016-01-19 15:09:00 -0700
commit	973a28ed6e42449520d66bc69f677a120861ccb3 (patch)
tree	6e4a8ec3625bada7aadac9a9adf2f08db3a11395 /regen
parent	3571e9a754834338c4e5b8c1349920062570f2a0 (diff)
download	perl-973a28ed6e42449520d66bc69f677a120861ccb3.tar.gz