diff options
author | Karl Williamson <public@khwilliamson.com> | 2010-09-23 20:47:03 -0600 |
---|---|---|
committer | Steffen Mueller <smueller@cpan.org> | 2010-09-25 11:15:31 +0200 |
commit | 9c68f0abf07d540a547ec774305849d8cf1f3dfe (patch) | |
tree | e5fbd1eee63fa150db700a9e9411607207a31417 /Porting | |
parent | 271b5cb31d498243788ba704c149a346f82b8e04 (diff) | |
download | perl-9c68f0abf07d540a547ec774305849d8cf1f3dfe.tar.gz |
Add 256 word bit table of character classes
This patch adds a table for looking up character classes. It is 256
words long, in l1_char_class_tab.h, with each word corresponding to the
ordinal of a Latin1 character, and each word contains a bit map of all
the properties that character matches. Each property has a bit or two.
Ones named _CC_property_A are true only if the character is also in the
ASCII character set. Ones named CC_property_L1 do not have this
restriction. (L1 stands for Latin1.)
Also added is a script that generates the table. It is not anticipated
that this will need to be used often.
(This commit was changed from its original form by Steffen.)
Diffstat (limited to 'Porting')
-rw-r--r-- | Porting/mk_PL_charclass.pl | 200 |
1 files changed, 200 insertions, 0 deletions
diff --git a/Porting/mk_PL_charclass.pl b/Porting/mk_PL_charclass.pl new file mode 100644 index 0000000000..a23d611751 --- /dev/null +++ b/Porting/mk_PL_charclass.pl @@ -0,0 +1,200 @@ +#!perl -w +use 5.012; +use strict; +use warnings; + +# This program outputs the 256 lines that form the guts of the PL_charclass +# table. The output should be used to manually replace the table contents in +# perl.h. Each line is a bit map of properties that the Unicode code point at +# the corresponding position in the table array has. The first line +# corresponds to code point U+0000, NULL, the last line to U=00FF. For an +# application to see if the code point "i" has a particular property, it just +# does +# 'PL_charclass[i] & BIT' +# The bit names are of the form '_CC_property_suffix', where 'CC' stands for +# character class, and 'property' is the corresponding property, and 'suffix' +# is one of '_A' to mean the property is true only if the corresponding code +# point is ASCII, and '_L1' means that the range includes any Latin1 +# character (ISO-8859-1 including the C0 and C1 controls). A property without +# these suffixes does not have different forms for both ranges. + +# The data in the table is pretty well set in stone, so that this program need +# be run only when adding new properties to it. + +my @properties = qw( + ALNUMC_A + ALNUMC_L1 + ALPHA_A + ALPHA_L1 + BLANK_A + BLANK_L1 + CHARNAME_CONT + CNTRL_A + CNTRL_L1 + DIGIT_A + GRAPH_A + GRAPH_L1 + IDFIRST_A + IDFIRST_L1 + LOWER_A + LOWER_L1 + OCTAL_A + PRINT_A + PRINT_L1 + PSXSPC_A + PSXSPC_L1 + PUNCT_A + PUNCT_L1 + SPACE_A + SPACE_L1 + UPPER_A + UPPER_L1 + WORDCHAR_A + WORDCHAR_L1 + XDIGIT_A +); + +my @bits; # Bit map for each code point + +for my $ord (0..255) { + my $char = chr($ord); + utf8::upgrade($char); # Important to use Unicode semantics! + for my $property (@properties) { + my $name = $property; + + # The property name that corresponds to this doesn't have a suffix. + # If is a latin1 version, no further checking is needed. + if (! ($name =~ s/_L1$//)) { + + # Here, isn't an L1. It's either a special one or the suffix ends + # in _A. In the latter case, it's automatically false for + # non-ascii. The one current special is valid over the whole range. + next if $name =~ s/_A$// && $ord >= 128; + + } + my $re; + if ($name eq 'PUNCT') {; + + # Sadly, this is inconsistent: \pP and \pS for the ascii range, + # just \pP outside it. + $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/; + } elsif ($name eq 'CHARNAME_CONT') {; + $re = qr/[-\w ():\xa0]/; + } elsif ($name eq 'SPACE') {; + $re = qr/\s/; + } elsif ($name eq 'IDFIRST') { + $re = qr/[_\p{Alpha}]/; + } elsif ($name eq 'PSXSPC') { + $re = qr/[\v\p{Space}]/; + } elsif ($name eq 'WORDCHAR') { + $re = qr/\w/; + } elsif ($name eq 'ALNUMC') { + # Like \w, but no underscore + $re = qr/[^_\W]/; + } elsif ($name eq 'OCTAL') { + $re = qr/[0-7]/; + } else { # The remainder have the same name and values as Unicode + $re = eval "qr/\\p{$name}/"; + use Carp; + carp $@ if ! defined $re; + } + #print "$ord, $name $property, $re\n"; + if ($char =~ $re) { # Add this property if matches + $bits[$ord] .= '|' if $bits[$ord]; + $bits[$ord] .= "_CC_$property"; + } + } + #print __LINE__, " $ord $char $bits[$ord]\n"; +} + +# Names of C0 controls +my @C0 = qw ( + NUL + SOH + STX + ETX + EOT + ENQ + ACK + BEL + BS + HT + LF + VT + FF + CR + SO + SI + DLE + DC1 + DC2 + DC3 + DC4 + NAK + SYN + ETB + CAN + EOM + SUB + ESC + FS + GS + RS + US + ); + +# Names of C1 controls, plus the adjacent DEL +my @C1 = qw( + DEL + PAD + HOP + BPH + NBH + IND + NEL + SSA + ESA + HTS + HTJ + VTS + PLD + PLU + RI + SS2 + SS3 + DCS + PU1 + PU2 + STS + CCH + MW + SPA + EPA + SOS + SGC + SCI + CSI + ST + OSC + PM + APC + ); + +# Output the table using fairly short names for each char. +for my $ord (0..255) { + my $name; + if ($ord < 32) { # A C0 control + $name = $C0[$ord]; + } elsif ($ord > 32 && $ord < 127) { # Graphic + $name = "'" . chr($ord) . "'"; + } elsif ($ord >= 127 && $ord <= 0x9f) { + $name = $C1[$ord - 127]; # A C1 control + DEL + } else { # SPACE, or, if Latin1, shorten the name */ + use charnames(); + $name = charnames::viacode($ord); + $name =~ s/LATIN CAPITAL LETTER // + || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/; + } + printf "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord]; +} + |