diff options
Diffstat (limited to 'Porting')
-rw-r--r-- | Porting/mk_PL_charclass.pl | 200 |
1 files changed, 200 insertions, 0 deletions
diff --git a/Porting/mk_PL_charclass.pl b/Porting/mk_PL_charclass.pl new file mode 100644 index 0000000000..a23d611751 --- /dev/null +++ b/Porting/mk_PL_charclass.pl @@ -0,0 +1,200 @@ +#!perl -w +use 5.012; +use strict; +use warnings; + +# This program outputs the 256 lines that form the guts of the PL_charclass +# table. The output should be used to manually replace the table contents in +# perl.h. Each line is a bit map of properties that the Unicode code point at +# the corresponding position in the table array has. The first line +# corresponds to code point U+0000, NULL, the last line to U=00FF. For an +# application to see if the code point "i" has a particular property, it just +# does +# 'PL_charclass[i] & BIT' +# The bit names are of the form '_CC_property_suffix', where 'CC' stands for +# character class, and 'property' is the corresponding property, and 'suffix' +# is one of '_A' to mean the property is true only if the corresponding code +# point is ASCII, and '_L1' means that the range includes any Latin1 +# character (ISO-8859-1 including the C0 and C1 controls). A property without +# these suffixes does not have different forms for both ranges. + +# The data in the table is pretty well set in stone, so that this program need +# be run only when adding new properties to it. + +my @properties = qw( + ALNUMC_A + ALNUMC_L1 + ALPHA_A + ALPHA_L1 + BLANK_A + BLANK_L1 + CHARNAME_CONT + CNTRL_A + CNTRL_L1 + DIGIT_A + GRAPH_A + GRAPH_L1 + IDFIRST_A + IDFIRST_L1 + LOWER_A + LOWER_L1 + OCTAL_A + PRINT_A + PRINT_L1 + PSXSPC_A + PSXSPC_L1 + PUNCT_A + PUNCT_L1 + SPACE_A + SPACE_L1 + UPPER_A + UPPER_L1 + WORDCHAR_A + WORDCHAR_L1 + XDIGIT_A +); + +my @bits; # Bit map for each code point + +for my $ord (0..255) { + my $char = chr($ord); + utf8::upgrade($char); # Important to use Unicode semantics! + for my $property (@properties) { + my $name = $property; + + # The property name that corresponds to this doesn't have a suffix. + # If is a latin1 version, no further checking is needed. + if (! ($name =~ s/_L1$//)) { + + # Here, isn't an L1. It's either a special one or the suffix ends + # in _A. In the latter case, it's automatically false for + # non-ascii. The one current special is valid over the whole range. + next if $name =~ s/_A$// && $ord >= 128; + + } + my $re; + if ($name eq 'PUNCT') {; + + # Sadly, this is inconsistent: \pP and \pS for the ascii range, + # just \pP outside it. + $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/; + } elsif ($name eq 'CHARNAME_CONT') {; + $re = qr/[-\w ():\xa0]/; + } elsif ($name eq 'SPACE') {; + $re = qr/\s/; + } elsif ($name eq 'IDFIRST') { + $re = qr/[_\p{Alpha}]/; + } elsif ($name eq 'PSXSPC') { + $re = qr/[\v\p{Space}]/; + } elsif ($name eq 'WORDCHAR') { + $re = qr/\w/; + } elsif ($name eq 'ALNUMC') { + # Like \w, but no underscore + $re = qr/[^_\W]/; + } elsif ($name eq 'OCTAL') { + $re = qr/[0-7]/; + } else { # The remainder have the same name and values as Unicode + $re = eval "qr/\\p{$name}/"; + use Carp; + carp $@ if ! defined $re; + } + #print "$ord, $name $property, $re\n"; + if ($char =~ $re) { # Add this property if matches + $bits[$ord] .= '|' if $bits[$ord]; + $bits[$ord] .= "_CC_$property"; + } + } + #print __LINE__, " $ord $char $bits[$ord]\n"; +} + +# Names of C0 controls +my @C0 = qw ( + NUL + SOH + STX + ETX + EOT + ENQ + ACK + BEL + BS + HT + LF + VT + FF + CR + SO + SI + DLE + DC1 + DC2 + DC3 + DC4 + NAK + SYN + ETB + CAN + EOM + SUB + ESC + FS + GS + RS + US + ); + +# Names of C1 controls, plus the adjacent DEL +my @C1 = qw( + DEL + PAD + HOP + BPH + NBH + IND + NEL + SSA + ESA + HTS + HTJ + VTS + PLD + PLU + RI + SS2 + SS3 + DCS + PU1 + PU2 + STS + CCH + MW + SPA + EPA + SOS + SGC + SCI + CSI + ST + OSC + PM + APC + ); + +# Output the table using fairly short names for each char. +for my $ord (0..255) { + my $name; + if ($ord < 32) { # A C0 control + $name = $C0[$ord]; + } elsif ($ord > 32 && $ord < 127) { # Graphic + $name = "'" . chr($ord) . "'"; + } elsif ($ord >= 127 && $ord <= 0x9f) { + $name = $C1[$ord - 127]; # A C1 control + DEL + } else { # SPACE, or, if Latin1, shorten the name */ + use charnames(); + $name = charnames::viacode($ord); + $name =~ s/LATIN CAPITAL LETTER // + || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/; + } + printf "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord]; +} + |