summaryrefslogtreecommitdiff
path: root/lib/unicode/mktables.PL
diff options
context:
space:
mode:
Diffstat (limited to 'lib/unicode/mktables.PL')
-rwxr-xr-xlib/unicode/mktables.PL294
1 files changed, 294 insertions, 0 deletions
diff --git a/lib/unicode/mktables.PL b/lib/unicode/mktables.PL
new file mode 100755
index 0000000000..306f2a43c5
--- /dev/null
+++ b/lib/unicode/mktables.PL
@@ -0,0 +1,294 @@
+#!../../miniperl
+
+# Note: we try to keep filenames unique within first 8 chars. Using
+# subdirectories for the following helps.
+mkdir "In", 0777;
+mkdir "Is", 0777;
+mkdir "To", 0777;
+
+@todo = (
+# typical
+
+ ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''],
+ ['IsAlpha', '$cat =~ /^L[ulo]/', ''],
+ ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
+ ['IsDigit', '$cat =~ /^Nd$/', ''],
+ ['IsUpper', '$cat =~ /^Lu$/', ''],
+ ['IsLower', '$cat =~ /^Ll$/', ''],
+ ['IsPrint', '$cat =~ /^[^C]/', ''],
+ ['ToUpper', '$up', '$up'],
+ ['ToLower', '$down', '$down'],
+ ['ToTitle', '$title', '$title'],
+ ['ToDigit', '$dec ne ""', '$dec'],
+
+# Name
+
+ ['Name', '$name', '$name'],
+
+# Category
+
+ ['Category', '$cat', '$cat'],
+
+# Normative
+
+ ['IsM', '$cat =~ /^M/', ''], # Mark
+ ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
+ ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
+
+ ['IsN', '$cat =~ /^N/', ''], # Number
+ ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
+ ['IsNo', '$cat eq "No"', ''], # Number, Other
+
+ ['IsZ', '$cat =~ /^Z/', ''], # Zeparator
+ ['IsZs', '$cat eq "Zs"', ''], # Separator, Space
+ ['IsZl', '$cat eq "Zl"', ''], # Separator, Line
+ ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
+
+ ['IsC', '$cat =~ /^C/', ''], # Crazy
+ ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
+ ['IsCo', '$cat eq "Co"', ''], # Other, Private Use
+ ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
+
+# Informative
+
+ ['IsL', '$cat =~ /^L/', ''], # Letter
+ ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase
+ ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase
+ ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase
+ ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier
+ ['IsLo', '$cat eq "Lo"', ''], # Letter, Other
+
+ ['IsP', '$cat =~ /^P/', ''], # Punctuation
+ ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash
+ ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
+ ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
+ ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
+
+ ['IsS', '$cat =~ /^S/', ''], # Symbol
+ ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
+ ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
+ ['IsSo', '$cat eq "So"', ''], # Symbol, Other
+
+# Combining class
+ ['CombiningClass', '$comb', '$comb'],
+
+# BIDIRECTIONAL PROPERTIES
+
+ ['Bidirectional', '$bid', '$bid'],
+
+# Strong types:
+
+ ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic,
+ # syllabic, and logographic
+ # characters (e.g., CJK
+ # ideographs)
+ ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew,
+ # and punctuation specific to
+ # those scripts
+
+# Weak types:
+
+ ['IsBidiEN','$bid eq "EN"', ''], # European Number
+ ['IsBidiES','$bid eq "ES"', ''], # European Number Separator
+ ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator
+ ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number
+ ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator
+
+# Separators:
+
+ ['IsBidiB', '$bid eq "B"', ''], # Block Separator
+ ['IsBidiS', '$bid eq "S"', ''], # Segment Separator
+
+# Neutrals:
+
+ ['IsBidiWS','$bid eq "WS"', ''], # Whitespace
+ ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other
+ # characters: punctuation,
+ # symbols
+
+# Decomposition
+
+ ['Decomposition', '$decomp', '$decomp'],
+ ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''],
+ ['IsDecoCompat', '$decomp =~ /^</', ''],
+ ['IsDCfont', '$decomp =~ /^<font>/', ''],
+ ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''],
+ ['IsDCinitial', '$decomp =~ /^<initial>/', ''],
+ ['IsDCinital', '$decomp =~ /^<medial>/', ''],
+ ['IsDCfinal', '$decomp =~ /^<final>/', ''],
+ ['IsDCisolated', '$decomp =~ /^<isolated>/', ''],
+ ['IsDCcircle', '$decomp =~ /^<circle>/', ''],
+ ['IsDCsuper', '$decomp =~ /^<super>/', ''],
+ ['IsDCsub', '$decomp =~ /^<sub>/', ''],
+ ['IsDCvertical', '$decomp =~ /^<vertical>/', ''],
+ ['IsDCwide', '$decomp =~ /^<wide>/', ''],
+ ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
+ ['IsDCsmall', '$decomp =~ /^<small>/', ''],
+ ['IsDCsquare', '$decomp =~ /^<square>/', ''],
+ ['IsDCcompat', '$decomp =~ /^<compat>/', ''],
+
+# Number
+
+ ['Number', '$num', '$num'],
+
+# Mirrored
+
+ ['IsMirrored', '$mir eq "Y"', ''],
+
+# Arabic
+
+ ['ArabLink', '1', '$link'],
+ ['ArabLnkGrp', '1', '$linkgroup'],
+
+# Jamo
+
+ ['JamoShort', '1', '$short'],
+);
+
+# This is not written for speed...
+
+foreach $file (@todo) {
+ my ($table, $wanted, $val) = @$file;
+ next if @ARGV and not grep { $_ eq $table } @ARGV;
+ print $table,"\n";
+ if ($table =~ /^(Is|In|To)(.*)/) {
+ open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n";
+ }
+ else {
+ open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
+ }
+ print OUT <<"END";
+return <<'END';
+END
+ print OUT proplist($table, $wanted, $val);
+ print OUT "END\n";
+ close OUT;
+}
+
+# Must treat blocks specially.
+
+exit if @ARGV and not grep { $_ eq Block } @ARGV;
+print "Block\n";
+open(UD, 'blocks.txt') or die "Can't open blocks.txt: $!\n";
+open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n";
+print OUT <<"END";
+return <<'END';
+END
+
+while (<UD>) {
+ next if /^#/;
+ next if /^$/;
+ chomp;
+ ($code, $last, $name) = split(/; */);
+ if ($name) {
+ print OUT "$code $last $name\n";
+ $name =~ s/\s+//g;
+ open(BLOCK, ">In/$name.pl");
+ print BLOCK <<"END2";
+return <<'END';
+$code $last
+END
+END2
+ close BLOCK;
+ }
+}
+
+print OUT "END\n";
+close OUT;
+
+##################################################
+
+sub proplist {
+ my ($table, $wanted, $val) = @_;
+ my @wanted;
+ my $out;
+ my $split;
+
+ if ($table =~ /^Arab/) {
+ open(UD, "arabshp.txt") or warn "Can't open $table: $!";
+
+ $split = '($code, $name, $link, $linkgroup) = split(/; */);';
+ }
+ elsif ($table =~ /^Jamo/) {
+ open(UD, "jamo2.txt") or warn "Can't open $table: $!";
+
+ $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
+ }
+ else {
+ open(UD, "UnicodeData-Latest.txt") or warn "Can't open $table: $!";
+
+ $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
+ $comment, $up, $down, $title) = split(/;/);';
+ }
+
+ if ($table =~ /^(?:To|Is)[A-Z]/) {
+ eval <<"END";
+ while (<UD>) {
+ next if /^#/;
+ next if /^\s/;
+ chop;
+ $split
+ if ($wanted) {
+ push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
+ }
+ }
+END
+ die $@ if $@;
+
+ while (@wanted) {
+ $beg = shift @wanted;
+ $last = $beg;
+ while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
+ (not $val or $wanted[0]->[1] == $last->[1] + 1)) {
+ $last = shift @wanted;
+ }
+ $out .= sprintf "%04x", $beg->[0];
+ if ($beg->[2]) {
+ $last = shift @wanted;
+ }
+ if ($beg == $last) {
+ $out .= "\t";
+ }
+ else {
+ $out .= sprintf "\t%04x", $last->[0];
+ }
+ $out .= sprintf "\t%04x", $beg->[1] if $val;
+ $out .= "\n";
+ }
+ }
+ else {
+ eval <<"END";
+ while (<UD>) {
+ next if /^#/;
+ next if /^\s*\$/;
+ chop;
+ $split
+ if ($wanted) {
+ push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]);
+ }
+ }
+END
+ die $@ if $@;
+
+ while (@wanted) {
+ $beg = shift @wanted;
+ $last = $beg;
+ while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
+ ($wanted[0]->[1] eq $last->[1])) {
+ $last = shift @wanted;
+ }
+ $out .= sprintf "%04x", $beg->[0];
+ if ($beg->[2]) {
+ $last = shift @wanted;
+ }
+ if ($beg == $last) {
+ $out .= "\t";
+ }
+ else {
+ $out .= sprintf "\t%04x", $last->[0];
+ }
+ $out .= sprintf "\t%s\n", $beg->[1];
+ }
+ }
+ $out;
+}