Initial commit. Generates character.uni, block.uni, and titelcase.uni.

author: Paul Fisher <rao@gnu.org> 1998-08-09 23:18:28 +0000
committer: Paul Fisher <rao@gnu.org> 1998-08-09 23:18:28 +0000
commit: 1f99d15ad8bc63791d2b67a31ceb6be194c4a6cc (patch)
tree: 4708be905f26b6e5235d3543ffa471c9400ff867
parent: c5ffc63a3c55d65882fee858e396d5ca0bfd690a (diff)
download: classpath-1f99d15ad8bc63791d2b67a31ceb6be194c4a6cc.tar.gz
1 files changed, 248 insertions, 0 deletions
diff --git a/doc/unicode/unicode-muncher.pl b/doc/unicode/unicode-muncher.pl
new file mode 100644
index 000000000..cf68b24b4
--- /dev/null
+++ b/doc/unicode/unicode-muncher.pl
@@ -0,0 +1,248 @@
+#!/usr/bin/perl
+# This is lots of really ugly hacked up Perl code.
+# It works.  Don't touch it, or it'll break.
+# I'm far from proud of it.  If you want to fix it so it
+# works with strict mode, please do so.
+# See unicode.database.format for information on the files
+# this program generates.
+
+# usage: unicode-muncher.pl UnicodeData-2.1.2.txt
+
+# Code for reading BLOCKS.TXT and generating stubs for Character.Subset
+#
+#  open (BLOCKS, $ARGV[1]) || die "Can't open Unicode block file: $!\n";
+#  &read_block;
+#
+#  sub read_block {
+#      ($start, $end, $block) = split(/; /, <BLOCKS>);
+#      chop $block; chop $block;
+#      if ($start =~ /#./) {
+#  	&read_block;
+#      } else { 
+#  	$curr_block++;
+#
+#  # Generate Character.Subset information	
+#  #	$block =~ tr/a-z/A-Z/;
+#  #	$block =~ y/ ,-/_/;
+#  #	print "$block = new Subset('\\u$start', '\\u$end'),\n";
+#
+#  	$start = hex($start);
+#  	$end   = hex($end);
+#
+#      };
+#  }
+
+sub write_char {
+    $S = $jnobreakspace << 5;
+    print CHAR 
+      (pack ("C", $S | $cat));
+    print CHAR (pack ("n", $jnum));
+    print CHAR (pack ("n", $jupper));
+    print CHAR (pack ("n", $jlower));
+}
+
+sub end_block {
+    print BLOCK (pack ("n", $start));
+    print BLOCK (pack ("n", $lhex));
+    $comp = 0 if ($comp != 1);
+    print BLOCK (pack ("C", $comp));
+    print BLOCK (pack ("N", $offset));
+
+    # calculate next offset
+    if ($comp == 1) { 
+	$offset += 7; 
+    } else { 
+	$offset += (7 * ($lhex - $start + 1)); 
+    }
+    # setup next starting block
+    $start = $hex;
+    # default to unknown compression
+    $comp = 2;
+}
+
+open (DATA, $ARGV[0]) || die "Can't open Unicode attribute file: $!\n";
+
+open (TITLECASE, ">titlecase.uni") || die ("Can't open titlecase file: $!\n");
+open (CHAR, ">character.uni") || die ("Can't open character file: $!\n");
+open (BLOCK, ">block.uni") || die ("Can't open block file: $!\n");
+
+$| = 1;
+print "GNU Classpath Unicode Attribute Database Generator 1.0\n";
+print "Copyright (C) 1998, Free Software Foundation, Inc.\n";
+print "Creating";
+
+$offset = 0;
+$hex = -1;
+while (<DATA>) {
+    $llhex = $lhex;
+    $lhex = $hex;
+    ($hex, $name, $category, $comb_class, $bidir, $decomp, $decimal, $digit,
+     $numeric, $mirrored, $unicode1n, $comment, $upcase, $lowcase, $titlecase)
+	= split(/;/);
+
+    print "." if (++$count % 1000 == 0);
+	
+    # convert unicode index strings to hex
+    $hex = hex($hex);
+    $upcase = hex($upcase);
+    $lowcase = hex($lowcase);
+    $titlecase = hex($titlecase);
+
+# read blocks - code used only when counting absolute blocks
+#    print "end of block\n" if (($lhex+1) != $hex);
+#     # read blocks until we find a valid range
+#    while ($hex > $end) {
+#	&read_block;
+#    }
+#    # $curr_block now contains the correct block
+
+# handle isIdentifierIgnorable()
+# this is according to the javadoc spec, which is broken.
+#    $jidentignore =  ($category =~ /(Cf)|(Cc)/ && 
+#		      !($unicode1n =~ /SEPARATOR|RETURN|TAB|FEED/)) ? 1 : 0;
+
+    $jnobreakspace = ($category eq "Zs" && $decomp =~ /noBreak/) ? 1 : 0;
+
+    # handle getNumericValue()
+    if ($numeric eq "") {
+	$jnum = 65535;		# does not have a numeric value
+    } else {
+	if ($numeric =~ /^[0-9]+$/) {
+	    $jnum = $numeric;	# nonnegative integer value
+	    die "sanity: numeric out of range!\n" if ($jnum >= 65534);
+	} else { 
+	    $jnum = 65534;	# other integer value
+	}
+    }
+
+    # handle uppercase mapping
+    $jupper = $upcase;
+
+    # handle lowercase mapping
+    $jlower = $lowcase;
+
+    # handle titlecase mapping - go ahead and output to file
+    if ($titlecase != $upcase && $titlecase != 0 && $upcase != 0) {
+	print TITLECASE (pack ("n2", $hex, $titlecase));
+    }
+
+    # handle category
+    $_ = $category;
+    CAT: {
+	if (/Cn/) { $cat = 0; last CAT; } # unassigned
+	if (/Lu/) { $cat = 1; last CAT; } # letter, uppercase
+	if (/Ll/) { $cat = 2; last CAT; } # letter, lowercase
+	if (/Lt/) { $cat = 3; last CAT; } # letter, titlecase
+	if (/Lm/) { $cat = 4; last CAT; } # letter, modifier
+	if (/Lo/) { $cat = 5; last CAT; } # letter, other
+	if (/Mn/) { $cat = 6; last CAT; } # mark, non-spacing
+	if (/Me/) { $cat = 7; last CAT; } # mark, enclosing
+	if (/Mc/) { $cat = 8; last CAT; } # mark, spacing combining
+	if (/Nd/) { $cat = 9; last CAT; } # number, decimal digit
+	if (/Nl/) { $cat = 10; last CAT; } # number, letter
+	if (/No/) { $cat = 11; last CAT; } # number, other
+	if (/Zs/) { $cat = 12; last CAT; } # separator, space
+	if (/Zl/) { $cat = 13; last CAT; } # separator, line
+	if (/Zp/) { $cat = 14; last CAT; } # separator, paragraph
+	if (/Cc/) { $cat = 15; last CAT; } # other, control
+	if (/Cf/) { $cat = 16; last CAT; } # other, format
+	# Sun skipped 17 - don't ask me why -- rao
+	if (/Co/) { $cat = 18; last CAT; } # other, private use
+	if (/Cs/) { $cat = 19; last CAT; } # other, surrogate
+	if (/Pd/) { $cat = 20; last CAT; } # punctuation, dash
+	if (/Ps/) { $cat = 21; last CAT; } # punctuation, open
+	if (/Pe/) { $cat = 22; last CAT; } # punctuation, close
+	if (/Pc/) { $cat = 23; last CAT; } # punctuation, connector
+	if (/Po/) { $cat = 24; last CAT; } # punctuation, other
+	if (/Sm/) { $cat = 25; last CAT; } # symbol, math
+	if (/Sc/) { $cat = 26; last CAT; } # symbol, currency
+	if (/Sk/) { $cat = 27; last CAT; } # symbol, modifier
+	if (/So/) { $cat = 28; last CAT; } # symbol, other
+    }
+
+  CHAR: {
+      # starting point
+      if ($hex == 0) { 
+	  &write_char;
+	  $comp = 2;		# compressed block state unknown until next ch
+	  $start = 0;
+	  $offset = 0;
+	  last CHAR; 
+      }
+
+      # handle mandatory blocks
+      if ($name =~ /First>$/) {
+	  &end_block;
+	  &write_char;
+	  $comp = 1;
+	  $mand_block = 1;
+	  last CHAR;
+      }
+      # end mandatory block
+      if ($mand_block) {
+	  $mand_block = 0;
+	  last CHAR;
+      }
+
+      # not sequential, end block.
+      if (($lhex+1) != $hex) { 
+	  &end_block; 
+	  &write_char; 
+	  last CHAR; 
+      }
+
+      # check to see if we can compress this character into the current block
+      if ($cat == $lcat &&
+	  $jnum == $ljnum &&
+	  $jnobreakspace == $ljnobreakspace &&
+	  $jupper == $ljupper &&
+	  $jlower == $ljlower) {
+	  if ($comp == 2) { $comp = 1; } # start compressing
+	  # end uncompressed block
+	  if ($comp == 0) { 
+	      $tmp = $lhex;
+	      $lhex = $llhex;
+	      &end_block;
+	      $start = $lhex = $tmp;
+	      $comp = 1; 
+	  }
+      } else { 
+  	  if ($comp == 2) { $comp = 0; };
+  	  if ($comp == 1) { &end_block; $comp = 2; }
+  	  &write_char;
+      }
+  }
+
+    # copy over all the variables to their "last" counterparts
+    $lcat = $cat;
+    $ljnum = $jnum;
+    $ljnobreakspace = $jnobreakspace;
+    $ljupper = $jupper;
+    $ljlower = $jlower;
+}
+$lhex = $hex;			# setup final block write
+&end_block;			# write final block
+
+close(DATA);
+close(TITLECASE);
+close(CHAR);
+close(BLOCK);
+
+print "ok\n";
+
+# Second step of compression -- elimate duplicate compressed blocks
+# in char.uni.  Should save about 2k; might implement later.
+#
+#  open (DATA, "<block.uni") || die ("Can't open block file: $!\n");
+#  open (CHAR, "<character.uni") || die ("Can't open char file: $!\n");
+#  $offset = 0;
+#  $comp = 0;
+#  while (read DATA, $buf, 9) {
+#      ($start, $end, $comp, $off) = unpack("nnCN", $buf);
+#      if ($comp == 1) {
+#  	seek CHAR, $off, 0;
+#  	read CHAR, $cbuf, 7;
+#  	push @$cbuf, $off;
+#  	$arrays{$cbuf} = $cbuf;
+#      }
+#  }
author	Paul Fisher <rao@gnu.org>	1998-08-09 23:18:28 +0000
committer	Paul Fisher <rao@gnu.org>	1998-08-09 23:18:28 +0000
commit	1f99d15ad8bc63791d2b67a31ceb6be194c4a6cc (patch)
tree	4708be905f26b6e5235d3543ffa471c9400ff867
parent	c5ffc63a3c55d65882fee858e396d5ca0bfd690a (diff)
download	classpath-1f99d15ad8bc63791d2b67a31ceb6be194c4a6cc.tar.gz