diff options
Diffstat (limited to 'libjava/scripts/unicode-decomp.pl')
-rwxr-xr-x | libjava/scripts/unicode-decomp.pl | 146 |
1 files changed, 146 insertions, 0 deletions
diff --git a/libjava/scripts/unicode-decomp.pl b/libjava/scripts/unicode-decomp.pl new file mode 100755 index 00000000000..8aeed152adf --- /dev/null +++ b/libjava/scripts/unicode-decomp.pl @@ -0,0 +1,146 @@ +#!/usr/bin/perl -w +# unicode-decomp.pl - script to generate database for java.text.Collator +# Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc. +# +# This file is part of libjava. +# +# This software is copyrighted work licensed under the terms of the +# Libjava License. Please consult the file "LIBJAVA_LICENSE" for +# details. + +# Code for reading UnicodeData.txt and generating the code for +# gnu.java.lang.CharData. For now, the relevant Unicode definition files +# are found in libjava/gnu/gcj/convert/. +# +# Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h> +# where <UnicodeData.txt> is obtained from www.unicode.org (named +# UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java> +# is the final location of include/java-chardecomp.h. +# As of JDK 1.4, use Unicode version 3.0.0 for best results. +# +# If this exits with nonzero status, then you must investigate the +# cause of the problem. +# Diagnostics and other information to stderr. +# With -n, the files are not created, but all processing still occurs. + +# These maps characters to their decompositions. +my %canonical_decomposition = (); +my %full_decomposition = (); + +# Handle `-n' and open output files. +if ($ARGV[0] && $ARGV[0] eq '-n') +{ + shift @ARGV; + $ARGV[1] = '/dev/null'; +} +die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2; +open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n"; + +# Process the Unicode file. +$| = 1; +my $count = 0; +print STDERR "Parsing attributes file"; +while (<UNICODE>) +{ + print STDERR "." unless $count++ % 1000; + chomp; + s/\r//g; + my ($ch, undef, undef, undef, undef, $decomp) = split ';'; + $ch = hex($ch); + + if ($decomp ne '') + { + my $is_full = 0; + my @decomp = (); + foreach (split (' ', $decomp)) + { + if (/^\<.*\>$/) + { + $is_full = 1; + next; + } + push (@decomp, hex ($_)); + } + my $s = pack "n*", @decomp; + if ($is_full) + { + $full_decomposition{$ch} = $s; + } + else + { + $canonical_decomposition{$ch} = $s; + } + } +} + +# Now generate decomposition tables. +open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n"; +print STDERR "\nGenerating tables\n"; +print DECOMP <<EOF; +// java-chardecomp.h - Decomposition character tables -*- c++ -*- + +#ifndef __JAVA_CHARDECOMP_H__ +#define __JAVA_CHARDECOMP_H__ + + +// These tables are automatically generated by the $0 +// script. DO NOT EDIT the tables. Instead, fix the script +// and run it again. + +// This file should only be included by natCollator.cc + +struct decomp_entry +{ + jchar key; + const char *value; +}; + +EOF + +&write_decompositions; + +print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n"; + +close(DECOMP); +print STDERR "Done\n"; +exit; + + +# Write a single decomposition table. +sub write_single_decomposition($$%) +{ + my ($name, $is_canon, %table) = @_; + my $first_line = 1; + print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n"; + + for my $key (0 .. 0xffff) + { + next if ! defined $table{$key}; + print DECOMP ",\n" unless $first_line; + $first_line = 0; + + printf DECOMP " { 0x%04x, \"", $key; + + # We represent the expansion as a series of bytes, terminated + # with a double nul. This is ugly, but relatively + # space-efficient. Most expansions are short, but there are a + # few that are very long (e.g. \uFDFA). This means that if we + # chose a fixed-space representation we would waste a lot of + # space. + my @expansion = unpack "n*", $table{$key}; + foreach my $char (@expansion) + { + printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256); + } + + print DECOMP "\" }"; + } + + print DECOMP "\n};\n\n"; +} + +sub write_decompositions() +{ + &write_single_decomposition ('canonical', 1, %canonical_decomposition); + &write_single_decomposition ('full', 0, %full_decomposition); +} |