1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
|
# chartables.pl - A perl program to generate tables for use by the
# Character class.
# Copyright (C) 1998, 1999 Red Hat, Inc.
#
# This file is part of libjava.
#
# This software is copyrighted work licensed under the terms of the
# Libjava License. Please consult the file "LIBJAVA_LICENSE" for
# details.
# This program requires a `unidata.txt' file of the form distributed
# on the Unicode 2.0 CD ROM. Or, get it more conveniently here:
# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
# Version `2.1.8' of this file was last used to update the Character class.
# Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3
# "The Java Language Specification", ISBN 0-201-63451-1
# plus online API docs for JDK 1.2 beta from http://www.javasoft.com.
# Usage: perl chartables.pl [-n] UnicodeData-VERSION.txt
# If this exits with nonzero status, then you must investigate the
# cause of the problem.
# Diagnostics and other information to stderr.
# This creates the new include/java-chartables.h and
# include/java-chardecomp.h files directly.
# With -n, the files are not created, but all processing
# still occurs.
# Fields in the table.
$CODE = 0;
$NAME = 1;
$CATEGORY = 2;
$DECOMPOSITION = 5;
$DECIMAL = 6;
$DIGIT = 7;
$NUMERIC = 8;
$UPPERCASE = 12;
$LOWERCASE = 13;
$TITLECASE = 14;
# A special case.
$TAMIL_DIGIT_ONE = 0x0be7;
$TAMIL_DIGIT_NINE = 0x0bef;
# These are endpoints of legitimate gaps in the tables.
$CJK_IDEOGRAPH_END = 0x9fa5;
$HANGUL_END = 0xd7a3;
$HIGH_SURROGATE_END = 0xdb7f;
$PRIVATE_HIGH_SURROGATE_END = 0xdbff;
$LOW_SURROGATE_END = 0xdfff;
$PRIVATE_END = 0xf8ff;
%title_to_upper = ();
%title_to_lower = ();
%numerics = ();
%name = ();
@digit_start = ();
@digit_end = ();
@space_start = ();
@space_end = ();
# @letter_start = ();
# @letter_end = ();
@all_start = ();
@all_end = ();
@all_cats = ();
@upper_start = ();
@upper_end = ();
@upper_map = ();
%upper_anom = ();
@lower_start = ();
@lower_end = ();
@lower_map = ();
%lower_anom = ();
@attributes = ();
# There are a few characters which actually need two attributes.
# These are special-cased.
$ROMAN_START = 0x2160;
$ROMAN_END = 0x217f;
%second_attributes = ();
$prevcode = -1;
$status = 0;
%category_map =
(
'Mn' => 'NON_SPACING_MARK',
'Mc' => 'COMBINING_SPACING_MARK',
'Me' => 'ENCLOSING_MARK',
'Nd' => 'DECIMAL_DIGIT_NUMBER',
'Nl' => 'LETTER_NUMBER',
'No' => 'OTHER_NUMBER',
'Zs' => 'SPACE_SEPARATOR',
'Zl' => 'LINE_SEPARATOR',
'Zp' => 'PARAGRAPH_SEPARATOR',
'Cc' => 'CONTROL',
'Cf' => 'FORMAT',
'Cs' => 'SURROGATE',
'Co' => 'PRIVATE_USE',
'Cn' => 'UNASSIGNED',
'Lu' => 'UPPERCASE_LETTER',
'Ll' => 'LOWERCASE_LETTER',
'Lt' => 'TITLECASE_LETTER',
'Lm' => 'MODIFIER_LETTER',
'Lo' => 'OTHER_LETTER',
'Pc' => 'CONNECTOR_PUNCTUATION',
'Pd' => 'DASH_PUNCTUATION',
'Ps' => 'START_PUNCTUATION',
'Pe' => 'END_PUNCTUATION',
'Pi' => 'START_PUNCTUATION',
'Pf' => 'END_PUNCTUATION',
'Po' => 'OTHER_PUNCTUATION',
'Sm' => 'MATH_SYMBOL',
'Sc' => 'CURRENCY_SYMBOL',
'Sk' => 'MODIFIER_SYMBOL',
'So' => 'OTHER_SYMBOL'
);
# These maps characters to their decompositions.
%canonical_decomposition = ();
%full_decomposition = ();
# Handle `-n' and open output files.
local ($f1, $f2) = ('include/java-chartables.h',
'include/java-chardecomp.h');
if ($ARGV[0] eq '-n')
{
shift @ARGV;
$f1 = '/dev/null';
$f2 = '/dev/null';
}
open (CHARTABLE, "> $f1");
open (DECOMP, "> $f2");
# Process the Unicode file.
while (<>)
{
chop;
# Specify a limit for split so that we pick up trailing fields.
# We make the limit larger than we need, to catch the case where
# there are extra fields.
@fields = split (';', $_, 30);
# Convert code to number.
$ncode = hex ($fields[$CODE]);
if ($#fields != 14)
{
print STDERR ("Entry for \\u", $fields[$CODE],
" has wrong number of fields: ", $#fields, "\n");
}
$name{$fields[$CODE]} = $fields[$NAME];
# If we've found a gap in the table, fill it in.
if ($ncode != $prevcode + 1)
{
&process_gap (*fields, $prevcode, $ncode);
}
&process_char (*fields, $ncode);
$prevcode = $ncode;
}
if ($prevcode != 0xffff)
{
# Setting of `fields' parameter doesn't matter here.
&process_gap (*fields, $prevcode, 0x10000);
}
print CHARTABLE "// java-chartables.h - Character tables for java.lang.Character -*- c++ -*-\n\n";
print CHARTABLE "#ifndef __JAVA_CHARTABLES_H__\n";
print CHARTABLE "#define __JAVA_CHARTABLES_H__\n\n";
print CHARTABLE "// These tables are automatically generated by the chartables.pl\n";
print CHARTABLE "// script. DO NOT EDIT the tables. Instead, fix the script\n";
print CHARTABLE "// and run it again.\n\n";
print CHARTABLE "// This file should only be included by natCharacter.cc\n\n";
$bytes = 0;
# Titlecase mapping tables.
if ($#title_to_lower != $#title_to_upper)
{
# If this fails we need to reimplement toTitleCase.
print STDERR "titlecase mappings have different sizes\n";
$status = 1;
}
# Also ensure that the tables are entirely parallel.
foreach $key (sort keys %title_to_lower)
{
if (! defined $title_to_upper{$key})
{
print STDERR "titlecase mappings have different entries\n";
$status = 1;
}
}
&print_single_map ("title_to_lower_table", %title_to_lower);
&print_single_map ("title_to_upper_table", %title_to_upper);
print CHARTABLE "#ifdef COMPACT_CHARACTER\n\n";
printf CHARTABLE "#define TAMIL_DIGIT_ONE 0x%04x\n\n", $TAMIL_DIGIT_ONE;
# All numeric values.
&print_numerics;
# Digits only.
&print_block ("digit_table", *digit_start, *digit_end);
# Space characters.
&print_block ("space_table", *space_start, *space_end);
# Letters. We used to generate a separate letter table. But this
# doesn't really seem worthwhile. Simply using `all_table' saves us
# about 800 bytes, and only adds 3 table probes to isLetter.
# &print_block ("letter_table", *letter_start, *letter_end);
# Case tables.
&print_case_table ("upper", *upper_start, *upper_end, *upper_map, *upper_anom);
&print_case_table ("lower", *lower_start, *lower_end, *lower_map, *lower_anom);
# Everything else.
&print_all_block (*all_start, *all_end, *all_cats);
print CHARTABLE "#else /* COMPACT_CHARACTER */\n\n";
printf CHARTABLE "#define ROMAN_START 0x%04x\n", $ROMAN_START;
printf CHARTABLE "#define ROMAN_END 0x%04x\n\n", $ROMAN_END;
&print_fast_tables (*all_start, *all_end, *all_cats,
*attributes, *second_attributes);
print CHARTABLE "#endif /* COMPACT_CHARACTER */\n\n";
print CHARTABLE "#endif /* __JAVA_CHARTABLES_H__ */\n";
printf STDERR "Approximately %d bytes of data generated (compact case)\n",
$bytes;
# Now generate decomposition tables.
printf DECOMP "// java-chardecomp.h - Decomposition character tables -*- c++ -*-\n\n";
printf DECOMP "#ifndef __JAVA_CHARDECOMP_H__\n";
printf DECOMP "#define __JAVA_CHARDECOMP_H__\n\n";
print DECOMP "// These tables are automatically generated by the chartables.pl\n";
print DECOMP "// script. DO NOT EDIT the tables. Instead, fix the script\n";
print DECOMP "// and run it again.\n\n";
print DECOMP "// This file should only be included by natCollator.cc\n\n";
print DECOMP "struct decomp_entry\n{\n";
print DECOMP " jchar key;\n";
print DECOMP " const char *value;\n";
print DECOMP "};\n\n";
&write_decompositions;
printf DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";
close (CHARTABLE);
close (DECOMP);
exit $status;
# Process a gap in the space.
sub process_gap
{
local (*fields, $prevcode, $ncode) = @_;
local (@gap_fields, $i);
if ($ncode == $CJK_IDEOGRAPH_END
|| $ncode == $HANGUL_END
|| $ncode == $HIGH_SURROGATE_END
|| $ncode == $PRIVATE_HIGH_SURROGATE_END
|| $ncode == $LOW_SURROGATE_END
|| $ncode == $PRIVATE_END)
{
# The characters in the gap we just found are known to
# have the same properties as the character at the end of
# the gap.
@gap_fields = @fields;
}
else
{
# This prints too much to be enabled.
# print STDERR "Gap found at \\u", $fields[$CODE], "\n";
@gap_fields = ('', '', 'Cn', '', '', '', '', '', '', '', '',
'', '', '', '');
}
for ($i = $prevcode + 1; $i < $ncode; ++$i)
{
$gap_fields[$CODE] = sprintf ("%04x", $i);
$gap_fields[$NAME] = "CHARACTER " . $gap_fields[$CODE];
&process_char (*gap_fields, $i);
}
}
# Process a single character.
sub process_char
{
local (*fields, $ncode) = @_;
if ($fields[$DECOMPOSITION] ne '')
{
&add_decomposition ($ncode, $fields[$DECOMPOSITION]);
}
# If this is a titlecase character, mark it.
if ($fields[$CATEGORY] eq 'Lt')
{
$title_to_upper{$fields[$CODE]} = $fields[$UPPERCASE];
$title_to_lower{$fields[$CODE]} = $fields[$LOWERCASE];
}
else
{
# For upper and lower case mappings, we try to build compact
# tables that map range onto range. We specifically want to
# avoid titlecase characters. Java specifies a range check to
# make sure the character is not between 0x2000 and 0x2fff.
# We avoid that here because we need to generate table entries
# -- toLower and toUpper still work in that range.
if ($fields[$UPPERCASE] eq ''
&& ($fields[$LOWERCASE] ne ''
|| $fields[$NAME] =~ /CAPITAL (LETTER|LIGATURE)/))
{
if ($fields[$LOWERCASE] ne '')
{
&update_case_block (*upper_start, *upper_end, *upper_map,
$fields[$CODE], $fields[$LOWERCASE]);
&set_attribute ($ncode, hex ($fields[$LOWERCASE]));
}
else
{
$upper_anom{$fields[$CODE]} = 1;
}
}
elsif ($fields[$LOWERCASE] ne '')
{
print STDERR ("Java missed upper case char \\u",
$fields[$CODE], "\n");
}
elsif ($fields[$CATEGORY] eq 'Lu')
{
# This case is for letters which are marked as upper case
# but for which there is no lower case equivalent. For
# instance, LATIN LETTER YR.
}
if ($fields[$LOWERCASE] eq ''
&& ($fields[$UPPERCASE] ne ''
|| $fields[$NAME] =~ /SMALL (LETTER|LIGATURE)/))
{
if ($fields[$UPPERCASE] ne '')
{
&update_case_block (*lower_start, *lower_end, *lower_map,
$fields[$CODE], $fields[$UPPERCASE]);
&set_attribute ($ncode, hex ($fields[$UPPERCASE]));
}
else
{
$lower_anom{$fields[$CODE]} = 1;
}
}
elsif ($fields[$UPPERCASE] ne '')
{
print STDERR ("Java missed lower case char \\u",
$fields[$CODE], "\n");
}
elsif ($fields[$CATEGORY] eq 'Ll')
{
# This case is for letters which are marked as lower case
# but for which there is no upper case equivalent. For
# instance, FEMININE ORDINAL INDICATOR.
}
}
# If we have a non-decimal numeric value, add it to the list.
if ($fields[$CATEGORY] eq 'Nd'
&& ($ncode < 0x2000 || $ncode > 0x2fff)
&& $fields[$NAME] =~ /DIGIT/)
{
# This is a digit character that is handled elsewhere.
}
elsif ($fields[$DIGIT] ne '' || $fields[$NUMERIC] ne '')
{
# Do a simple check.
if ($fields[$DECIMAL] ne '')
{
# This catches bugs in an earlier implementation of
# chartables.pl. Now it is here for historical interest
# only.
# print STDERR ("Character \u", $fields[$CODE],
# " would have been missed as digit\n");
}
local ($val) = $fields[$DIGIT];
$val = $fields[$NUMERIC] if $val eq '';
local ($ok) = 1;
# If we have a value which is not a positive integer, then we
# set the value to -2 to make life easier for
# Character.getNumericValue.
if ($val !~ m/^[0-9]+$/)
{
if ($fields[$CATEGORY] ne 'Nl'
&& $fields[$CATEGORY] ne 'No')
{
# This shows a few errors in the Unicode table. These
# characters have a missing Numeric field, and the `N'
# for the mirrored field shows up there instead. I
# reported these characters to errata@unicode.org on
# Thu Sep 10 1998. They said it will be fixed in the
# 2.1.6 release of the tables.
print STDERR ("Character \u", $fields[$CODE],
" has value but is not numeric; val = '",
$val, "'\n");
# We skip these.
$ok = 0;
}
$val = "-2";
}
if ($ok)
{
$numerics{$fields[$CODE]} = $val;
&set_attribute ($ncode, $val);
}
}
# We build a table that lists ranges of ordinary decimal values.
# At each step we make sure that the digits are in the correct
# order, with no holes, as this is assumed by Character. If this
# fails, reimplementation is required. This implementation
# dovetails nicely with the Java Spec, which has strange rules for
# what constitutes a decimal value. In particular the Unicode
# name must contain the word `DIGIT'. The spec doesn't directly
# say that digits must have type `Nd' (or that their value must an
# integer), but that can be inferred from the list of digits in
# the book(s). Currently the only Unicode characters whose name
# includes `DIGIT' which would not fit are the Tibetan "half"
# digits.
if ($fields[$CATEGORY] eq 'Nd')
{
if (($ncode < 0x2000 || $ncode > 0x2fff)
&& $fields[$NAME] =~ /DIGIT/)
{
&update_digit_block (*digit_start, *digit_end, $fields[$CODE],
$fields[$DECIMAL]);
&set_attribute ($ncode, $fields[$DECIMAL]);
}
else
{
# If this fails then Character.getType will fail. We
# assume that things in `digit_table' are the only
# category `Nd' characters.
print STDERR ("Character \u", $fields[$CODE],
" is class Nd but not in digit table\n");
$status = 1;
}
}
# Keep track of space characters.
if ($fields[$CATEGORY] =~ /Z[slp]/)
{
&update_block (*space_start, *space_end, $fields[$CODE]);
}
# Keep track of letters.
# if ($fields[$CATEGORY] =~ /L[ultmo]/)
# {
# &update_letter_block (*letter_start, *letter_end, $fields[$CODE],
# $fields[$CATEGORY]);
# }
# Keep track of all characters. You might think we wouldn't have
# to do this for uppercase letters, or other characters we already
# "classify". The problem is that this classification is
# different. E.g., \u216f is uppercase by Java rules, but is a
# LETTER_NUMBER here.
&update_all_block (*all_start, *all_end, *all_cats,
$fields[$CODE], $fields[$CATEGORY]);
}
# Called to add a new decomposition.
sub add_decomposition
{
local ($ncode, $value) = @_;
local ($is_full) = 0;
local ($first) = 1;
local (@decomp) = ();
foreach (split (' ', $value))
{
if ($first && /^\<.*\>$/)
{
$is_full = 1;
}
else
{
push (@decomp, hex ($_));
}
$first = 0;
}
# We pack the value into a string because this means we can stick
# with Perl 4 features.
local ($s) = pack "I*", @decomp;
if ($is_full)
{
$full_decomposition{$ncode} = $s;
}
else
{
$canonical_decomposition{$ncode} = $s;
}
}
# Write a single decomposition table.
sub write_single_decomposition
{
local ($name, $is_canon, %table) = @_;
printf DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";
local ($key, @expansion, $char);
local ($first_line) = 1;
for ($key = 0; $key <= 65535; ++$key)
{
next if ! defined $table{$key};
printf DECOMP ",\n"
unless $first_line;
$first_line = 0;
printf DECOMP " { 0x%04x, \"", $key;
# We represent the expansion as a series of bytes, terminated
# with a double nul. This is ugly, but relatively
# space-efficient. Most expansions are short, but there are a
# few that are very long (e.g. \uFDFA). This means that if we
# chose a fixed-space representation we would waste a lot of
# space.
@expansion = unpack "I*", $table{$key};
foreach $char (@expansion)
{
printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
}
printf DECOMP "\" }";
}
printf DECOMP "\n};\n\n";
}
sub write_decompositions
{
&write_single_decomposition ('canonical', 1, %canonical_decomposition);
&write_single_decomposition ('full', 0, %full_decomposition);
}
# We represent a block of characters with a pair of lists. This
# function updates the pair to account for the new character. Returns
# 1 if we added to the old block, 0 otherwise.
sub update_block
{
local (*start, *end, $char) = @_;
local ($nchar) = hex ($char);
local ($count) = $#end;
if ($count >= 0 && $end[$count] == $nchar - 1)
{
++$end[$count];
return 1;
}
else
{
++$count;
$start[$count] = $nchar;
$end[$count] = $nchar;
}
return 0;
}
# Return true if we will be appending this character to the end of the
# existing block.
sub block_append_p
{
local (*end, $char) = @_;
return $#end >= 0 && $end[$#end] == $char - 1;
}
# This updates the digit block. This table is much like an ordinary
# block, but it has an extra constraint.
sub update_digit_block
{
local (*start, *end, $char, $value) = @_;
&update_block ($start, $end, $char);
local ($nchar) = hex ($char);
# We want to make sure that the new digit's value is correct for
# its place in the block. However, we special-case Tamil digits,
# since Tamil does not have a digit `0'.
local ($count) = $#start;
if (($nchar < $TAMIL_DIGIT_ONE || $nchar > $TAMIL_DIGIT_NINE)
&& $nchar - $start[$count] != $value)
{
# If this fails then Character.digit_value will be wrong.
print STDERR "Character \\u", $char, " violates digit constraint\n";
$status = 1;
}
}
# Update letter table. We could be smart about avoiding upper or
# lower case letters, but it is much simpler to just track them all.
sub update_letter_block
{
local (*start, *end, $char, $category) = @_;
&update_block (*start, *end, $char);
}
# Update `all' table. This table holds all the characters we don't
# already categorize for other reasons. FIXME: if a given type has
# very few characters, we should just inline the code. E.g., there is
# only one paragraph separator.
sub update_all_block
{
local (*start, *end, *cats, $char, $category) = @_;
local ($nchar) = hex ($char);
local ($count) = $#end;
if ($count >= 0
&& $end[$count] == $nchar - 1
&& $cats[$count] eq $category)
{
++$end[$count];
}
else
{
++$count;
$start[$count] = $nchar;
$end[$count] = $nchar;
$cats[$count] = $category;
}
}
# Update a case table. We handle case tables specially because we
# want to map (e.g.) a block of uppercase characters directly onto the
# corresponding block of lowercase characters. Therefore we generate
# a new entry when the block would no longer map directly.
sub update_case_block
{
local (*start, *end, *map, $char, $mapchar) = @_;
local ($nchar) = hex ($char);
local ($nmap) = hex ($mapchar);
local ($count) = $#end;
if ($count >= 0
&& $end[$count] == $nchar - 1
&& $nchar - $start[$count] == $nmap - $map[$count])
{
++$end[$count];
}
else
{
++$count;
$start[$count] = $nchar;
$end[$count] = $nchar;
$map[$count] = $nmap;
}
}
# Set the attribute value for the character. Each character can have
# only one attribute.
sub set_attribute
{
local ($ncode, $attr) = @_;
if ($attributes{$ncode} ne '' && $attributes{$ncode} ne $attr)
{
if ($ncode >= $ROMAN_START && $ncode <= $ROMAN_END)
{
$second_attributes{$ncode} = $attr;
}
else
{
printf STDERR "character \\u%04x already has attribute\n", $ncode;
}
}
# Attributes can be interpreted as unsigned in some situations,
# so we check against 65535. This could cause errors -- we need
# to check the interpretation here.
elsif ($attr < -32768 || $attr > 65535)
{
printf STDERR "attribute out of range for character \\u%04x\n", $ncode;
}
else
{
$attributes{$ncode} = $attr;
}
}
# Print a block table.
sub print_block
{
local ($title, *start, *end) = @_;
print CHARTABLE "static const jchar ", $title, "[][2] =\n";
print CHARTABLE " {\n";
local ($i) = 0;
while ($i <= $#start)
{
print CHARTABLE " { ";
&print_char ($start[$i]);
print CHARTABLE ", ";
&print_char ($end[$i]);
print CHARTABLE " }";
print CHARTABLE "," if ($i != $#start);
print CHARTABLE "\n";
++$i;
$bytes += 4; # Two bytes per char.
}
print CHARTABLE " };\n\n";
}
# Print the numerics table.
sub print_numerics
{
local ($i, $key, $count, @keys);
$i = 0;
@keys = sort keys %numerics;
$count = @keys;
print CHARTABLE "static const jchar numeric_table[] =\n";
print CHARTABLE " { ";
foreach $key (@keys)
{
&print_char (hex ($key));
++$i;
print CHARTABLE ", " if $i < $count;
# Print 5 per line.
print CHARTABLE "\n " if ($i % 5 == 0);
$bytes += 2; # One character.
}
print CHARTABLE " };\n\n";
print CHARTABLE "static const jshort numeric_value[] =\n";
print CHARTABLE " { ";
$i = 0;
foreach $key (@keys)
{
print CHARTABLE $numerics{$key};
if ($numerics{$key} > 32767 || $numerics{$key} < -32768)
{
# This means our generated type info is incorrect. We
# could just detect and work around this here, but I'm
# lazy.
print STDERR "numeric value won't fit in a short\n";
$status = 1;
}
++$i;
print CHARTABLE ", " if $i < $count;
# Print 10 per line.
print CHARTABLE "\n " if ($i % 10 == 0);
$bytes += 2; # One short.
}
print CHARTABLE " };\n\n";
}
# Print a table that maps one single letter onto another. It assumes
# the map is index by char code.
sub print_single_map
{
local ($title, %map) = @_;
local (@keys) = sort keys %map;
$num = @keys;
print CHARTABLE "static const jchar ", $title, "[][2] =\n";
print CHARTABLE " {\n";
$i = 0;
for $key (@keys)
{
print CHARTABLE " { ";
&print_char (hex ($key));
print CHARTABLE ", ";
&print_char (hex ($map{$key}));
print CHARTABLE " }";
++$i;
if ($i < $num)
{
print CHARTABLE ",";
}
else
{
print CHARTABLE " ";
}
print CHARTABLE " // ", $name{$key}, "\n";
$bytes += 4; # Two bytes per char.
}
print CHARTABLE " };\n\n";
}
# Print the `all' block.
sub print_all_block
{
local (*start, *end, *cats) = @_;
&print_block ("all_table", *start, *end);
local ($i) = 0;
local ($sum) = 0;
while ($i <= $#start)
{
$sum += $end[$i] - $start[$i] + 1;
++$i;
}
# We do this computation just to make sure it isn't cheaper to
# simply list all the characters individually.
printf STDERR ("all_table encodes %d characters in %d entries\n",
$sum, $#start + 1);
print CHARTABLE "static const jbyte category_table[] =\n";
print CHARTABLE " { ";
$i = 0;
while ($i <= $#cats)
{
if ($i > 0 && $cats[$i] eq $cats[$i - 1])
{
# This isn't an error. We can have a duplicate because
# two ranges are not adjacent while the intervening
# characters are left out of the table for other reasons.
# We could exploit this to make the table a little smaller.
# printf STDERR "Duplicate all entry at \\u%04x\n", $start[$i];
}
print CHARTABLE 'java::lang::Character::', $category_map{$cats[$i]};
print CHARTABLE ", " if ($i < $#cats);
++$i;
print CHARTABLE "\n ";
++$bytes;
}
print CHARTABLE " };\n\n";
}
# Print case table.
sub print_case_table
{
local ($title, *start, *end, *map, *anomalous) = @_;
&print_block ($title . '_case_table', *start, *end);
print CHARTABLE "static const jchar ", $title, "_case_map_table[] =\n";
print CHARTABLE " { ";
local ($i) = 0;
while ($i <= $#map)
{
&print_char ($map[$i]);
print CHARTABLE ", " if $i < $#map;
++$i;
print CHARTABLE "\n " if $i % 5 == 0;
$bytes += 2;
}
print CHARTABLE " };\n";
local ($key, @keys);
@keys = sort keys %anomalous;
if ($title eq 'upper')
{
if ($#keys >= 0)
{
# If these are found we need to change Character.isUpperCase.
print STDERR "Found anomalous upper case characters\n";
$status = 1;
}
}
else
{
print CHARTABLE "\n";
print CHARTABLE "static const jchar ", $title, "_anomalous_table[] =\n";
print CHARTABLE " { ";
$i = 0;
foreach $key (@keys)
{
&print_char (hex ($key));
print CHARTABLE ", " if $i < $#keys;
++$i;
print CHARTABLE "\n " if $i % 5 == 0;
$bytes += 2;
}
print CHARTABLE " };\n";
}
print CHARTABLE "\n";
}
# Print the type table and attributes table for the fast version.
sub print_fast_tables
{
local (*start, *end, *cats, *atts, *second_atts) = @_;
print CHARTABLE "static const jbyte type_table[] =\n{ ";
local ($i, $j);
for ($i = 0; $i <= $#cats; ++$i)
{
for ($j = $start[$i]; $j <= $end[$i]; ++$j)
{
print CHARTABLE 'java::lang::Character::', $category_map{$cats[$i]};
print CHARTABLE "," if ($i < $#cats || $j < $end[$i]);
print CHARTABLE "\n ";
}
}
print CHARTABLE "\n };\n\n";
print CHARTABLE "static const jshort attribute_table[] =\n{ ";
for ($i = 0; $i <= 0xffff; ++$i)
{
$atts{$i} = 0 if ! defined $atts{$i};
print CHARTABLE $atts{$i};
print CHARTABLE ", " if $i < 0xffff;
print CHARTABLE "\n " if $i % 5 == 1;
}
print CHARTABLE "\n };\n\n";
print CHARTABLE "static const jshort secondary_attribute_table[] =\n{ ";
for ($i = $ROMAN_START; $i <= $ROMAN_END; ++$i)
{
print CHARTABLE $second_atts{$i};
print CHARTABLE ", " if $i < $ROMAN_END;
print CHARTABLE "\n " if $i % 5 == 1;
}
print CHARTABLE "\n };\n\n";
}
# Print a character constant.
sub print_char
{
local ($ncode) = @_;
printf CHARTABLE "0x%04x", $ncode;
}
|