summaryrefslogtreecommitdiff
path: root/codepage/gensubset.pl
diff options
context:
space:
mode:
Diffstat (limited to 'codepage/gensubset.pl')
-rwxr-xr-xcodepage/gensubset.pl38
1 files changed, 38 insertions, 0 deletions
diff --git a/codepage/gensubset.pl b/codepage/gensubset.pl
new file mode 100755
index 00000000..5fde460f
--- /dev/null
+++ b/codepage/gensubset.pl
@@ -0,0 +1,38 @@
+#!/usr/bin/perl
+#
+# Generate a subset of the UnicodeData.txt file, available from
+# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
+#
+# Usage:
+# gensubset.pl [subset files] < UnicodeData.txt > MiniUCD.txt
+#
+
+%need_these = ();
+
+foreach $file (@ARGV) {
+ open(F, '<', $file) or die;
+ while (defined($line = <F>)) {
+ $line =~ s/\s*(\#.*|)$//; # Remove comments and final blanks
+ @f = split(/\s+/, $line);
+ next if (scalar @f != 2);
+ $need_these{hex $f[1]}++;
+ }
+ close(F);
+}
+
+while (defined($line = <STDIN>)) {
+ ($v, $l) = split(/;/, $line, 2);
+ if ($v =~ /^([0-9a-f]+)\-([0-9a-f]+)$/i) {
+ $r1 = hex $1;
+ $r2 = hex $2;
+ } elsif ($v =~ /^([0-9a-f]+)$/i) {
+ $r1 = $r2 = hex $1;
+ } else {
+ next;
+ }
+ for ($r = $r1; $r <= $r2; $r++) {
+ printf "%04X;%s", $r, $l if ($need_these{$r});
+ }
+}
+
+