summaryrefslogtreecommitdiff
path: root/libguile/unidata_to_charset.awk
diff options
context:
space:
mode:
Diffstat (limited to 'libguile/unidata_to_charset.awk')
-rw-r--r--libguile/unidata_to_charset.awk409
1 files changed, 409 insertions, 0 deletions
diff --git a/libguile/unidata_to_charset.awk b/libguile/unidata_to_charset.awk
new file mode 100644
index 000000000..11dfb2686
--- /dev/null
+++ b/libguile/unidata_to_charset.awk
@@ -0,0 +1,409 @@
+# unidata_to_charset.awk --- Compute SRFI-14 charsets from UnicodeData.txt
+#
+# Copyright (C) 2009, 2010, 2022 Free Software Foundation, Inc.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 3 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+# Utilities
+###########
+
+# Print MESSAGE to standard error, and exit with STATUS.
+function die(status, message) {
+ print "unidata_to_charset.awk:", message | "cat 1>&2";
+ exit_status = status;
+ exit exit_status;
+}
+
+# Parse the string S as a hexadecimal number. Note that R, C, and B are
+# local variables that need not be set by callers. Most Awk
+# implementations have an 'strtonum' function that we could use, but it
+# is not part of POSIX.
+function hex(s, r, c, b) {
+ if (length(s) == 0) {
+ die(1, "Cannot parse empty string as hexadecimal.");
+ }
+ r = 0;
+ for (i = 1; i <= length(s); i++) {
+ c = substr(s, i, 1);
+ b = 0;
+ if (c == "0") { b = 0; }
+ else if (c == "1") { b = 1; }
+ else if (c == "2") { b = 2; }
+ else if (c == "3") { b = 3; }
+ else if (c == "4") { b = 4; }
+ else if (c == "5") { b = 5; }
+ else if (c == "6") { b = 6; }
+ else if (c == "7") { b = 7; }
+ else if (c == "8") { b = 8; }
+ else if (c == "9") { b = 9; }
+ else if (c == "A") { b = 10; }
+ else if (c == "B") { b = 11; }
+ else if (c == "C") { b = 12; }
+ else if (c == "D") { b = 13; }
+ else if (c == "E") { b = 14; }
+ else if (c == "F") { b = 15; }
+ else { die(1, "Invalid hexadecimal character: " c); }
+ r *= 16;
+ r += b;
+ }
+ return r;
+}
+
+# Program initialization
+########################
+
+BEGIN {
+ # The columns are separated by semicolons.
+ FS = ";";
+
+ # This will help us handle errors.
+ exit_status = 0;
+
+ # List of charsets.
+ all_charsets_count = 0;
+ all_charsets[all_charsets_count++] = "lower_case";
+ all_charsets[all_charsets_count++] = "upper_case";
+ all_charsets[all_charsets_count++] = "title_case";
+ all_charsets[all_charsets_count++] = "letter";
+ all_charsets[all_charsets_count++] = "digit";
+ all_charsets[all_charsets_count++] = "hex_digit";
+ all_charsets[all_charsets_count++] = "letter_plus_digit";
+ all_charsets[all_charsets_count++] = "graphic";
+ all_charsets[all_charsets_count++] = "whitespace";
+ all_charsets[all_charsets_count++] = "printing";
+ all_charsets[all_charsets_count++] = "iso_control";
+ all_charsets[all_charsets_count++] = "punctuation";
+ all_charsets[all_charsets_count++] = "symbol";
+ all_charsets[all_charsets_count++] = "blank";
+ all_charsets[all_charsets_count++] = "ascii";
+ all_charsets[all_charsets_count++] = "empty";
+ all_charsets[all_charsets_count++] = "designated";
+
+ # Initialize charset state table.
+ for (i in all_charsets) {
+ cs = all_charsets[i];
+ state[cs, "start"] = -1;
+ state[cs, "end"] = -1;
+ state[cs, "count"] = 0;
+ }
+}
+
+# Record initialization
+#######################
+
+# In this block we give names to each field, and do some basic
+# initialization.
+{
+ codepoint = hex($1);
+ name = $2;
+ category = $3;
+ uppercase = $13;
+ lowercase = $14;
+
+ codepoint_end = codepoint;
+ charset_count = 0;
+}
+
+# Some pairs of lines in UnicodeData.txt delimit ranges of
+# characters.
+name ~ /First>$/ {
+ getline;
+ last_name = name;
+ sub(/First>$/, "Last>", last_name);
+ if (last_name != $2) {
+ die(1, "Invalid range in Unicode data.");
+ exit_status = 1;
+ exit 1;
+ }
+ codepoint_end = hex($1);
+}
+
+# Character set predicates
+##########################
+
+## The lower_case character set
+###############################
+
+# For Unicode, we follow Java's specification: a character is
+# lowercase if
+# * it is not in the range [U+2000,U+2FFF] ([8192,12287]), and
+# * the Unicode attribute table does not give a lowercase mapping
+# for it, and
+# * at least one of the following is true:
+# o the Unicode attribute table gives a mapping to uppercase
+# for the character, or
+# o the name for the character in the Unicode attribute table
+# contains the words "SMALL LETTER" or "SMALL LIGATURE".
+
+(codepoint < 8192 || codepoint > 12287) &&
+lowercase == "" &&
+(uppercase != "" || name ~ /(SMALL LETTER|SMALL LIGATURE)/) {
+ charsets[charset_count++] = "lower_case";
+}
+
+## The upper_case character set
+###############################
+
+# For Unicode, we follow Java's specification: a character is
+# uppercase if
+# * it is not in the range [U+2000,U+2FFF] ([8192,12287]), and
+# * the Unicode attribute table does not give an uppercase mapping
+# for it (this excludes titlecase characters), and
+# * at least one of the following is true:
+# o the Unicode attribute table gives a mapping to lowercase
+# for the character, or
+# o the name for the character in the Unicode attribute table
+# contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE".
+
+(codepoint < 8192 || codepoint > 12287) &&
+uppercase == "" &&
+(lowercase != "" || name ~ /(CAPITAL LETTER|CAPITAL LIGATURE)/) {
+ charsets[charset_count++] = "upper_case";
+}
+
+## The title_case character set
+###############################
+
+# A character is titlecase if it has the category Lt in the character
+# attribute database.
+
+category == "Lt" {
+ charsets[charset_count++] = "title_case";
+}
+
+## The letter character set
+###########################
+
+# A letter is any character with one of the letter categories (Lu, Ll,
+# Lt, Lm, Lo) in the Unicode character database.
+
+category == "Lu" ||
+category == "Ll" ||
+category == "Lt" ||
+category == "Lm" ||
+category == "Lo" {
+ charsets[charset_count++] = "letter";
+ charsets[charset_count++] = "letter_plus_digit";
+}
+
+## The digit character set
+##########################
+
+# A character is a digit if it has the category Nd in the character
+# attribute database. In Latin-1 and ASCII, the only such characters
+# are 0123456789. In Unicode, there are other digit characters in
+# other code blocks, such as Gujarati digits and Tibetan digits.
+
+category == "Nd" {
+ charsets[charset_count++] = "digit";
+ charsets[charset_count++] = "letter_plus_digit";
+}
+
+## The hex_digit character set
+##############################
+
+# The only hex digits are 0123456789abcdefABCDEF.
+
+(codepoint >= 48 && codepoint <= 57) ||
+(codepoint >= 65 && codepoint <= 70) ||
+(codepoint >= 97 && codepoint <= 102) {
+ charsets[charset_count++] = "hex_digit";
+}
+
+## The graphic character set
+############################
+
+# Characters that would 'use ink' when printed
+
+category ~ /L|M|N|P|S/ {
+ charsets[charset_count++] = "graphic";
+ charsets[charset_count++] = "printing";
+}
+
+## The whitespace character set
+###############################
+
+# A whitespace character is either
+# * a character with one of the space, line, or paragraph separator
+# categories (Zs, Zl or Zp) of the Unicode character database.
+# * U+0009 (09) Horizontal tabulation (\t control-I)
+# * U+000A (10) Line feed (\n control-J)
+# * U+000B (11) Vertical tabulation (\v control-K)
+# * U+000C (12) Form feed (\f control-L)
+# * U+000D (13) Carriage return (\r control-M)
+
+category ~ /Zs|Zl|Zp/ ||
+(codepoint >= 9 && codepoint <= 13) {
+ charsets[charset_count++] = "whitespace";
+ charsets[charset_count++] = "printing";
+}
+
+## The iso_control character set
+################################
+
+# The ISO control characters are the Unicode/Latin-1 characters in the
+# ranges [U+0000,U+001F] ([0,31]) and [U+007F,U+009F] ([127,159]).
+
+(codepoint >= 0 && codepoint <= 31) ||
+(codepoint >= 127 && codepoint <= 159) {
+ charsets[charset_count++] = "iso_control";
+}
+
+## The punctuation character set
+################################
+
+# A punctuation character is any character that has one of the
+# punctuation categories in the Unicode character database (Pc, Pd,
+# Ps, Pe, Pi, Pf, or Po.)
+
+# Note that srfi-14 gives conflicting requirements!! It claims that
+# only the Unicode punctuation is necessary, but, explicitly calls out
+# the soft hyphen character (U+00AD) as punctution. Current versions
+# of Unicode consider U+00AD to be a formatting character, not
+# punctuation.
+
+category ~ /P/ {
+ charsets[charset_count++] = "punctuation";
+}
+
+## The symbol character set
+###########################
+
+# A symbol is any character that has one of the symbol categories in
+# the Unicode character database (Sm, Sc, Sk, or So).
+
+category ~ /S/ {
+ charsets[charset_count++] = "symbol";
+}
+
+## The blank character set
+##########################
+
+# Blank chars are horizontal whitespace. A blank character is either
+# * a character with the space separator category (Zs) in the
+# Unicode character database.
+# * U+0009 (9) Horizontal tabulation (\t control-I)
+
+category ~ /Zs/ || codepoint == 9 {
+ charsets[charset_count++] = "blank";
+}
+
+## The ascii character set
+##########################
+
+codepoint <= 127 {
+ charsets[charset_count++] = "ascii";
+}
+
+## The designated character set
+###############################
+
+# Designated -- All characters except for the surrogates
+
+category !~ /Cs/ {
+ charsets[charset_count++] = "designated";
+}
+
+## Other character sets
+#######################
+
+# Note that the "letter_plus_digit" and "printing" character sets, which
+# are unions of other character sets, are included in the patterns
+# matching their constituent parts (i.e., the "letter_plus_digit"
+# character set is included as part of the "letter" and "digit"
+# patterns).
+#
+# Also, the "empty" character is computed by doing precisely nothing!
+
+# Keeping track of state
+########################
+
+# Update the state for each charset.
+{
+ for (i = 0; i < charset_count; i++) {
+ cs = charsets[i];
+ if (state[cs, "start"] == -1) {
+ state[cs, "start"] = codepoint;
+ state[cs, "end"] = codepoint_end;
+ } else if (state[cs, "end"] + 1 == codepoint) {
+ state[cs, "end"] = codepoint_end;
+ } else {
+ count = state[cs, "count"];
+ state[cs, "count"]++;
+ state[cs, "ranges", count, 0] = state[cs, "start"];
+ state[cs, "ranges", count, 1] = state[cs, "end"];
+ state[cs, "start"] = codepoint;
+ state[cs, "end"] = codepoint_end;
+ }
+ }
+}
+
+# Printing and error handling
+#############################
+
+END {
+ # Normally, an exit statement runs all the 'END' blocks before
+ # actually exiting. We use the 'exit_status' variable to short
+ # circuit the rest of the 'END' block by reissuing the exit
+ # statement.
+ if (exit_status != 0) {
+ exit exit_status;
+ }
+
+ # Write a bit of a header.
+ print("/* srfi-14.i.c -- standard SRFI-14 character set data */");
+ print("");
+ print("/* This file is #include'd by srfi-14.c. */");
+ print("");
+ print("/* This file was generated from");
+ print(" https://unicode.org/Public/UNIDATA/UnicodeData.txt");
+ print(" with the unidata_to_charset.awk script. */");
+ print("");
+
+ for (i = 0; i < all_charsets_count; i++) {
+ cs = all_charsets[i];
+
+ # Extra logic to ensure that the last range is included.
+ if (state[cs, "start"] != -1) {
+ count = state[cs, "count"];
+ state[cs, "count"]++;
+ state[cs, "ranges", count, 0] = state[cs, "start"];
+ state[cs, "ranges", count, 1] = state[cs, "end"];
+ }
+
+ count = state[cs, "count"];
+
+ print("static const scm_t_char_range cs_" cs "_ranges[] = {");
+ for (j = 0; j < count; j++) {
+ rstart = state[cs, "ranges", j, 0];
+ rend = state[cs, "ranges", j, 1];
+ if (j + 1 < count) {
+ printf(" {0x%04x, 0x%04x},\n", rstart, rend);
+ } else {
+ printf(" {0x%04x, 0x%04x}\n", rstart, rend);
+ }
+ }
+ print("};");
+ print("");
+
+ count = state[cs, "count"];
+ printf("static const size_t cs_%s_len = %d;\n", cs, count);
+ if (i + 1 < all_charsets_count) {
+ print("");
+ }
+ }
+}
+
+# And we're done.