summaryrefslogtreecommitdiff
path: root/build/utils
diff options
context:
space:
mode:
authorMichihiro NAKAJIMA <ggcueroad@gmail.com>2011-05-03 05:37:31 -0400
committerMichihiro NAKAJIMA <ggcueroad@gmail.com>2011-05-03 05:37:31 -0400
commit8a33ea4f02f0675897cb0b436fac3747dea37d93 (patch)
tree84412647b98dbaea4eed014148c21c3a72e28aeb /build/utils
parentfe6433d15986d1d7dde1632982b4654f8208e1bf (diff)
downloadlibarchive-8a33ea4f02f0675897cb0b436fac3747dea37d93.tar.gz
If the character-set of filenames in archives is UTF-8, we should automatically
normalize it to avoid the scene that two filenames in one directory are, of course, different byte sequence but they have the same sight because of NFD and NFC. Second reason is that iconv cannot correctly convert NFD characters to other character-set so we have to convert NFD to NFC before iconv handle it unless iconv supports UTF-8-MAC. Third reason is for matching filenames, if filenames in archives are NFD and the platform is not MAC OS, the uses cannot specify the filename the uses want to extract although the users can see the filename by listing. Recently NFD can be displayed on some platforms but creating NFD requires character-set conversion utility, in particular that input of NFD string is hard on Windows platform. SVN-Revision: 3275
Diffstat (limited to 'build/utils')
-rwxr-xr-xbuild/utils/gen_archive_string_composition_h.sh418
1 files changed, 418 insertions, 0 deletions
diff --git a/build/utils/gen_archive_string_composition_h.sh b/build/utils/gen_archive_string_composition_h.sh
new file mode 100755
index 00000000..95dbe167
--- /dev/null
+++ b/build/utils/gen_archive_string_composition_h.sh
@@ -0,0 +1,418 @@
+#!/bin/sh
+#
+# This needs http://unicode.org/Public/UNIDATA/UnicodeData.txt
+#
+inputfile="$1" # Expect UnicodeData.txt
+outfile=archive_string_composition.h
+pickout=/tmp/mk_unicode_composition_tbl$$.awk
+#################################################################################
+#
+# Append the file header of "archive_string_composition.h"
+#
+#################################################################################
+append_copyright()
+{
+cat > ${outfile} <<CR_END
+/*-
+ * Copyright (c) 2011 libarchive Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \$FreeBSD\$
+ *
+ */
+
+/*
+ * ATTENTION!
+ * This file is generated by build/utils/gen_archive_string_composition_h.sh
+ * from http://unicode.org/Public/UNIDATA/UnicodeData.txt
+ *
+ * See also http://unicode.org/report/tr15/
+ */
+
+#ifndef __LIBARCHIVE_BUILD
+#error This header is only to be used internally to libarchive.
+#endif
+
+#ifndef ARCHIVE_STRING_COMPOSITION_H_INCLUDED
+#define ARCHIVE_STRING_COMPOSITION_H_INCLUDED
+
+struct unicode_composition_table {
+ uint32_t cp1;
+ uint32_t cp2;
+ uint32_t nfc;
+};
+
+CR_END
+}
+#################################################################################
+#
+# awk script
+#
+#################################################################################
+cat > ${pickout} <<AWK_END
+#
+BEGIN {
+ FS = ";"
+ min = "";
+ max = "";
+ cmd="sort | awk -F ' ' '{printf \"\\\\t{ 0x%s , 0x%s , 0x%s },\\\\n\",\$1,\$2,\$3}'"
+ print "static const struct unicode_composition_table u_composition_table[] = {"
+}
+END {
+ close(cmd)
+ print "};"
+ print ""
+ #
+ # Output Canonical Combining Class tables used for translating NFD to NFC.
+ #
+ printf "#define CANONICAL_CLASS_MIN\\t0x%s\\n", min
+ printf "#define CANONICAL_CLASS_MAX\\t0x%s\\n", max
+ print ""
+ printf "#define IS_DECOMPOSABLE_BLOCK(uc)\\t\\\\\n"
+ printf "\\t(((uc)>>8) <= 0x%X && u_decomposable_blocks[(uc)>>8])\\n", highnum
+ printf "static const char u_decomposable_blocks[0x%X+1] = {\\n\\t", highnum
+ #
+ # Output blockmap
+ for (i = 0; i <= highnum; i++) {
+ if (i != 0 && i % 32 == 0)
+ printf "\\n\\t"
+ # Additionally Hangul[11XX(17), AC00(172) - D7FF(215)] is decomposable.
+ if (blockmap[i] || i == 17 || (i >= 172 && i <= 215))
+ printf "1,"
+ else
+ printf "0,"
+ }
+ printf "\\n};\\n\\n"
+ #
+ # Output a macro to get a canonical combining class.
+ #
+ print "/* Get Canonical Combining Class(CCC). */"
+ printf "#define CCC(uc)\\t\\\\\n"
+ printf "\\t(((uc) > 0x%s)?0:\\\\\\n", max
+ printf "\\tccc_val[ccc_val_index[ccc_index[(uc)>>8]][((uc)>>4)&0x0F]][(uc)&0x0F])\\n"
+ print ""
+ #
+ # Output a canonical combining class value table.
+ #
+ midcnt = 0
+ printf "/* The table of the value of Canonical Cimbining Class */\\n"
+ print "static const unsigned char ccc_val[][16] = {"
+ print " /* idx=0: XXXX0 - XXXXF */"
+ print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"
+ for (h = 0; h <= highnum; h++) {
+ if (!blockmap[h])
+ continue;
+ for (m = 0; m < 16; m++) {
+ if (!xx_blockmap[h, m])
+ continue;
+ midcnt++
+ printf " /* idx=%d: %03X%1X0 - %03X%1XF */\\n {", midcnt, h, m, h, m
+ for (l = 0; l < 15; l++) {
+ printf "%d, ", xxx_blockmap[h, m, l]
+ }
+ printf "%d },\n", xxx_blockmap[h, m, 15]
+ }
+ }
+ printf "};\n"
+ #
+ # Output the index table of the canonical combining class value table.
+ #
+ cnt = 0
+ midcnt = 0
+ printf "\\n/* The index table to ccc_val[*][16] */\\n"
+ print "static const unsigned char ccc_val_index[][16] = {"
+ print " /* idx=0: XXX00 - XXXFF */"
+ print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"
+ for (h = 0; h <= highnum; h++) {
+ if (!blockmap[h])
+ continue;
+ cnt++
+ printf " /* idx=%d: %03X00 - %03XFF */\\n {", cnt, h, h
+ for (m = 0; m < 16; m++) {
+ if (m != 0)
+ printf ","
+ if (xx_blockmap[h, m]) {
+ midcnt++
+ printf "%2d", midcnt
+ } else
+ printf " 0"
+ }
+ printf " },\\n"
+ }
+ printf "};\\n"
+ #
+ # Output the index table to the index table of the canonical combining
+ # class value table.
+ #
+ printf "\\n/* The index table to ccc_val_index[*][16] */\\n"
+ printf "static const unsigned char ccc_index[] = {\\n ", h
+ cnt = 0
+ for (h = 0; h <= highnum; h++) {
+ if (h != 0 && h % 24 == 0)
+ printf "\\n "
+ if (blockmap[h]) {
+ cnt++;
+ printf "%2d,", cnt
+ } else
+ printf " 0,"
+ }
+ print "};"
+ print ""
+ print "#endif /* ARCHIVE_STRING_COMPOSITION_H_INCLUDED */"
+}
+#
+#
+function hextoi(hex)
+{
+ dec = 0
+ for (i=0; i < length(hex); i++) {
+ x = substr(hex, i+1, 1)
+ if (x ~/[0-9]/)
+ dec = dec * 16 + x;
+ else if (x == "A")
+ dec = dec * 16 + 10;
+ else if (x == "B")
+ dec = dec * 16 + 11;
+ else if (x == "C")
+ dec = dec * 16 + 12;
+ else if (x == "D")
+ dec = dec * 16 + 13;
+ else if (x == "E")
+ dec = dec * 16 + 14;
+ else if (x == "F")
+ dec = dec * 16 + 15;
+ }
+ return dec
+}
+#
+# Collect Canonical Combining Class values.
+#
+\$4 ~/^[0-9A-F]+$/ {
+ if (\$4 !~/^0$/) {
+ if (min == "") {
+ min = \$1
+ }
+ max = \$1
+ high = substr(\$1, 1, length(\$1) -2)
+ highnum = hextoi(high)
+ mid = substr(\$1, length(\$1) -1, 1)
+ midnum = hextoi(mid)
+ low = substr(\$1, length(\$1), 1)
+ lownum = hextoi(low)
+ blockmap[highnum] = 1
+ xx_blockmap[highnum, midnum] = 1
+ xxx_blockmap[highnum, midnum, lownum] = \$4
+ }
+}
+#
+# Following code points are not decomposed in MAC OS.
+# U+2000 - U+2FFF
+# U+F900 - U+FAFF
+# U+2F800 - U+2FAFF
+#
+#\$1 ~/^2[0-9A-F][0-9A-F][0-9A-F]\$/ {
+# next
+#}
+#\$1 ~/^F[9A][0-9A-F][0-9A-F]\$/ {
+# next
+#}
+#\$1 ~/^2F[89A][0-9A-F][0-9A-F]\$/ {
+# next
+#}
+#
+# Exclusion code points specified by
+# http://unicode.org/Public/UNIDATA/CompositionExclusions.txt
+##
+# 1. Script Specifices
+##
+\$1 ~/^095[89ABCDEF]\$/ {
+ next
+}
+\$1 ~/^09D[CDF]\$/ {
+ next
+}
+\$1 ~/^0A3[36]\$/ {
+ next
+}
+\$1 ~/^0A5[9ABE]\$/ {
+ next
+}
+\$1 ~/^0B5[CD]\$/ {
+ next
+}
+\$1 ~/^0F4[3D]\$/ {
+ next
+}
+\$1 ~/^0F5[27C]\$/ {
+ next
+}
+\$1 ~/^0F69\$/ {
+ next
+}
+\$1 ~/^0F7[68]\$/ {
+ next
+}
+\$1 ~/^0F9[3D]\$/ {
+ next
+}
+\$1 ~/^0FA[27C]\$/ {
+ next
+}
+\$1 ~/^0FB9\$/ {
+ next
+}
+\$1 ~/^FB1[DF]\$/ {
+ next
+}
+\$1 ~/^FB2[ABCDEF]\$/ {
+ next
+}
+\$1 ~/^FB3[012345689ABCE]\$/ {
+ next
+}
+\$1 ~/^FB4[01346789ABCDE]\$/ {
+ next
+}
+##
+# 2. Post Composition Version precomposed characters
+##
+\$1 ~/^2ADC\$/ {
+ next
+}
+\$1 ~/^1D15[EF]\$/ {
+ next
+}
+\$1 ~/^1D16[01234]\$/ {
+ next
+}
+\$1 ~/^1D1B[BCDEF]\$/ {
+ next
+}
+\$1 ~/^1D1C0\$/ {
+ next
+}
+##
+# 3. Singleton Decompositions
+##
+\$1 ~/^034[01]\$/ {
+ next
+}
+\$1 ~/^037[4E]\$/ {
+ next
+}
+\$1 ~/^0387\$/ {
+ next
+}
+\$1 ~/^1F7[13579BD]\$/ {
+ next
+}
+\$1 ~/^1FB[BE]\$/ {
+ next
+}
+\$1 ~/^1FC[9B]\$/ {
+ next
+}
+\$1 ~/^1FD[3B]\$/ {
+ next
+}
+\$1 ~/^1FE[3BEF]\$/ {
+ next
+}
+\$1 ~/^1FF[9BD]\$/ {
+ next
+}
+\$1 ~/^200[01]\$/ {
+ next
+}
+\$1 ~/^212[6AB]\$/ {
+ next
+}
+\$1 ~/^232[9A]\$/ {
+ next
+}
+\$1 ~/^F9[0-9A-F][0-9A-F]\$/ {
+ next
+}
+\$1 ~/^FA0[0-9A-D]\$/ {
+ next
+}
+\$1 ~/^FA1[025-9A-E]\$/ {
+ next
+}
+\$1 ~/^FA2[0256A-D]\$/ {
+ next
+}
+\$1 ~/^FA[3-5][0-9A-F]\$/ {
+ next
+}
+\$1 ~/^FA6[0-9A-D]\$/ {
+ next
+}
+\$1 ~/^FA[7-9A-C][0-9A-F]\$/ {
+ next
+}
+\$1 ~/^FAD[0-9]\$/ {
+ next
+}
+\$1 ~/^2F[89][0-9A-F][0-9A-F]\$/ {
+ next
+}
+\$1 ~/^2FA0[0-9A-F]\$/ {
+ next
+}
+\$1 ~/^2FA1[0-9A-D]\$/ {
+ next
+}
+##
+# 4. Non-Starter Decompositions
+##
+\$1 ~/^0344\$/ {
+ next
+}
+\$1 ~/^0F7[35]\$/ {
+ next
+}
+\$1 ~/^0F81\$/ {
+ next
+}
+#
+# Output combinations for NFD ==> NFC.
+#
+\$6 ~/^[0-9A-F]+ [0-9A-F]+\$/ {
+ split(\$6, cp, " ")
+ if (length(\$1) == 4)
+ print "0"cp[1], "0"cp[2], "0"\$1 | cmd
+ else
+ print cp[1], cp[2], \$1 | cmd
+}
+AWK_END
+#################################################################################
+#
+# Run awk a script.
+#
+#################################################################################
+append_copyright
+awk -f ${pickout} ${inputfile} >> ${outfile}
+#
+# Remove awk the script.
+rm ${pickout}