summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog16
-rw-r--r--MANIFEST3
-rw-r--r--Makefile.in1
-rw-r--r--src/preproc/preconv/.cvsignore3
-rw-r--r--src/preproc/preconv/Makefile.sub6
-rw-r--r--src/preproc/preconv/preconv.cpp1086
-rw-r--r--test-groff.in1
7 files changed, 1115 insertions, 1 deletions
diff --git a/ChangeLog b/ChangeLog
index 855d95d1..8b0d073c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+2005-12-30 Werner LEMBERG <wl@gnu.org>
+
+ New preprocessor `preconv' to convert input encodings to something
+ groff can understand. Not yet integrated within groff. Proper
+ autoconf stuff is missing too.
+
+ Tomohiro Kubota has written a first draft of this program, and some
+ ideas have been reused (while almost no code has been taken
+ actually).
+
+ * src/preproc/preconv/preconv.cpp. src/preproc/preconv/Makefile.sub:
+ New files.
+
+ * MANIFEST, Makefile.in (CCPROGDIRS), test-groff.in
+ (GROFF_BIN_PATH): Add preconv.
+
2005-12-12 Werner LEMBERG <wl@gnu.org>
* aclocal.m4 (GROFF_MAKEINFO): Fix regexps to be POSIX conformant.
diff --git a/MANIFEST b/MANIFEST
index f8d32d0e..65edcd31 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -1,6 +1,6 @@
MANIFEST
-Last update: 26 May 2005
+Last update: 18 Dec 2005
This file is part of groff, the GNU roff type-setting system.
@@ -105,6 +105,7 @@ the groff source distribution.
snprintf An implementation of snprintf() and friends.
./src/preproc Preprocessors.
+ preconv Input encoding conversion.
eqn Mathematical formulae.
grn Gremlin pictures.
html The preprocessor part of grohtml.
diff --git a/Makefile.in b/Makefile.in
index 81e2e615..897008d3 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -507,6 +507,7 @@ LIBDIRS=\
CCPROGDIRS=\
src/roff/groff \
src/roff/troff \
+ src/preproc/preconv \
src/preproc/tbl \
src/preproc/pic \
src/preproc/eqn \
diff --git a/src/preproc/preconv/.cvsignore b/src/preproc/preconv/.cvsignore
new file mode 100644
index 00000000..cd554c53
--- /dev/null
+++ b/src/preproc/preconv/.cvsignore
@@ -0,0 +1,3 @@
+Makefile.dep
+preconv
+preconv.n
diff --git a/src/preproc/preconv/Makefile.sub b/src/preproc/preconv/Makefile.sub
new file mode 100644
index 00000000..64e03d59
--- /dev/null
+++ b/src/preproc/preconv/Makefile.sub
@@ -0,0 +1,6 @@
+PROG=preconv$(EXEEXT)
+# MAN1=preconv.n
+XLIBS=$(LIBGROFF)
+MLIB=$(LIBM)
+OBJS=preconv.$(OBJEXT)
+CCSRCS=$(srcdir)/preconv.cpp
diff --git a/src/preproc/preconv/preconv.cpp b/src/preproc/preconv/preconv.cpp
new file mode 100644
index 00000000..b3ef6135
--- /dev/null
+++ b/src/preproc/preconv/preconv.cpp
@@ -0,0 +1,1086 @@
+// -*- C++ -*-
+/* Copyright (C) 2005
+ Free Software Foundation, Inc.
+ Written by Werner Lemberg (wl@gnu.org)
+
+This file is part of groff.
+
+groff is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+groff is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License along
+with groff; see the file COPYING. If not, write to the Free Software
+Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
+
+#define I18N
+
+#include "lib.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <errno.h>
+#include "errarg.h"
+#include "error.h"
+#include "nonposix.h"
+#include "stringclass.h"
+
+#ifdef I18N
+# include <locale.h>
+# include <langinfo.h>
+# include <iconv.h>
+# ifdef WORDS_BIGENDIAN
+# define UNICODE "UTF-32BE"
+# else
+# define UNICODE "UTF-32LE"
+# endif
+#endif /* I18N */
+
+#define MAX_VAR_LEN 100
+
+extern "C" const char *Version_string;
+
+const char *default_encoding;
+char user_encoding[MAX_VAR_LEN];
+char encoding_string[MAX_VAR_LEN];
+int debug = 0;
+
+struct conversion {
+ const char *from;
+ const char *to;
+};
+
+// The official list of MIME tags can be found at
+//
+// http://www.iana.org/assignments/character-sets
+//
+// For encodings which don't have a MIME tag we use GNU iconv's encoding
+// names (which also work with Bruno Haible's libinconv package). They
+// are marked with `*'.
+//
+// Encodings marked with `--' are special to Emacs or other applications and
+// shouldn't be used for data exchange.
+//
+// `Not covered' means that the encoding can be handled neither by GNU iconv
+// nor by libiconv, or just one of them has support for it.
+//
+// A special case is VIQR encoding: Despite of having a MIME tag it is
+// missing in both libiconv 1.9.1 and iconv (coming with GNU libc 2.3.3).
+//
+// Finally, we add all aliases of GNU iconv for `ascii' (handled as
+// latin-1), `latin1', and `utf8' to catch those encoding names before iconv
+// is called.
+
+static const conversion
+emacs_to_mime[] = {
+ {"alternativnyj", ""}, // ?
+ {"arabic-iso-8bit", "ISO-8859-6"},
+ {"ascii", "ISO-8859-1"},
+ {"big5", "Big5"},
+ {"binary", ""}, // --
+ {"chinese-big5", "Big5"},
+ {"chinese-euc", ""}, // XEmacs?
+ {"chinese-hz", "HZ-GB-2312"},
+ {"chinese-iso-7bit", "ISO-2022-CN"},
+ {"chinese-iso-8bit", "GB2312"},
+ {"chinese-iso-8bit-with-esc", ""}, // --
+ {"cn-big5", "Big5"},
+ {"cn-gb", "GB2312"},
+ {"cn-gb-2312", "GB2312"},
+ {"compound-text", ""}, // --
+ {"compound-text-with-extension", ""}, // --
+ {"cp1125", "cp1125"}, // *
+ {"cp1250", "windows-1250"},
+ {"cp1251", "windows-1251"},
+ {"cp1252", "windows-1252"},
+ {"cp1253", "windows-1253"},
+ {"cp1254", "windows-1254"},
+ {"cp1255", "windows-1255"},
+ {"cp1256", "windows-1256"},
+ {"cp1257", "windows-1257"},
+ {"cp1258", "windows-1258"},
+ {"cp437", "IBM437"},
+ {"cp720", ""}, // not covered
+ {"cp737", "cp737"}, // *
+ {"cp775", "IBM775"},
+ {"cp850", "IBM850"},
+ {"cp851", "IBM851"},
+ {"cp852", "IBM852"},
+ {"cp855", "IBM855"},
+ {"cp857", "IBM857"},
+ {"cp860", "IBM860"},
+ {"cp861", "IBM861"},
+ {"cp862", "IBM862"},
+ {"cp863", "IBM863"},
+ {"cp864", "IBM864"},
+ {"cp865", "IBM865"},
+ {"cp866", "IBM866"},
+ {"cp866u", "cp1125"}, // *
+ {"cp869", "IBM869"},
+ {"cp874", "cp874"}, // *
+ {"cp878", "KOI8-R"},
+ {"cp932", "SHIFT_JIS"},
+ {"cp936", "GB2312"},
+ {"cp949", "EUC-KR"},
+ {"cp950", "Big5"},
+ {"csascii", "ISO-8859-1"}, // alias
+ {"csisolatin1", "ISO-8859-1"}, // alias
+ {"ctext", ""}, // --
+ {"ctext-no-compositions", ""}, // --
+ {"ctext-with-extensions", ""}, // --
+ {"cyrillic-alternativnyj", ""}, // ?
+ {"cyrillic-iso-8bit", "ISO-8859-5"},
+ {"cyrillic-iso-8bit-with-esc", ""}, // --
+ {"cyrillic-koi8", "KOI8-R"},
+ {"cyrillic-koi8-t", "KOI8-T"}, // *
+ {"devanagari", ""}, // not covered
+ {"dos", ""}, // --
+ {"emacs-mule", ""}, // --
+ {"euc-china", "GB2312"},
+ {"euc-cn", "GB2312"},
+ {"euc-japan", "EUC-JP"},
+ {"euc-japan-1990", "EUC-JP"},
+ {"euc-jisx0213", ""}, // XEmacs?
+ {"euc-jisx0213-with-esc", ""}, // XEmacs?
+ {"euc-jp", "EUC-JP"},
+ {"euc-korea", "EUC-KR"},
+ {"euc-kr", "EUC-KR"},
+ {"euc-taiwan", "EUC-TW"}, // *
+ {"euc-tw", "EUC-TW"}, // *
+ {"gb2312", "GB2312"},
+ {"georgian-ps", "GEORGIAN-PS"}, // *
+ {"greek-iso-8bit", "ISO-8859-7"},
+ {"greek-iso-8bit-with-esc", ""}, // --
+ {"hebrew-iso-8bit", "ISO-8859-8"},
+ {"hebrew-iso-8bit-with-esc", ""}, // --
+ {"hz", "HZ-GB-2312"},
+ {"hz-gb-2312", "HZ-GB-2312"},
+ {"in-is13194", ""}, // not covered
+ {"in-is13194-with-esc", ""}, // --
+ {"iso-10646/utf8", "UTF-8"}, // alias
+ {"iso-10646/utf-8", "UTF-8"}, // alias
+ {"iso-2022-7", ""}, // XEmacs?
+ {"iso-2022-7bit", ""}, // --
+ {"iso-2022-7bit-lock", ""}, // --
+ {"iso-2022-7bit-lock-ss2", ""}, // --
+ {"iso-2022-7bit-ss2", ""}, // --
+ {"iso-2022-8", ""}, // XEmacs?
+ {"iso-2022-8bit", ""}, // XEmacs?
+ {"iso-2022-8bit-lock", ""}, // XEmacs?
+ {"iso-2022-8bit-lock-ss2", ""}, // XEmacs?
+ {"iso-2022-8bit-ss2", ""}, // --
+ {"iso-2022-cjk", ""}, // --
+ {"iso-2022-cn", "ISO-2022-CN"},
+ {"iso-2022-cn-ext", "ISO-2022-CN-EXT"},
+ {"iso-2022-int-1", ""}, // --
+ {"iso-2022-jp", "ISO-2022-JP"},
+ {"iso-2022-jp-1978-irv", "ISO-2022-JP"},
+ {"iso-2022-jp-2", "ISO-2022-JP-2"},
+ {"iso-2022-jp-3", ""}, // XEmacs?
+ {"iso-2022-jp-3-compatible", ""}, // XEmacs?
+ {"iso-2022-jp-3-strict", ""}, // XEmacs?
+ {"iso-2022-kr", "ISO-2022-KR"},
+ {"iso-2022-lock", ""}, // XEmacs?
+ {"iso-8859-1", "ISO-8859-1"},
+ {"iso-8859-10", "ISO-8859-10"},
+ {"iso-8859-11", "ISO-8859-11"}, // *
+ {"iso-8859-13", "ISO-8859-13"},
+ {"iso-8859-14", "ISO-8859-14"},
+ {"iso-8859-15", "ISO-8859-15"},
+ {"iso-8859-16", "ISO-8859-16"},
+ {"iso-8859-2", "ISO-8859-2"},
+ {"iso-8859-3", "ISO-8859-3"},
+ {"iso-8859-4", "ISO-8859-4"},
+ {"iso-8859-5", "ISO-8859-5"},
+ {"iso-8859-6", "ISO-8859-6"},
+ {"iso-8859-7", "ISO-8859-7"},
+ {"iso-8859-8", "ISO-8859-8"},
+ {"iso-8859-8-e", "ISO-8859-8"},
+ {"iso-8859-8-i", "ISO-8859-8"},
+ {"iso-8859-9", "ISO-8859-9"},
+ {"iso-latin-1", "ISO-8859-1"},
+ {"iso-latin-10", "ISO-8859-16"},
+ {"iso-latin-1-with-esc", ""}, // --
+ {"iso-latin-2", "ISO-8859-2"},
+ {"iso-latin-2-with-esc", ""}, // --
+ {"iso-latin-3", "ISO-8859-3"},
+ {"iso-latin-3-with-esc", ""}, // --
+ {"iso-latin-4", "ISO-8859-4"},
+ {"iso-latin-4-with-esc", ""}, // --
+ {"iso-latin-5", "ISO-8859-9"},
+ {"iso-latin-5-with-esc", ""}, // --
+ {"iso-latin-6", "ISO-8859-10"},
+ {"iso-latin-7", "ISO-8859-13"},
+ {"iso-latin-8", "ISO-8859-14"},
+ {"iso-latin-9", "ISO-8859-15"},
+ {"iso-safe", ""}, // --
+ {"japanese-iso-7bit-1978-irv", "ISO-2022-JP"},
+ {"japanese-iso-8bit", "EUC-JP"},
+ {"japanese-iso-8bit-with-esc", ""}, // --
+ {"japanese-euc", ""}, // XEmacs?
+ {"japanese-shift-jis", "Shift_JIS"},
+ {"japanese-shift-jisx0213", ""}, // XEmacs?
+ {"junet", "ISO-2022-JP"},
+ {"koi8", "KOI8-R"},
+ {"koi8-r", "KOI8-R"},
+ {"koi8-t", "KOI8-T"}, // *
+ {"koi8-u", "KOI8-U"},
+ {"korean-euc", ""}, // XEmacs?
+ {"korean-iso-7bit-lock", "ISO-2022-KR"},
+ {"korean-iso-8bit", "EUC-KR"},
+ {"korean-iso-8bit-with-esc", ""}, // --
+ {"lao", ""}, // not covered
+ {"lao-with-esc", ""}, // --
+ {"latin1", "ISO-8859-1"}, // alias
+ {"latin-0", "ISO-8859-15"},
+ {"latin-1", "ISO-8859-1"},
+ {"latin-10", "ISO-8859-16"},
+ {"latin-2", "ISO-8859-2"},
+ {"latin-3", "ISO-8859-3"},
+ {"latin-4", "ISO-8859-4"},
+ {"latin-5", "ISO-8859-9"},
+ {"latin-6", "ISO-8859-10"},
+ {"latin-7", "ISO-8859-13"},
+ {"latin-8", "ISO-8859-14"},
+ {"latin-9", "ISO-8859-15"},
+ {"mac", ""}, // --
+ {"mac-roman", "MACINTOSH"},
+ {"mik", ""}, // not covered
+ {"mule-utf-16", "UTF-16"},
+ {"mule-utf-16be", "UTF-16BE"},
+ {"mule-utf-16-be", "UTF-16BE"},
+ {"mule-utf-16be-with-signature", "UTF-16"}, // not UTF-16BE
+ {"mule-utf-16le", "UTF-16LE"},
+ {"mule-utf-16-le", "UTF-16LE"},
+ {"mule-utf-16le-with-signature", "UTF-16"}, // not UTF-16LE
+ {"mule-utf-8", "UTF-8"},
+ {"next", "NEXTSTEP"}, // *
+ {"no-conversion", ""}, // --
+ {"old-jis", "ISO-2022-JP"},
+ {"pt154", "PT154"},
+ {"raw-text", ""}, // --
+ {"ruscii", "cp1125"}, // *
+ {"shift_jis", "Shift_JIS"},
+ {"shift_jisx0213", ""}, // XEmacs?
+ {"sjis", "Shift_JIS"},
+ {"tcvn", "TCVN"}, // *
+ {"tcvn-5712", "TCVN"}, // *
+ {"thai-tis620", "TIS-620"},
+ {"thai-tis620-with-esc", ""}, // --
+ {"th-tis620", "TIS-620"},
+ {"tibetan", ""}, // not covered
+ {"tibetan-iso-8bit", ""}, // not covered
+ {"tibetan-iso-8bit-with-esc", ""}, // --
+ {"tis-620", "TIS-620"},
+ {"tis620", "TIS-620"},
+ {"undecided", ""}, // --
+ {"unix", ""}, // --
+ {"us-ascii", "US-ASCII"},
+ {"utf8", "UTF-8"}, // alias
+ {"utf-16", "UTF-16"},
+ {"utf-16-be", "UTF-16BE"},
+ {"utf-16-be-with-signature", "UTF-16"}, // not UTF-16BE
+ {"utf-16-le", "UTF-16LE"},
+ {"utf-16-le-with-signature", "UTF-16"}, // not UTF-16LE
+ {"utf-7", "UTF-7"},
+ {"utf-7-safe", ""}, // XEmacs?
+ {"utf-8", "UTF-8"},
+ {"utf-8-ws", "UTF-8"}, // XEmacs?
+ {"vietnamese-tcvn", "TCVN"}, // *
+ {"vietnamese-viqr", "VIQR"}, // not covered
+ {"vietnamese-viscii", "VISCII"},
+ {"vietnamese-vscii", "VISCII"},
+ {"viqr", "VIQR"}, // not covered
+ {"viscii", "VISCII"},
+ {"vscii", ""}, // not covered
+ {"windows-1250", "windows-1250"},
+ {"windows-1251", "windows-1251"},
+ {"windows-1252", "windows-1252"},
+ {"windows-1253", "windows-1253"},
+ {"windows-1254", "windows-1254"},
+ {"windows-1255", "windows-1255"},
+ {"windows-1256", "windows-1256"},
+ {"windows-1257", "windows-1257"},
+ {"windows-1258", "windows-1258"},
+ {"x-ctext", ""}, // --
+ {"x-ctext-with-extensions", ""}, // --
+ {NULL, NULL},
+};
+
+// ---------------------------------------------------------
+// Convert encoding name from emacs to mime.
+// ---------------------------------------------------------
+char *
+emacs2mime(char *emacs_enc)
+{
+ int emacs_enc_len = strlen(emacs_enc);
+ if (!strcasecmp(emacs_enc + emacs_enc_len - 4, "-dos"))
+ emacs_enc[emacs_enc_len - 4] = 0;
+ if (!strcasecmp(emacs_enc + emacs_enc_len - 4, "-mac"))
+ emacs_enc[emacs_enc_len - 4] = 0;
+ if (!strcasecmp(emacs_enc + emacs_enc_len - 5, "-unix"))
+ emacs_enc[emacs_enc_len - 5] = 0;
+ for (const conversion *table = emacs_to_mime; table->from; table++)
+ if (!strcasecmp(emacs_enc, table->from))
+ return (char *)table->to;
+ return emacs_enc;
+}
+
+// ---------------------------------------------------------
+// Print out Unicode entity if value is greater than 0x7F.
+// ---------------------------------------------------------
+inline void
+unicode_entity(int u)
+{
+ if (u < 0x80)
+ putchar(u);
+ else
+ printf("\\[u%04X]", u);
+}
+
+// ---------------------------------------------------------
+// Conversion functions. All functions take `data', which
+// normally holds the first two lines, and a file pointer.
+// ---------------------------------------------------------
+
+// Conversion from ISO-8859-1 (aka Latin-1) to Unicode.
+void
+conversion_latin1(FILE *fp, const string &data)
+{
+ int len = data.length();
+ const unsigned char *ptr = (const unsigned char *)data.contents();
+ for (int i = 0; i < len; i++)
+ unicode_entity(ptr[i]);
+ int c = -1;
+ while ((c = getc(fp)) != EOF)
+ unicode_entity(c);
+}
+
+// A future version of groff shall support UTF-8 natively.
+// In this case, the UTF-8 stuff here in this file will be
+// moved to the troff program.
+
+struct utf8 {
+ FILE *fp;
+ unsigned char s[6];
+ enum {
+ FIRST = 0,
+ SECOND,
+ THIRD,
+ FOURTH,
+ FIFTH,
+ SIXTH
+ } byte;
+ int expected_bytes;
+ int invalid_warning;
+ int incomplete_warning;
+ utf8(FILE *);
+ ~utf8();
+ void add(unsigned char);
+ void invalid();
+ void incomplete();
+};
+
+utf8::utf8(FILE *f) : fp(f), byte(FIRST), expected_bytes(1),
+ invalid_warning(1), incomplete_warning(1)
+{
+ // empty
+}
+
+utf8::~utf8()
+{
+ if (byte != FIRST)
+ incomplete();
+}
+
+inline void
+utf8::add(unsigned char c)
+{
+ s[byte] = c;
+ if (byte == FIRST) {
+ if (c < 0x80)
+ unicode_entity(c);
+ else if (c < 0xC0)
+ invalid();
+ else if (c < 0xE0) {
+ expected_bytes = 2;
+ byte = SECOND;
+ }
+ else if (c < 0xF0) {
+ expected_bytes = 3;
+ byte = SECOND;
+ }
+ else if (c < 0xF8) {
+ expected_bytes = 4;
+ byte = SECOND;
+ }
+ else if (c < 0xFC) {
+ expected_bytes = 5;
+ byte = SECOND;
+ }
+ else if (c < 0xFE) {
+ expected_bytes = 6;
+ byte = SECOND;
+ }
+ else
+ invalid();
+ return;
+ }
+ if (c < 0x80 || c > 0xBF) {
+ incomplete();
+ add(c);
+ return;
+ }
+ switch (byte) {
+ case FIRST:
+ // can't happen
+ break;
+ case SECOND:
+ if (expected_bytes == 2) {
+ if (s[0] < 0xC2)
+ invalid();
+ else
+ unicode_entity(((s[0] & 0x1F) << 6)
+ | (s[1] ^ 0x80));
+ byte = FIRST;
+ }
+ else
+ byte = THIRD;
+ break;
+ case THIRD:
+ if (expected_bytes == 3) {
+ if (!(s[0] >= 0xE1 || s[1] >= 0xA0))
+ invalid();
+ else
+ unicode_entity(((s[0] & 0x1F) << 12)
+ | ((s[1] ^ 0x80) << 6)
+ | (s[2] ^ 0x80));
+ byte = FIRST;
+ }
+ else
+ byte = FOURTH;
+ break;
+ case FOURTH:
+ // We reject everything greater than 0x10FFFF.
+ if (expected_bytes == 4) {
+ if (!((s[0] >= 0xF1 || s[1] >= 0x90)
+ && (s[0] < 0xF4 || (s[0] == 0xF4 && s[1] < 0x90))))
+ invalid();
+ else
+ unicode_entity(((s[0] & 0x07) << 18)
+ | ((s[1] ^ 0x80) << 12)
+ | ((s[2] ^ 0x80) << 6)
+ | (s[3] ^ 0x80));
+ byte = FIRST;
+ }
+ else
+ byte = FIFTH;
+ break;
+ case FIFTH:
+ if (expected_bytes == 5) {
+ invalid();
+ byte = FIRST;
+ }
+ else
+ byte = SIXTH;
+ break;
+ case SIXTH:
+ invalid();
+ byte = FIRST;
+ break;
+ }
+}
+
+void
+utf8::invalid()
+{
+ if (debug && invalid_warning) {
+ fprintf(stderr, " invalid byte(s) found in input stream --\n"
+ " each such sequence replaced with 0xFFFD\n");
+ invalid_warning = 0;
+ }
+ unicode_entity(0xFFFD);
+ byte = FIRST;
+}
+
+void
+utf8::incomplete()
+{
+ if (debug && incomplete_warning) {
+ fprintf(stderr, " incomplete sequence(s) found in input stream --\n"
+ " each such sequence replaced with 0xFFFD\n");
+ incomplete_warning = 0;
+ }
+ unicode_entity(0xFFFD);
+ byte = FIRST;
+}
+
+// Conversion from UTF-8 to Unicode.
+void
+conversion_utf8(FILE *fp, const string &data)
+{
+ utf8 u(fp);
+ int len = data.length();
+ const unsigned char *ptr = (const unsigned char *)data.contents();
+ for (int i = 0; i < len; i++)
+ u.add(ptr[i]);
+ int c = -1;
+ while ((c = getc(fp)) != EOF)
+ u.add(c);
+ return;
+}
+
+// Conversion from cp1047 (EBCDIC) to UTF-8.
+void
+conversion_cp1047(FILE *fp, const string &data)
+{
+ static unsigned char cp1047[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, // 0x00
+ 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x10, 0x11, 0x12, 0x13, 0x9D, 0x85, 0x08, 0x87, // 0x10
+ 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x0A, 0x17, 0x1B, // 0x20
+ 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07,
+ 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, // 0x30
+ 0x98, 0x99, 0x9A, 0x9B, 0x14, 0x15, 0x9E, 0x1A,
+ 0x20, 0xA0, 0xE2, 0xE4, 0xE0, 0xE1, 0xE3, 0xE5, // 0x40
+ 0xE7, 0xF1, 0xA2, 0x2E, 0x3C, 0x28, 0x2B, 0x7C,
+ 0x26, 0xE9, 0xEA, 0xEB, 0xE8, 0xED, 0xEE, 0xEF, // 0x50
+ 0xEC, 0xDF, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E,
+ 0x2D, 0x2F, 0xC2, 0xC4, 0xC0, 0xC1, 0xC3, 0xC5, // 0x60
+ 0xC7, 0xD1, 0xA6, 0x2C, 0x25, 0x5F, 0x3E, 0x3F,
+ 0xF8, 0xC9, 0xCA, 0xCB, 0xC8, 0xCD, 0xCE, 0xCF, // 0x70
+ 0xCC, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22,
+ 0xD8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, // 0x80
+ 0x68, 0x69, 0xAB, 0xBB, 0xF0, 0xFD, 0xFE, 0xB1,
+ 0xB0, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, // 0x90
+ 0x71, 0x72, 0xAA, 0xBA, 0xE6, 0xB8, 0xC6, 0xA4,
+ 0xB5, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, // 0xA0
+ 0x79, 0x7A, 0xA1, 0xBF, 0xD0, 0x5B, 0xDE, 0xAE,
+ 0xAC, 0xA3, 0xA5, 0xB7, 0xA9, 0xA7, 0xB6, 0xBC, // 0xB0
+ 0xBD, 0xBE, 0xDD, 0xA8, 0xAF, 0x5D, 0xB4, 0xD7,
+ 0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, // 0xC0
+ 0x48, 0x49, 0xAD, 0xF4, 0xF6, 0xF2, 0xF3, 0xF5,
+ 0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, // 0xD0
+ 0x51, 0x52, 0xB9, 0xFB, 0xFC, 0xF9, 0xFA, 0xFF,
+ 0x5C, 0xF7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, // 0xE0
+ 0x59, 0x5A, 0xB2, 0xD4, 0xD6, 0xD2, 0xD3, 0xD5,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, // 0xF0
+ 0x38, 0x39, 0xB3, 0xDB, 0xDC, 0xD9, 0xDA, 0x9F,
+ };
+ int len = data.length();
+ const unsigned char *ptr = (const unsigned char *)data.contents();
+ for (int i = 0; i < len; i++)
+ unicode_entity(cp1047[ptr[i]]);
+ int c = -1;
+ while ((c = getc(fp)) != EOF)
+ unicode_entity(cp1047[c]);
+}
+
+// Locale-sensible conversion.
+#ifdef I18N
+void
+conversion_iconv(FILE *fp, const string &data, char *enc)
+{
+ iconv_t handle = iconv_open(UNICODE, enc);
+ if (handle == (iconv_t)-1) {
+ if (errno == EINVAL) {
+ error("encoding system `%1' not supported by iconv()", enc);
+ return;
+ }
+ fatal("iconv_open failed");
+ }
+ char inbuf[BUFSIZ];
+ int outbuf[BUFSIZ];
+ char *outptr = (char *)outbuf;
+ size_t outbytes_left = BUFSIZ * sizeof (int);
+ // Handle `data'.
+ char *inptr = (char *)data.contents();
+ size_t inbytes_left = data.length();
+ char *limit;
+ while (inbytes_left > 0) {
+ size_t status = iconv(handle, &inptr, &inbytes_left,
+ &outptr, &outbytes_left);
+ if (status == (size_t)-1) {
+ if (errno == EILSEQ) {
+ // Invalid byte sequence. XXX
+ inptr++;
+ inbytes_left--;
+ }
+ else if (errno == E2BIG) {
+ // Output buffer is full.
+ limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left;
+ for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
+ unicode_entity(*ptr);
+ memmove(outbuf, outptr, outbytes_left);
+ outptr = (char *)outbuf + outbytes_left;
+ outbytes_left = BUFSIZ * sizeof (int) - outbytes_left;
+ }
+ else if (errno == EINVAL) {
+ // `data' ends with partial input sequence.
+ memcpy(inbuf, inptr, inbytes_left);
+ break;
+ }
+ }
+ }
+ // Handle `fp' and switch to `inbuf'.
+ size_t read_bytes;
+ char *read_start = inbuf + inbytes_left;
+ while ((read_bytes = fread(read_start, 1, BUFSIZ - inbytes_left, fp)) > 0) {
+ inptr = inbuf;
+ inbytes_left += read_bytes;
+ while (inbytes_left > 0) {
+ size_t status = iconv(handle, &inptr, &inbytes_left,
+ &outptr, &outbytes_left);
+ if (status == (size_t)-1) {
+ if (errno == EILSEQ) {
+ // Invalid byte sequence. XXX
+ inptr++;
+ inbytes_left--;
+ }
+ else if (errno == E2BIG) {
+ // Output buffer is full.
+ limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left;
+ for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
+ unicode_entity(*ptr);
+ memmove(outbuf, outptr, outbytes_left);
+ outptr = (char *)outbuf + outbytes_left;
+ outbytes_left = BUFSIZ * sizeof (int) - outbytes_left;
+ }
+ else if (errno == EINVAL) {
+ // `inbuf' ends with partial input sequence.
+ memmove(inbuf, inptr, inbytes_left);
+ break;
+ }
+ }
+ }
+ read_start = inbuf + inbytes_left;
+ }
+ // XXX use ferror?
+ limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left;
+ for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
+ unicode_entity(*ptr);
+}
+#endif /* I18N */
+
+// ---------------------------------------------------------
+// Handle Byte Order Mark.
+//
+// Since we have a chicken-and-egg problem it's necessary
+// to handle the BOM manually if it is in the data stream.
+// As documented in the Unicode book it is very unlikely
+// that any normal text file (regardless of the encoding)
+// starts with the bytes which represent a BOM.
+//
+// Return the BOM in string `BOM'; `data' then starts with
+// the byte after the BOM. This function reads (at most)
+// four bytes from the data stream.
+// ---------------------------------------------------------
+void
+get_BOM(FILE *fp, string &BOM, string &data)
+{
+ // The BOM is U+FEFF. We have thus the following possible
+ // representations.
+ //
+ // UTF-8: 0xEFBBBF
+ // UTF-16: 0xFEFF or 0xFFFE
+ // UTF-32: 0x0000FEFF or 0xFFFE0000
+ static struct {
+ int len;
+ const char *str;
+ } BOM_table[] = {
+ {4, "\x00\x00\xFE\xFF"},
+// {4, "\xFF\xFE\x00\x00"},
+ {3, "\xEF\xBB\xBF"},
+ {2, "\xFE\xFF"},
+ {2, "\xFF\xFE"},
+ };
+ const int BOM_table_len = sizeof (BOM_table) / sizeof (BOM_table[0]);
+ char BOM_string[4];
+ int len;
+ for (len = 0; len < 4; len++) {
+ int c = getc(fp);
+ if (c == EOF)
+ break;
+ BOM_string[len] = char(c);
+ }
+ int i;
+ for (i = 0; i < BOM_table_len; i++) {
+ if (BOM_table[i].len <= len
+ && memcmp(BOM_string, BOM_table[i].str, BOM_table[i].len) == 0)
+ break;
+ }
+ int j = 0;
+ if (i < BOM_table_len)
+ for (; j < BOM_table[i].len; j++)
+ BOM += BOM_string[j];
+ for (; j < len; j++)
+ data += BOM_string[j];
+}
+
+// ---------------------------------------------------------
+// Get first two lines from input stream.
+//
+// Return string (allocated with `new') without zero bytes
+// or NULL in case no coding tag can occur in the data
+// (which is stored unmodified in `data').
+// ---------------------------------------------------------
+char *
+get_tag_lines(FILE *fp, string &data)
+{
+ int newline_count = 0;
+ int c, prev = -1;
+ // Handle CR, LF, and CRLF as line separators.
+ for (int i = 0; i < data.length(); i++) {
+ c = data[i];
+ if (c == '\n' || c == '\r')
+ newline_count++;
+ if (c == '\n' && prev == '\r')
+ newline_count--;
+ prev = c;
+ }
+ if (newline_count > 1)
+ return NULL;
+ int emit_warning = 1;
+ for (int lines = newline_count; lines < 2; lines++) {
+ while ((c = getc(fp)) != EOF) {
+ if (c == '\0' && debug && emit_warning) {
+ fprintf(stderr,
+ " null byte(s) found in input stream --\n"
+ " search for encoding tag might return false result\n");
+ emit_warning = 0;
+ }
+ data += char(c);
+ if (c == '\n' || c == '\r')
+ break;
+ }
+ // Handle CR, LF, and CRLF as line separators.
+ if (c == '\r') {
+ c = getc(fp);
+ if (c != EOF && c != '\n')
+ ungetc(c, fp);
+ else
+ data += char(c);
+ }
+ }
+ return data.extract();
+}
+
+// ---------------------------------------------------------
+// Check whether C string starts with a comment.
+//
+// Return 1 if true, 0 otherwise.
+// ---------------------------------------------------------
+int
+is_comment_line(char *s)
+{
+ if (!s || !*s)
+ return 0;
+ if (*s == '.')
+ {
+ s++;
+ while (*s == ' ' || *s == '\t')
+ s++;
+ if (*s && *s == '\\')
+ {
+ s++;
+ if (*s == '"' || *s == '#')
+ return 1;
+ }
+ }
+ else if (*s == '\\')
+ {
+ s++;
+ if (*s == '#')
+ return 1;
+ }
+ return 0;
+}
+
+// ---------------------------------------------------------
+// Get a value/variable pair from a local variables list
+// in a C string which look like this:
+//
+// <variable1>: <value1>; <variable2>: <value2>; ...
+//
+// Leading and trailing blanks are ignored. There might be
+// more than one blank after `:' and `;'.
+//
+// Return position of next value/variable pair or NULL if
+// at end of data.
+// ---------------------------------------------------------
+char *
+get_variable_value_pair(char *d1, char **variable, char **value)
+{
+ static char var[MAX_VAR_LEN], val[MAX_VAR_LEN];
+ *variable = var;
+ *value = val;
+ while (*d1 == ' ' || *d1 == '\t')
+ d1++;
+ // Get variable.
+ int l = 0;
+ while (l < MAX_VAR_LEN - 1 && *d1 && !strchr(";: \t", *d1))
+ var[l++] = *(d1++);
+ var[l] = 0;
+ // Skip everything until `:', `;', or end of data.
+ while (*d1 && *d1 != ':' && *d1 != ';')
+ d1++;
+ val[0] = 0;
+ if (!*d1)
+ return NULL;
+ if (*d1 == ';')
+ return d1 + 1;
+ d1++;
+ while (*d1 == ' ' || *d1 == '\t')
+ d1++;
+ // Get value.
+ l = 0;
+ while (l < MAX_VAR_LEN - 1 && *d1 && !strchr("; \t", *d1))
+ val[l++] = *(d1++);
+ val[l] = 0;
+ // Skip everything until `;' or end of data.
+ while (*d1 && *d1 != ';')
+ d1++;
+ if (*d1 == ';')
+ return d1 + 1;
+ return NULL;
+}
+
+// ---------------------------------------------------------
+// Check encoding tag in the read buffer.
+//
+// We search for the following line:
+//
+// .\"...-*-<local variables list>-*-
+//
+// (`...' might be anything). There can be blanks after
+// the leading `.'; additionally, you might use `\#' starting
+// a line instead of `.\"'.
+//
+// The local variables list must occur within the first
+// comment block at the very beginning of the data stream.
+//
+// Within the <local variables list>, we search for
+//
+// coding: <value>
+//
+// which specifies the coding system used for the data
+// stream.
+//
+// Return <value> if found, NULL otherwise.
+//
+// Note that null bytes in the data are skipped before applying
+// the algorithm. This should work even with files encoded as
+// UTF-16 or UTF-32 (or its siblings) in most cases.
+//
+// XXX Add support for tag at the end of buffer.
+// ---------------------------------------------------------
+char *
+check_encoding_tag(FILE *fp, string &data)
+{
+ char *inbuf = get_tag_lines(fp, data);
+ char *lineend;
+ for (char *p = inbuf; is_comment_line(p); p = lineend + 1) {
+ if ((lineend = strchr(p, '\n')) == NULL)
+ break;
+ *lineend = 0; // switch temporarily to '\0'
+ char *d1 = strstr(p, "-*-");
+ char *d2 = 0;
+ if (d1)
+ d2 = strstr(d1 + 3, "-*-");
+ *lineend = '\n'; // restore newline
+ if (!d1 || !d2)
+ continue;
+ *d2 = 0; // switch temporarily to '\0'
+ d1 += 3;
+ while (d1) {
+ char *variable, *value;
+ d1 = get_variable_value_pair(d1, &variable, &value);
+ if (!strcasecmp(variable, "coding")) {
+ *d2 = '-'; // restore '-'
+ a_delete inbuf;
+ return value;
+ }
+ }
+ *d2 = '-'; // restore '-'
+ }
+ a_delete inbuf;
+ return NULL;
+}
+
+// ---------------------------------------------------------
+// Handle an input file. If filename is `-' handle stdin.
+//
+// Return 1 on success, 0 otherwise.
+// ---------------------------------------------------------
+int
+do_file(const char *filename)
+{
+ FILE *fp;
+ string BOM, data;
+ if (strcmp(filename, "-")) {
+ if (debug)
+ fprintf(stderr, "file `%s':\n", filename);
+ fp = fopen(filename, FOPEN_RB);
+ if (!fp) {
+ error("can't open `%1': %2", filename, strerror(errno));
+ return 0;
+ }
+ }
+ else {
+ if (debug)
+ fprintf(stderr, "standard input:\n");
+ SET_BINARY(fileno(stdin));
+ fp = stdin;
+ }
+ get_BOM(fp, BOM, data);
+ // Determine the encoding.
+ char *encoding;
+ if (user_encoding[0])
+ encoding = user_encoding;
+ else {
+ // `check_encoding_tag' returns a pointer to a static array (or NULL).
+ char *file_encoding = check_encoding_tag(fp, data);
+ if (!file_encoding) {
+ if (debug)
+ fprintf(stderr, " no file encoding\n");
+ file_encoding = (char *)default_encoding;
+ }
+ else
+ if (debug)
+ fprintf(stderr, " file encoding: `%s'\n", file_encoding);
+ encoding = file_encoding;
+ }
+ strncpy(encoding_string, encoding, MAX_VAR_LEN - 1);
+ encoding_string[MAX_VAR_LEN - 1] = 0;
+ encoding = encoding_string;
+ // Translate from MIME & Emacs encoding names to locale encoding names.
+ encoding = emacs2mime(encoding);
+ if (debug)
+ fprintf(stderr, " encoding used: `%s'\n", encoding);
+ data = BOM + data;
+ int success = 1;
+ // Call converter (converters write to stdout).
+ if (!strcasecmp(encoding, "ISO-8859-1"))
+ conversion_latin1(fp, data);
+ else if (!strcasecmp(encoding, "UTF-8"))
+ conversion_utf8(fp, data);
+ else if (!strcasecmp(encoding, "cp1047"))
+ conversion_cp1047(fp, data);
+ else {
+#ifdef I18N
+ conversion_iconv(fp, data, encoding);
+#else
+ error("encoding system `%1' not supported", encoding);
+ success = 0;
+#endif /* I18N */
+ }
+ if (fp != stdin)
+ fclose(fp);
+ return success;
+}
+
+// ---------------------------------------------------------
+// Print usage.
+// ---------------------------------------------------------
+void
+usage(FILE *stream)
+{
+ fprintf(stream, "usage: %s [ option ] [ files ]\n"
+ "\n"
+ "-d show debugging messages\n"
+ "-e encoding specify inpput encoding\n"
+ "-h print this message\n"
+ "-v print version number\n"
+ "\n"
+ "The default encoding is `%s'.\n",
+ program_name, default_encoding);
+}
+
+// ---------------------------------------------------------
+// Main routine.
+// ---------------------------------------------------------
+int
+main(int argc, char **argv)
+{
+ // Determine the default encoding. This must be done before
+ // getopt() is called since the usage message shows the default
+ // encoding.
+#ifdef I18N
+ setlocale(LC_ALL, "");
+ char *locale = setlocale(LC_CTYPE, NULL);
+ if (!locale || !strcmp(locale, "C") || !strcmp(locale, "POSIX"))
+ default_encoding = "latin1";
+ else {
+ default_encoding = nl_langinfo(CODESET);
+ if (!default_encoding)
+ default_encoding = "latin1";
+ }
+#else
+ default_encoding = "latin1";
+#endif /* I18N */
+
+ program_name = argv[0];
+ int opt;
+ static const struct option long_options[] = {
+ { "help", no_argument, 0, CHAR_MAX + 1 },
+ { "version", no_argument, 0, 'v' },
+ { NULL, 0, 0, 0 }
+ };
+
+ // Parse the command line options.
+ while ((opt = getopt_long(argc, argv, "de:hv", long_options, NULL)) != EOF)
+ switch (opt) {
+ case 'v':
+ printf("GNU preconv (groff) version %s %s iconv support\n",
+ Version_string,
+#ifdef I18N
+ "with"
+#else
+ "without"
+#endif /* I18N */
+ );
+ exit(0);
+ break;
+ case 'd':
+ debug = 1;
+ break;
+ case 'e':
+ if (optarg) {
+ strncpy(user_encoding, optarg, MAX_VAR_LEN - 1);
+ user_encoding[MAX_VAR_LEN - 1] = 0;
+ }
+ else
+ user_encoding[0] = 0;
+ break;
+ case CHAR_MAX + 1: // --help
+ usage(stdout);
+ exit(0);
+ break;
+ case '?':
+ usage(stderr);
+ exit(1);
+ break;
+ default:
+ assert(0);
+ }
+ int nbad = 0;
+ if (debug)
+ fprintf(stderr, "default encoding: `%s'\n", default_encoding);
+ if (optind >= argc)
+ nbad += !do_file("-");
+ else
+ for (int i = optind; i < argc; i++)
+ nbad += !do_file(argv[i]);
+ if (ferror(stdout) || fflush(stdout) < 0)
+ fatal("output error");
+ return nbad != 0;
+}
+
+/* end of preconv.cpp */
diff --git a/test-groff.in b/test-groff.in
index 013e1666..6a9e10fe 100644
--- a/test-groff.in
+++ b/test-groff.in
@@ -8,6 +8,7 @@ srcdir=@abs_top_srcdir@
GROFF_BIN_PATH=\
$builddir/roff/groff$SEP\
$builddir/roff/troff$SEP\
+$builddir/preproc/preconv$SEP\
$builddir/preproc/pic$SEP\
$builddir/preproc/eqn$SEP\
$builddir/preproc/tbl$SEP\