diff options
author | wl <wl> | 2006-01-05 15:45:00 +0000 |
---|---|---|
committer | wl <wl> | 2006-01-05 15:45:00 +0000 |
commit | 595c39b716c8b4b31587bde4fdaaed565836bafa (patch) | |
tree | 53b88f6138e63589d08a39ae2ebdf025aaa8bd19 /src/preproc/preconv | |
parent | 6d141e4c4e8d16b60e6a50d15cf80630443d00d8 (diff) | |
download | groff-595c39b716c8b4b31587bde4fdaaed565836bafa.tar.gz |
* src/preproc/preconv/preconv.cpp (do_file): Don't pass BOM to
`conversion_utf8'.
* src/preproc/preconv/preconv.man: New file. Not complete yet.
* src/preproc/proconv/Makefile.sub (MAN1): New variable.
Diffstat (limited to 'src/preproc/preconv')
-rw-r--r-- | src/preproc/preconv/Makefile.sub | 2 | ||||
-rw-r--r-- | src/preproc/preconv/preconv.cpp | 7 | ||||
-rw-r--r-- | src/preproc/preconv/preconv.man | 161 |
3 files changed, 165 insertions, 5 deletions
diff --git a/src/preproc/preconv/Makefile.sub b/src/preproc/preconv/Makefile.sub index c5f2c9d5..e53050f1 100644 --- a/src/preproc/preconv/Makefile.sub +++ b/src/preproc/preconv/Makefile.sub @@ -1,5 +1,5 @@ PROG=preconv$(EXEEXT) -# MAN1=preconv.n +MAN1=preconv.n XLIBS=$(LIBGROFF) MLIB=$(LIBM) EXTRA_LDFLAGS=$(LIBICONV) diff --git a/src/preproc/preconv/preconv.cpp b/src/preproc/preconv/preconv.cpp index 0897c30d..e3a460e1 100644 --- a/src/preproc/preconv/preconv.cpp +++ b/src/preproc/preconv/preconv.cpp @@ -1051,20 +1051,19 @@ do_file(const char *filename) } if (debug_flag) fprintf(stderr, " encoding used: `%s'\n", encoding); - data = BOM + data; if (!raw_flag) printf(".lf 1 %s\n", filename); int success = 1; // Call converter (converters write to stdout). if (!strcasecmp(encoding, "ISO-8859-1")) - conversion_latin1(fp, data); + conversion_latin1(fp, BOM + data); else if (!strcasecmp(encoding, "UTF-8")) conversion_utf8(fp, data); else if (!strcasecmp(encoding, "cp1047")) - conversion_cp1047(fp, data); + conversion_cp1047(fp, BOM + data); else { #if HAVE_ICONV - conversion_iconv(fp, data, encoding); + conversion_iconv(fp, BOM + data, encoding); #else error("encoding system `%1' not supported", encoding); success = 0; diff --git a/src/preproc/preconv/preconv.man b/src/preproc/preconv/preconv.man new file mode 100644 index 00000000..23c14c7d --- /dev/null +++ b/src/preproc/preconv/preconv.man @@ -0,0 +1,161 @@ +.ig +Copyright (C) 2006 Free Software Foundation, Inc. + +Permission is granted to make and distribute verbatim copies of +this manual provided the copyright notice and this permission notice +are preserved on all copies. + +Permission is granted to copy and distribute modified versions of this +manual under the conditions for verbatim copying, provided that the +entire resulting derived work is distributed under the terms of a +permission notice identical to this one. + +Permission is granted to copy and distribute translations of this +manual into another language, under the above conditions for modified +versions, except that this permission notice may be included in +translations approved by the Free Software Foundation instead of in +the original English. +.. +. +.TH PRECONV @MAN1EXT@ "@MDATE@" "Groff Version @VERSION@" +. +. +.SH NAME +preconv \- convert encoding of input files to something GNU troff understands +. +. +.SH SYNOPSIS +.B preconv +[ +.B \-dhrv +] +[ +.BI \-e encoding +] +[ +.IR files \|.\|.\|.\| +] +. +.PP +It is possible to have whitespace between the +.B \-e +command line option and its parameter. +. +. +.SH DESCRIPTION +.B preconv +reads +.I files +and converts its encoding(s) to a form GNU +.BR troff (@MAN1EXT) +can process, sending the data to standard output. +Currently, this means ASCII characters and `\e[uXXXX]' entities, where +`XXXX' is a hexadecimal number with four to six digits, representing a +Unicode input code. +Normally, +.B preconv +should be invoked with the +.B \-k +and +.B \-K +options of +.BR groff . +. +. +.SH OPTIONS +.TP +.B \-d +Emit debugging messages to standard error (mainly the used encoding). +. +.TP +.BI \-e encoding +Specify input encoding explicitly, overriding all other methods. +This corresponds to +.BR groff 's +.BI \-K encoding +option. +Without this switch, +.B preconv +uses the algorithm described below to select the input encoding. +. +.TP +.B \-h +Print help message. +. +.TP +.B \-r +Do not add .lf requests. +. +.TP +.B \-v +Print version number. +. +. +.SH USAGE +.B preconv +tries to find the input encoding with the following algorithm. +. +.IP 1. +If the input encoding has been explicitly specified with option +.BR \-e , +use it. +. +.IP 2. +Otherwise, check whether the input starts with a +.I Byte Order Mark +(BOM, see below). +If found, use it. +. +.IP 3. +Finally, check whether there is a known +.I coding tag +(see below) in either the first or second input line. +If found, use it. +. +.IP 4. +If everything fails, use a default encoding as given by the current locale, +or `latin1' if the locale is set to `C', `POSIX', or empty. +. +.PP +Note tha the +.B groff +program supports a +.B GROFF_ENCODING +environment variable which is eventually expanded to option +.BR \-k . +. +.SS "Byte Order Mark" +The Unicode Standard defines character U+FEFF as the the Byte Order Mark +(BOM). +On the other hand, value U+FFFE is guaranteed not be a Unicode character at +all. +This allows to detect the byte order within the data stream (either +big-endian or lower-endian), and the MIME encodings `UTF-16' and `UTF-32' +mandate that the data stream starts with U+FEFF. +Similarly, the data stream encoded as `UTF-8' might start with a BOM (to +ease the conversion from and to UTF-16 and UTF-32). +In all cases, the byte order mark is +.I not +part of the data but part of the encoding protocol; with other words, +.BR preconv 's +output doesn't contain it. +. +.PP +Note that U+FEFF not at the start of the input data actually is emitted; +it has then the meaning of a `zero width no-break space' character \[en] +something not needed normally in +.BR groff . +. +.SS "Coding Tags" +To be written. +. +.SS "Iconv Issues" +To be written. +. +. +.SH "SEE ALSO" +.BR groff (@MAN1EXT@) +. +.\" Local Variables: +.\" mode: nroff +.\" End: |