summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorwl <wl>2010-12-13 15:30:19 +0000
committerwl <wl>2010-12-13 15:30:19 +0000
commitd5d8909e11e0c613f7a1dfba3a20a405ae7b4da4 (patch)
tree88c9d26db241f5a613239e1372eb22ff239070cc
parent67525a8a24c8a0a7d6413de6814c8901f9401a39 (diff)
downloadgroff-d5d8909e11e0c613f7a1dfba3a20a405ae7b4da4.tar.gz
Implement support for character classes.
This patch uses standard C++ headers, contrary to the rest of groff. Ideally, everything in groff should be updated to do the same. * src/include/font.h (glyph_to_unicode): New function. * src/libs/libgroff/font.cpp (glyph_to_unicode): Implement it. (font::contains, font::get_code): Use it. * src/roff/troff/charinfo.h: Include <vector> and <utility>. (charinfo): New members `ranges' and `nested_classes'. New member functions `get_unicode_code' and `get_flags'. New member functions `add_to_class', `is_class', and `contains'. (charinfo::overlaps_horizontally, charinfo::overlaps_vertically, charinfo::can_break_before, charinfo::can_break_after, charinfo::can_break_after, charinfo::ends_sentence, charinfo::transparent,, charinfo:ignore_hcodes): Use `get_flags', which handles character classes also. * src/roff/troff/input.cpp (char_class_dictionary): New global variable. (define_class): New function. (init_input_requests): Register `class'. (charinfo::get_unicode_code, charinfo::get_flags, charinfo::contains): Implement it. * NEWS, doc/groff.texinfo (Character Classes), man/groff_diff.man, man/groff.man: Document it.
-rw-r--r--ChangeLog34
-rw-r--r--NEWS4
-rw-r--r--doc/groff.texinfo87
-rw-r--r--man/groff.man11
-rw-r--r--man/groff_diff.man65
-rw-r--r--src/include/font.h5
-rw-r--r--src/libs/libgroff/font.cpp123
-rw-r--r--src/roff/troff/charinfo.h55
-rw-r--r--src/roff/troff/input.cpp171
9 files changed, 446 insertions, 109 deletions
diff --git a/ChangeLog b/ChangeLog
index d59fd991..e47e1a0b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,37 @@
+2010-12-06 Colin Watson <cjwatson@debian.org>
+ Daiki Ueno <ueno@unixuser.org>
+
+ Implement support for character classes.
+
+ This patch uses standard C++ headers, contrary to the rest of groff.
+ Ideally, everything in groff should be updated to do the same.
+
+ * src/include/font.h (glyph_to_unicode): New function.
+
+ * src/libs/libgroff/font.cpp (glyph_to_unicode): Implement it.
+ (font::contains, font::get_code): Use it.
+
+ * src/roff/troff/charinfo.h: Include <vector> and <utility>.
+ (charinfo): New members `ranges' and `nested_classes'.
+ New member functions `get_unicode_code' and `get_flags'.
+ New member functions `add_to_class', `is_class', and `contains'.
+ (charinfo::overlaps_horizontally, charinfo::overlaps_vertically,
+ charinfo::can_break_before, charinfo::can_break_after,
+ charinfo::can_break_after, charinfo::ends_sentence,
+ charinfo::transparent,, charinfo:ignore_hcodes): Use `get_flags',
+ which handles character classes also.
+
+ * src/roff/troff/input.cpp (char_class_dictionary): New global
+ variable.
+ (define_class): New function.
+ (init_input_requests): Register `class'.
+
+ (charinfo::get_unicode_code, charinfo::get_flags,
+ charinfo::contains): Implement it.
+
+ * NEWS, doc/groff.texinfo (Character Classes), man/groff_diff.man,
+ man/groff.man: Document it.
+
2010-11-11 Anton Shepelev <anton.txt@gmail.com>
[grohtml]: Improve texinfo documentation.
diff --git a/NEWS b/NEWS
index c5028c4b..00a41b47 100644
--- a/NEWS
+++ b/NEWS
@@ -24,6 +24,10 @@ o There is a new warning category `file', enabled by default. The `mso'
request emits warnings in this category when the requested macro file does
not exist.
+o The new `class' request assigns a short name to a set of characters
+ which can be referred to in the `cflags' request. This is especially
+ useful to control line-breaking and hyphenation rules in CJK languages.
+
Tbl
---
diff --git a/doc/groff.texinfo b/doc/groff.texinfo
index 8d2e5c68..42cf6166 100644
--- a/doc/groff.texinfo
+++ b/doc/groff.texinfo
@@ -6052,7 +6052,7 @@ aaa bbb ccc ddd eee fff ggg hhh\h'0'\R':k \n[.k]'
@endExample
If you process this with the PostScript device (@code{-Tps}), there
-will be a line break eventually after @code{ggg} in both input lines.
+will be a line break eventually after @code{ggg} in both input lines.
However, after processing the space after @code{ggg}, the partially
collected line is not overfull yet, so @code{troff} continues to
collect input until it sees the space (or in this case, the newline)
@@ -8726,6 +8726,7 @@ special symbols (Greek, mathematics).
* Font Families::
* Font Positions::
* Using Symbols::
+* Character Classes::
* Special Fonts::
* Artificial Fonts::
* Ligatures and Kerning::
@@ -9122,7 +9123,7 @@ this is font 1 again
@c ---------------------------------------------------------------------
-@node Using Symbols, Special Fonts, Font Positions, Fonts and Symbols
+@node Using Symbols, Character Classes, Font Positions, Fonts and Symbols
@subsection Using Symbols
@cindex using symbols
@cindex symbols, using
@@ -9458,7 +9459,9 @@ width, depth, and height, nothing else. All manipulations with the
modified with the @code{cflags} request. The first argument is the sum
of the desired flags and the remaining arguments are the characters or
symbols to have those properties. It is possible to omit the spaces
-between the characters or symbols.
+between the characters or symbols. Instead of single characters or
+symbols you can also use character classes (see @ref{Character Classes}
+for more details).
@table @code
@item 1
@@ -9639,7 +9642,83 @@ The request @code{rfschar} removes glyph definitions defined with
@c ---------------------------------------------------------------------
-@node Special Fonts, Artificial Fonts, Using Symbols, Fonts and Symbols
+@node Character Classes, Special Fonts, Using Symbols, Fonts and Symbols
+@subsection Character Classes
+@cindex character classes
+@cindex classes, character
+
+Classes are particularly useful for East Asian languages such as
+Chinese, Japanese, and Korean, where the number of needed characters is
+much larger than in European languages, and where large sets of
+characters share the same properties.
+
+@Defreq {class, n c1 c2 @dots{}}
+@cindex character class (@code{class})
+@cindex defining character class (@code{class})
+@cindex class of characters (@code{class})
+In @code{groff}, a @dfn{character class} (or simply ``class'') is a set
+of characters, grouped by some user aspect. The @code{class} request
+defines such classes so that other requests can refer to all characters
+belonging to this set with a single class name. Currently, only the
+@code{cflags} request can handle character classes.
+
+A @code{class} request takes a class name followed by a list of
+entities. In its simplest form, the entities are characters or symbols:
+
+@Example
+.class [prepunct] , : ; > @}
+@endExample
+
+Since class and glyph names share the same namespace, it is recommended
+to start and end the class name with @code{[} and @code{]},
+respectively, to avoid collisions with normal @code{groff} symbols (and
+symbols defined by the user). In particular, the presence of @code{]}
+in the symbol name intentionally prevents the usage of @code{\[...]},
+thus you must use the @code{\C} escape to access a class with such a
+name.
+
+@cindex GGL (groff glyph list)
+@cindex groff glyph list (GGL)
+You can also use a special character range notation, consisting of a
+start character or symbol, followed by @samp{-}, and an end character or
+symbol. Internally, @code{gtroff} converts these two symbol names to
+Unicode values (according to the groff glyph gist) which then give the
+start and end value of the range. If that fails, the class definition
+is skipped.
+
+Finally, classes can be nested, too.
+
+Here is a more complex example:
+
+@Example
+.class [prepunctx] \C'[prepunct]' \[u2013]-\[u2016]
+@endExample
+
+The class @samp{prepunctx} now contains the contents of the class
+@code{prepunct} as defined above (the set @samp{, : ; > @}}), and
+characters in the range between @code{U+2013} and @code{U+2016}.
+
+If you want to add @samp{-} to a class, it must be the first character
+value in the argument list, otherwise it gets misinterpreted as a range.
+
+Note that it is not possible to use class names within range
+definitions.
+
+Typical use of the @code{class} request is to control line-breaking and
+hyphenation rules as defined by the @code{cflags} request. For example,
+to inhibit line breaks before the characters belonging to the
+@code{prepunctx} class, you can write:
+
+@Example
+.cflags 2 \C'[prepunctx]'
+@endExample
+
+See the @code{cflags} request in @ref{Using Symbols}, for more details.
+@endDefreq
+
+@c ---------------------------------------------------------------------
+
+@node Special Fonts, Artificial Fonts, Character Classes, Fonts and Symbols
@subsection Special Fonts
@cindex special fonts
@cindex fonts, special
diff --git a/man/groff.man b/man/groff.man
index c89dcfdd..8ec260c0 100644
--- a/man/groff.man
+++ b/man/groff.man
@@ -5,7 +5,7 @@ groff.man
This file is part of groff, the GNU roff type-setting system.
Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
- 2009
+ 2009, 2010
Free Software Foundation, Inc.
written by Bernd Warken <bwarken@mayn.de>
maintained by Werner Lemberg <wl@gnu.org>
@@ -1312,6 +1312,15 @@ Chop the last character off macro, string, or diversion
.IR object .
.
.TPx
+.REQ .class "name c1 c2 .\|.\|.\&"
+Assign a set of characters, character ranges, or classes
+.IR c1 ,
+.IR c2 ,
+.I .\|.\|.\&
+to
+.IR name .
+.
+.TPx
.REQ .close "stream"
Close the
.IR stream .
diff --git a/man/groff_diff.man b/man/groff_diff.man
index 115c6d26..e0d355ea 100644
--- a/man/groff_diff.man
+++ b/man/groff_diff.man
@@ -138,7 +138,7 @@ Interpolate string
taking
.IR arg1 ,
.IR arg2 ,
-.I .\|.\|.\&
+.IR .\|.\|. ,
as arguments.
.
.TP
@@ -965,10 +965,11 @@ This is the same as
.BR \[rs]p .
.
.TP
-.BI .cflags\ n\ c1\ c2\|.\|.\|.\&
+.BI .cflags\ "n c1 c2 .\|.\|."
Characters
.IR c1 ,
-.IR c2 ,\|.\|.\|.\&
+.IR c2 ,
+.IR .\|.\|. ,
have properties determined by
.IR n ,
which is ORed from the following:
@@ -1093,6 +1094,21 @@ This is useful for removing the newline from the end of diversions
that are to be interpolated as strings.
.
.TP
+.BI .class\ "name c1 c2 .\|.\|."
+Assign
+.I name
+to a set of characters
+.IR c1 ,
+.IR c2 ,
+.IR .\|.\|. ,
+so that they can be referred to from other requests easily (currently
+.B .cflags
+only).
+Character ranges (indicated by an intermediate `-') and nested classes
+are possible also.
+This is useful to assign properties to a large set of characters.
+.
+.TP
.BI .close\ stream
Close the stream named
.IR stream ;
@@ -1394,12 +1410,13 @@ request but before the list of fonts declared with
.BR .special .
.
.TP
-.BI .fspecial\ f\ s1\ s2\|.\|.\|.\&
+.BI .fspecial\ "f s1 s2 .\|.\|."
When the current font is\~\c
.IR f ,
fonts
.IR s1 ,
-.IR s2 ,\|.\|.\|.\&
+.IR s2 ,
+.IR .\|.\|. ,
are special, that is, they are searched for glyphs not in
the current font.
.
@@ -1470,7 +1487,7 @@ is missing,
switch to the previous glyph color.
.
.TP
-.BI .hcode \ c1\ code1\ c2\ code2\|.\|.\|.\&
+.BI .hcode\ "c1 code1 c2 code2 .\|.\|."
Set the hyphenation code of character
.I c1
to
@@ -1478,7 +1495,8 @@ to
and that of
.I c2
to
-.IR code2 .
+.IR code2 ,
+and so on.
A hyphenation code must be a single input character (not a special
character) other than a digit or a space.
.
@@ -1625,7 +1643,7 @@ except that the hyphenation patterns from
are appended to the patterns already loaded in the current language.
.
.TP
-.BI .hpfcode\ a\ b\ c\ d\ .\|.\|.
+.BI .hpfcode\ "a b c d .\|.\|."
After reading a hyphenation patterns file with the
.B hpf
or
@@ -1930,10 +1948,11 @@ and
with a positive value which are applied after the line is output.
.
.TP
-.BI .rchar\ c1\ c2\|.\|.\|.\&
+.BI .rchar\ "c1 c2 .\|.\|."
Remove the definitions of glyphs
.IR c1 ,
-.IR c2 ,\|.\|.\|.
+.IR c2 ,
+.I .\|.\|.
This undoes the effect of a
.B char
request.
@@ -1948,10 +1967,11 @@ from the macro one level higher.
No effect otherwise.
.
.TP
-.BI .rfschar\ c1\ c2\|.\|.\|.\&
+.BI .rfschar\ "c1 c2 .\|.\|."
Remove the font-specific definitions of glyphs
.IR c1 ,
-.IR c2 ,\|.\|.\|.
+.IR c2 ,
+.I .\|.\|.
This undoes the effect of a
.B fschar
request.
@@ -2039,7 +2059,7 @@ is missing, arguments are shifted by\~1.
Shifting by negative amounts is currently undefined.
.
.TP
-.BI .sizes\ s1\ s2\|.\|.\|.\|sn\ [0]
+.BI .sizes\ s1\ s2\ .\|.\|.\ sn\ [0]
This command is similar to the
.B sizes
command of a
@@ -2048,7 +2068,8 @@ file.
.
It sets the available font sizes for the current font to
.IR s1 ,
-.IR s2 ,\|.\|.\|.\|,\~ sn
+.IR s2 ,
+.IR .\|.\|.\| ,\~ sn
scaled points.
.
The list of sizes can be terminated by an optional\~\c
@@ -2063,10 +2084,11 @@ Contrary to the font file command, the list can't extend over more
than a single line.
.
.TP
-.BI .special\ s1\ s2\|.\|.\|.\&
+.BI .special\ "s1 s2 .\|.\|."
Fonts
.IR s1 ,
-.IR s2 ,\|.\|.\|.\&
+.IR s2 ,
+.IR .\|.\|. ,
are special and are searched for glyphs not in the current
font.
.
@@ -2563,18 +2585,21 @@ In GNU troff, as in UNIX troff, you should always follow a sentence
with either a newline or two spaces.
.
.TP
-.BI .ta\ n1\ n2\|.\|.\|.nn \ T\ r1\ r2\|.\|.\|.\|rn
+.BI .ta\ "n1 n2 .\|.\|. nn " "T " "r1 r2 .\|.\|. rn"
Set tabs at positions
.IR n1 ,
-.IR n2 ,\|.\|.\|.\|,
+.IR n2 ,
+.IR .\|.\|. ,
.I nn
and then set tabs at
.IR nn \|+\| r1 ,
-.IR nn \|+\| r2 ,\|.\|.\|.\|,
+.IR nn \|+\| r2 ,
+.IR .\|.\|. ,
.IR nn \|+\| rn
and then at
.IR nn \|+\| rn \|+\| r1 ,
-.IR nn \|+\| rn \|+\| r2 ,\|.\|.\|.\|,
+.IR nn \|+\| rn \|+\| r2 ,
+.IR .\|.\|. ,
.IR nn \|+\| rn \|+\| rn ,
and so on.
For example,
diff --git a/src/include/font.h b/src/include/font.h
index 944250b9..75d2ef16 100644
--- a/src/include/font.h
+++ b/src/include/font.h
@@ -1,5 +1,5 @@
// -*- C++ -*-
-/* Copyright (C) 1989, 1990, 1991, 1992, 2002, 2004, 2006, 2009
+/* Copyright (C) 1989, 1990, 1991, 1992, 2002, 2004, 2006, 2009, 2010
Free Software Foundation, Inc.
Written by James Clark (jjc@jclark.com)
@@ -73,6 +73,9 @@ inline int glyph_to_number(glyph *); // Convert the given glyph back to
// a numbered character.
inline int glyph_to_index(glyph *); // Return the unique index that is
// associated with the given glyph. It is >= 0.
+extern int glyph_to_unicode(glyph *); // Convert the given glyph to its
+ // Unicode codepoint. Return -1 if it does not
+ // designate a Unicode character.
inline int glyph_to_number(glyph *g)
{
diff --git a/src/libs/libgroff/font.cpp b/src/libs/libgroff/font.cpp
index d0b4a12e..8dff71e7 100644
--- a/src/libs/libgroff/font.cpp
+++ b/src/libs/libgroff/font.cpp
@@ -1,6 +1,6 @@
// -*- C++ -*-
/* Copyright (C) 1989, 1990, 1991, 1992, 2000, 2001, 2002, 2003, 2004, 2005,
- 2006, 2008, 2009
+ 2006, 2008, 2009, 2010
Free Software Foundation, Inc.
Written by James Clark (jjc@jclark.com)
@@ -147,6 +147,47 @@ void text_file::error(const char *format,
error_with_file_and_line(path, lineno, format, arg1, arg2, arg3);
}
+int glyph_to_unicode(glyph *g)
+{
+ const char *nm = glyph_to_name(g);
+ if (nm != NULL) {
+ // ASCII character?
+ if (nm[0] == 'c' && nm[1] == 'h' && nm[2] == 'a' && nm[3] == 'r'
+ && (nm[4] >= '0' && nm[4] <= '9')) {
+ int n = (nm[4] - '0');
+ if (nm[5] == '\0')
+ return n;
+ if (n > 0 && (nm[5] >= '0' && nm[5] <= '9')) {
+ n = 10*n + (nm[5] - '0');
+ if (nm[6] == '\0')
+ return n;
+ if (nm[6] >= '0' && nm[6] <= '9') {
+ n = 10*n + (nm[6] - '0');
+ if (nm[7] == '\0' && n < 128)
+ return n;
+ }
+ }
+ }
+ // Unicode character?
+ if (check_unicode_name(nm)) {
+ char *ignore;
+ return (int)strtol(nm + 1, &ignore, 16);
+ }
+ // If `nm' is a single letter `x', the glyph name is `\x'.
+ char buf[] = { '\\', '\0', '\0' };
+ if (nm[1] == '\0') {
+ buf[1] = nm[0];
+ nm = buf;
+ }
+ // groff glyphs that map to Unicode?
+ const char *unicode = glyph_name_to_unicode(nm);
+ if (unicode != NULL && strchr(unicode, '_') == NULL) {
+ char *ignore;
+ return (int)strtol(unicode, &ignore, 16);
+ }
+ }
+ return -1;
+}
/* font functions */
@@ -269,42 +310,11 @@ int font::contains(glyph *g)
return 1;
if (is_unicode) {
// Unicode font
- const char *nm = glyph_to_name(g);
- if (nm != NULL) {
- // ASCII character?
- if (nm[0] == 'c' && nm[1] == 'h' && nm[2] == 'a' && nm[3] == 'r'
- && (nm[4] >= '0' && nm[4] <= '9')) {
- int n = (nm[4] - '0');
- if (nm[5] == '\0')
- return 1;
- if (n > 0 && (nm[5] >= '0' && nm[5] <= '9')) {
- n = 10*n + (nm[5] - '0');
- if (nm[6] == '\0')
- return 1;
- if (nm[6] >= '0' && nm[6] <= '9') {
- n = 10*n + (nm[6] - '0');
- if (nm[7] == '\0' && n < 128)
- return 1;
- }
- }
- }
- // Unicode character?
- if (check_unicode_name(nm))
- return 1;
- // If `nm' is a single letter `x', the glyph name is `\x'.
- char buf[] = { '\\', '\0', '\0' };
- if (nm[1] == '\0') {
- buf[1] = nm[0];
- nm = buf;
- }
- // groff glyph name that maps to Unicode?
- const char *unicode = glyph_name_to_unicode(nm);
- if (unicode != NULL && strchr(unicode, '_') == NULL)
- return 1;
- }
+ // ASCII or Unicode character, or groff glyph name that maps to Unicode?
+ if (glyph_to_unicode(g) >= 0)
+ return 1;
// Numbered character?
- int n = glyph_to_number(g);
- if (n >= 0)
+ if (glyph_to_number(g) >= 0)
return 1;
}
return 0;
@@ -554,43 +564,10 @@ int font::get_code(glyph *g)
}
if (is_unicode) {
// Unicode font
- const char *nm = glyph_to_name(g);
- if (nm != NULL) {
- // ASCII character?
- if (nm[0] == 'c' && nm[1] == 'h' && nm[2] == 'a' && nm[3] == 'r'
- && (nm[4] >= '0' && nm[4] <= '9')) {
- int n = (nm[4] - '0');
- if (nm[5] == '\0')
- return n;
- if (n > 0 && (nm[5] >= '0' && nm[5] <= '9')) {
- n = 10*n + (nm[5] - '0');
- if (nm[6] == '\0')
- return n;
- if (nm[6] >= '0' && nm[6] <= '9') {
- n = 10*n + (nm[6] - '0');
- if (nm[7] == '\0' && n < 128)
- return n;
- }
- }
- }
- // Unicode character?
- if (check_unicode_name(nm)) {
- char *ignore;
- return (int)strtol(nm + 1, &ignore, 16);
- }
- // If `nm' is a single letter `x', the glyph name is `\x'.
- char buf[] = { '\\', '\0', '\0' };
- if (nm[1] == '\0') {
- buf[1] = nm[0];
- nm = buf;
- }
- // groff glyphs that map to Unicode?
- const char *unicode = glyph_name_to_unicode(nm);
- if (unicode != NULL && strchr(unicode, '_') == NULL) {
- char *ignore;
- return (int)strtol(unicode, &ignore, 16);
- }
- }
+ // ASCII or Unicode character, or groff glyph name that maps to Unicode?
+ int uni = glyph_to_unicode(g);
+ if (uni >= 0)
+ return uni;
// Numbered character?
int n = glyph_to_number(g);
if (n >= 0)
diff --git a/src/roff/troff/charinfo.h b/src/roff/troff/charinfo.h
index 2c2c2685..c71383ed 100644
--- a/src/roff/troff/charinfo.h
+++ b/src/roff/troff/charinfo.h
@@ -1,5 +1,5 @@
// -*- C++ -*-
-/* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2002, 2006, 2009
+/* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2002, 2006, 2009, 2010
Free Software Foundation, Inc.
Written by James Clark (jjc@jclark.com)
@@ -18,6 +18,9 @@ for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */
+#include <vector>
+#include <utility>
+
class macro;
class charinfo : glyph {
@@ -35,6 +38,9 @@ class charinfo : glyph {
char translate_input; // non-zero means that asciify_code is
// active for .asciify (set by .trin)
char_mode mode;
+ // Unicode character classes
+ std::vector<std::pair<int, int> > ranges;
+ std::vector<charinfo *> nested_classes;
public:
enum { // Values for the flags bitmask. See groff
// manual, description of the `.cflags' request.
@@ -66,6 +72,7 @@ public:
unsigned char get_hyphenation_code();
unsigned char get_ascii_code();
unsigned char get_asciify_code();
+ int get_unicode_code();
void set_hyphenation_code(unsigned char);
void set_ascii_code(unsigned char);
void set_asciify_code(unsigned char);
@@ -73,6 +80,7 @@ public:
int get_translation_input();
charinfo *get_translation(int = 0);
void set_translation(charinfo *, int, int);
+ unsigned char get_flags();
void set_flags(unsigned char);
void set_special_translation(int, int);
int get_special_translation(int = 0);
@@ -87,6 +95,13 @@ public:
int is_fallback();
int is_special();
symbol *get_symbol();
+ void add_to_class(int);
+ void add_to_class(int, int);
+ void add_to_class(charinfo *);
+ bool is_class();
+ bool contains(int);
+ bool contains(symbol);
+ bool contains(charinfo *);
};
charinfo *get_charinfo(symbol);
@@ -95,37 +110,37 @@ charinfo *get_charinfo_by_number(int);
inline int charinfo::overlaps_horizontally()
{
- return flags & OVERLAPS_HORIZONTALLY;
+ return get_flags() & OVERLAPS_HORIZONTALLY;
}
inline int charinfo::overlaps_vertically()
{
- return flags & OVERLAPS_VERTICALLY;
+ return get_flags() & OVERLAPS_VERTICALLY;
}
inline int charinfo::can_break_before()
{
- return flags & BREAK_BEFORE;
+ return get_flags() & BREAK_BEFORE;
}
inline int charinfo::can_break_after()
{
- return flags & BREAK_AFTER;
+ return get_flags() & BREAK_AFTER;
}
inline int charinfo::ends_sentence()
{
- return flags & ENDS_SENTENCE;
+ return get_flags() & ENDS_SENTENCE;
}
inline int charinfo::transparent()
{
- return flags & TRANSPARENT;
+ return get_flags() & TRANSPARENT;
}
inline int charinfo::ignore_hcodes()
{
- return flags & IGNORE_HCODES;
+ return get_flags() & IGNORE_HCODES;
}
inline int charinfo::numbered()
@@ -214,5 +229,27 @@ inline int charinfo::first_time_not_found()
inline symbol *charinfo::get_symbol()
{
- return( &nm );
+ return &nm;
+}
+
+inline void charinfo::add_to_class(int c)
+{
+ // TODO ranges cumbersome for single characters?
+ ranges.push_back(std::pair<int, int>(c, c));
+}
+
+inline void charinfo::add_to_class(int lo,
+ int hi)
+{
+ ranges.push_back(std::pair<int, int>(lo, hi));
+}
+
+inline void charinfo::add_to_class(charinfo *ci)
+{
+ nested_classes.push_back(ci);
+}
+
+inline bool charinfo::is_class()
+{
+ return (!ranges.empty() || !nested_classes.empty());
}
diff --git a/src/roff/troff/input.cpp b/src/roff/troff/input.cpp
index 5335c1ce..cdb82fe2 100644
--- a/src/roff/troff/input.cpp
+++ b/src/roff/troff/input.cpp
@@ -1,6 +1,6 @@
// -*- C++ -*-
/* Copyright (C) 1989, 1990, 1991, 1992, 2000, 2001, 2002, 2003, 2004, 2005,
- 2006, 2007, 2008, 2009
+ 2006, 2007, 2008, 2009, 2010
Free Software Foundation, Inc.
Written by James Clark (jjc@jclark.com)
@@ -6740,6 +6740,102 @@ void hyphenation_patterns_file_code()
skip_line();
}
+dictionary char_class_dictionary(501);
+
+void define_class()
+{
+ tok.skip();
+ symbol nm = get_name(1);
+ if (nm.is_null()) {
+ skip_line();
+ return;
+ }
+ charinfo *ci = get_charinfo(nm);
+ charinfo *child1 = 0, *child2 = 0;
+ while (!tok.newline() && !tok.eof()) {
+ tok.skip();
+ if (child1 != 0 && tok.ch() == '-') {
+ tok.next();
+ child2 = tok.get_char(1);
+ if (!child2) {
+ warning(WARN_MISSING,
+ "missing end of character range in class `%1'",
+ nm.contents());
+ skip_line();
+ return;
+ }
+ if (child1->is_class() || child2->is_class()) {
+ warning(WARN_SYNTAX,
+ "nested character class is not allowed in range definition");
+ skip_line();
+ return;
+ }
+ int u1 = child1->get_unicode_code();
+ int u2 = child2->get_unicode_code();
+ if (u1 < 0) {
+ warning(WARN_SYNTAX,
+ "invalid start value in character range");
+ skip_line();
+ return;
+ }
+ if (u2 < 0) {
+ warning(WARN_SYNTAX,
+ "invalid end value in character range");
+ skip_line();
+ return;
+ }
+ ci->add_to_class(u1, u2);
+ child1 = child2 = 0;
+ }
+ else if (child1 != 0) {
+ if (child1->is_class())
+ ci->add_to_class(child1);
+ else {
+ int u1 = child1->get_unicode_code();
+ if (u1 < 0) {
+ warning(WARN_SYNTAX,
+ "invalid character value in class `%1'",
+ nm.contents());
+ skip_line();
+ return;
+ }
+ ci->add_to_class(u1);
+ }
+ child1 = 0;
+ }
+ child1 = tok.get_char(1);
+ tok.next();
+ if (!child1) {
+ if (!tok.newline())
+ skip_line();
+ break;
+ }
+ }
+ if (child1 != 0) {
+ if (child1->is_class())
+ ci->add_to_class(child1);
+ else {
+ int u1 = child1->get_unicode_code();
+ if (u1 < 0) {
+ warning(WARN_SYNTAX,
+ "invalid character value in class `%1'",
+ nm.contents());
+ skip_line();
+ return;
+ }
+ ci->add_to_class(u1);
+ }
+ child1 = 0;
+ }
+ if (!ci->is_class()) {
+ warning(WARN_SYNTAX,
+ "empty class definition for `%1'",
+ nm.contents());
+ return;
+ }
+ (void)char_class_dictionary.lookup(nm, ci);
+}
+
charinfo *token::get_char(int required)
{
if (type == TOKEN_CHAR)
@@ -7817,6 +7913,7 @@ void init_input_requests()
init_request("cflags", char_flags);
init_request("char", define_character);
init_request("chop", chop_macro);
+ init_request("class", define_class);
init_request("close", close_request);
init_request("color", activate_color);
init_request("composite", composite_request);
@@ -8367,6 +8464,13 @@ charinfo::charinfo(symbol s)
number = -1;
}
+int charinfo::get_unicode_code()
+{
+ if (ascii_code != '\0')
+ return ascii_code;
+ return glyph_to_unicode(this);
+}
+
void charinfo::set_hyphenation_code(unsigned char c)
{
hyphenation_code = c;
@@ -8388,6 +8492,27 @@ void charinfo::set_translation(charinfo *ci, int tt, int ti)
transparent_translate = tt;
}
+// Get the union of all flags affecting this charinfo.
+unsigned char charinfo::get_flags()
+{
+ unsigned char all_flags = flags;
+ dictionary_iterator iter(char_class_dictionary);
+ charinfo *cp;
+ symbol s;
+ while (iter.get(&s, (void **)&cp)) {
+ assert(!s.is_null());
+ if (cp->contains(get_unicode_code())) {
+#if defined(DEBUGGING)
+ if (debug_state)
+ fprintf(stderr, "charinfo::get_flags %p %s %d\n",
+ (void *)cp, cp->nm.contents(), cp->flags);
+#endif
+ all_flags |= cp->flags;
+ }
+ }
+ return all_flags;
+}
+
void charinfo::set_special_translation(int c, int tt)
{
special_translation = c;
@@ -8432,6 +8557,50 @@ int charinfo::get_number()
return number;
}
+bool charinfo::contains(int c)
+{
+ std::vector<std::pair<int, int> >::const_iterator ranges_iter;
+ ranges_iter = ranges.begin();
+ while (ranges_iter != ranges.end()) {
+ if (c >= ranges_iter->first && c <= ranges_iter->second) {
+#if defined(DEBUGGING)
+ if (debug_state)
+ fprintf(stderr, "charinfo::contains(%d)\n", c);
+#endif
+ return true;
+ }
+ ++ranges_iter;
+ }
+
+ std::vector<charinfo *>::const_iterator nested_iter;
+ nested_iter = nested_classes.begin();
+ while (nested_iter != nested_classes.end()) {
+ if ((*nested_iter)->contains(c))
+ return true;
+ ++nested_iter;
+ }
+
+ return false;
+}
+
+bool charinfo::contains(symbol s)
+{
+ const char *unicode = glyph_name_to_unicode(s.contents());
+ if (unicode != NULL && strchr(unicode, '_') == NULL) {
+ char *ignore;
+ int c = (int)strtol(unicode, &ignore, 16);
+ return contains(c);
+ }
+ else
+ return false;
+}
+
+bool charinfo::contains(charinfo *)
+{
+ // TODO
+ return false;
+}
+
symbol UNNAMED_SYMBOL("---");
// For numbered characters not between 0 and 255, we make a symbol out