From d5d8909e11e0c613f7a1dfba3a20a405ae7b4da4 Mon Sep 17 00:00:00 2001 From: wl Date: Mon, 13 Dec 2010 15:30:19 +0000 Subject: Implement support for character classes. This patch uses standard C++ headers, contrary to the rest of groff. Ideally, everything in groff should be updated to do the same. * src/include/font.h (glyph_to_unicode): New function. * src/libs/libgroff/font.cpp (glyph_to_unicode): Implement it. (font::contains, font::get_code): Use it. * src/roff/troff/charinfo.h: Include and . (charinfo): New members `ranges' and `nested_classes'. New member functions `get_unicode_code' and `get_flags'. New member functions `add_to_class', `is_class', and `contains'. (charinfo::overlaps_horizontally, charinfo::overlaps_vertically, charinfo::can_break_before, charinfo::can_break_after, charinfo::can_break_after, charinfo::ends_sentence, charinfo::transparent,, charinfo:ignore_hcodes): Use `get_flags', which handles character classes also. * src/roff/troff/input.cpp (char_class_dictionary): New global variable. (define_class): New function. (init_input_requests): Register `class'. (charinfo::get_unicode_code, charinfo::get_flags, charinfo::contains): Implement it. * NEWS, doc/groff.texinfo (Character Classes), man/groff_diff.man, man/groff.man: Document it. --- ChangeLog | 34 +++++++++ NEWS | 4 ++ doc/groff.texinfo | 87 +++++++++++++++++++++-- man/groff.man | 11 ++- man/groff_diff.man | 65 +++++++++++------ src/include/font.h | 5 +- src/libs/libgroff/font.cpp | 123 +++++++++++++------------------- src/roff/troff/charinfo.h | 55 ++++++++++++--- src/roff/troff/input.cpp | 171 ++++++++++++++++++++++++++++++++++++++++++++- 9 files changed, 446 insertions(+), 109 deletions(-) diff --git a/ChangeLog b/ChangeLog index d59fd991..e47e1a0b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,37 @@ +2010-12-06 Colin Watson + Daiki Ueno + + Implement support for character classes. + + This patch uses standard C++ headers, contrary to the rest of groff. + Ideally, everything in groff should be updated to do the same. + + * src/include/font.h (glyph_to_unicode): New function. + + * src/libs/libgroff/font.cpp (glyph_to_unicode): Implement it. + (font::contains, font::get_code): Use it. + + * src/roff/troff/charinfo.h: Include and . + (charinfo): New members `ranges' and `nested_classes'. + New member functions `get_unicode_code' and `get_flags'. + New member functions `add_to_class', `is_class', and `contains'. + (charinfo::overlaps_horizontally, charinfo::overlaps_vertically, + charinfo::can_break_before, charinfo::can_break_after, + charinfo::can_break_after, charinfo::ends_sentence, + charinfo::transparent,, charinfo:ignore_hcodes): Use `get_flags', + which handles character classes also. + + * src/roff/troff/input.cpp (char_class_dictionary): New global + variable. + (define_class): New function. + (init_input_requests): Register `class'. + + (charinfo::get_unicode_code, charinfo::get_flags, + charinfo::contains): Implement it. + + * NEWS, doc/groff.texinfo (Character Classes), man/groff_diff.man, + man/groff.man: Document it. + 2010-11-11 Anton Shepelev [grohtml]: Improve texinfo documentation. diff --git a/NEWS b/NEWS index c5028c4b..00a41b47 100644 --- a/NEWS +++ b/NEWS @@ -24,6 +24,10 @@ o There is a new warning category `file', enabled by default. The `mso' request emits warnings in this category when the requested macro file does not exist. +o The new `class' request assigns a short name to a set of characters + which can be referred to in the `cflags' request. This is especially + useful to control line-breaking and hyphenation rules in CJK languages. + Tbl --- diff --git a/doc/groff.texinfo b/doc/groff.texinfo index 8d2e5c68..42cf6166 100644 --- a/doc/groff.texinfo +++ b/doc/groff.texinfo @@ -6052,7 +6052,7 @@ aaa bbb ccc ddd eee fff ggg hhh\h'0'\R':k \n[.k]' @endExample If you process this with the PostScript device (@code{-Tps}), there -will be a line break eventually after @code{ggg} in both input lines. +will be a line break eventually after @code{ggg} in both input lines. However, after processing the space after @code{ggg}, the partially collected line is not overfull yet, so @code{troff} continues to collect input until it sees the space (or in this case, the newline) @@ -8726,6 +8726,7 @@ special symbols (Greek, mathematics). * Font Families:: * Font Positions:: * Using Symbols:: +* Character Classes:: * Special Fonts:: * Artificial Fonts:: * Ligatures and Kerning:: @@ -9122,7 +9123,7 @@ this is font 1 again @c --------------------------------------------------------------------- -@node Using Symbols, Special Fonts, Font Positions, Fonts and Symbols +@node Using Symbols, Character Classes, Font Positions, Fonts and Symbols @subsection Using Symbols @cindex using symbols @cindex symbols, using @@ -9458,7 +9459,9 @@ width, depth, and height, nothing else. All manipulations with the modified with the @code{cflags} request. The first argument is the sum of the desired flags and the remaining arguments are the characters or symbols to have those properties. It is possible to omit the spaces -between the characters or symbols. +between the characters or symbols. Instead of single characters or +symbols you can also use character classes (see @ref{Character Classes} +for more details). @table @code @item 1 @@ -9639,7 +9642,83 @@ The request @code{rfschar} removes glyph definitions defined with @c --------------------------------------------------------------------- -@node Special Fonts, Artificial Fonts, Using Symbols, Fonts and Symbols +@node Character Classes, Special Fonts, Using Symbols, Fonts and Symbols +@subsection Character Classes +@cindex character classes +@cindex classes, character + +Classes are particularly useful for East Asian languages such as +Chinese, Japanese, and Korean, where the number of needed characters is +much larger than in European languages, and where large sets of +characters share the same properties. + +@Defreq {class, n c1 c2 @dots{}} +@cindex character class (@code{class}) +@cindex defining character class (@code{class}) +@cindex class of characters (@code{class}) +In @code{groff}, a @dfn{character class} (or simply ``class'') is a set +of characters, grouped by some user aspect. The @code{class} request +defines such classes so that other requests can refer to all characters +belonging to this set with a single class name. Currently, only the +@code{cflags} request can handle character classes. + +A @code{class} request takes a class name followed by a list of +entities. In its simplest form, the entities are characters or symbols: + +@Example +.class [prepunct] , : ; > @} +@endExample + +Since class and glyph names share the same namespace, it is recommended +to start and end the class name with @code{[} and @code{]}, +respectively, to avoid collisions with normal @code{groff} symbols (and +symbols defined by the user). In particular, the presence of @code{]} +in the symbol name intentionally prevents the usage of @code{\[...]}, +thus you must use the @code{\C} escape to access a class with such a +name. + +@cindex GGL (groff glyph list) +@cindex groff glyph list (GGL) +You can also use a special character range notation, consisting of a +start character or symbol, followed by @samp{-}, and an end character or +symbol. Internally, @code{gtroff} converts these two symbol names to +Unicode values (according to the groff glyph gist) which then give the +start and end value of the range. If that fails, the class definition +is skipped. + +Finally, classes can be nested, too. + +Here is a more complex example: + +@Example +.class [prepunctx] \C'[prepunct]' \[u2013]-\[u2016] +@endExample + +The class @samp{prepunctx} now contains the contents of the class +@code{prepunct} as defined above (the set @samp{, : ; > @}}), and +characters in the range between @code{U+2013} and @code{U+2016}. + +If you want to add @samp{-} to a class, it must be the first character +value in the argument list, otherwise it gets misinterpreted as a range. + +Note that it is not possible to use class names within range +definitions. + +Typical use of the @code{class} request is to control line-breaking and +hyphenation rules as defined by the @code{cflags} request. For example, +to inhibit line breaks before the characters belonging to the +@code{prepunctx} class, you can write: + +@Example +.cflags 2 \C'[prepunctx]' +@endExample + +See the @code{cflags} request in @ref{Using Symbols}, for more details. +@endDefreq + +@c --------------------------------------------------------------------- + +@node Special Fonts, Artificial Fonts, Character Classes, Fonts and Symbols @subsection Special Fonts @cindex special fonts @cindex fonts, special diff --git a/man/groff.man b/man/groff.man index c89dcfdd..8ec260c0 100644 --- a/man/groff.man +++ b/man/groff.man @@ -5,7 +5,7 @@ groff.man This file is part of groff, the GNU roff type-setting system. Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, - 2009 + 2009, 2010 Free Software Foundation, Inc. written by Bernd Warken maintained by Werner Lemberg @@ -1312,6 +1312,15 @@ Chop the last character off macro, string, or diversion .IR object . . .TPx +.REQ .class "name c1 c2 .\|.\|.\&" +Assign a set of characters, character ranges, or classes +.IR c1 , +.IR c2 , +.I .\|.\|.\& +to +.IR name . +. +.TPx .REQ .close "stream" Close the .IR stream . diff --git a/man/groff_diff.man b/man/groff_diff.man index 115c6d26..e0d355ea 100644 --- a/man/groff_diff.man +++ b/man/groff_diff.man @@ -138,7 +138,7 @@ Interpolate string taking .IR arg1 , .IR arg2 , -.I .\|.\|.\& +.IR .\|.\|. , as arguments. . .TP @@ -965,10 +965,11 @@ This is the same as .BR \[rs]p . . .TP -.BI .cflags\ n\ c1\ c2\|.\|.\|.\& +.BI .cflags\ "n c1 c2 .\|.\|." Characters .IR c1 , -.IR c2 ,\|.\|.\|.\& +.IR c2 , +.IR .\|.\|. , have properties determined by .IR n , which is ORed from the following: @@ -1093,6 +1094,21 @@ This is useful for removing the newline from the end of diversions that are to be interpolated as strings. . .TP +.BI .class\ "name c1 c2 .\|.\|." +Assign +.I name +to a set of characters +.IR c1 , +.IR c2 , +.IR .\|.\|. , +so that they can be referred to from other requests easily (currently +.B .cflags +only). +Character ranges (indicated by an intermediate `-') and nested classes +are possible also. +This is useful to assign properties to a large set of characters. +. +.TP .BI .close\ stream Close the stream named .IR stream ; @@ -1394,12 +1410,13 @@ request but before the list of fonts declared with .BR .special . . .TP -.BI .fspecial\ f\ s1\ s2\|.\|.\|.\& +.BI .fspecial\ "f s1 s2 .\|.\|." When the current font is\~\c .IR f , fonts .IR s1 , -.IR s2 ,\|.\|.\|.\& +.IR s2 , +.IR .\|.\|. , are special, that is, they are searched for glyphs not in the current font. . @@ -1470,7 +1487,7 @@ is missing, switch to the previous glyph color. . .TP -.BI .hcode \ c1\ code1\ c2\ code2\|.\|.\|.\& +.BI .hcode\ "c1 code1 c2 code2 .\|.\|." Set the hyphenation code of character .I c1 to @@ -1478,7 +1495,8 @@ to and that of .I c2 to -.IR code2 . +.IR code2 , +and so on. A hyphenation code must be a single input character (not a special character) other than a digit or a space. . @@ -1625,7 +1643,7 @@ except that the hyphenation patterns from are appended to the patterns already loaded in the current language. . .TP -.BI .hpfcode\ a\ b\ c\ d\ .\|.\|. +.BI .hpfcode\ "a b c d .\|.\|." After reading a hyphenation patterns file with the .B hpf or @@ -1930,10 +1948,11 @@ and with a positive value which are applied after the line is output. . .TP -.BI .rchar\ c1\ c2\|.\|.\|.\& +.BI .rchar\ "c1 c2 .\|.\|." Remove the definitions of glyphs .IR c1 , -.IR c2 ,\|.\|.\|. +.IR c2 , +.I .\|.\|. This undoes the effect of a .B char request. @@ -1948,10 +1967,11 @@ from the macro one level higher. No effect otherwise. . .TP -.BI .rfschar\ c1\ c2\|.\|.\|.\& +.BI .rfschar\ "c1 c2 .\|.\|." Remove the font-specific definitions of glyphs .IR c1 , -.IR c2 ,\|.\|.\|. +.IR c2 , +.I .\|.\|. This undoes the effect of a .B fschar request. @@ -2039,7 +2059,7 @@ is missing, arguments are shifted by\~1. Shifting by negative amounts is currently undefined. . .TP -.BI .sizes\ s1\ s2\|.\|.\|.\|sn\ [0] +.BI .sizes\ s1\ s2\ .\|.\|.\ sn\ [0] This command is similar to the .B sizes command of a @@ -2048,7 +2068,8 @@ file. . It sets the available font sizes for the current font to .IR s1 , -.IR s2 ,\|.\|.\|.\|,\~ sn +.IR s2 , +.IR .\|.\|.\| ,\~ sn scaled points. . The list of sizes can be terminated by an optional\~\c @@ -2063,10 +2084,11 @@ Contrary to the font file command, the list can't extend over more than a single line. . .TP -.BI .special\ s1\ s2\|.\|.\|.\& +.BI .special\ "s1 s2 .\|.\|." Fonts .IR s1 , -.IR s2 ,\|.\|.\|.\& +.IR s2 , +.IR .\|.\|. , are special and are searched for glyphs not in the current font. . @@ -2563,18 +2585,21 @@ In GNU troff, as in UNIX troff, you should always follow a sentence with either a newline or two spaces. . .TP -.BI .ta\ n1\ n2\|.\|.\|.nn \ T\ r1\ r2\|.\|.\|.\|rn +.BI .ta\ "n1 n2 .\|.\|. nn " "T " "r1 r2 .\|.\|. rn" Set tabs at positions .IR n1 , -.IR n2 ,\|.\|.\|.\|, +.IR n2 , +.IR .\|.\|. , .I nn and then set tabs at .IR nn \|+\| r1 , -.IR nn \|+\| r2 ,\|.\|.\|.\|, +.IR nn \|+\| r2 , +.IR .\|.\|. , .IR nn \|+\| rn and then at .IR nn \|+\| rn \|+\| r1 , -.IR nn \|+\| rn \|+\| r2 ,\|.\|.\|.\|, +.IR nn \|+\| rn \|+\| r2 , +.IR .\|.\|. , .IR nn \|+\| rn \|+\| rn , and so on. For example, diff --git a/src/include/font.h b/src/include/font.h index 944250b9..75d2ef16 100644 --- a/src/include/font.h +++ b/src/include/font.h @@ -1,5 +1,5 @@ // -*- C++ -*- -/* Copyright (C) 1989, 1990, 1991, 1992, 2002, 2004, 2006, 2009 +/* Copyright (C) 1989, 1990, 1991, 1992, 2002, 2004, 2006, 2009, 2010 Free Software Foundation, Inc. Written by James Clark (jjc@jclark.com) @@ -73,6 +73,9 @@ inline int glyph_to_number(glyph *); // Convert the given glyph back to // a numbered character. inline int glyph_to_index(glyph *); // Return the unique index that is // associated with the given glyph. It is >= 0. +extern int glyph_to_unicode(glyph *); // Convert the given glyph to its + // Unicode codepoint. Return -1 if it does not + // designate a Unicode character. inline int glyph_to_number(glyph *g) { diff --git a/src/libs/libgroff/font.cpp b/src/libs/libgroff/font.cpp index d0b4a12e..8dff71e7 100644 --- a/src/libs/libgroff/font.cpp +++ b/src/libs/libgroff/font.cpp @@ -1,6 +1,6 @@ // -*- C++ -*- /* Copyright (C) 1989, 1990, 1991, 1992, 2000, 2001, 2002, 2003, 2004, 2005, - 2006, 2008, 2009 + 2006, 2008, 2009, 2010 Free Software Foundation, Inc. Written by James Clark (jjc@jclark.com) @@ -147,6 +147,47 @@ void text_file::error(const char *format, error_with_file_and_line(path, lineno, format, arg1, arg2, arg3); } +int glyph_to_unicode(glyph *g) +{ + const char *nm = glyph_to_name(g); + if (nm != NULL) { + // ASCII character? + if (nm[0] == 'c' && nm[1] == 'h' && nm[2] == 'a' && nm[3] == 'r' + && (nm[4] >= '0' && nm[4] <= '9')) { + int n = (nm[4] - '0'); + if (nm[5] == '\0') + return n; + if (n > 0 && (nm[5] >= '0' && nm[5] <= '9')) { + n = 10*n + (nm[5] - '0'); + if (nm[6] == '\0') + return n; + if (nm[6] >= '0' && nm[6] <= '9') { + n = 10*n + (nm[6] - '0'); + if (nm[7] == '\0' && n < 128) + return n; + } + } + } + // Unicode character? + if (check_unicode_name(nm)) { + char *ignore; + return (int)strtol(nm + 1, &ignore, 16); + } + // If `nm' is a single letter `x', the glyph name is `\x'. + char buf[] = { '\\', '\0', '\0' }; + if (nm[1] == '\0') { + buf[1] = nm[0]; + nm = buf; + } + // groff glyphs that map to Unicode? + const char *unicode = glyph_name_to_unicode(nm); + if (unicode != NULL && strchr(unicode, '_') == NULL) { + char *ignore; + return (int)strtol(unicode, &ignore, 16); + } + } + return -1; +} /* font functions */ @@ -269,42 +310,11 @@ int font::contains(glyph *g) return 1; if (is_unicode) { // Unicode font - const char *nm = glyph_to_name(g); - if (nm != NULL) { - // ASCII character? - if (nm[0] == 'c' && nm[1] == 'h' && nm[2] == 'a' && nm[3] == 'r' - && (nm[4] >= '0' && nm[4] <= '9')) { - int n = (nm[4] - '0'); - if (nm[5] == '\0') - return 1; - if (n > 0 && (nm[5] >= '0' && nm[5] <= '9')) { - n = 10*n + (nm[5] - '0'); - if (nm[6] == '\0') - return 1; - if (nm[6] >= '0' && nm[6] <= '9') { - n = 10*n + (nm[6] - '0'); - if (nm[7] == '\0' && n < 128) - return 1; - } - } - } - // Unicode character? - if (check_unicode_name(nm)) - return 1; - // If `nm' is a single letter `x', the glyph name is `\x'. - char buf[] = { '\\', '\0', '\0' }; - if (nm[1] == '\0') { - buf[1] = nm[0]; - nm = buf; - } - // groff glyph name that maps to Unicode? - const char *unicode = glyph_name_to_unicode(nm); - if (unicode != NULL && strchr(unicode, '_') == NULL) - return 1; - } + // ASCII or Unicode character, or groff glyph name that maps to Unicode? + if (glyph_to_unicode(g) >= 0) + return 1; // Numbered character? - int n = glyph_to_number(g); - if (n >= 0) + if (glyph_to_number(g) >= 0) return 1; } return 0; @@ -554,43 +564,10 @@ int font::get_code(glyph *g) } if (is_unicode) { // Unicode font - const char *nm = glyph_to_name(g); - if (nm != NULL) { - // ASCII character? - if (nm[0] == 'c' && nm[1] == 'h' && nm[2] == 'a' && nm[3] == 'r' - && (nm[4] >= '0' && nm[4] <= '9')) { - int n = (nm[4] - '0'); - if (nm[5] == '\0') - return n; - if (n > 0 && (nm[5] >= '0' && nm[5] <= '9')) { - n = 10*n + (nm[5] - '0'); - if (nm[6] == '\0') - return n; - if (nm[6] >= '0' && nm[6] <= '9') { - n = 10*n + (nm[6] - '0'); - if (nm[7] == '\0' && n < 128) - return n; - } - } - } - // Unicode character? - if (check_unicode_name(nm)) { - char *ignore; - return (int)strtol(nm + 1, &ignore, 16); - } - // If `nm' is a single letter `x', the glyph name is `\x'. - char buf[] = { '\\', '\0', '\0' }; - if (nm[1] == '\0') { - buf[1] = nm[0]; - nm = buf; - } - // groff glyphs that map to Unicode? - const char *unicode = glyph_name_to_unicode(nm); - if (unicode != NULL && strchr(unicode, '_') == NULL) { - char *ignore; - return (int)strtol(unicode, &ignore, 16); - } - } + // ASCII or Unicode character, or groff glyph name that maps to Unicode? + int uni = glyph_to_unicode(g); + if (uni >= 0) + return uni; // Numbered character? int n = glyph_to_number(g); if (n >= 0) diff --git a/src/roff/troff/charinfo.h b/src/roff/troff/charinfo.h index 2c2c2685..c71383ed 100644 --- a/src/roff/troff/charinfo.h +++ b/src/roff/troff/charinfo.h @@ -1,5 +1,5 @@ // -*- C++ -*- -/* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2002, 2006, 2009 +/* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2002, 2006, 2009, 2010 Free Software Foundation, Inc. Written by James Clark (jjc@jclark.com) @@ -18,6 +18,9 @@ for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ +#include +#include + class macro; class charinfo : glyph { @@ -35,6 +38,9 @@ class charinfo : glyph { char translate_input; // non-zero means that asciify_code is // active for .asciify (set by .trin) char_mode mode; + // Unicode character classes + std::vector > ranges; + std::vector nested_classes; public: enum { // Values for the flags bitmask. See groff // manual, description of the `.cflags' request. @@ -66,6 +72,7 @@ public: unsigned char get_hyphenation_code(); unsigned char get_ascii_code(); unsigned char get_asciify_code(); + int get_unicode_code(); void set_hyphenation_code(unsigned char); void set_ascii_code(unsigned char); void set_asciify_code(unsigned char); @@ -73,6 +80,7 @@ public: int get_translation_input(); charinfo *get_translation(int = 0); void set_translation(charinfo *, int, int); + unsigned char get_flags(); void set_flags(unsigned char); void set_special_translation(int, int); int get_special_translation(int = 0); @@ -87,6 +95,13 @@ public: int is_fallback(); int is_special(); symbol *get_symbol(); + void add_to_class(int); + void add_to_class(int, int); + void add_to_class(charinfo *); + bool is_class(); + bool contains(int); + bool contains(symbol); + bool contains(charinfo *); }; charinfo *get_charinfo(symbol); @@ -95,37 +110,37 @@ charinfo *get_charinfo_by_number(int); inline int charinfo::overlaps_horizontally() { - return flags & OVERLAPS_HORIZONTALLY; + return get_flags() & OVERLAPS_HORIZONTALLY; } inline int charinfo::overlaps_vertically() { - return flags & OVERLAPS_VERTICALLY; + return get_flags() & OVERLAPS_VERTICALLY; } inline int charinfo::can_break_before() { - return flags & BREAK_BEFORE; + return get_flags() & BREAK_BEFORE; } inline int charinfo::can_break_after() { - return flags & BREAK_AFTER; + return get_flags() & BREAK_AFTER; } inline int charinfo::ends_sentence() { - return flags & ENDS_SENTENCE; + return get_flags() & ENDS_SENTENCE; } inline int charinfo::transparent() { - return flags & TRANSPARENT; + return get_flags() & TRANSPARENT; } inline int charinfo::ignore_hcodes() { - return flags & IGNORE_HCODES; + return get_flags() & IGNORE_HCODES; } inline int charinfo::numbered() @@ -214,5 +229,27 @@ inline int charinfo::first_time_not_found() inline symbol *charinfo::get_symbol() { - return( &nm ); + return &nm; +} + +inline void charinfo::add_to_class(int c) +{ + // TODO ranges cumbersome for single characters? + ranges.push_back(std::pair(c, c)); +} + +inline void charinfo::add_to_class(int lo, + int hi) +{ + ranges.push_back(std::pair(lo, hi)); +} + +inline void charinfo::add_to_class(charinfo *ci) +{ + nested_classes.push_back(ci); +} + +inline bool charinfo::is_class() +{ + return (!ranges.empty() || !nested_classes.empty()); } diff --git a/src/roff/troff/input.cpp b/src/roff/troff/input.cpp index 5335c1ce..cdb82fe2 100644 --- a/src/roff/troff/input.cpp +++ b/src/roff/troff/input.cpp @@ -1,6 +1,6 @@ // -*- C++ -*- /* Copyright (C) 1989, 1990, 1991, 1992, 2000, 2001, 2002, 2003, 2004, 2005, - 2006, 2007, 2008, 2009 + 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. Written by James Clark (jjc@jclark.com) @@ -6740,6 +6740,102 @@ void hyphenation_patterns_file_code() skip_line(); } +dictionary char_class_dictionary(501); + +void define_class() +{ + tok.skip(); + symbol nm = get_name(1); + if (nm.is_null()) { + skip_line(); + return; + } + charinfo *ci = get_charinfo(nm); + charinfo *child1 = 0, *child2 = 0; + while (!tok.newline() && !tok.eof()) { + tok.skip(); + if (child1 != 0 && tok.ch() == '-') { + tok.next(); + child2 = tok.get_char(1); + if (!child2) { + warning(WARN_MISSING, + "missing end of character range in class `%1'", + nm.contents()); + skip_line(); + return; + } + if (child1->is_class() || child2->is_class()) { + warning(WARN_SYNTAX, + "nested character class is not allowed in range definition"); + skip_line(); + return; + } + int u1 = child1->get_unicode_code(); + int u2 = child2->get_unicode_code(); + if (u1 < 0) { + warning(WARN_SYNTAX, + "invalid start value in character range"); + skip_line(); + return; + } + if (u2 < 0) { + warning(WARN_SYNTAX, + "invalid end value in character range"); + skip_line(); + return; + } + ci->add_to_class(u1, u2); + child1 = child2 = 0; + } + else if (child1 != 0) { + if (child1->is_class()) + ci->add_to_class(child1); + else { + int u1 = child1->get_unicode_code(); + if (u1 < 0) { + warning(WARN_SYNTAX, + "invalid character value in class `%1'", + nm.contents()); + skip_line(); + return; + } + ci->add_to_class(u1); + } + child1 = 0; + } + child1 = tok.get_char(1); + tok.next(); + if (!child1) { + if (!tok.newline()) + skip_line(); + break; + } + } + if (child1 != 0) { + if (child1->is_class()) + ci->add_to_class(child1); + else { + int u1 = child1->get_unicode_code(); + if (u1 < 0) { + warning(WARN_SYNTAX, + "invalid character value in class `%1'", + nm.contents()); + skip_line(); + return; + } + ci->add_to_class(u1); + } + child1 = 0; + } + if (!ci->is_class()) { + warning(WARN_SYNTAX, + "empty class definition for `%1'", + nm.contents()); + return; + } + (void)char_class_dictionary.lookup(nm, ci); +} + charinfo *token::get_char(int required) { if (type == TOKEN_CHAR) @@ -7817,6 +7913,7 @@ void init_input_requests() init_request("cflags", char_flags); init_request("char", define_character); init_request("chop", chop_macro); + init_request("class", define_class); init_request("close", close_request); init_request("color", activate_color); init_request("composite", composite_request); @@ -8367,6 +8464,13 @@ charinfo::charinfo(symbol s) number = -1; } +int charinfo::get_unicode_code() +{ + if (ascii_code != '\0') + return ascii_code; + return glyph_to_unicode(this); +} + void charinfo::set_hyphenation_code(unsigned char c) { hyphenation_code = c; @@ -8388,6 +8492,27 @@ void charinfo::set_translation(charinfo *ci, int tt, int ti) transparent_translate = tt; } +// Get the union of all flags affecting this charinfo. +unsigned char charinfo::get_flags() +{ + unsigned char all_flags = flags; + dictionary_iterator iter(char_class_dictionary); + charinfo *cp; + symbol s; + while (iter.get(&s, (void **)&cp)) { + assert(!s.is_null()); + if (cp->contains(get_unicode_code())) { +#if defined(DEBUGGING) + if (debug_state) + fprintf(stderr, "charinfo::get_flags %p %s %d\n", + (void *)cp, cp->nm.contents(), cp->flags); +#endif + all_flags |= cp->flags; + } + } + return all_flags; +} + void charinfo::set_special_translation(int c, int tt) { special_translation = c; @@ -8432,6 +8557,50 @@ int charinfo::get_number() return number; } +bool charinfo::contains(int c) +{ + std::vector >::const_iterator ranges_iter; + ranges_iter = ranges.begin(); + while (ranges_iter != ranges.end()) { + if (c >= ranges_iter->first && c <= ranges_iter->second) { +#if defined(DEBUGGING) + if (debug_state) + fprintf(stderr, "charinfo::contains(%d)\n", c); +#endif + return true; + } + ++ranges_iter; + } + + std::vector::const_iterator nested_iter; + nested_iter = nested_classes.begin(); + while (nested_iter != nested_classes.end()) { + if ((*nested_iter)->contains(c)) + return true; + ++nested_iter; + } + + return false; +} + +bool charinfo::contains(symbol s) +{ + const char *unicode = glyph_name_to_unicode(s.contents()); + if (unicode != NULL && strchr(unicode, '_') == NULL) { + char *ignore; + int c = (int)strtol(unicode, &ignore, 16); + return contains(c); + } + else + return false; +} + +bool charinfo::contains(charinfo *) +{ + // TODO + return false; +} + symbol UNNAMED_SYMBOL("---"); // For numbered characters not between 0 and 255, we make a symbol out -- cgit v1.2.1