From 81d80a6f71cc6d6fbd3d1111498318fb22a6857a Mon Sep 17 00:00:00 2001 From: wl Date: Sat, 18 Dec 2010 09:13:16 +0000 Subject: Improve CJK support with new values for `.cflags'. This patch introduces three new values to `.cflags': don't break before character: 128 don't break after character: 256 allow inter-character break: 512 They are handled differently if compared to other cflags values: (1) hcode values are completely ignored (2) similar to kern values, and contrary to the other cflags values, troff looks at pairs of characters to decide whether a break gets inserted A yet-to-be-written patch should add inter-character spacing if those flags are active; currently, only zero-width breakpoints are inserted. * src/roff/troff/charinfo.h (charinfo): Change type of `flags' to `int'. Update callers accordingly. New enum values `DONT_BREAK_BEFORE', `DONT_BREAK_AFTER', and `INTER_CHAR_SPACE'. New member functions `prohibit_break_before', `prohibit_break_after', and `inter_char_space'. * src/roff/troff/input.cpp: Updated. * src/roff/troff/node.cpp (inter_char_space_node): New class similar to kern_pair_node, collecting charinfo entities with the abovementioned cflags values. (break_char_type): Add new enum values. (glyph_node::merge_glyph_node): Handle abovementioned cflags values and emit an `inter_char_space_node' if necessary. * tmac/ja.tmac: Use new cflags values. * doc/groff.texinfo, NEWS, man/groff_diff.man: Document new values. --- ChangeLog | 40 +++++++ NEWS | 7 ++ doc/groff.texinfo | 21 ++++ man/groff_diff.man | 26 ++++- src/roff/troff/charinfo.h | 43 ++++++-- src/roff/troff/input.cpp | 4 +- src/roff/troff/node.cpp | 258 ++++++++++++++++++++++++++++++++++++++++++++-- tmac/ja.tmac | 6 +- 8 files changed, 382 insertions(+), 23 deletions(-) diff --git a/ChangeLog b/ChangeLog index f5963d93..acd42273 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,43 @@ +2010-12-18 Werner LEMBERG + + Improve CJK support with new values for `.cflags'. + + This patch introduces three new values to `.cflags': + + don't break before character: 128 + don't break after character: 256 + allow inter-character break: 512 + + They are handled differently if compared to other cflags values: + + (1) hcode values are completely ignored + (2) similar to kern values, and contrary to the other cflags + values, troff looks at pairs of characters to decide whether a + break gets inserted + + A yet-to-be-written patch should add inter-character spacing if + those flags are active; currently, only zero-width breakpoints are + inserted. + + * src/roff/troff/charinfo.h (charinfo): Change type of `flags' to + `int'. Update callers accordingly. + New enum values `DONT_BREAK_BEFORE', `DONT_BREAK_AFTER', and + `INTER_CHAR_SPACE'. + New member functions `prohibit_break_before', + `prohibit_break_after', and `inter_char_space'. + * src/roff/troff/input.cpp: Updated. + + * src/roff/troff/node.cpp (inter_char_space_node): New class similar + to kern_pair_node, collecting charinfo entities with the + abovementioned cflags values. + (break_char_type): Add new enum values. + (glyph_node::merge_glyph_node): Handle abovementioned cflags values + and emit an `inter_char_space_node' if necessary. + + * tmac/ja.tmac: Use new cflags values. + + * doc/groff.texinfo, NEWS, man/groff_diff.man: Document new values. + 2010-12-18 Werner LEMBERG Remove compiler warning. diff --git a/NEWS b/NEWS index 00a41b47..ce6410cc 100644 --- a/NEWS +++ b/NEWS @@ -28,6 +28,13 @@ o The new `class' request assigns a short name to a set of characters which can be referred to in the `cflags' request. This is especially useful to control line-breaking and hyphenation rules in CJK languages. +o Three new values for the `cflags' request have been added, which are + needed for proper CJK support. + + 128 prohibit before but allow break after character + 256 prohibit after but allow break before character + 512 allow break before and after character + Tbl --- diff --git a/doc/groff.texinfo b/doc/groff.texinfo index 42cf6166..dc81fb5a 100644 --- a/doc/groff.texinfo +++ b/doc/groff.texinfo @@ -9537,6 +9537,27 @@ into your document. Note, however, that this can lead to bad layout if done without thinking; in most situations, a better solution instead of changing the @code{cflags} value is to insert @code{\:} right after the hyphen at the places which really need a break point. + +@item 128 +Prohibit a line break before the character, but allow a line break after +the character. This works only in combination with flags 256 and 512 +(see below) and has no effect otherwise. + +@item 256 +Prohibit a line break after the character, but allow a line break before +the character. This works only in combination with flags 128 and 512 +(see below) and has no effect otherwise. + +@item 512 +Allow line break before or after the character. This works only in +combination with flags 128 and 256 and has no effect otherwise. + +Contrary to flag values 2 and@tie{}4, the flags 128, 256, and 512 work +pairwise. If, for example, the left character has value 512, and the +right character 128, no line break gets inserted. If we use +value@tie{}6 instead for the left character, a line break after the +character can't be suppressed since the right neighbour character +doesn't get examined. @end table @endDefreq diff --git a/man/groff_diff.man b/man/groff_diff.man index e0d355ea..408d7bf7 100644 --- a/man/groff_diff.man +++ b/man/groff_diff.man @@ -993,7 +993,6 @@ Lines can be broken after the character (initially characters have this property); a line is not broken at a character with this property unless the characters on each side both have non-zero hyphenation codes. - This can be overridden with value 64. . .IP 8 @@ -1021,8 +1020,33 @@ have this property). Ignore hyphenation code values of the surrounding characters. Use this in combination with values 2 and\~4 (initially no characters have this property). +. +.IP 128 +Prohibit a line break before the character, but allow a line break after the +character. +This works only in combination with flags 256 and 512 and has no effect +otherwise. +. +.IP 256 +Prohibit a line break after the character, but allow a line break before +the character. +This works only in combination with flags 128 and 512 and has no effect +otherwise. +. +.IP 512 +Allow line break before or after the character. +This works only in combination with flags 128 and 256 and has no effect +otherwise. .RE . +.IP +Contrary to flag values 2 and\~4, the flags 128, 256, and 512 work pairwise. +If, for example, the left character has value 512, and the right character +128, no line break gets inserted. +If we use value\~6 instead for the left character, a line break after the +character can't be suppressed since the right neighbour character doesn't +get examined. +. .TP .BI .char\ c\ string [This request can both define characters and glyphs.] diff --git a/src/roff/troff/charinfo.h b/src/roff/troff/charinfo.h index c71383ed..544f24f2 100644 --- a/src/roff/troff/charinfo.h +++ b/src/roff/troff/charinfo.h @@ -29,7 +29,7 @@ class charinfo : glyph { macro *mac; unsigned char special_translation; unsigned char hyphenation_code; - unsigned char flags; + unsigned int flags; unsigned char ascii_code; unsigned char asciify_code; char not_found; @@ -44,13 +44,16 @@ class charinfo : glyph { public: enum { // Values for the flags bitmask. See groff // manual, description of the `.cflags' request. - ENDS_SENTENCE = 1, - BREAK_BEFORE = 2, - BREAK_AFTER = 4, - OVERLAPS_HORIZONTALLY = 8, - OVERLAPS_VERTICALLY = 16, - TRANSPARENT = 32, - IGNORE_HCODES = 64 + ENDS_SENTENCE = 0x01, + BREAK_BEFORE = 0x02, + BREAK_AFTER = 0x04, + OVERLAPS_HORIZONTALLY = 0x08, + OVERLAPS_VERTICALLY = 0x10, + TRANSPARENT = 0x20, + IGNORE_HCODES = 0x40, + DONT_BREAK_BEFORE = 0x80, + DONT_BREAK_AFTER = 0x100, + INTER_CHAR_SPACE = 0x200 }; enum { TRANSLATE_NONE, @@ -69,6 +72,9 @@ public: int can_break_after(); int transparent(); int ignore_hcodes(); + int prohibit_break_before(); + int prohibit_break_after(); + int inter_char_space(); unsigned char get_hyphenation_code(); unsigned char get_ascii_code(); unsigned char get_asciify_code(); @@ -80,8 +86,8 @@ public: int get_translation_input(); charinfo *get_translation(int = 0); void set_translation(charinfo *, int, int); - unsigned char get_flags(); - void set_flags(unsigned char); + unsigned int get_flags(); + void set_flags(unsigned int); void set_special_translation(int, int); int get_special_translation(int = 0); macro *set_macro(macro *); @@ -143,6 +149,21 @@ inline int charinfo::ignore_hcodes() return get_flags() & IGNORE_HCODES; } +inline int charinfo::prohibit_break_before() +{ + return get_flags() & DONT_BREAK_BEFORE; +} + +inline int charinfo::prohibit_break_after() +{ + return get_flags() & DONT_BREAK_AFTER; +} + +inline int charinfo::inter_char_space() +{ + return get_flags() & INTER_CHAR_SPACE; +} + inline int charinfo::numbered() { return number >= 0; @@ -185,7 +206,7 @@ inline unsigned char charinfo::get_asciify_code() return (translate_input ? asciify_code : 0); } -inline void charinfo::set_flags(unsigned char c) +inline void charinfo::set_flags(unsigned int c) { flags = c; } diff --git a/src/roff/troff/input.cpp b/src/roff/troff/input.cpp index 8d5c1b33..4d579f80 100644 --- a/src/roff/troff/input.cpp +++ b/src/roff/troff/input.cpp @@ -8496,9 +8496,9 @@ void charinfo::set_translation(charinfo *ci, int tt, int ti) } // Get the union of all flags affecting this charinfo. -unsigned char charinfo::get_flags() +unsigned int charinfo::get_flags() { - unsigned char all_flags = flags; + unsigned int all_flags = flags; dictionary_iterator iter(char_class_dictionary); charinfo *cp; symbol s; diff --git a/src/roff/troff/node.cpp b/src/roff/troff/node.cpp index 9a964a0d..ec082ff2 100644 --- a/src/roff/troff/node.cpp +++ b/src/roff/troff/node.cpp @@ -2111,6 +2111,47 @@ node *node::merge_glyph_node(glyph_node *) return 0; } +class inter_char_space_node : public node { + hunits amount; + char left_break_code; + char right_break_code; + color *col; + node *n1; + node *n2; +public: + inter_char_space_node(hunits, char, char, color *, node *, node *, + statem *, int, node * = 0); + ~inter_char_space_node(); + node *copy(); + node *merge_glyph_node(glyph_node *); + node *add_self(node *, hyphen_list **); + hyphen_list *get_hyphen_list(hyphen_list *, int *); + node *add_discretionary_hyphen(); + hunits width(); + node *last_char_node(); + hunits italic_correction(); + hunits subscript_correction(); + void tprint(troff_output_file *); + hyphenation_type get_hyphenation_type(); + int ends_sentence(); + void ascii_print(ascii_output_file *); + void asciify(macro *); + int same(node *); + const char *type(); + int force_tprint(); + int is_tag(); + void vertical_extent(vunits *, vunits *); +}; + +enum break_char_type { + CAN_BREAK_BEFORE = 0x01, + CAN_BREAK_AFTER = 0x02, + IGNORE_HCODES = 0x04, + PROHIBIT_BREAK_BEFORE = 0x08, + PROHIBIT_BREAK_AFTER = 0x10, + INTER_CHAR_SPACE = 0x20 +}; + node *glyph_node::merge_glyph_node(glyph_node *gn) { if (tf == gn->tf && gcol == gn->gcol && fcol == gn->fcol) { @@ -2129,6 +2170,28 @@ node *glyph_node::merge_glyph_node(glyph_node *gn) gn->div_nest_level, next1); } } + int left_bc = 0, right_bc = 0; + if (ci->prohibit_break_before()) + left_bc = PROHIBIT_BREAK_BEFORE; + if (gn->ci->prohibit_break_before()) + right_bc = PROHIBIT_BREAK_BEFORE; + if (ci->prohibit_break_after()) + left_bc |= PROHIBIT_BREAK_AFTER; + if (gn->ci->prohibit_break_after()) + right_bc |= PROHIBIT_BREAK_AFTER; + if (ci->inter_char_space()) + left_bc |= INTER_CHAR_SPACE; + if (gn->ci->inter_char_space()) + right_bc |= INTER_CHAR_SPACE; + if (left_bc && right_bc) { + node *next1 = next; + next = 0; + // ic_space not supported yet + int ic_space = 0; + return new inter_char_space_node(ic_space, left_bc, right_bc, + gcol, this, gn, state, + gn->div_nest_level, next1); + } return 0; } @@ -2757,12 +2820,6 @@ int italic_corrected_node::character_type() return n->character_type(); } -enum break_char_type { - CAN_BREAK_BEFORE = 0x01, - CAN_BREAK_AFTER = 0x02, - IGNORE_HCODES = 0x04 -}; - class break_char_node : public node { node *ch; char break_code; @@ -5718,6 +5775,195 @@ int dbreak_node::is_tag() return 0; } +inter_char_space_node::inter_char_space_node(hunits n, + char left, char right, + color *c, node *first, node *second, + statem* s, int pop, node *x) +: node(x, s, pop), amount(n), left_break_code(left), right_break_code(right), + col(c), n1(first), n2(second) +{ +} + +inter_char_space_node::~inter_char_space_node() +{ + if (n1 != 0) + delete n1; + if (n2 != 0) + delete n2; +} + +node *inter_char_space_node::merge_glyph_node(glyph_node *gn) +{ + node *nd = n2->merge_glyph_node(gn); + if (nd == 0) + return 0; + n2 = nd; + nd = n2->merge_self(n1); + if (nd) { + nd->next = next; + n1 = 0; + n2 = 0; + delete this; + return nd; + } + return this; +} + +hunits inter_char_space_node::italic_correction() +{ + return n2->italic_correction(); +} + +hunits inter_char_space_node::subscript_correction() +{ + return n2->subscript_correction(); +} + +void inter_char_space_node::vertical_extent(vunits *min, vunits *max) +{ + n1->vertical_extent(min, max); + vunits min2, max2; + n2->vertical_extent(&min2, &max2); + if (min2 < *min) + *min = min2; + if (max2 > *max) + *max = max2; +} + +node *inter_char_space_node::add_discretionary_hyphen() +{ + tfont *tf = n1->get_tfont(); + if (tf) { + if (tf->contains(soft_hyphen_char)) { + color *gcol = n2->get_glyph_color(); + color *fcol = n2->get_fill_color(); + node *next1 = next; + next = 0; + node *n = copy(); + glyph_node *gn = new glyph_node(soft_hyphen_char, tf, gcol, fcol, + state, div_nest_level); + node *nn = n->merge_glyph_node(gn); + if (nn == 0) { + gn->next = n; + nn = gn; + } + return new dbreak_node(this, nn, state, div_nest_level, next1); + } + } + return this; +} + +node *inter_char_space_node::copy() +{ + return new inter_char_space_node(amount, left_break_code, right_break_code, + col, n1->copy(), n2->copy(), + state, div_nest_level); +} + +hyphen_list *inter_char_space_node::get_hyphen_list(hyphen_list *tail, + int *count) +{ + hyphen_list *hl = n2->get_hyphen_list(tail, count); + return n1->get_hyphen_list(hl, count); +} + +node *inter_char_space_node::add_self(node *n, hyphen_list **p) +{ + n = n1->add_self(n, p); + if (left_break_code & INTER_CHAR_SPACE + || left_break_code & PROHIBIT_BREAK_AFTER) { + if (right_break_code & PROHIBIT_BREAK_BEFORE) + // stretchable zero-width space not implemented yet + ; + else { + // breakable, stretchable zero-width space not implemented yet + n = new space_node(H0, col, n); + n->freeze_space(); + } + } + n = n2->add_self(n, p); + n1 = n2 = 0; + delete this; + return n; +} + +hunits inter_char_space_node::width() +{ + return n1->width() + n2->width(); +} + +node *inter_char_space_node::last_char_node() +{ + node *nd = n2->last_char_node(); + if (nd) + return nd; + return n1->last_char_node(); +} + +int inter_char_space_node::ends_sentence() +{ + switch (n2->ends_sentence()) { + case 0: + return 0; + case 1: + return 1; + case 2: + break; + default: + assert(0); + } + return n1->ends_sentence(); +} + +void inter_char_space_node::ascii_print(ascii_output_file *ascii) +{ + n1->ascii_print(ascii); + n2->ascii_print(ascii); +} + +void inter_char_space_node::asciify(macro *m) +{ + n1->asciify(m); + n2->asciify(m); + n1 = n2 = 0; + delete this; +} + +hyphenation_type inter_char_space_node::get_hyphenation_type() +{ + return HYPHEN_MIDDLE; +} + +void inter_char_space_node::tprint(troff_output_file *out) +{ + n1->tprint(out); + n2->tprint(out); +} + +int inter_char_space_node::same(node *nd) +{ + return (amount == ((inter_char_space_node *)nd)->amount + && left_break_code == ((inter_char_space_node *)nd)->left_break_code + && right_break_code == ((inter_char_space_node *)nd)->right_break_code + && same_node(n1, ((inter_char_space_node *)nd)->n1) + && same_node(n2, ((inter_char_space_node *)nd)->n2)); +} + +const char *inter_char_space_node::type() +{ + return "inter_char_space_node"; +} + +int inter_char_space_node::force_tprint() +{ + return 0; +} + +int inter_char_space_node::is_tag() +{ + return 0; +} + int break_char_node::same(node *nd) { return break_code == ((break_char_node *)nd)->break_code diff --git a/tmac/ja.tmac b/tmac/ja.tmac index d6883f8f..f0ecd75b 100644 --- a/tmac/ja.tmac +++ b/tmac/ja.tmac @@ -44,6 +44,6 @@ .class [CJKnormal] \ \[u3041]-\[u3096] \[u30A0]-\[u30FF] \[u4E00]-\[u9FFF] . -.cflags 2 \C'[CJKprepunct]' -.cflags 4 \C'[CJKpostpunct]' -.cflags 66 \C'[CJKnormal]' +.cflags 128 \C'[CJKprepunct]' +.cflags 266 \C'[CJKpostpunct]' +.cflags 512 \C'[CJKnormal]' -- cgit v1.2.1