4 files changed, 270 insertions, 84 deletions
diff --git a/src/include/font.h b/src/include/font.h
index 944250b9..75d2ef16 100644
--- a/src/include/font.h
+++ b/src/include/font.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-/* Copyright (C) 1989, 1990, 1991, 1992, 2002, 2004, 2006, 2009
+/* Copyright (C) 1989, 1990, 1991, 1992, 2002, 2004, 2006, 2009, 2010
    Free Software Foundation, Inc.
      Written by James Clark (jjc@jclark.com)
 
@@ -73,6 +73,9 @@ inline int glyph_to_number(glyph *);	// Convert the given glyph back to
 			// a numbered character.
 inline int glyph_to_index(glyph *);	// Return the unique index that is
 			// associated with the given glyph. It is >= 0.
+extern int glyph_to_unicode(glyph *);	// Convert the given glyph to its
+			// Unicode codepoint.  Return -1 if it does not
+			// designate a Unicode character.
 
 inline int glyph_to_number(glyph *g)
 {
diff --git a/src/libs/libgroff/font.cpp b/src/libs/libgroff/font.cpp
index d0b4a12e..8dff71e7 100644
--- a/src/libs/libgroff/font.cpp
+++ b/src/libs/libgroff/font.cpp
@@ -1,6 +1,6 @@
 // -*- C++ -*-
 /* Copyright (C) 1989, 1990, 1991, 1992, 2000, 2001, 2002, 2003, 2004, 2005,
-                 2006, 2008, 2009
+                 2006, 2008, 2009, 2010
    Free Software Foundation, Inc.
      Written by James Clark (jjc@jclark.com)
 
@@ -147,6 +147,47 @@ void text_file::error(const char *format,
     error_with_file_and_line(path, lineno, format, arg1, arg2, arg3);
 }
 
+int glyph_to_unicode(glyph *g)
+{
+  const char *nm = glyph_to_name(g);
+  if (nm != NULL) {
+    // ASCII character?
+    if (nm[0] == 'c' && nm[1] == 'h' && nm[2] == 'a' && nm[3] == 'r'
+	&& (nm[4] >= '0' && nm[4] <= '9')) {
+      int n = (nm[4] - '0');
+      if (nm[5] == '\0')
+	return n;
+      if (n > 0 && (nm[5] >= '0' && nm[5] <= '9')) {
+	n = 10*n + (nm[5] - '0');
+	if (nm[6] == '\0')
+	  return n;
+	if (nm[6] >= '0' && nm[6] <= '9') {
+	  n = 10*n + (nm[6] - '0');
+	  if (nm[7] == '\0' && n < 128)
+	    return n;
+	}
+      }
+    }
+    // Unicode character?
+    if (check_unicode_name(nm)) {
+      char *ignore;
+      return (int)strtol(nm + 1, &ignore, 16);
+    }
+    // If `nm' is a single letter `x', the glyph name is `\x'.
+    char buf[] = { '\\', '\0', '\0' };
+    if (nm[1] == '\0') {
+      buf[1] = nm[0];
+      nm = buf;
+    }
+    // groff glyphs that map to Unicode?
+    const char *unicode = glyph_name_to_unicode(nm);
+    if (unicode != NULL && strchr(unicode, '_') == NULL) {
+      char *ignore;
+      return (int)strtol(unicode, &ignore, 16);
+    }
+  }
+  return -1;
+}
 
 /* font functions */
 
@@ -269,42 +310,11 @@ int font::contains(glyph *g)
     return 1;
   if (is_unicode) {
     // Unicode font
-    const char *nm = glyph_to_name(g);
-    if (nm != NULL) {
-      // ASCII character?
-      if (nm[0] == 'c' && nm[1] == 'h' && nm[2] == 'a' && nm[3] == 'r'
-          && (nm[4] >= '0' && nm[4] <= '9')) {
-	int n = (nm[4] - '0');
-	if (nm[5] == '\0')
-	  return 1;
-	if (n > 0 && (nm[5] >= '0' && nm[5] <= '9')) {
-	  n = 10*n + (nm[5] - '0');
-	  if (nm[6] == '\0')
-	    return 1;
-	  if (nm[6] >= '0' && nm[6] <= '9') {
-	    n = 10*n + (nm[6] - '0');
-	    if (nm[7] == '\0' && n < 128)
-	      return 1;
-	  }
-	}
-      }
-      // Unicode character?
-      if (check_unicode_name(nm))
-	return 1;
-      // If `nm' is a single letter `x', the glyph name is `\x'.
-      char buf[] = { '\\', '\0', '\0' };
-      if (nm[1] == '\0') {
-	buf[1] = nm[0];
-        nm = buf;
-      }
-      // groff glyph name that maps to Unicode?
-      const char *unicode = glyph_name_to_unicode(nm);
-      if (unicode != NULL && strchr(unicode, '_') == NULL)
-	return 1;
-    }
+    // ASCII or Unicode character, or groff glyph name that maps to Unicode?
+    if (glyph_to_unicode(g) >= 0)
+      return 1;
     // Numbered character?
-    int n = glyph_to_number(g);
-    if (n >= 0)
+    if (glyph_to_number(g) >= 0)
       return 1;
   }
   return 0;
@@ -554,43 +564,10 @@ int font::get_code(glyph *g)
   }
   if (is_unicode) {
     // Unicode font
-    const char *nm = glyph_to_name(g);
-    if (nm != NULL) {
-      // ASCII character?
-      if (nm[0] == 'c' && nm[1] == 'h' && nm[2] == 'a' && nm[3] == 'r'
-          && (nm[4] >= '0' && nm[4] <= '9')) {
-	int n = (nm[4] - '0');
-	if (nm[5] == '\0')
-	  return n;
-	if (n > 0 && (nm[5] >= '0' && nm[5] <= '9')) {
-	  n = 10*n + (nm[5] - '0');
-	  if (nm[6] == '\0')
-	    return n;
-	  if (nm[6] >= '0' && nm[6] <= '9') {
-	    n = 10*n + (nm[6] - '0');
-	    if (nm[7] == '\0' && n < 128)
-	      return n;
-	  }
-	}
-      }
-      // Unicode character?
-      if (check_unicode_name(nm)) {
-	char *ignore;
-	return (int)strtol(nm + 1, &ignore, 16);
-      }
-      // If `nm' is a single letter `x', the glyph name is `\x'.
-      char buf[] = { '\\', '\0', '\0' };
-      if (nm[1] == '\0') {
-	buf[1] = nm[0];
-        nm = buf;
-      }
-      // groff glyphs that map to Unicode?
-      const char *unicode = glyph_name_to_unicode(nm);
-      if (unicode != NULL && strchr(unicode, '_') == NULL) {
-	char *ignore;
-	return (int)strtol(unicode, &ignore, 16);
-      }
-    }
+    // ASCII or Unicode character, or groff glyph name that maps to Unicode?
+    int uni = glyph_to_unicode(g);
+    if (uni >= 0)
+      return uni;
     // Numbered character?
     int n = glyph_to_number(g);
     if (n >= 0)
diff --git a/src/roff/troff/charinfo.h b/src/roff/troff/charinfo.h
index 2c2c2685..c71383ed 100644
--- a/src/roff/troff/charinfo.h
+++ b/src/roff/troff/charinfo.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-/* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2002, 2006, 2009
+/* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2002, 2006, 2009, 2010
    Free Software Foundation, Inc.
      Written by James Clark (jjc@jclark.com)
 
@@ -18,6 +18,9 @@ for more details.
 You should have received a copy of the GNU General Public License
 along with this program. If not, see <http://www.gnu.org/licenses/>. */
 
+#include <vector>
+#include <utility>
+
 class macro;
 
 class charinfo : glyph {
@@ -35,6 +38,9 @@ class charinfo : glyph {
   char translate_input;		// non-zero means that asciify_code is
 				// active for .asciify (set by .trin)
   char_mode mode;
+  // Unicode character classes
+  std::vector<std::pair<int, int> > ranges;
+  std::vector<charinfo *> nested_classes;
 public:
   enum {		// Values for the flags bitmask.  See groff
 			// manual, description of the `.cflags' request.
@@ -66,6 +72,7 @@ public:
   unsigned char get_hyphenation_code();
   unsigned char get_ascii_code();
   unsigned char get_asciify_code();
+  int get_unicode_code();
   void set_hyphenation_code(unsigned char);
   void set_ascii_code(unsigned char);
   void set_asciify_code(unsigned char);
@@ -73,6 +80,7 @@ public:
   int get_translation_input();
   charinfo *get_translation(int = 0);
   void set_translation(charinfo *, int, int);
+  unsigned char get_flags();
   void set_flags(unsigned char);
   void set_special_translation(int, int);
   int get_special_translation(int = 0);
@@ -87,6 +95,13 @@ public:
   int is_fallback();
   int is_special();
   symbol *get_symbol();
+  void add_to_class(int);
+  void add_to_class(int, int);
+  void add_to_class(charinfo *);
+  bool is_class();
+  bool contains(int);
+  bool contains(symbol);
+  bool contains(charinfo *);
 };
 
 charinfo *get_charinfo(symbol);
@@ -95,37 +110,37 @@ charinfo *get_charinfo_by_number(int);
 
 inline int charinfo::overlaps_horizontally()
 {
-  return flags & OVERLAPS_HORIZONTALLY;
+  return get_flags() & OVERLAPS_HORIZONTALLY;
 }
 
 inline int charinfo::overlaps_vertically()
 {
-  return flags & OVERLAPS_VERTICALLY;
+  return get_flags() & OVERLAPS_VERTICALLY;
 }
 
 inline int charinfo::can_break_before()
 {
-  return flags & BREAK_BEFORE;
+  return get_flags() & BREAK_BEFORE;
 }
 
 inline int charinfo::can_break_after()
 {
-  return flags & BREAK_AFTER;
+  return get_flags() & BREAK_AFTER;
 }
 
 inline int charinfo::ends_sentence()
 {
-  return flags & ENDS_SENTENCE;
+  return get_flags() & ENDS_SENTENCE;
 }
 
 inline int charinfo::transparent()
 {
-  return flags & TRANSPARENT;
+  return get_flags() & TRANSPARENT;
 }
 
 inline int charinfo::ignore_hcodes()
 {
-  return flags & IGNORE_HCODES;
+  return get_flags() & IGNORE_HCODES;
 }
 
 inline int charinfo::numbered()
@@ -214,5 +229,27 @@ inline int charinfo::first_time_not_found()
 
 inline symbol *charinfo::get_symbol()
 {
-  return( &nm );
+  return &nm;
+}
+
+inline void charinfo::add_to_class(int c)
+{
+  // TODO ranges cumbersome for single characters?
+  ranges.push_back(std::pair<int, int>(c, c));
+}
+
+inline void charinfo::add_to_class(int lo,
+				   int hi)
+{
+  ranges.push_back(std::pair<int, int>(lo, hi));
+}
+
+inline void charinfo::add_to_class(charinfo *ci)
+{
+  nested_classes.push_back(ci);
+}
+
+inline bool charinfo::is_class()
+{
+  return (!ranges.empty() || !nested_classes.empty());
 }
diff --git a/src/roff/troff/input.cpp b/src/roff/troff/input.cpp
index 5335c1ce..cdb82fe2 100644
--- a/src/roff/troff/input.cpp
+++ b/src/roff/troff/input.cpp
@@ -1,6 +1,6 @@
 // -*- C++ -*-
 /* Copyright (C) 1989, 1990, 1991, 1992, 2000, 2001, 2002, 2003, 2004, 2005,
-                 2006, 2007, 2008, 2009
+                 2006, 2007, 2008, 2009, 2010
    Free Software Foundation, Inc.
      Written by James Clark (jjc@jclark.com)
 
@@ -6740,6 +6740,102 @@ void hyphenation_patterns_file_code()
   skip_line();
 }
 
+dictionary char_class_dictionary(501);
+
+void define_class()
+{
+  tok.skip();
+  symbol nm = get_name(1);
+  if (nm.is_null()) {
+    skip_line();
+    return;
+  }
+  charinfo *ci = get_charinfo(nm);
+  charinfo *child1 = 0, *child2 = 0;
+  while (!tok.newline() && !tok.eof()) {
+    tok.skip();
+    if (child1 != 0 && tok.ch() == '-') {
+      tok.next();
+      child2 = tok.get_char(1);
+      if (!child2) {
+	warning(WARN_MISSING,
+		"missing end of character range in class `%1'",
+		nm.contents());
+	skip_line();
+	return;
+      }
+      if (child1->is_class() || child2->is_class()) {
+	warning(WARN_SYNTAX,
+		"nested character class is not allowed in range definition");
+	skip_line();
+	return;
+      }
+      int u1 = child1->get_unicode_code();
+      int u2 = child2->get_unicode_code();
+      if (u1 < 0) {
+	warning(WARN_SYNTAX,
+		"invalid start value in character range");
+	skip_line();
+	return;
+      }
+      if (u2 < 0) {
+	warning(WARN_SYNTAX,
+		"invalid end value in character range");
+	skip_line();
+	return;
+      }
+      ci->add_to_class(u1, u2);
+      child1 = child2 = 0;
+    }
+    else if (child1 != 0) {
+      if (child1->is_class())
+	ci->add_to_class(child1);
+      else {
+	int u1 = child1->get_unicode_code();
+	if (u1 < 0) {
+	  warning(WARN_SYNTAX,
+		  "invalid character value in class `%1'",
+		  nm.contents());
+	  skip_line();
+	  return;
+	}
+	ci->add_to_class(u1);
+      }
+      child1 = 0;
+    }
+    child1 = tok.get_char(1);
+    tok.next();
+    if (!child1) {
+      if (!tok.newline())
+	skip_line();
+      break;
+    }
+  }
+  if (child1 != 0) {
+    if (child1->is_class())
+      ci->add_to_class(child1);
+    else {
+      int u1 = child1->get_unicode_code();
+      if (u1 < 0) {
+	warning(WARN_SYNTAX,
+		"invalid character value in class `%1'",
+		nm.contents());
+	skip_line();
+	return;
+      }
+      ci->add_to_class(u1);
+    }
+    child1 = 0;
+  }
+  if (!ci->is_class()) {
+    warning(WARN_SYNTAX,
+	    "empty class definition for `%1'",
+	    nm.contents());
+    return;
+  }
+  (void)char_class_dictionary.lookup(nm, ci);
+}
+
 charinfo *token::get_char(int required)
 {
   if (type == TOKEN_CHAR)
@@ -7817,6 +7913,7 @@ void init_input_requests()
   init_request("cflags", char_flags);
   init_request("char", define_character);
   init_request("chop", chop_macro);
+  init_request("class", define_class);
   init_request("close", close_request);
   init_request("color", activate_color);
   init_request("composite", composite_request);
@@ -8367,6 +8464,13 @@ charinfo::charinfo(symbol s)
   number = -1;
 }
 
+int charinfo::get_unicode_code()
+{
+  if (ascii_code != '\0')
+    return ascii_code;
+  return glyph_to_unicode(this);
+}
+
 void charinfo::set_hyphenation_code(unsigned char c)
 {
   hyphenation_code = c;
@@ -8388,6 +8492,27 @@ void charinfo::set_translation(charinfo *ci, int tt, int ti)
   transparent_translate = tt;
 }
 
+// Get the union of all flags affecting this charinfo.
+unsigned char charinfo::get_flags()
+{
+  unsigned char all_flags = flags;
+  dictionary_iterator iter(char_class_dictionary);
+  charinfo *cp;
+  symbol s;
+  while (iter.get(&s, (void **)&cp)) {
+    assert(!s.is_null());
+    if (cp->contains(get_unicode_code())) {
+#if defined(DEBUGGING)
+      if (debug_state)
+	fprintf(stderr, "charinfo::get_flags %p %s %d\n",
+			(void *)cp, cp->nm.contents(), cp->flags);
+#endif
+      all_flags |= cp->flags;
+    }
+  }
+  return all_flags;
+}
+
 void charinfo::set_special_translation(int c, int tt)
 {
   special_translation = c;
@@ -8432,6 +8557,50 @@ int charinfo::get_number()
   return number;
 }
 
+bool charinfo::contains(int c)
+{
+  std::vector<std::pair<int, int> >::const_iterator ranges_iter;
+  ranges_iter = ranges.begin();
+  while (ranges_iter != ranges.end()) {
+    if (c >= ranges_iter->first && c <= ranges_iter->second) {
+#if defined(DEBUGGING)
+      if (debug_state)
+	fprintf(stderr, "charinfo::contains(%d)\n", c);
+#endif
+      return true;
+    }
+    ++ranges_iter;
+  }
+
+  std::vector<charinfo *>::const_iterator nested_iter;
+  nested_iter = nested_classes.begin();
+  while (nested_iter != nested_classes.end()) {
+    if ((*nested_iter)->contains(c))
+      return true;
+    ++nested_iter;
+  }
+
+  return false;
+}
+
+bool charinfo::contains(symbol s)
+{
+  const char *unicode = glyph_name_to_unicode(s.contents());
+  if (unicode != NULL && strchr(unicode, '_') == NULL) {
+    char *ignore;
+    int c = (int)strtol(unicode, &ignore, 16);
+    return contains(c);
+  }
+  else
+    return false;
+}
+
+bool charinfo::contains(charinfo *)
+{
+  // TODO
+  return false;
+}
+
 symbol UNNAMED_SYMBOL("---");
 
 // For numbered characters not between 0 and 255, we make a symbol out