Add support for N and C lines.

At the moment, there's a horrible hack with a hardcoded table of character names and unicode code points. Hopefully eventually this can be replaced by parsing a file or calling a program... but I don't know how yet :-(
author: Rupert Swarbrick <rswarbrick@gmail.com> 2010-12-16 00:50:07 +0000
committer: Shaun McCance <shaunm@gnome.org> 2010-12-20 10:45:23 -0500
commit: 6a9eefa8e1d210b396aa0ae1ec375d7c4642e36f (patch)
tree: 37da7ba03de144896b70d744153783fd4136a7b8
parent: a1ebbd0651842f8f4f3cb672d5ef22322770fa9b (diff)
download: yelp-6a9eefa8e1d210b396aa0ae1ec375d7c4642e36f.tar.gz
1 files changed, 223 insertions, 9 deletions
diff --git a/libyelp/yelp-man-parser.c b/libyelp/yelp-man-parser.c
index a02cc146..645d9fdf 100644
--- a/libyelp/yelp-man-parser.c
+++ b/libyelp/yelp-man-parser.c
@@ -104,6 +104,15 @@ struct _YelpManParser {
     /* Set to TRUE if there's been a newline since the last text was
      * parsed. */
     gboolean newline;
+
+    /* Count the number of 'N' lines we've seen since the last h
+     * command. This is because for some reason N doesn't
+     * automatically move the position forward. Thus immediately after
+     * one, you see a h24 or the like. Unless there's a space. Then it
+     * might be wh48. This is set in parse_N (obviously) and used in
+     * parse_h.
+     */
+    guint N_count;
 };
 
 static gboolean parser_parse_line (YelpManParser *parser, GError **error);
@@ -124,6 +133,8 @@ DECLARE_LINE_PARSER (parse_text);
 DECLARE_LINE_PARSER (parse_w);
 DECLARE_LINE_PARSER (parse_body_text);
 DECLARE_LINE_PARSER (parse_n);
+DECLARE_LINE_PARSER (parse_N);
+DECLARE_LINE_PARSER (parse_C);
 
 /* Declare a sort of alist registry of parsers for different lines. */
 struct LineParsePair
@@ -138,6 +149,8 @@ static struct LineParsePair line_parsers[] = {
     { "t", parse_text },
     { "w", parse_w },
     { "n", parse_n },
+    { "N", parse_N },
+    { "C", parse_C },
     { NULL, NULL }
 };
 
@@ -146,6 +159,134 @@ static struct LineParsePair line_parsers[] = {
  * bits) */
 static void finish_span (YelpManParser *parser);
 static guint dx_to_em_count (YelpManParser *parser, guint dx);
+static void append_nbsps (YelpManParser *parser, guint k);
+
+/******************************************************************************/
+/* Translations for the 'C' command. This is indeed hackish, but the
+ * -Tutf8 output doesn't seem to give include files so we can do this
+ * at runtime :-(
+ *
+ * On my machine, this data's at /usr/share/groff/current/tmac/ in
+ * latin1.tmac, unicode.tmac and I worked out the lq and rq from
+ * running man: I'm not sure where that comes from!
+ */
+struct StringPair
+{
+    const gchar *from;
+    gunichar to;
+};
+static const struct StringPair char_translations[] = {
+    { "r!", 161 },
+    { "ct", 162 },
+    { "Po", 163 },
+    { "Cs", 164 },
+    { "Ye", 165 },
+    { "bb", 166 },
+    { "sc", 167 },
+    { "ad", 168 },
+    { "co", 169 },
+    { "Of", 170 },
+    { "Fo", 171 },
+    { "tno", 172 },
+    { "%", 173 },
+    { "rg", 174 },
+    { "a-", 175 },
+    { "de", 176 },
+    { "t+-", 177 },
+    { "S2", 178 },
+    { "S3", 179 },
+    { "aa", 180 },
+    { "mc", 181 },
+    { "ps", 182 },
+    { "pc", 183 },
+    { "ac", 184 },
+    { "S1", 185 },
+    { "Om", 186 },
+    { "Fc", 187 },
+    { "14", 188 },
+    { "12", 189 },
+    { "34", 190 },
+    { "r?", 191 },
+    { "`A", 192 },
+    { "'A", 193 },
+    { "^A", 194 },
+    { "~A", 195 },
+    { ":A", 196 },
+    { "oA", 197 },
+    { "AE", 198 },
+    { ",C", 199 },
+    { "`E", 200 },
+    { "'E", 201 },
+    { "^E", 202 },
+    { ":E", 203 },
+    { "`I", 204 },
+    { "'I", 205 },
+    { "^I", 206 },
+    { ":I", 207 },
+    { "-D", 208 },
+    { "~N", 209 },
+    { "`O", 210 },
+    { "'O", 211 },
+    { "^O", 212 },
+    { "~O", 213 },
+    { ":O", 214 },
+    { "tmu", 215 },
+    { "/O", 216 },
+    { "`U", 217 },
+    { "'U", 218 },
+    { "^U", 219 },
+    { ":U", 220 },
+    { "'Y", 221 },
+    { "TP", 222 },
+    { "ss", 223 },
+    { "`a", 224 },
+    { "'a", 225 },
+    { "^a", 226 },
+    { "~a", 227 },
+    { ":a", 228 },
+    { "oa", 229 },
+    { "ae", 230 },
+    { ",c", 231 },
+    { "`e", 232 },
+    { "'e", 233 },
+    { "^e", 234 },
+    { ":e", 235 },
+    { "`i", 236 },
+    { "'i", 237 },
+    { "^i", 238 },
+    { ":i", 239 },
+    { "Sd", 240 },
+    { "~n", 241 },
+    { "`o", 242 },
+    { "'o", 243 },
+    { "^o", 244 },
+    { "~o", 245 },
+    { ":o", 246 },
+    { "tdi", 247 },
+    { "/o", 248 },
+    { "`u", 249 },
+    { "'u", 250 },
+    { "^u", 251 },
+    { ":u", 252 },
+    { "'y", 253 },
+    { "Tp", 254 },
+    { ":y", 255 },
+    { "hy", '-' },
+    { "oq", '`' },
+    { "cq", '\'' },
+    { "lq", 8220 }, // left smart quotes
+    { "rq", 8221 }, // right smart quotes
+    { "en", 8211 }, // en-dash
+    { "em", 8212 }, // em-dash
+    { "la", 10216 }, // left angle bracket
+    { "ra", 10217 }, // left angle bracket
+    { "rs", '\\' },
+    { "<=", 8804 }, // < or equal to sign
+    { ">=", 8805 }, // > or equal to sign
+    { "aq", '\'' },
+    { "tm", 8482 }, // trademark symbol
+    { NULL, 0 }
+};
 
 /******************************************************************************/
 
@@ -170,9 +311,9 @@ get_troff (gchar *path, GError **error)
 {
     gint stdout;
     GError *err = NULL;
-    gchar *argv[] = { "man", "-Z", "-Tutf8", NULL, NULL };
+    gchar *argv[] = { "man", "-Z", "-Tutf8", "-EUTF-8", NULL, NULL };
 
-    argv[3] = path;
+    argv[4] = path;
 
     if (!g_spawn_async_with_pipes (NULL, argv, NULL,
                                    G_SPAWN_SEARCH_PATH, NULL, NULL,
@@ -374,7 +515,7 @@ static gboolean
 parse_h (YelpManParser *parser, GError **error)
 {
     guint dx;
-    guint k;
+    int k;
     const gchar *str;
 
     if (SSCANF ("h%u", 1, &dx)) {
@@ -396,12 +537,11 @@ parse_h (YelpManParser *parser, GError **error)
         (str[0] != '\0') &&
         (str[strlen (str)-1] != ' ')) {
 
-        dx = dx_to_em_count (parser, dx);
-        for (k=0; k<dx; k++) {
-            /* 0xc2 0xa0 is nonbreaking space in utf8 */
-            g_string_append_c (parser->accumulator, 0xc2);
-            g_string_append_c (parser->accumulator, 0xa0);
-        }
+        k = dx_to_em_count (parser, dx) - parser->N_count;
+        parser->N_count = 0;
+        if (k < 0) k = 0;
+
+        append_nbsps (parser, k);
     }
 
     return TRUE;
@@ -638,3 +778,77 @@ dx_to_em_count (YelpManParser *parser, guint dx)
 {
     return (int)(dx / ((float)parser->char_width));
 }
+
+static gboolean
+parse_N (YelpManParser *parser, GError **error)
+{
+    gint n;
+    if (SSCANF ("N%i", 1, &n)) {
+        RAISE_PARSE_ERROR ("Strange format for N line: %s");
+    }
+    if (n > 127) {
+        RAISE_PARSE_ERROR ("N line has non-7-bit character: %s");
+    }
+    if (n < -200) {
+        RAISE_PARSE_ERROR ("Bizarrely many nbsps in N line: %s");
+    }
+
+    if (n < 0) {
+        append_nbsps (parser, -n);
+        parser->N_count += -n;
+    }
+    else {
+        g_string_append_c (parser->accumulator, (gchar)n);
+        parser->N_count++;
+    }
+
+    return TRUE;
+}
+
+static void
+append_nbsps (YelpManParser *parser, guint k)
+{
+    for (; k > 0; k--) {
+        /* 0xc2 0xa0 is nonbreaking space in utf8 */
+        g_string_append_c (parser->accumulator, 0xc2);
+        g_string_append_c (parser->accumulator, 0xa0);
+    }
+}
+
+static gboolean
+parse_C (YelpManParser *parser, GError **error)
+{
+    gchar name[16];
+    gunichar code = 0;
+    guint k;
+    gint len;
+
+    if (SSCANF ("C%16s", 1, name)) {
+        RAISE_PARSE_ERROR ("Can't understand special character: %s");
+    }
+
+    for (k=0; char_translations[k].from; k++) {
+        if (g_str_equal (char_translations[k].from, name)) {
+            code = char_translations[k].to;
+            break;
+        }
+    }
+    if (sscanf (name, "u%x", &k) == 1) {
+        code = k;
+    }
+
+    if (!code) {
+        g_warning ("Couldn't parse troff special character: '%s'",
+                   name);
+        code = 65533; /* Unicode replacement character */
+    }
+
+    /* Output buffer must be length >= 6. 16 >= 6, so we're ok. */
+    len = g_unichar_to_utf8 (code, name);
+    name[len] = '\0';
+    g_string_append (parser->accumulator, name);
+
+    parser->N_count++;
+
+    return TRUE;
+}
author	Rupert Swarbrick <rswarbrick@gmail.com>	2010-12-16 00:50:07 +0000
committer	Shaun McCance <shaunm@gnome.org>	2010-12-20 10:45:23 -0500
commit	6a9eefa8e1d210b396aa0ae1ec375d7c4642e36f (patch)
tree	37da7ba03de144896b70d744153783fd4136a7b8
parent	a1ebbd0651842f8f4f3cb672d5ef22322770fa9b (diff)
download	yelp-6a9eefa8e1d210b396aa0ae1ec375d7c4642e36f.tar.gz