summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRupert Swarbrick <rswarbrick@gmail.com>2010-12-16 00:50:07 +0000
committerShaun McCance <shaunm@gnome.org>2010-12-20 10:45:23 -0500
commit6a9eefa8e1d210b396aa0ae1ec375d7c4642e36f (patch)
tree37da7ba03de144896b70d744153783fd4136a7b8
parenta1ebbd0651842f8f4f3cb672d5ef22322770fa9b (diff)
downloadyelp-6a9eefa8e1d210b396aa0ae1ec375d7c4642e36f.tar.gz
Add support for N and C lines.
At the moment, there's a horrible hack with a hardcoded table of character names and unicode code points. Hopefully eventually this can be replaced by parsing a file or calling a program... but I don't know how yet :-(
-rw-r--r--libyelp/yelp-man-parser.c232
1 files changed, 223 insertions, 9 deletions
diff --git a/libyelp/yelp-man-parser.c b/libyelp/yelp-man-parser.c
index a02cc146..645d9fdf 100644
--- a/libyelp/yelp-man-parser.c
+++ b/libyelp/yelp-man-parser.c
@@ -104,6 +104,15 @@ struct _YelpManParser {
/* Set to TRUE if there's been a newline since the last text was
* parsed. */
gboolean newline;
+
+ /* Count the number of 'N' lines we've seen since the last h
+ * command. This is because for some reason N doesn't
+ * automatically move the position forward. Thus immediately after
+ * one, you see a h24 or the like. Unless there's a space. Then it
+ * might be wh48. This is set in parse_N (obviously) and used in
+ * parse_h.
+ */
+ guint N_count;
};
static gboolean parser_parse_line (YelpManParser *parser, GError **error);
@@ -124,6 +133,8 @@ DECLARE_LINE_PARSER (parse_text);
DECLARE_LINE_PARSER (parse_w);
DECLARE_LINE_PARSER (parse_body_text);
DECLARE_LINE_PARSER (parse_n);
+DECLARE_LINE_PARSER (parse_N);
+DECLARE_LINE_PARSER (parse_C);
/* Declare a sort of alist registry of parsers for different lines. */
struct LineParsePair
@@ -138,6 +149,8 @@ static struct LineParsePair line_parsers[] = {
{ "t", parse_text },
{ "w", parse_w },
{ "n", parse_n },
+ { "N", parse_N },
+ { "C", parse_C },
{ NULL, NULL }
};
@@ -146,6 +159,134 @@ static struct LineParsePair line_parsers[] = {
* bits) */
static void finish_span (YelpManParser *parser);
static guint dx_to_em_count (YelpManParser *parser, guint dx);
+static void append_nbsps (YelpManParser *parser, guint k);
+
+/******************************************************************************/
+/* Translations for the 'C' command. This is indeed hackish, but the
+ * -Tutf8 output doesn't seem to give include files so we can do this
+ * at runtime :-(
+ *
+ * On my machine, this data's at /usr/share/groff/current/tmac/ in
+ * latin1.tmac, unicode.tmac and I worked out the lq and rq from
+ * running man: I'm not sure where that comes from!
+ */
+struct StringPair
+{
+ const gchar *from;
+ gunichar to;
+};
+static const struct StringPair char_translations[] = {
+ { "r!", 161 },
+ { "ct", 162 },
+ { "Po", 163 },
+ { "Cs", 164 },
+ { "Ye", 165 },
+ { "bb", 166 },
+ { "sc", 167 },
+ { "ad", 168 },
+ { "co", 169 },
+ { "Of", 170 },
+ { "Fo", 171 },
+ { "tno", 172 },
+ { "%", 173 },
+ { "rg", 174 },
+ { "a-", 175 },
+ { "de", 176 },
+ { "t+-", 177 },
+ { "S2", 178 },
+ { "S3", 179 },
+ { "aa", 180 },
+ { "mc", 181 },
+ { "ps", 182 },
+ { "pc", 183 },
+ { "ac", 184 },
+ { "S1", 185 },
+ { "Om", 186 },
+ { "Fc", 187 },
+ { "14", 188 },
+ { "12", 189 },
+ { "34", 190 },
+ { "r?", 191 },
+ { "`A", 192 },
+ { "'A", 193 },
+ { "^A", 194 },
+ { "~A", 195 },
+ { ":A", 196 },
+ { "oA", 197 },
+ { "AE", 198 },
+ { ",C", 199 },
+ { "`E", 200 },
+ { "'E", 201 },
+ { "^E", 202 },
+ { ":E", 203 },
+ { "`I", 204 },
+ { "'I", 205 },
+ { "^I", 206 },
+ { ":I", 207 },
+ { "-D", 208 },
+ { "~N", 209 },
+ { "`O", 210 },
+ { "'O", 211 },
+ { "^O", 212 },
+ { "~O", 213 },
+ { ":O", 214 },
+ { "tmu", 215 },
+ { "/O", 216 },
+ { "`U", 217 },
+ { "'U", 218 },
+ { "^U", 219 },
+ { ":U", 220 },
+ { "'Y", 221 },
+ { "TP", 222 },
+ { "ss", 223 },
+ { "`a", 224 },
+ { "'a", 225 },
+ { "^a", 226 },
+ { "~a", 227 },
+ { ":a", 228 },
+ { "oa", 229 },
+ { "ae", 230 },
+ { ",c", 231 },
+ { "`e", 232 },
+ { "'e", 233 },
+ { "^e", 234 },
+ { ":e", 235 },
+ { "`i", 236 },
+ { "'i", 237 },
+ { "^i", 238 },
+ { ":i", 239 },
+ { "Sd", 240 },
+ { "~n", 241 },
+ { "`o", 242 },
+ { "'o", 243 },
+ { "^o", 244 },
+ { "~o", 245 },
+ { ":o", 246 },
+ { "tdi", 247 },
+ { "/o", 248 },
+ { "`u", 249 },
+ { "'u", 250 },
+ { "^u", 251 },
+ { ":u", 252 },
+ { "'y", 253 },
+ { "Tp", 254 },
+ { ":y", 255 },
+ { "hy", '-' },
+ { "oq", '`' },
+ { "cq", '\'' },
+ { "lq", 8220 }, // left smart quotes
+ { "rq", 8221 }, // right smart quotes
+ { "en", 8211 }, // en-dash
+ { "em", 8212 }, // em-dash
+ { "la", 10216 }, // left angle bracket
+ { "ra", 10217 }, // left angle bracket
+ { "rs", '\\' },
+ { "<=", 8804 }, // < or equal to sign
+ { ">=", 8805 }, // > or equal to sign
+ { "aq", '\'' },
+ { "tm", 8482 }, // trademark symbol
+ { NULL, 0 }
+};
/******************************************************************************/
@@ -170,9 +311,9 @@ get_troff (gchar *path, GError **error)
{
gint stdout;
GError *err = NULL;
- gchar *argv[] = { "man", "-Z", "-Tutf8", NULL, NULL };
+ gchar *argv[] = { "man", "-Z", "-Tutf8", "-EUTF-8", NULL, NULL };
- argv[3] = path;
+ argv[4] = path;
if (!g_spawn_async_with_pipes (NULL, argv, NULL,
G_SPAWN_SEARCH_PATH, NULL, NULL,
@@ -374,7 +515,7 @@ static gboolean
parse_h (YelpManParser *parser, GError **error)
{
guint dx;
- guint k;
+ int k;
const gchar *str;
if (SSCANF ("h%u", 1, &dx)) {
@@ -396,12 +537,11 @@ parse_h (YelpManParser *parser, GError **error)
(str[0] != '\0') &&
(str[strlen (str)-1] != ' ')) {
- dx = dx_to_em_count (parser, dx);
- for (k=0; k<dx; k++) {
- /* 0xc2 0xa0 is nonbreaking space in utf8 */
- g_string_append_c (parser->accumulator, 0xc2);
- g_string_append_c (parser->accumulator, 0xa0);
- }
+ k = dx_to_em_count (parser, dx) - parser->N_count;
+ parser->N_count = 0;
+ if (k < 0) k = 0;
+
+ append_nbsps (parser, k);
}
return TRUE;
@@ -638,3 +778,77 @@ dx_to_em_count (YelpManParser *parser, guint dx)
{
return (int)(dx / ((float)parser->char_width));
}
+
+static gboolean
+parse_N (YelpManParser *parser, GError **error)
+{
+ gint n;
+ if (SSCANF ("N%i", 1, &n)) {
+ RAISE_PARSE_ERROR ("Strange format for N line: %s");
+ }
+ if (n > 127) {
+ RAISE_PARSE_ERROR ("N line has non-7-bit character: %s");
+ }
+ if (n < -200) {
+ RAISE_PARSE_ERROR ("Bizarrely many nbsps in N line: %s");
+ }
+
+ if (n < 0) {
+ append_nbsps (parser, -n);
+ parser->N_count += -n;
+ }
+ else {
+ g_string_append_c (parser->accumulator, (gchar)n);
+ parser->N_count++;
+ }
+
+ return TRUE;
+}
+
+static void
+append_nbsps (YelpManParser *parser, guint k)
+{
+ for (; k > 0; k--) {
+ /* 0xc2 0xa0 is nonbreaking space in utf8 */
+ g_string_append_c (parser->accumulator, 0xc2);
+ g_string_append_c (parser->accumulator, 0xa0);
+ }
+}
+
+static gboolean
+parse_C (YelpManParser *parser, GError **error)
+{
+ gchar name[16];
+ gunichar code = 0;
+ guint k;
+ gint len;
+
+ if (SSCANF ("C%16s", 1, name)) {
+ RAISE_PARSE_ERROR ("Can't understand special character: %s");
+ }
+
+ for (k=0; char_translations[k].from; k++) {
+ if (g_str_equal (char_translations[k].from, name)) {
+ code = char_translations[k].to;
+ break;
+ }
+ }
+ if (sscanf (name, "u%x", &k) == 1) {
+ code = k;
+ }
+
+ if (!code) {
+ g_warning ("Couldn't parse troff special character: '%s'",
+ name);
+ code = 65533; /* Unicode replacement character */
+ }
+
+ /* Output buffer must be length >= 6. 16 >= 6, so we're ok. */
+ len = g_unichar_to_utf8 (code, name);
+ name[len] = '\0';
+ g_string_append (parser->accumulator, name);
+
+ parser->N_count++;
+
+ return TRUE;
+}