/* * Purple - Internet Messaging Library * Copyright (C) Pidgin Developers * * Purple is the legal property of its developers, whose names are too numerous * to list here. Please refer to the COPYRIGHT file distributed with this * source distribution. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see . */ #include "purplemarkup.h" #include "util.h" const char * purple_markup_unescape_entity(const char *text, int *length) { const char *pln; int len; if (!text || *text != '&') return NULL; #define IS_ENTITY(s) (!g_ascii_strncasecmp(text, s, (len = sizeof(s) - 1))) if(IS_ENTITY("&")) pln = "&"; else if(IS_ENTITY("<")) pln = "<"; else if(IS_ENTITY(">")) pln = ">"; else if(IS_ENTITY(" ")) pln = " "; else if(IS_ENTITY("©")) pln = "\302\251"; /* or use g_unichar_to_utf8(0xa9); */ else if(IS_ENTITY(""")) pln = "\""; else if(IS_ENTITY("®")) pln = "\302\256"; /* or use g_unichar_to_utf8(0xae); */ else if(IS_ENTITY("'")) pln = "\'"; else if(text[1] == '#' && (g_ascii_isxdigit(text[2]) || text[2] == 'x')) { static char buf[7]; const char *start = text + 2; char *end; guint64 pound; int base = 10; int buflen; if (*start == 'x') { base = 16; start++; } pound = g_ascii_strtoull(start, &end, base); if (pound == 0 || pound > INT_MAX || *end != ';') { return NULL; } len = (end - text) + 1; buflen = g_unichar_to_utf8((gunichar)pound, buf); buf[buflen] = '\0'; pln = buf; } else return NULL; if (length) *length = len; return pln; } struct purple_parse_tag { char *src_tag; char *dest_tag; gboolean ignore; }; /* NOTE: Do not put `do {} while(0)` around this macro (as this is the method recommended in the GCC docs). It contains 'continue's that should affect the while-loop in purple_markup_html_to_xhtml and doing the above would break that. Also, remember to put braces in constructs that require them for multiple statements when using this macro. */ #define ALLOW_TAG_ALT(x, y) if(!g_ascii_strncasecmp(c, "<" x " ", strlen("<" x " "))) { \ const char *o = c + strlen("<" x); \ const char *p = NULL, *q = NULL, *r = NULL; \ /* o = iterating over full tag \ * p = > (end of tag) \ * q = start of quoted bit \ * r = < inside tag \ */ \ GString *innards = g_string_new(""); \ while(o && *o) { \ if(!q && (*o == '\"' || *o == '\'') ) { \ q = o; \ } else if(q) { \ if(*o == *q) { /* end of quoted bit */ \ char *unescaped = g_strndup(q+1, o-q-1); \ char *escaped = g_markup_escape_text(unescaped, -1); \ g_string_append_printf(innards, "%c%s%c", *q, escaped, *q); \ g_free(unescaped); \ g_free(escaped); \ q = NULL; \ } else if(*c == '\\') { \ o++; \ } \ } else if(*o == '<') { \ r = o; \ } else if(*o == '>') { \ p = o; \ break; \ } else { \ innards = g_string_append_c(innards, *o); \ } \ o++; \ } \ if(p && !r) { /* got an end of tag and no other < earlier */\ if(*(p-1) != '/') { \ struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); \ pt->src_tag = x; \ pt->dest_tag = y; \ tags = g_list_prepend(tags, pt); \ } \ if(xhtml) { \ xhtml = g_string_append(xhtml, "<" y); \ xhtml = g_string_append(xhtml, innards->str); \ xhtml = g_string_append_c(xhtml, '>'); \ } \ c = p + 1; \ } else { /* got end of tag with earlier < *or* didn't get anything */ \ if(xhtml) \ xhtml = g_string_append(xhtml, "<"); \ if(plain) \ plain = g_string_append_c(plain, '<'); \ c++; \ } \ g_string_free(innards, TRUE); \ continue; \ } \ if(!g_ascii_strncasecmp(c, "<" x, strlen("<" x)) && \ (*(c+strlen("<" x)) == '>' || \ !g_ascii_strncasecmp(c+strlen("<" x), "/>", 2))) { \ if(xhtml) \ xhtml = g_string_append(xhtml, "<" y); \ c += strlen("<" x); \ if(*c != '/') { \ struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); \ pt->src_tag = x; \ pt->dest_tag = y; \ tags = g_list_prepend(tags, pt); \ if(xhtml) \ xhtml = g_string_append_c(xhtml, '>'); \ } else { \ if(xhtml) \ xhtml = g_string_append(xhtml, "/>");\ } \ c = strchr(c, '>') + 1; \ continue; \ } /* Don't forget to check the note above for ALLOW_TAG_ALT. */ #define ALLOW_TAG(x) ALLOW_TAG_ALT(x, x) void purple_markup_html_to_xhtml(const char *html, char **xhtml_out, char **plain_out) { GString *xhtml = NULL; GString *plain = NULL; GString *url = NULL; GString *cdata = NULL; GList *tags = NULL, *tag; const char *c = html; char quote = '\0'; #define CHECK_QUOTE(ptr) if (*(ptr) == '\'' || *(ptr) == '\"') \ quote = *(ptr++); \ else \ quote = '\0'; #define VALID_CHAR(ptr) (*(ptr) && *(ptr) != quote && (quote || (*(ptr) != ' ' && *(ptr) != '>'))) g_return_if_fail(xhtml_out != NULL || plain_out != NULL); if(xhtml_out) xhtml = g_string_new(""); if(plain_out) plain = g_string_new(""); while(c && *c) { if(*c == '<') { if(*(c+1) == '/') { /* closing tag */ tag = tags; while(tag) { struct purple_parse_tag *pt = tag->data; if(!g_ascii_strncasecmp((c+2), pt->src_tag, strlen(pt->src_tag)) && *(c+strlen(pt->src_tag)+2) == '>') { c += strlen(pt->src_tag) + 3; break; } tag = tag->next; } if(tag) { while(tags) { struct purple_parse_tag *pt = tags->data; if(xhtml && !pt->ignore) g_string_append_printf(xhtml, "", pt->dest_tag); if(plain && purple_strequal(pt->src_tag, "a")) { /* if this is a link, we have to add the url to the plaintext, too */ if (cdata && url && (!g_string_equal(cdata, url) && (g_ascii_strncasecmp(url->str, "mailto:", 7) != 0 || g_utf8_collate(url->str + 7, cdata->str) != 0))) { char *unescaped = purple_unescape_html(url->str); g_string_append_printf(plain, " <%s>", g_strstrip(unescaped)); g_free(unescaped); } if (cdata) { g_string_free(cdata, TRUE); cdata = NULL; } } if(tags == tag) break; tags = g_list_delete_link(tags, tags); g_free(pt); } g_free(tag->data); tags = g_list_delete_link(tags, tag); } else { /* a closing tag we weren't expecting... * we'll let it slide, if it's really a tag...if it's * just a ') { c = end+1; } else { if(xhtml) xhtml = g_string_append(xhtml, "<"); if(plain) plain = g_string_append_c(plain, '<'); c++; } } } else { /* opening tag */ ALLOW_TAG("blockquote"); ALLOW_TAG("cite"); ALLOW_TAG("div"); ALLOW_TAG("em"); ALLOW_TAG("h1"); ALLOW_TAG("h2"); ALLOW_TAG("h3"); ALLOW_TAG("h4"); ALLOW_TAG("h5"); ALLOW_TAG("h6"); /* we only allow html to start the message */ if(c == html) { ALLOW_TAG("html"); } ALLOW_TAG_ALT("i", "em"); ALLOW_TAG_ALT("italic", "em"); ALLOW_TAG("li"); ALLOW_TAG("ol"); ALLOW_TAG("p"); ALLOW_TAG("pre"); ALLOW_TAG("q"); ALLOW_TAG("span"); ALLOW_TAG("ul"); /* we skip
because it's not legal in XHTML-IM. However, * we still want to send something sensible, so we put a * linebreak in its place.
also needs special handling * because putting a
to close it would just be dumb. */ if((!g_ascii_strncasecmp(c, "' || !g_ascii_strncasecmp(c+3, "/>", 2) || !g_ascii_strncasecmp(c+3, " />", 3))) { c = strchr(c, '>') + 1; if(xhtml) xhtml = g_string_append(xhtml, "
"); if(plain && *c != '\n') plain = g_string_append_c(plain, '\n'); continue; } if(!g_ascii_strncasecmp(c, "", 3) || !g_ascii_strncasecmp(c, "", strlen("")) || !g_ascii_strncasecmp(c, "", strlen(""))) { struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); if (*(c+2) == '>') pt->src_tag = "b"; else if (*(c+2) == 'o') pt->src_tag = "bold"; else pt->src_tag = "strong"; pt->dest_tag = "span"; tags = g_list_prepend(tags, pt); c = strchr(c, '>') + 1; if(xhtml) xhtml = g_string_append(xhtml, ""); continue; } if(!g_ascii_strncasecmp(c, "", 3) || !g_ascii_strncasecmp(c, "", strlen(""))) { struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); pt->src_tag = *(c+2) == '>' ? "u" : "underline"; pt->dest_tag = "span"; tags = g_list_prepend(tags, pt); c = strchr(c, '>') + 1; if (xhtml) xhtml = g_string_append(xhtml, ""); continue; } if(!g_ascii_strncasecmp(c, "", 3) || !g_ascii_strncasecmp(c, "", strlen(""))) { struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); pt->src_tag = *(c+2) == '>' ? "s" : "strike"; pt->dest_tag = "span"; tags = g_list_prepend(tags, pt); c = strchr(c, '>') + 1; if(xhtml) xhtml = g_string_append(xhtml, ""); continue; } if(!g_ascii_strncasecmp(c, "", 5)) { struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); pt->src_tag = "sub"; pt->dest_tag = "span"; tags = g_list_prepend(tags, pt); c = strchr(c, '>') + 1; if(xhtml) xhtml = g_string_append(xhtml, ""); continue; } if(!g_ascii_strncasecmp(c, "", 5)) { struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); pt->src_tag = "sup"; pt->dest_tag = "span"; tags = g_list_prepend(tags, pt); c = strchr(c, '>') + 1; if(xhtml) xhtml = g_string_append(xhtml, ""); continue; } if (!g_ascii_strncasecmp(c, "' || *(c+4) == ' ')) { const char *p = c + 4; GString *src = NULL, *alt = NULL; #define ESCAPE(from, to) \ CHECK_QUOTE(from); \ while (VALID_CHAR(from)) { \ int len; \ if ((*from == '&') && (purple_markup_unescape_entity(from, &len) == NULL)) \ to = g_string_append(to, "&"); \ else if (*from == '\'') \ to = g_string_append(to, "'"); \ else \ to = g_string_append_c(to, *from); \ from++; \ } while (*p && *p != '>') { if (!g_ascii_strncasecmp(p, "src=", 4)) { const char *q = p + 4; if (src) g_string_free(src, TRUE); src = g_string_new(""); ESCAPE(q, src); p = q; } else if (!g_ascii_strncasecmp(p, "alt=", 4)) { const char *q = p + 4; if (alt) g_string_free(alt, TRUE); alt = g_string_new(""); ESCAPE(q, alt); p = q; } else { p++; } } #undef ESCAPE if ((c = strchr(p, '>')) != NULL) c++; else c = p; /* src and alt are required! */ if(src && xhtml) g_string_append_printf(xhtml, "%s", g_strstrip(src->str), alt ? alt->str : ""); if(alt) { if(plain) { char *unescaped = purple_unescape_html(alt->str); plain = g_string_append(plain, unescaped); g_free(unescaped); } if(!src && xhtml) { xhtml = g_string_append(xhtml, alt->str); } g_string_free(alt, TRUE); } g_string_free(src, TRUE); continue; } if (!g_ascii_strncasecmp(c, "' || *(c+2) == ' ')) { const char *p = c + 2; struct purple_parse_tag *pt; while (*p && *p != '>') { if (!g_ascii_strncasecmp(p, "href=", 5)) { const char *q = p + 5; if (url) g_string_free(url, TRUE); url = g_string_new(""); if (cdata) g_string_free(cdata, TRUE); cdata = g_string_new(""); CHECK_QUOTE(q); while (VALID_CHAR(q)) { int len; if ((*q == '&') && (purple_markup_unescape_entity(q, &len) == NULL)) url = g_string_append(url, "&"); else if (*q == '"') url = g_string_append(url, """); else url = g_string_append_c(url, *q); q++; } p = q; } else { p++; } } if ((c = strchr(p, '>')) != NULL) c++; else c = p; pt = g_new0(struct purple_parse_tag, 1); pt->src_tag = "a"; pt->dest_tag = "a"; tags = g_list_prepend(tags, pt); if(xhtml) g_string_append_printf(xhtml, "", url ? g_strstrip(url->str) : ""); continue; } #define ESCAPE(from, to) \ CHECK_QUOTE(from); \ while (VALID_CHAR(from)) { \ int len; \ if ((*from == '&') && (purple_markup_unescape_entity(from, &len) == NULL)) \ to = g_string_append(to, "&"); \ else if (*from == '\'') \ to = g_string_append_c(to, '\"'); \ else \ to = g_string_append_c(to, *from); \ from++; \ } if(!g_ascii_strncasecmp(c, "' || *(c+5) == ' ')) { const char *p = c + 5; GString *style = g_string_new(""); struct purple_parse_tag *pt; while (*p && *p != '>') { if (!g_ascii_strncasecmp(p, "back=", 5)) { const char *q = p + 5; GString *color = g_string_new(""); ESCAPE(q, color); g_string_append_printf(style, "background: %s; ", color->str); g_string_free(color, TRUE); p = q; } else if (!g_ascii_strncasecmp(p, "color=", 6)) { const char *q = p + 6; GString *color = g_string_new(""); ESCAPE(q, color); g_string_append_printf(style, "color: %s; ", color->str); g_string_free(color, TRUE); p = q; } else if (!g_ascii_strncasecmp(p, "face=", 5)) { const char *q = p + 5; GString *face = g_string_new(""); ESCAPE(q, face); g_string_append_printf(style, "font-family: %s; ", g_strstrip(face->str)); g_string_free(face, TRUE); p = q; } else if (!g_ascii_strncasecmp(p, "size=", 5)) { const char *q = p + 5; int sz; const char *size = "medium"; CHECK_QUOTE(q); sz = atoi(q); switch (sz) { case 1: size = "xx-small"; break; case 2: size = "small"; break; case 3: size = "medium"; break; case 4: size = "large"; break; case 5: size = "x-large"; break; case 6: case 7: size = "xx-large"; break; default: break; } g_string_append_printf(style, "font-size: %s; ", size); p = q; } else { p++; } } if ((c = strchr(p, '>')) != NULL) c++; else c = p; pt = g_new0(struct purple_parse_tag, 1); pt->src_tag = "font"; pt->dest_tag = "span"; tags = g_list_prepend(tags, pt); if(style->len && xhtml) g_string_append_printf(xhtml, "", g_strstrip(style->str)); else pt->ignore = TRUE; g_string_free(style, TRUE); continue; } #undef ESCAPE if (!g_ascii_strncasecmp(c, "", g_strstrip(color->str)); g_string_free(color, TRUE); if ((c = strchr(p, '>')) != NULL) c++; else c = p; pt->src_tag = "body"; pt->dest_tag = "span"; tags = g_list_prepend(tags, pt); did_something = TRUE; break; } p++; } if (did_something) continue; } /* this has to come after the special case for bgcolor */ ALLOW_TAG("body"); if(!g_ascii_strncasecmp(c, ""); if(p) { if(xhtml) xhtml = g_string_append(xhtml, "