diff options
author | Wez Furlong <wez@php.net> | 2001-05-28 11:00:06 +0000 |
---|---|---|
committer | Wez Furlong <wez@php.net> | 2001-05-28 11:00:06 +0000 |
commit | d38cba8697857fdf9ffdb11f001bb986803473ea (patch) | |
tree | e213df214193d8349b2442d214bd9f9302307c5f | |
parent | 2bdef9c83ffea933d3654575d6bd38894d5856c6 (diff) | |
download | php-git-d38cba8697857fdf9ffdb11f001bb986803473ea.tar.gz |
Added charset awareness to htmlentities() and htmlspecialchars(); use an
optional third parameter to specify the charset; otherwise tries to determine
it from the LC_CTYPE locale setting.
-rw-r--r-- | ext/standard/html.c | 358 | ||||
-rw-r--r-- | ext/standard/html.h | 2 |
2 files changed, 317 insertions, 43 deletions
diff --git a/ext/standard/html.c b/ext/standard/html.c index 85f370f568..6c967b5958 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -23,11 +23,27 @@ #include "reg.h" #include "html.h" +#if HAVE_LOCALE_H +#include <locale.h> +#endif + /* This must be fixed to handle the input string according to LC_CTYPE. Defaults to ISO-8859-1 for now. */ - -static char EntTable[][7] = -{ + +enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252, + cs_8859_15, cs_utf_8 }; +typedef const char * entity_table_t; + +/* codepage 1252 is a Windows extension to iso-8859-1. */ +static entity_table_t ent_cp_1252[] = { + NULL, NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger", + "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig", + NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo", + "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo", + "oelig", NULL, NULL, "Yuml" +}; + +static entity_table_t ent_iso_8859_1[] = { "nbsp","iexcl","cent","pound","curren","yen","brvbar", "sect","uml","copy","ordf","laquo","not","shy","reg", "macr","deg","plusmn","sup2","sup3","acute","micro", @@ -45,10 +61,212 @@ static char EntTable[][7] = "uuml","yacute","thorn","yuml" }; -PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style) +static entity_table_t ent_iso_8859_15[] = { + "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron", + "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg", + "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */ + "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm", + "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute", + "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", + "Eacute","Ecirc","Euml","Igrave","Iacute","Icirc", + "Iuml","ETH","Ntilde","Ograve","Oacute","Ocirc","Otilde", + "Ouml","times","Oslash","Ugrave","Uacute","Ucirc","Uuml", + "Yacute","THORN","szlig","agrave","aacute","acirc", + "atilde","auml","aring","aelig","ccedil","egrave", + "eacute","ecirc","euml","igrave","iacute","icirc", + "iuml","eth","ntilde","ograve","oacute","ocirc","otilde", + "ouml","divide","oslash","ugrave","uacute","ucirc", + "uuml","yacute","thorn","yuml" +}; + +struct html_entity_map { + enum entity_charset charset; /* charset identifier */ + unsigned short basechar; /* char code at start of table */ + unsigned short endchar; /* last char code in the table */ + entity_table_t * table; /* the table of mappings */ +}; + +static const struct html_entity_map entity_map[] = { + { cs_cp1252, 0x80, 0x9f, ent_cp_1252 }, + { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 }, + { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_terminator } +}; + +static const struct { + const char * codeset; + enum entity_charset charset; +} charset_map[] = { + { "ISO-8859-1", cs_8859_1 }, + { "ISO-8859-15", cs_8859_15 }, + { "utf-8", cs_utf_8 }, + { "cp1252", cs_cp1252 }, + { NULL } +}; + +inline static unsigned short get_next_char(enum entity_charset charset, + unsigned char * str, + int * newpos, + unsigned char * mbseq, + int * mbseqlen +) +{ + int pos = *newpos; + int mbpos = 0; + unsigned short this_char = str[pos++]; + + mbseq[mbpos++] = this_char; + + if (charset == cs_utf_8) { + unsigned long utf = 0; + int stat = 0; + int more = 1; + + /* unpack utf-8 encoding into a wide char. + * Code stolen from the mbstring extension */ + + do { + if (this_char < 0x80) { + more = 0; + break; + } + else if (this_char < 0xc0) { + switch(stat) { + case 0x10: /* 2, 2nd */ + case 0x21: /* 3, 3rd */ + case 0x32: /* 4, 4th */ + case 0x43: /* 5, 5th */ + case 0x54: /* 6, 6th */ + /* last byte in sequence */ + more = 0; + utf |= (this_char & 0x3f); + this_char = utf; + break; + case 0x20: /* 3, 2nd */ + case 0x31: /* 4, 3rd */ + case 0x42: /* 5, 4th */ + case 0x53: /* 6, 5th */ + /* penultimate char */ + utf |= ((this_char & 0x3f) << 6); + stat++; + break; + case 0x30: /* 4, 2nd */ + case 0x41: /* 5, 3rd */ + case 0x52: /* 6, 4th */ + utf |= ((this_char & 0x3f) << 12); + stat++; + break; + case 0x40: /* 5, 2nd */ + case 0x51: + utf |= ((this_char & 0x3f) << 18); + stat++; + break; + case 0x50: /* 6, 2nd */ + utf |= ((this_char & 0x3f) << 24); + stat++; + default: + /* invalid */ + more = 0; + } + } + /* lead byte */ + else if (this_char < 0xe0) { + stat = 0x10; /* 2 byte */ + utf = (this_char & 0x1f) << 6; + } else if (this_char < 0xf0) { + stat = 0x20; /* 3 byte */ + utf = (this_char & 0xf) << 12; + } else if (this_char < 0xf8) { + stat = 0x30; /* 4 byte */ + utf = (this_char & 0x7) << 18; + } else if (this_char < 0xfc) { + stat = 0x40; /* 5 byte */ + utf = (this_char & 0x3) << 24; + } else if (this_char < 0xfe) { + stat = 0x50; /* 6 byte */ + utf = (this_char & 0x1) << 30; + } + else { + /* invalid; bail */ + more = 0; + break; + } + if (more) + { + this_char = str[pos++]; + mbseq[mbpos++] = this_char; + } + } while(more); + } + *newpos = pos; + mbseq[mbpos] = '\0'; + *mbseqlen = mbpos; + return this_char; +} + +/* returns the charset identifier based on current locale or a hint. + * defaults to iso-8859-1 */ +static enum entity_charset determine_charset(char * charset_hint) +{ + int i; + enum entity_charset charset = cs_8859_1; + int len; + +#if HAVE_LOCALE_H + if (charset_hint == NULL) + { + /* try to figure out the charset from the locale */ + char * localename; + char * dot, * at; + + /* lang[_territory][.codeset][@modifier] */ + localename = setlocale(LC_CTYPE, NULL); + + dot = strchr(localename, '.'); + if (dot) { + dot++; + /* locale specifies a codeset */ + at = strchr(dot, '@'); + if (at) + len = at - dot; + else + len = strlen(dot); + charset_hint = dot; + } + else { + /* no explicit name; see if the name itself + * is the charset */ + charset_hint = localename; + len = strlen(charset_hint); + } + } + else + len = strlen(charset_hint); +#else + if (charset_hint) + len = strlen(charset_hint); +#endif + + if (charset_hint) { + /* now walk the charset map and look for the codeset */ + for (i = 0; charset_map[i].codeset; i++) { + if (strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) { + charset = charset_map[i].charset; + break; + } + } + } + + return charset; +} + +PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char * hint_charset) { int i, maxlen, len; char *new; + enum entity_charset charset = determine_charset(hint_charset); maxlen = 2 * oldlen; if (maxlen < 128) @@ -56,49 +274,89 @@ PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newle new = emalloc (maxlen); len = 0; - i = oldlen; - while (i--) { + i = 0; + while (i < oldlen) { + int mbseqlen; + unsigned char mbsequence[16]; /* allow up to 15 characters + in a multibyte sequence + it should be more than enough.. */ + unsigned short this_char = get_next_char(charset, old, &i, mbsequence, &mbseqlen); + int matches_map = 0; + if (len + 9 > maxlen) new = erealloc (new, maxlen += 128); - if (38 == *old) { - memcpy (new + len, "&", 5); - len += 5; - } else if (34 == *old && !(quote_style&ENT_NOQUOTES)) { - memcpy (new + len, """, 6); - len += 6; - } else if (39 == *old && (quote_style&ENT_QUOTES)) { - memcpy (new + len, "'", 6); - len += 6; - } else if (60 == *old) { - memcpy (new + len, "<", 4); - len += 4; - } else if (62 == *old) { - memcpy (new + len, ">", 4); - len += 4; - } else if (all && 160 <= *old) { - new [len++] = '&'; - strcpy (new + len, EntTable [*old - 160]); - len += strlen (EntTable [*old - 160]); - new [len++] = ';'; - } else { - new [len++] = *old; + + if (all) { + /* look for a match in the maps for this charset */ + int j; + unsigned char * rep; + + for (j=0; entity_map[j].charset != cs_terminator; j++) { + if (entity_map[j].charset == charset + && this_char >= entity_map[j].basechar + && this_char <= entity_map[j].endchar) + { + rep = (unsigned char*)entity_map[j].table[this_char - entity_map[j].basechar]; + if (rep == NULL) { + /* there is no entity for this position; fall through and + * just output the character itself */ + break; + } + + matches_map = 1; + break; + } + } + + if (matches_map) { + new[len++] = '&'; + strcpy(new + len, rep); + len += strlen(rep); + new[len++] = ';'; + } + } + if (!matches_map) { + if (38 == this_char) { + memcpy (new + len, "&", 5); + len += 5; + } else if (34 == this_char && !(quote_style&ENT_NOQUOTES)) { + memcpy (new + len, """, 6); + len += 6; + } else if (39 == this_char && (quote_style&ENT_QUOTES)) { + memcpy (new + len, "'", 6); + len += 6; + } else if (60 == this_char) { + memcpy (new + len, "<", 4); + len += 4; + } else if (62 == this_char) { + memcpy (new + len, ">", 4); + len += 4; + } else if (this_char > 0xff) { + /* a wide char without a named entity; pass through the original sequence */ + memcpy(new + len, mbsequence, mbseqlen); + len += mbseqlen; + } else { + new [len++] = this_char; + } } - old++; } new [len] = '\0'; *newlen = len; return new; + + } static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all) { - zval **arg, **quotes; + zval **arg, **quotes, **charset; int len, quote_style = ENT_COMPAT; int ac = ZEND_NUM_ARGS(); + char *hint_charset = NULL; char *new; - if (ac < 1 || ac > 2 || zend_get_parameters_ex(ac, &arg, "es) == FAILURE) { + if (ac < 1 || ac > 3 || zend_get_parameters_ex(ac, &arg, "es, &charset) == FAILURE) { WRONG_PARAM_COUNT; } @@ -107,8 +365,13 @@ static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all) convert_to_long_ex(quotes); quote_style = (*quotes)->value.lval; } + if (ac == 3) { + convert_to_string_ex(charset); + hint_charset = Z_STRVAL_PP(charset); + } + - new = php_escape_html_entities((*arg)->value.str.val, (*arg)->value.str.len, &len, all, quote_style); + new = php_escape_html_entities((*arg)->value.str.val, (*arg)->value.str.len, &len, all, quote_style, hint_charset); RETVAL_STRINGL(new,len,0); } @@ -124,7 +387,7 @@ void register_html_constants(INIT_FUNC_ARGS) REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS); } -/* {{{ proto string htmlspecialchars(string string [, int quote_style]) +/* {{{ proto string htmlspecialchars(string string [, int quote_style][, string charset]) Convert special characters to HTML entities */ PHP_FUNCTION(htmlspecialchars) { @@ -132,7 +395,7 @@ PHP_FUNCTION(htmlspecialchars) } /* }}} */ -/* {{{ proto string htmlentities(string string [, int quote_style]) +/* {{{ proto string htmlentities(string string [, int quote_style][, string charset]) Convert all applicable characters to HTML entities */ PHP_FUNCTION(htmlentities) { @@ -140,15 +403,16 @@ PHP_FUNCTION(htmlentities) } /* }}} */ -/* {{{ proto array get_html_translation_table([int table [, int quote_style]]) +/* {{{ proto array get_html_translation_table([int table [, int quote_style][, string charset]]) Returns the internal translation table used by htmlspecialchars and htmlentities */ PHP_FUNCTION(get_html_translation_table) { zval **whichone, **quotes; - int which = 0, quote_style = ENT_COMPAT; + int which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT; int ac = ZEND_NUM_ARGS(); - int inx; + int i, j; char ind[ 2 ]; + enum entity_charset charset = determine_charset(NULL); if (ac < 0 || ac > 2 || zend_get_parameters_ex(ac, &whichone, "es) == FAILURE) { WRONG_PARAM_COUNT; @@ -169,11 +433,21 @@ PHP_FUNCTION(get_html_translation_table) switch (which) { case HTML_ENTITIES: - for (inx = 160; inx <= 255; inx++) { - char buffer[16]; - ind[0] = inx; - sprintf(buffer,"&%s;",EntTable[inx-160]); - add_assoc_string(return_value,ind,buffer,1); + for (j=0; entity_map[j].charset != cs_terminator; j++) { + if (entity_map[j].charset != charset) + continue; + for (i = 0; i < entity_map[j].endchar - entity_map[j].basechar; i++) + { + char buffer[16]; + + if (entity_map[j].table[i] == NULL) + continue; + /* what about wide chars here ?? */ + ind[0] = i + entity_map[j].basechar; + sprintf(buffer, "&%s;", entity_map[j].table[i]); + add_assoc_string(return_value, ind, buffer, 1); + + } } /* break thru */ diff --git a/ext/standard/html.h b/ext/standard/html.h index ae0f5e8d65..682ced2273 100644 --- a/ext/standard/html.h +++ b/ext/standard/html.h @@ -31,6 +31,6 @@ PHP_FUNCTION(htmlspecialchars); PHP_FUNCTION(htmlentities); PHP_FUNCTION(get_html_translation_table); -PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style); +PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char * hint_charset); #endif /* HTML_H */ |