summaryrefslogtreecommitdiff
path: root/ext/standard/html.c
diff options
context:
space:
mode:
authorMoriyoshi Koizumi <moriyoshi@php.net>2009-10-09 10:02:38 +0000
committerMoriyoshi Koizumi <moriyoshi@php.net>2009-10-09 10:02:38 +0000
commit9d19866476291c318ba50c9002ca0e96f356345e (patch)
tree0ff0dbb95fc5f63ed4cf496a29ccf47cd01a20b6 /ext/standard/html.c
parentf88c96a7f358264738929b0a9d445497d22099a9 (diff)
downloadphp-git-9d19866476291c318ba50c9002ca0e96f356345e.tar.gz
- Fixed bug #49785 (insufficient input string validation of htmlspecialchars()).
Diffstat (limited to 'ext/standard/html.c')
-rw-r--r--ext/standard/html.c231
1 files changed, 103 insertions, 128 deletions
diff --git a/ext/standard/html.c b/ext/standard/html.c
index 6759d66023..8845a8aa8c 100644
--- a/ext/standard/html.c
+++ b/ext/standard/html.c
@@ -380,8 +380,8 @@ static entity_table_t ent_macroman[] = {
struct html_entity_map {
enum entity_charset charset; /* charset identifier */
- unsigned short basechar; /* char code at start of table */
- unsigned short endchar; /* last char code in the table */
+ unsigned int basechar; /* char code at start of table */
+ unsigned int endchar; /* last char code in the table */
entity_table_t *table; /* the table of mappings */
};
@@ -492,7 +492,7 @@ struct basic_entities_dec {
/* {{{ get_next_char
*/
-inline static unsigned short get_next_char(enum entity_charset charset,
+inline static unsigned int get_next_char(enum entity_charset charset,
unsigned char * str,
int str_len,
int * newpos,
@@ -503,206 +503,181 @@ inline static unsigned short get_next_char(enum entity_charset charset,
int pos = *newpos;
int mbpos = 0;
int mbspace = *mbseqlen;
- unsigned short this_char = str[pos++];
+ unsigned int this_char = 0;
unsigned char next_char;
*status = SUCCESS;
-
+
if (mbspace <= 0) {
*mbseqlen = 0;
- return this_char;
+ CHECK_LEN(pos, 1);
+ *newpos = pos + 1;
+ return str[pos];
}
-
- MB_WRITE((unsigned char)this_char);
-
+
switch (charset) {
case cs_utf_8:
{
- unsigned long utf = 0;
- int stat = 0;
- int more = 1;
-
- /* unpack utf-8 encoding into a wide char.
- * Code stolen from the mbstring extension */
-
- do {
+ unsigned char c;
+ CHECK_LEN(pos, 1);
+ c = str[pos];
+ if (c < 0x80) {
+ MB_WRITE(c);
+ this_char = c;
+ pos++;
+ } else if (c < 0xc0) {
+ *status = FAILURE;
+ return 0;
+ } else if (c < 0xe0) {
+ CHECK_LEN(pos, 2);
+ this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
if (this_char < 0x80) {
- more = 0;
- if(stat) {
- /* we didn't finish the UTF sequence correctly */
- --pos;
- *status = FAILURE;
- }
- break;
- } else if (this_char < 0xc0) {
- switch (stat) {
- case 0x10: /* 2, 2nd */
- case 0x21: /* 3, 3rd */
- case 0x32: /* 4, 4th */
- case 0x43: /* 5, 5th */
- case 0x54: /* 6, 6th */
- /* last byte in sequence */
- more = 0;
- utf |= (this_char & 0x3f);
- this_char = (unsigned short)utf;
- break;
- case 0x20: /* 3, 2nd */
- case 0x31: /* 4, 3rd */
- case 0x42: /* 5, 4th */
- case 0x53: /* 6, 5th */
- /* penultimate char */
- utf |= ((this_char & 0x3f) << 6);
- stat++;
- break;
- case 0x30: /* 4, 2nd */
- case 0x41: /* 5, 3rd */
- case 0x52: /* 6, 4th */
- utf |= ((this_char & 0x3f) << 12);
- stat++;
- break;
- case 0x40: /* 5, 2nd */
- case 0x51:
- utf |= ((this_char & 0x3f) << 18);
- stat++;
- break;
- case 0x50: /* 6, 2nd */
- utf |= ((this_char & 0x3f) << 24);
- stat++;
- break;
- default:
- /* invalid */
- *status = FAILURE;
- more = 0;
- }
+ *status = FAILURE;
+ return 0;
}
- /* lead byte */
- else if (this_char < 0xe0) {
- stat = 0x10; /* 2 byte */
- utf = (this_char & 0x1f) << 6;
- CHECK_LEN(pos, 1);
- } else if (this_char < 0xf0) {
- stat = 0x20; /* 3 byte */
- utf = (this_char & 0xf) << 12;
- CHECK_LEN(pos, 2);
- } else if (this_char < 0xf8) {
- stat = 0x30; /* 4 byte */
- utf = (this_char & 0x7) << 18;
- CHECK_LEN(pos, 3);
- } else if (this_char < 0xfc) {
- stat = 0x40; /* 5 byte */
- utf = (this_char & 0x3) << 24;
- CHECK_LEN(pos, 4);
- } else if (this_char < 0xfe) {
- stat = 0x50; /* 6 byte */
- utf = (this_char & 0x1) << 30;
- CHECK_LEN(pos, 5);
- } else {
- /* invalid; bail */
- more = 0;
+ MB_WRITE((unsigned char)c);
+ MB_WRITE((unsigned char)str[pos + 1]);
+ pos += 2;
+ } else if (c < 0xf0) {
+ CHECK_LEN(pos, 3);
+ this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
+ if (this_char < 0x800) {
*status = FAILURE;
- break;
+ return 0;
}
-
- if (more) {
- this_char = str[pos++];
- MB_WRITE((unsigned char)this_char);
+ MB_WRITE((unsigned char)c);
+ MB_WRITE((unsigned char)str[pos + 1]);
+ MB_WRITE((unsigned char)str[pos + 2]);
+ pos += 3;
+ } else if (c < 0xf8) {
+ CHECK_LEN(pos, 4);
+ this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
+ if (this_char < 0x10000) {
+ *status = FAILURE;
+ return 0;
}
- } while (more);
+ MB_WRITE((unsigned char)c);
+ MB_WRITE((unsigned char)str[pos + 1]);
+ MB_WRITE((unsigned char)str[pos + 2]);
+ MB_WRITE((unsigned char)str[pos + 3]);
+ pos += 4;
+ } else {
+ *status = FAILURE;
+ return 0;
+ }
}
break;
case cs_big5:
case cs_gb2312:
case cs_big5hkscs:
{
+ CHECK_LEN(pos, 1);
+ this_char = str[pos++];
/* check if this is the first of a 2-byte sequence */
- if (this_char >= 0xa1 && this_char <= 0xfe) {
+ if (this_char >= 0x81 && this_char <= 0xfe) {
/* peek at the next char */
CHECK_LEN(pos, 1);
- next_char = str[pos];
+ next_char = str[pos++];
if ((next_char >= 0x40 && next_char <= 0x7e) ||
(next_char >= 0xa1 && next_char <= 0xfe)) {
/* yes, this a wide char */
- this_char <<= 8;
+ MB_WRITE(this_char);
MB_WRITE(next_char);
- this_char |= next_char;
- pos++;
+ this_char = (this_char << 8) | next_char;
+ } else {
+ *status = FAILURE;
+ return 0;
}
-
+ } else {
+ MB_WRITE(this_char);
}
- break;
}
+ break;
case cs_sjis:
{
+ CHECK_LEN(pos, 1);
+ this_char = str[pos++];
/* check if this is the first of a 2-byte sequence */
- if ( (this_char >= 0x81 && this_char <= 0x9f) ||
- (this_char >= 0xe0 && this_char <= 0xef)
- ) {
+ if ((this_char >= 0x81 && this_char <= 0x9f) ||
+ (this_char >= 0xe0 && this_char <= 0xfc)) {
/* peek at the next char */
CHECK_LEN(pos, 1);
- next_char = str[pos];
+ next_char = str[pos++];
if ((next_char >= 0x40 && next_char <= 0x7e) ||
(next_char >= 0x80 && next_char <= 0xfc))
{
/* yes, this a wide char */
- this_char <<= 8;
+ MB_WRITE(this_char);
MB_WRITE(next_char);
- this_char |= next_char;
- pos++;
+ this_char = (this_char << 8) | next_char;
+ } else {
+ *status = FAILURE;
+ return 0;
}
-
+ } else {
+ MB_WRITE(this_char);
}
break;
}
case cs_eucjp:
{
+ CHECK_LEN(pos, 1);
+ this_char = str[pos++];
/* check if this is the first of a multi-byte sequence */
if (this_char >= 0xa1 && this_char <= 0xfe) {
/* peek at the next char */
CHECK_LEN(pos, 1);
- next_char = str[pos];
+ next_char = str[pos++];
if (next_char >= 0xa1 && next_char <= 0xfe) {
/* yes, this a jis kanji char */
- this_char <<= 8;
+ MB_WRITE(this_char);
MB_WRITE(next_char);
- this_char |= next_char;
- pos++;
+ this_char = (this_char << 8) | next_char;
+ } else {
+ *status = FAILURE;
+ return 0;
}
-
} else if (this_char == 0x8e) {
/* peek at the next char */
CHECK_LEN(pos, 1);
- next_char = str[pos];
+ next_char = str[pos++];
if (next_char >= 0xa1 && next_char <= 0xdf) {
/* JIS X 0201 kana */
- this_char <<= 8;
+ MB_WRITE(this_char);
MB_WRITE(next_char);
- this_char |= next_char;
- pos++;
+ this_char = (this_char << 8) | next_char;
+ } else {
+ *status = FAILURE;
+ return 0;
}
-
} else if (this_char == 0x8f) {
/* peek at the next two char */
unsigned char next2_char;
CHECK_LEN(pos, 2);
next_char = str[pos];
- next2_char = str[pos+1];
+ next2_char = str[pos + 1];
+ pos += 2;
if ((next_char >= 0xa1 && next_char <= 0xfe) &&
(next2_char >= 0xa1 && next2_char <= 0xfe)) {
/* JIS X 0212 hojo-kanji */
- this_char <<= 8;
+ MB_WRITE(this_char);
MB_WRITE(next_char);
- this_char |= next_char;
- pos++;
- this_char <<= 8;
MB_WRITE(next2_char);
- this_char |= next2_char;
- pos++;
+ this_char = (this_char << 16) | (next_char << 8) | next_char;
+ } else {
+ *status = FAILURE;
+ return 0;
}
-
+ } else {
+ MB_WRITE(this_char);
}
break;
}
default:
+ /* single-byte charsets */
+ CHECK_LEN(pos, 1);
+ this_char = str[pos++];
+ MB_WRITE(this_char);
break;
}
MB_RETURN;
@@ -1133,7 +1108,7 @@ PHPAPI char *php_escape_html_entities_ex(unsigned char *old, int oldlen, int *ne
unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence */
int mbseqlen = sizeof(mbsequence);
int status = SUCCESS;
- unsigned short this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
+ unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
if(status == FAILURE) {
/* invalid MB sequence */