diff options
Diffstat (limited to 'Zend/zend_language_scanner.l')
| -rw-r--r-- | Zend/zend_language_scanner.l | 254 |
1 files changed, 223 insertions, 31 deletions
diff --git a/Zend/zend_language_scanner.l b/Zend/zend_language_scanner.l index b55237db9a..1d54b53ad8 100644 --- a/Zend/zend_language_scanner.l +++ b/Zend/zend_language_scanner.l @@ -181,16 +181,13 @@ ZEND_API void zend_save_lexical_state(zend_lex_state *lex_state TSRMLS_DC) lex_state->filename = zend_get_compiled_filename(TSRMLS_C); lex_state->lineno = CG(zend_lineno); - if (CG(multibyte)) { - lex_state->script_org = SCNG(script_org); - lex_state->script_org_size = SCNG(script_org_size); - lex_state->script_filtered = SCNG(script_filtered); - lex_state->script_filtered_size = SCNG(script_filtered_size); - lex_state->input_filter = SCNG(input_filter); - lex_state->output_filter = SCNG(output_filter); - lex_state->script_encoding = SCNG(script_encoding); - lex_state->internal_encoding = SCNG(internal_encoding); - } + lex_state->script_org = SCNG(script_org); + lex_state->script_org_size = SCNG(script_org_size); + lex_state->script_filtered = SCNG(script_filtered); + lex_state->script_filtered_size = SCNG(script_filtered_size); + lex_state->input_filter = SCNG(input_filter); + lex_state->output_filter = SCNG(output_filter); + lex_state->script_encoding = SCNG(script_encoding); } ZEND_API void zend_restore_lexical_state(zend_lex_state *lex_state TSRMLS_DC) @@ -209,24 +206,22 @@ ZEND_API void zend_restore_lexical_state(zend_lex_state *lex_state TSRMLS_DC) YYSETCONDITION(lex_state->yy_state); CG(zend_lineno) = lex_state->lineno; zend_restore_compiled_filename(lex_state->filename TSRMLS_CC); - if (CG(multibyte)) { - if (SCNG(script_org)) { - efree(SCNG(script_org)); - SCNG(script_org) = NULL; - } - if (SCNG(script_filtered)) { - efree(SCNG(script_filtered)); - SCNG(script_filtered) = NULL; - } - SCNG(script_org) = lex_state->script_org; - SCNG(script_org_size) = lex_state->script_org_size; - SCNG(script_filtered) = lex_state->script_filtered; - SCNG(script_filtered_size) = lex_state->script_filtered_size; - SCNG(input_filter) = lex_state->input_filter; - SCNG(output_filter) = lex_state->output_filter; - SCNG(script_encoding) = lex_state->script_encoding; - SCNG(internal_encoding) = lex_state->internal_encoding; + + if (SCNG(script_org)) { + efree(SCNG(script_org)); + SCNG(script_org) = NULL; } + if (SCNG(script_filtered)) { + efree(SCNG(script_filtered)); + SCNG(script_filtered) = NULL; + } + SCNG(script_org) = lex_state->script_org; + SCNG(script_org_size) = lex_state->script_org_size; + SCNG(script_filtered) = lex_state->script_filtered; + SCNG(script_filtered_size) = lex_state->script_filtered_size; + SCNG(input_filter) = lex_state->input_filter; + SCNG(output_filter) = lex_state->output_filter; + SCNG(script_encoding) = lex_state->script_encoding; } ZEND_API void zend_destroy_file_handle(zend_file_handle *file_handle TSRMLS_DC) @@ -239,6 +234,203 @@ ZEND_API void zend_destroy_file_handle(zend_file_handle *file_handle TSRMLS_DC) } } +#define BOM_UTF32_BE "\x00\x00\xfe\xff" +#define BOM_UTF32_LE "\xff\xfe\x00\x00" +#define BOM_UTF16_BE "\xfe\xff" +#define BOM_UTF16_LE "\xff\xfe" +#define BOM_UTF8 "\xef\xbb\xbf" + +static const zend_encoding *zend_multibyte_detect_utf_encoding(const unsigned char *script, size_t script_size TSRMLS_DC) +{ + const unsigned char *p; + int wchar_size = 2; + int le = 0; + + /* utf-16 or utf-32? */ + p = script; + while ((p-script) < script_size) { + p = memchr(p, 0, script_size-(p-script)-2); + if (!p) { + break; + } + if (*(p+1) == '\0' && *(p+2) == '\0') { + wchar_size = 4; + break; + } + + /* searching for UTF-32 specific byte orders, so this will do */ + p += 4; + } + + /* BE or LE? */ + p = script; + while ((p-script) < script_size) { + if (*p == '\0' && *(p+wchar_size-1) != '\0') { + /* BE */ + le = 0; + break; + } else if (*p != '\0' && *(p+wchar_size-1) == '\0') { + /* LE* */ + le = 1; + break; + } + p += wchar_size; + } + + if (wchar_size == 2) { + return le ? zend_multibyte_encoding_utf16le : zend_multibyte_encoding_utf16be; + } else { + return le ? zend_multibyte_encoding_utf32le : zend_multibyte_encoding_utf32be; + } + + return NULL; +} + +static const zend_encoding* zend_multibyte_detect_unicode(TSRMLS_D) +{ + const zend_encoding *script_encoding = NULL; + int bom_size; + unsigned char *script; + unsigned char *pos1, *pos2; + + if (LANG_SCNG(script_org_size) < sizeof(BOM_UTF32_LE)-1) { + return NULL; + } + + /* check out BOM */ + if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_BE, sizeof(BOM_UTF32_BE)-1)) { + script_encoding = zend_multibyte_encoding_utf32be; + bom_size = sizeof(BOM_UTF32_BE)-1; + } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_LE, sizeof(BOM_UTF32_LE)-1)) { + script_encoding = zend_multibyte_encoding_utf32le; + bom_size = sizeof(BOM_UTF32_LE)-1; + } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_BE, sizeof(BOM_UTF16_BE)-1)) { + script_encoding = zend_multibyte_encoding_utf16be; + bom_size = sizeof(BOM_UTF16_BE)-1; + } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_LE, sizeof(BOM_UTF16_LE)-1)) { + script_encoding = zend_multibyte_encoding_utf16le; + bom_size = sizeof(BOM_UTF16_LE)-1; + } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF8, sizeof(BOM_UTF8)-1)) { + script_encoding = zend_multibyte_encoding_utf8; + bom_size = sizeof(BOM_UTF8)-1; + } + + if (script_encoding) { + /* remove BOM */ + script = (unsigned char*)emalloc(LANG_SCNG(script_org_size)+1-bom_size); + memcpy(script, LANG_SCNG(script_org)+bom_size, LANG_SCNG(script_org_size)+1-bom_size); + efree(LANG_SCNG(script_org)); + LANG_SCNG(script_org) = script; + LANG_SCNG(script_org_size) -= bom_size; + + return script_encoding; + } + + /* script contains NULL bytes -> auto-detection */ + if ((pos1 = memchr(LANG_SCNG(script_org), 0, LANG_SCNG(script_org_size)))) { + /* check if the NULL byte is after the __HALT_COMPILER(); */ + pos2 = LANG_SCNG(script_org); + + while (pos1 - pos2 >= sizeof("__HALT_COMPILER();")-1) { + pos2 = memchr(pos2, '_', pos1 - pos2); + if (!pos2) break; + pos2++; + if (strncasecmp((char*)pos2, "_HALT_COMPILER", sizeof("_HALT_COMPILER")-1) == 0) { + pos2 += sizeof("_HALT_COMPILER")-1; + while (*pos2 == ' ' || + *pos2 == '\t' || + *pos2 == '\r' || + *pos2 == '\n') { + pos2++; + } + if (*pos2 == '(') { + pos2++; + while (*pos2 == ' ' || + *pos2 == '\t' || + *pos2 == '\r' || + *pos2 == '\n') { + pos2++; + } + if (*pos2 == ')') { + pos2++; + while (*pos2 == ' ' || + *pos2 == '\t' || + *pos2 == '\r' || + *pos2 == '\n') { + pos2++; + } + if (*pos2 == ';') { + return NULL; + } + } + } + } + } + /* make best effort if BOM is missing */ + return zend_multibyte_detect_utf_encoding(LANG_SCNG(script_org), LANG_SCNG(script_org_size) TSRMLS_CC); + } + + return NULL; +} + +static const zend_encoding* zend_multibyte_find_script_encoding(TSRMLS_D) +{ + const zend_encoding *script_encoding; + + if (CG(detect_unicode)) { + /* check out bom(byte order mark) and see if containing wchars */ + script_encoding = zend_multibyte_detect_unicode(TSRMLS_C); + if (script_encoding != NULL) { + /* bom or wchar detection is prior to 'script_encoding' option */ + return script_encoding; + } + } + + /* if no script_encoding specified, just leave alone */ + if (!CG(script_encoding_list) || !CG(script_encoding_list_size)) { + return NULL; + } + + /* if multiple encodings specified, detect automagically */ + if (CG(script_encoding_list_size) > 1) { + return zend_multibyte_encoding_detector(LANG_SCNG(script_org), LANG_SCNG(script_org_size), CG(script_encoding_list), CG(script_encoding_list_size) TSRMLS_CC); + } + + return CG(script_encoding_list)[0]; +} + +ZEND_API int zend_multibyte_set_filter(const zend_encoding *onetime_encoding TSRMLS_DC) +{ + const zend_encoding *internal_encoding = zend_multibyte_get_internal_encoding(TSRMLS_C); + const zend_encoding *script_encoding = onetime_encoding ? onetime_encoding: zend_multibyte_find_script_encoding(TSRMLS_C); + + if (!script_encoding) { + return FAILURE; + } + + /* judge input/output filter */ + LANG_SCNG(script_encoding) = script_encoding; + LANG_SCNG(input_filter) = NULL; + LANG_SCNG(output_filter) = NULL; + + if (!internal_encoding || LANG_SCNG(script_encoding) == internal_encoding) { + if (!zend_multibyte_check_lexer_compatibility(LANG_SCNG(script_encoding))) { + /* and if not, work around w/ script_encoding -> utf-8 -> script_encoding conversion */ + LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter; + LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter; + } else { + LANG_SCNG(input_filter) = NULL; + LANG_SCNG(output_filter) = NULL; + } + return SUCCESS; + } + + /* both script and internal encodings are incompatible w/ flex */ + LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter; + LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter; + + return 0; +} ZEND_API int open_file_for_scanning(zend_file_handle *file_handle TSRMLS_DC) { @@ -286,13 +478,13 @@ ZEND_API int open_file_for_scanning(zend_file_handle *file_handle TSRMLS_DC) SCNG(input_filter)(&SCNG(script_filtered), &SCNG(script_filtered_size), SCNG(script_org), SCNG(script_org_size) TSRMLS_CC); if (SCNG(script_filtered) == NULL) { zend_error_noreturn(E_COMPILE_ERROR, "Could not convert the script from the detected " - "encoding \"%s\" to a compatible encoding", LANG_SCNG(script_encoding)->name); + "encoding \"%s\" to a compatible encoding", zend_multibyte_get_encoding_name(LANG_SCNG(script_encoding))); } } SCNG(yy_start) = SCNG(script_filtered) - offset; yy_scan_buffer((char *)SCNG(script_filtered), SCNG(script_filtered_size) TSRMLS_CC); } else { - SCNG(yy_start) = buf - offset; + SCNG(yy_start) = (unsigned char *)buf - offset; yy_scan_buffer(buf, size TSRMLS_CC); } } else { @@ -441,7 +633,7 @@ ZEND_API int zend_prepare_string_for_scanning(zval *str, char *filename TSRMLS_D SCNG(script_org) = (unsigned char *)estrdup(str->value.str.val); SCNG(script_org_size) = str->value.str.len; - zend_multibyte_set_filter(CG(internal_encoding) TSRMLS_CC); + zend_multibyte_set_filter(zend_multibyte_get_internal_encoding(TSRMLS_C) TSRMLS_CC); if (!SCNG(input_filter)) { SCNG(script_filtered) = (unsigned char*)emalloc(SCNG(script_org_size)+1); @@ -615,7 +807,7 @@ ZEND_API void zend_multibyte_yyinput_again(zend_encoding_filter old_input_filter /* calculate current position */ offset = original_offset = YYCURSOR - SCNG(yy_start); if (old_input_filter && offset > 0) { - zend_encoding *new_encoding = SCNG(script_encoding); + const zend_encoding *new_encoding = SCNG(script_encoding); zend_encoding_filter new_filter = SCNG(input_filter); SCNG(script_encoding) = old_encoding; SCNG(input_filter) = old_input_filter; |
