summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRui Hirokawa <hirokawa@php.net>2002-05-08 12:33:44 +0000
committerRui Hirokawa <hirokawa@php.net>2002-05-08 12:33:44 +0000
commitf30b722f14521fbad2fabe5fdcaa2b60fe97eebb (patch)
tree0b386e8a7bcb10b5736b4d32500ccb34c0bc9ce7
parent2b5beee5ad6947af588e4d5f7131f189b8c90391 (diff)
downloadphp-git-f30b722f14521fbad2fabe5fdcaa2b60fe97eebb.tar.gz
Added conversion support from script character encoding to internal character encoding. This feature is very useful for japanese who uses Shift_JIS encoding because some of characters in Shift_JIS are including '0x5c' and it causes some troubles on Zend parser. This patch is made by Masaki Fujimoto.
-rw-r--r--ext/mbstring/mbfilter.c54
-rw-r--r--ext/mbstring/mbfilter.h8
-rw-r--r--ext/mbstring/mbstring.c207
-rw-r--r--ext/mbstring/mbstring.h14
-rw-r--r--main/main.c7
-rw-r--r--sapi/apache/mod_php4.c9
6 files changed, 295 insertions, 4 deletions
diff --git a/ext/mbstring/mbfilter.c b/ext/mbstring/mbfilter.c
index 3e065fa453..4e335f12fb 100644
--- a/ext/mbstring/mbfilter.c
+++ b/ext/mbstring/mbfilter.c
@@ -685,12 +685,12 @@ static mbfl_encoding mbfl_encoding_2022jp = {
#if defined(HAVE_MBSTR_CN)
-static const char *mbfl_encoding_euc_cn_aliases[] = {"EUC_CN", "eucCN", "x-euc-cn", NULL};
+static const char *mbfl_encoding_euc_cn_aliases[] = {"CN-GB", "EUC_CN", "eucCN", "x-euc-cn", NULL};
static mbfl_encoding mbfl_encoding_euc_cn = {
mbfl_no_encoding_euc_cn,
"EUC-CN",
- "EUC-CN",
+ "CN-GB",
(const char *(*)[])&mbfl_encoding_euc_cn_aliases,
mblen_table_euccn,
MBFL_ENCTYPE_MBCS
@@ -721,12 +721,12 @@ static mbfl_encoding mbfl_encoding_euc_tw = {
MBFL_ENCTYPE_MBCS
};
-static const char *mbfl_encoding_big5_aliases[] = {"big5", "CP950", NULL};
+static const char *mbfl_encoding_big5_aliases[] = {"CN-BIG5", "BIG5", "BIG-FIVE", "BIGFIVE", "CP950", NULL};
static mbfl_encoding mbfl_encoding_big5 = {
mbfl_no_encoding_big5,
"BIG-5",
- "BIG-5",
+ "CN-BIG5",
(const char *(*)[])&mbfl_encoding_big5_aliases,
mblen_table_big5,
MBFL_ENCTYPE_MBCS
@@ -6995,7 +6995,53 @@ mbfl_strlen(mbfl_string *string TSRMLS_DC)
return len;
}
+#ifdef ZEND_MULTIBYTE
+/*
+ * oddlen
+ */
+int
+mbfl_oddlen(mbfl_string *string)
+{
+ int len, n, m, k;
+ unsigned char *p;
+ const unsigned char *mbtab;
+ mbfl_encoding *encoding;
+
+ encoding = mbfl_no2encoding(string->no_encoding);
+ if (encoding == NULL || string == NULL) {
+ return -1;
+ }
+ len = 0;
+ if (encoding->flag & MBFL_ENCTYPE_SBCS) {
+ return 0;
+ } else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) {
+ return len % 2;
+ } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) {
+ return len % 4;
+ } else if (encoding->mblen_table != NULL) {
+ mbtab = encoding->mblen_table;
+ n = 0;
+ p = string->val;
+ k = string->len;
+ /* count */
+ if (p != NULL) {
+ while (n < k) {
+ m = mbtab[*p];
+ n += m;
+ p += m;
+ };
+ }
+ return n-k;
+ } else {
+ /* how can i do ? */
+ return 0;
+ }
+ /* NOT REACHED */
+}
+#endif /* ZEND_MULTIBYTE */
+
+
/*
* strpos
*/
diff --git a/ext/mbstring/mbfilter.h b/ext/mbstring/mbfilter.h
index 65ee94b573..a5077bd578 100644
--- a/ext/mbstring/mbfilter.h
+++ b/ext/mbstring/mbfilter.h
@@ -461,6 +461,14 @@ mbfl_identify_encoding_no(mbfl_string *string, enum mbfl_no_encoding *elist, int
int
mbfl_strlen(mbfl_string *string TSRMLS_DC);
+#ifdef ZEND_MULTIBYTE
+/*
+ * oddlen
+ */
+int
+mbfl_oddlen(mbfl_string *string);
+#endif /* ZEND_MULTIBYTE */
+
/*
* strpos
*/
diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c
index 40d16522d2..5b40ff3f89 100644
--- a/ext/mbstring/mbstring.c
+++ b/ext/mbstring/mbstring.c
@@ -65,6 +65,10 @@
#include "php_content_types.h"
#include "SAPI.h"
+#ifdef ZEND_MULTIBYTE
+#include "zend_multibyte.h"
+#endif /* ZEND_MULTIBYTE */
+
#if HAVE_MBSTRING
#if HAVE_MBREGEX
@@ -524,6 +528,25 @@ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
return SUCCESS;
}
+#ifdef ZEND_MULTIBYTE
+static PHP_INI_MH(OnUpdate_mbstring_script_encoding)
+{
+ int *list, size;
+
+ if (php_mbstring_parse_encoding_list(new_value, new_value_length, &list, &size, 1)) {
+ if (MBSTRG(script_encoding_list) != NULL) {
+ free(MBSTRG(script_encoding_list));
+ }
+ MBSTRG(script_encoding_list) = list;
+ MBSTRG(script_encoding_list_size) = size;
+ } else {
+ return FAILURE;
+ }
+
+ return SUCCESS;
+}
+#endif /* ZEND_MULTIBYTE */
+
static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
{
if (new_value != NULL) {
@@ -546,6 +569,9 @@ PHP_INI_BEGIN()
PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding)
+#ifdef ZEND_MULTIBYTE
+ PHP_INI_ENTRY("mbstring.script_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_script_encoding)
+#endif /* ZEND_MULTIBYTE */
PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
STD_PHP_INI_ENTRY("mbstring.func_overload", "0", PHP_INI_SYSTEM, OnUpdateInt, func_overload, zend_mbstring_globals, mbstring_globals)
PHP_INI_END()
@@ -579,6 +605,10 @@ php_mbstring_init_globals(zend_mbstring_globals *pglobals TSRMLS_DC)
MBSTRG(internal_encoding) = mbfl_no_encoding_euc_jp;
MBSTRG(current_internal_encoding) = mbfl_no_encoding_euc_jp;
#endif
+#ifdef ZEND_MULTIBYTE
+ MBSTRG(script_encoding_list) = NULL;
+ MBSTRG(script_encoding_list_size) = 0;
+#endif /* ZEND_MULTIBYTE */
MBSTRG(http_output_encoding) = mbfl_no_encoding_pass;
MBSTRG(current_http_output_encoding) = mbfl_no_encoding_pass;
MBSTRG(http_input_identify) = mbfl_no_encoding_invalid;
@@ -640,6 +670,11 @@ PHP_MSHUTDOWN_FUNCTION(mbstring)
if (MBSTRG(http_input_list)) {
free(MBSTRG(http_input_list));
}
+#ifdef ZEND_MULTIBYTE
+ if (MBSTRG(script_encoding_list)) {
+ free(MBSTRG(script_encoding_list));
+ }
+#endif /* ZEND_MULTIBYTE */
if (MBSTRG(detect_order_list)) {
free(MBSTRG(detect_order_list));
}
@@ -858,6 +893,9 @@ PHP_FUNCTION(mb_internal_encoding)
RETURN_FALSE;
} else {
MBSTRG(current_internal_encoding) = no_encoding;
+#ifdef ZEND_MULTIBYTE
+ zend_multibyte_set_internal_encoding(Z_STRVAL_PP(arg1), Z_STRLEN_PP(arg1) TSRMLS_CC);
+#endif /* ZEND_MULTIBYTE */
RETURN_TRUE;
}
} else {
@@ -3174,6 +3212,175 @@ PHP_FUNCTION(mb_get_info)
}
/* }}} */
+
+#ifdef ZEND_MULTIBYTE
+PHPAPI int php_mbstring_set_zend_encoding(TSRMLS_D)
+{
+ /* 'd better use mbfl_memory_device? */
+ char *name, *list = NULL;
+ int n, *entry, list_size = 0;
+ zend_encoding_detector encoding_detector;
+ zend_encoding_converter encoding_converter;
+ zend_multibyte_oddlen multibyte_oddlen;
+
+ /* notify script encoding to Zend Engine */
+ entry = MBSTRG(script_encoding_list);
+ n = MBSTRG(script_encoding_list_size);
+ while (n > 0) {
+ name = (char *)mbfl_no_encoding2name(*entry);
+ if (name) {
+ list_size += strlen(name) + 1;
+ if (!list)
+ {
+ list = (char*)emalloc(list_size);
+ if (!list)
+ return -1;
+ *list = (char)NULL;
+ }
+ else
+ {
+ list = (char*)erealloc(list, list_size);
+ if (!list)
+ return -1;
+ strcat(list, ",");
+ }
+ strcat(list, name);
+ }
+ entry++;
+ n--;
+ }
+ zend_multibyte_set_script_encoding(list, (list ? strlen(list) : 0) TSRMLS_CC);
+ if (list)
+ efree(list);
+
+ encoding_detector = php_mbstring_encoding_detector;
+ encoding_converter = NULL;
+ multibyte_oddlen = php_mbstring_oddlen;
+
+#if defined(MBSTR_ENC_TRANS)
+ /* notify internal encoding to Zend Engine */
+ name = (char*)mbfl_no_encoding2name(MBSTRG(current_internal_encoding));
+ zend_multibyte_set_internal_encoding(name, strlen(name) TSRMLS_CC);
+
+ encoding_converter = php_mbstring_encoding_converter;
+#endif /* defined(MBSTR_ENC_TRANS) */
+
+ zend_multibyte_set_functions(encoding_detector, encoding_converter,
+ multibyte_oddlen TSRMLS_CC);
+
+ return 0;
+}
+
+/*
+ * mb_detect_encoding (interface for Zend Engine)
+ */
+char* php_mbstring_encoding_detector(char *arg_string, int arg_length, char *arg_list TSRMLS_DC)
+{
+ mbfl_string string;
+ const char *ret;
+ enum mbfl_no_encoding *elist;
+ int size, *list;
+
+ /* make encoding list */
+ list = NULL;
+ size = 0;
+ php_mbstring_parse_encoding_list(arg_list, strlen(arg_list), &list, &size, 0);
+ if (size <= 0)
+ return NULL;
+
+ if (size > 0 && list != NULL) {
+ elist = list;
+ } else {
+ elist = MBSTRG(current_detect_order_list);
+ size = MBSTRG(current_detect_order_list_size);
+ }
+
+ mbfl_string_init(&string);
+ string.no_language = MBSTRG(current_language);
+ string.val = arg_string;
+ string.len = arg_length;
+ ret = mbfl_identify_encoding_name(&string, elist, size);
+ if (list != NULL) {
+ efree((void *)list);
+ }
+ if (ret != NULL) {
+ return estrdup(ret);
+ } else {
+ return NULL;
+ }
+}
+
+
+/*
+ * mb_convert_encoding (interface for Zend Engine)
+ */
+int php_mbstring_encoding_converter(char **to, int *to_length, char *from,
+ int from_length, const char *encoding_to, const char *encoding_from
+ TSRMLS_DC)
+{
+ mbfl_string string, result, *ret;
+ enum mbfl_no_encoding from_encoding, to_encoding;
+ mbfl_buffer_converter *convd;
+
+ /* new encoding */
+ to_encoding = mbfl_name2no_encoding(encoding_to);
+ if (to_encoding == mbfl_no_encoding_invalid)
+ return -1;
+
+ /* old encoding */
+ from_encoding = mbfl_name2no_encoding(encoding_from);
+ if (from_encoding == mbfl_no_encoding_invalid)
+ return -1;
+
+ /* initialize string */
+ mbfl_string_init(&string);
+ mbfl_string_init(&result);
+ string.no_encoding = from_encoding;
+ string.no_language = MBSTRG(current_language);
+ string.val = from;
+ string.len = from_length;
+
+ /* initialize converter */
+ convd = mbfl_buffer_converter_new(from_encoding, to_encoding, string.len);
+ if (convd == NULL)
+ return -1;
+ mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
+ mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
+
+ /* do it */
+ ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
+ if (ret != NULL) {
+ *to = ret->val;
+ *to_length = ret->len;
+ }
+ mbfl_buffer_converter_delete(convd);
+
+ return ret ? 0 : -1;
+}
+
+
+/*
+ * returns number of odd (e.g. appears only first byte of multibyte
+ * character) chars
+ */
+int php_mbstring_oddlen(char *string, int length, const char *encoding TSRMLS_DC)
+{
+ mbfl_string mb_string;
+
+ mbfl_string_init(&mb_string);
+ mb_string.no_language = MBSTRG(current_language);
+ mb_string.no_encoding = mbfl_name2no_encoding(encoding);
+ mb_string.val = string;
+ mb_string.len = length;
+
+ if(mb_string.no_encoding == mbfl_no_encoding_invalid)
+ return 0;
+
+ return mbfl_oddlen(&mb_string);
+}
+
+#endif /* ZEND_MULTIBYTE */
+
#endif /* HAVE_MBSTRING */
/*
diff --git a/ext/mbstring/mbstring.h b/ext/mbstring/mbstring.h
index 22ff290c5d..98623e77b4 100644
--- a/ext/mbstring/mbstring.h
+++ b/ext/mbstring/mbstring.h
@@ -129,6 +129,10 @@ ZEND_BEGIN_MODULE_GLOBALS(mbstring)
int current_language;
int internal_encoding;
int current_internal_encoding;
+#ifdef ZEND_MULTIBYTE
+ int *script_encoding_list;
+ int script_encoding_list_size;
+#endif /* ZEND_MULTIBYTE */
int http_output_encoding;
int current_http_output_encoding;
int http_input_identify;
@@ -177,6 +181,16 @@ struct mb_overload_def {
#define MBSTRG(v) (mbstring_globals.v)
#endif
+#ifdef ZEND_MULTIBYTE
+PHPAPI int php_mbstring_set_zend_encoding(TSRMLS_D);
+char* php_mbstring_encoding_detector(char *string, int length, char *list
+ TSRMLS_DC);
+int php_mbstring_encoding_converter(char **to, int *to_length, char *from,
+ int from_length, const char *encoding_to, const char *encoding_from
+ TSRMLS_DC);
+int php_mbstring_oddlen(char *string, int length, const char *encoding TSRMLS_DC);
+#endif /* ZEND_MULTIBYTE */
+
#else /* HAVE_MBSTRING */
#define mbstring_module_ptr NULL
diff --git a/main/main.c b/main/main.c
index fcd73d8239..16b47b6cea 100644
--- a/main/main.c
+++ b/main/main.c
@@ -75,6 +75,10 @@
#include "php_logos.h"
#include "php_streams.h"
+#if defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING)
+#include "ext/mbstring/mbstring.h"
+#endif /* defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) */
+
#include "SAPI.h"
/* }}} */
@@ -1402,6 +1406,9 @@ PHPAPI int php_execute_script(zend_file_handle *primary_file TSRMLS_DC)
} else {
append_file_p = NULL;
}
+#if defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING)
+ php_mbstring_set_zend_encoding(TSRMLS_C);
+#endif /* ZEND_MULTIBYTE && HAVE_MBSTRING */
retval = (zend_execute_scripts(ZEND_REQUIRE TSRMLS_CC, NULL, 3, prepend_file_p, primary_file, append_file_p) == SUCCESS);
} zend_end_try();
diff --git a/sapi/apache/mod_php4.c b/sapi/apache/mod_php4.c
index d3ed4ef0bd..9c23523178 100644
--- a/sapi/apache/mod_php4.c
+++ b/sapi/apache/mod_php4.c
@@ -21,6 +21,10 @@
#include "php_apache_http.h"
+#if defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING)
+#include "ext/mbstring/mbstring.h"
+#endif /* defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) */
+
#undef shutdown
/* {{{ Prototypes
@@ -459,6 +463,11 @@ static int send_php(request_rec *r, int display_source_mode, char *filename)
fh.opened_path = NULL;
fh.free_filename = 0;
fh.type = ZEND_HANDLE_FILENAME;
+
+#if defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING)
+ php_mbstring_set_zend_encoding(TSRMLS_C);
+#endif /* defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) */
+
zend_execute_scripts(ZEND_INCLUDE TSRMLS_CC, NULL, 1, &fh);
return OK;
}