diff options
Diffstat (limited to 'ext/mbstring/libmbfl/mbfl/mbfilter.c')
-rw-r--r-- | ext/mbstring/libmbfl/mbfl/mbfilter.c | 279 |
1 files changed, 90 insertions, 189 deletions
diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.c b/ext/mbstring/libmbfl/mbfl/mbfilter.c index 793dd8e078..1904786576 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c @@ -86,9 +86,11 @@ #include "mbfl_filter_output.h" #include "mbfilter_8bit.h" #include "mbfilter_wchar.h" -#include "filters/mbfilter_ascii.h" +#include "mbstring.h" +#include "php_unicode.h" #include "filters/mbfilter_base64.h" #include "filters/mbfilter_qprint.h" +#include "filters/mbfilter_singlebyte.h" #include "filters/mbfilter_tl_jisx0201_jisx0208.h" #include "filters/mbfilter_utf8.h" @@ -200,7 +202,6 @@ size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *str size_t n; unsigned char *p; mbfl_convert_filter *filter; - int (*filter_function)(int c, mbfl_convert_filter *filter); ZEND_ASSERT(convd); ZEND_ASSERT(string); @@ -212,9 +213,8 @@ size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *str filter = convd->filter1; if (filter != NULL) { - filter_function = filter->filter_function; while (n > 0) { - if ((*filter_function)(*p++, filter) < 0) { + if ((*filter->filter_function)(*p++, filter) < 0) { return p - string->val; } n--; @@ -234,9 +234,6 @@ mbfl_buffer_converter_flush(mbfl_buffer_converter *convd) if (convd->filter1 != NULL) { mbfl_convert_filter_flush(convd->filter1); } - if (convd->filter2 != NULL) { - mbfl_convert_filter_flush(convd->filter2); - } return 0; } @@ -262,9 +259,6 @@ mbfl_buffer_converter_feed_result(mbfl_buffer_converter *convd, mbfl_string *str if (convd->filter1 != NULL) { mbfl_convert_filter_flush(convd->filter1); } - if (convd->filter2 != NULL) { - mbfl_convert_filter_flush(convd->filter2); - } result->encoding = convd->to; return mbfl_memory_device_result(&convd->device, result); } @@ -291,126 +285,102 @@ size_t mbfl_buffer_illegalchars(mbfl_buffer_converter *convd) /* * encoding detector */ -mbfl_encoding_detector * -mbfl_encoding_detector_new(const mbfl_encoding **elist, int elistsz, int strict) +static int mbfl_estimate_encoding_likelihood(int c, void* data) { - mbfl_encoding_detector *identd; - - int i, num; - mbfl_identify_filter *filter; + mbfl_convert_filter *filter = *((mbfl_convert_filter**)data); + uintptr_t *score = (uintptr_t*)(&filter->opaque); + + /* Receive wchars decoded from test string using candidate encoding + * If the test string was invalid in the candidate encoding, we assume + * it's the wrong one. */ + if (c & MBFL_WCSGROUP_THROUGH) { + filter->num_illegalchar++; + } else if (php_unicode_is_cntrl(c) || php_unicode_is_private(c)) { + /* Otherwise, count how many control characters and 'private use' + * codepoints we see. Those are rarely used and may indicate that + * the candidate encoding is not the right one. */ + *score += 10; + } else if (php_unicode_is_punct(c)) { + /* Punctuation is also less common than letters/digits */ + (*score)++; + } + return c; +} - if (elist == NULL || elistsz <= 0) { +mbfl_encoding_detector *mbfl_encoding_detector_new(const mbfl_encoding **elist, int elistsz, int strict) +{ + if (!elistsz) { return NULL; } - /* allocate */ - identd = emalloc(sizeof(mbfl_encoding_detector)); - identd->filter_list = ecalloc(elistsz, sizeof(mbfl_identify_filter *)); - - /* create filters */ - i = 0; - num = 0; - while (i < elistsz) { - filter = mbfl_identify_filter_new2(elist[i]); - if (filter != NULL) { - identd->filter_list[num] = filter; - num++; - } - i++; + mbfl_encoding_detector *identd = emalloc(sizeof(mbfl_encoding_detector)); + identd->filter_list = ecalloc(elistsz, sizeof(mbfl_convert_filter*)); + for (int i = 0; i < elistsz; i++) { + identd->filter_list[i] = mbfl_convert_filter_new(elist[i], &mbfl_encoding_wchar, + mbfl_estimate_encoding_likelihood, NULL, &identd->filter_list[i]); + identd->filter_list[i]->opaque = (void*)0; } - identd->filter_list_size = num; - - /* set strict flag */ + identd->filter_list_size = elistsz; identd->strict = strict; - return identd; } - -void -mbfl_encoding_detector_delete(mbfl_encoding_detector *identd) +void mbfl_encoding_detector_delete(mbfl_encoding_detector *identd) { - int i; - - if (identd != NULL) { - if (identd->filter_list != NULL) { - i = identd->filter_list_size; - while (i > 0) { - i--; - mbfl_identify_filter_delete(identd->filter_list[i]); - } - efree((void *)identd->filter_list); - } - efree((void *)identd); + for (int i = 0; i < identd->filter_list_size; i++) { + mbfl_convert_filter_delete(identd->filter_list[i]); } + efree(identd->filter_list); + efree(identd); } -int -mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *string) +int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *string) { - int res = 0; - /* feed data */ - if (identd != NULL && string != NULL && string->val != NULL) { - int num = identd->filter_list_size; - size_t n = string->len; - unsigned char *p = string->val; - int bad = 0; - while (n > 0) { - int i; - for (i = 0; i < num; i++) { - mbfl_identify_filter *filter = identd->filter_list[i]; - if (!filter->flag) { - (*filter->filter_function)(*p, filter); - if (filter->flag) { - bad++; - } + int num = identd->filter_list_size; + size_t n = string->len; + unsigned char *p = string->val; + int bad = 0; + + while (n--) { + for (int i = 0; i < num; i++) { + mbfl_convert_filter *filter = identd->filter_list[i]; + if (!filter->num_illegalchar) { + (*filter->filter_function)(*p, filter); + if (filter->num_illegalchar) { + bad++; } } - if ((num - 1) <= bad) { - res = 1; - break; - } - p++; - n--; } + if ((num - 1) <= bad && !identd->strict) { + return 1; + } + p++; } - return res; + if (identd->strict) { + for (int i = 0; i < num; i++) { + mbfl_convert_filter *filter = identd->filter_list[i]; + (filter->filter_flush)(filter); + } + } + + return 0; } const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd) { - mbfl_identify_filter *filter; - const mbfl_encoding *encoding = NULL; - int n; - - /* judge */ - if (identd != NULL) { - n = identd->filter_list_size - 1; - while (n >= 0) { - filter = identd->filter_list[n]; - if (!filter->flag) { - if (!identd->strict || !filter->status) { - encoding = filter->encoding; - } - } - n--; - } + uintptr_t best_score = UINT_MAX; /* Low score is 'better' */ + const mbfl_encoding *enc = NULL; - /* fallback judge */ - if (!encoding) { - n = identd->filter_list_size - 1; - while (n >= 0) { - filter = identd->filter_list[n]; - if (!filter->flag) { - encoding = filter->encoding; - } - n--; - } + for (int i = 0; i < identd->filter_list_size; i++) { + mbfl_convert_filter *filter = identd->filter_list[i]; + if (!filter->num_illegalchar && (uintptr_t)filter->opaque < best_score) { + enc = filter->from; + best_score = (uintptr_t)filter->opaque; } } - return encoding; + return enc; } /* @@ -479,83 +449,19 @@ mbfl_convert_encoding( return mbfl_memory_device_result(&device, result); } - /* * identify encoding */ -const mbfl_encoding * -mbfl_identify_encoding(mbfl_string *string, const mbfl_encoding **elist, int elistsz, int strict) +const mbfl_encoding *mbfl_identify_encoding(mbfl_string *string, const mbfl_encoding **elist, int elistsz, int strict) { - int i, num, bad; - size_t n; - unsigned char *p; - mbfl_identify_filter *flist, *filter; - const mbfl_encoding *encoding; - - /* flist is an array of mbfl_identify_filter instances */ - flist = ecalloc(elistsz, sizeof(mbfl_identify_filter)); - - num = 0; - if (elist != NULL) { - for (i = 0; i < elistsz; i++) { - if (!mbfl_identify_filter_init2(&flist[num], elist[i])) { - num++; - } - } - } - - /* feed data */ - n = string->len; - p = string->val; - - if (p != NULL) { - bad = 0; - while (n > 0) { - for (i = 0; i < num; i++) { - filter = &flist[i]; - if (!filter->flag) { - (*filter->filter_function)(*p, filter); - if (filter->flag) { - bad++; - } - } - } - if ((num - 1) <= bad && !strict) { - break; - } - p++; - n--; - } - } - - /* judge */ - encoding = NULL; - - for (i = 0; i < num; i++) { - filter = &flist[i]; - if (!filter->flag) { - if (strict && filter->status) { - continue; - } - encoding = filter->encoding; - break; - } - } - - /* fall-back judge */ - if (!encoding) { - for (i = 0; i < num; i++) { - filter = &flist[i]; - if (!filter->flag && (!strict || !filter->status)) { - encoding = filter->encoding; - break; - } - } + if (!elistsz) { + return NULL; } - - efree((void *)flist); - - return encoding; + mbfl_encoding_detector *identd = mbfl_encoding_detector_new(elist, elistsz, strict); + mbfl_encoding_detector_feed(identd, string); + const mbfl_encoding *enc = mbfl_encoding_detector_judge(identd); + mbfl_encoding_detector_delete(identd); + return enc; } /* @@ -578,9 +484,9 @@ mbfl_strlen(const mbfl_string *string) len = 0; if (encoding->flag & MBFL_ENCTYPE_SBCS) { len = string->len; - } else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { + } else if (encoding->flag & MBFL_ENCTYPE_WCS2) { len = string->len/2; - } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { + } else if (encoding->flag & MBFL_ENCTYPE_WCS4) { len = string->len/4; } else if (encoding->mblen_table != NULL) { const unsigned char *mbtab = encoding->mblen_table; @@ -931,14 +837,14 @@ mbfl_substr( mbfl_string_init(result); result->encoding = string->encoding; - if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE | MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) || + if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4)) || encoding->mblen_table != NULL) { len = string->len; if (encoding->flag & MBFL_ENCTYPE_SBCS) { start = from; - } else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { + } else if (encoding->flag & MBFL_ENCTYPE_WCS2) { start = from*2; - } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { + } else if (encoding->flag & MBFL_ENCTYPE_WCS4) { start = from*4; } else { const unsigned char *mbtab = encoding->mblen_table; @@ -963,9 +869,9 @@ mbfl_substr( end = len; } else if (encoding->flag & MBFL_ENCTYPE_SBCS) { end = start + length; - } else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { + } else if (encoding->flag & MBFL_ENCTYPE_WCS2) { end = start + length*2; - } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { + } else if (encoding->flag & MBFL_ENCTYPE_WCS4) { end = start + length*4; } else { const unsigned char *mbtab = encoding->mblen_table; @@ -1078,18 +984,13 @@ mbfl_strcut( mbfl_string_init(result); result->encoding = string->encoding; - if ((encoding->flag & (MBFL_ENCTYPE_SBCS - | MBFL_ENCTYPE_WCS2BE - | MBFL_ENCTYPE_WCS2LE - | MBFL_ENCTYPE_WCS4BE - | MBFL_ENCTYPE_WCS4LE)) - || encoding->mblen_table != NULL) { + if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4)) || encoding->mblen_table != NULL) { const unsigned char *start = NULL; const unsigned char *end = NULL; unsigned char *w; size_t sz; - if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { + if (encoding->flag & MBFL_ENCTYPE_WCS2) { from &= -2; if (length >= string->len - from) { @@ -1098,7 +999,7 @@ mbfl_strcut( start = string->val + from; end = start + (length & -2); - } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { + } else if (encoding->flag & MBFL_ENCTYPE_WCS4) { from &= -4; if (length >= string->len - from) { |