summaryrefslogtreecommitdiff
path: root/ext/mbstring/libmbfl/mbfl/mbfilter.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/mbstring/libmbfl/mbfl/mbfilter.c')
-rw-r--r--ext/mbstring/libmbfl/mbfl/mbfilter.c279
1 files changed, 90 insertions, 189 deletions
diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.c b/ext/mbstring/libmbfl/mbfl/mbfilter.c
index 793dd8e078..1904786576 100644
--- a/ext/mbstring/libmbfl/mbfl/mbfilter.c
+++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c
@@ -86,9 +86,11 @@
#include "mbfl_filter_output.h"
#include "mbfilter_8bit.h"
#include "mbfilter_wchar.h"
-#include "filters/mbfilter_ascii.h"
+#include "mbstring.h"
+#include "php_unicode.h"
#include "filters/mbfilter_base64.h"
#include "filters/mbfilter_qprint.h"
+#include "filters/mbfilter_singlebyte.h"
#include "filters/mbfilter_tl_jisx0201_jisx0208.h"
#include "filters/mbfilter_utf8.h"
@@ -200,7 +202,6 @@ size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *str
size_t n;
unsigned char *p;
mbfl_convert_filter *filter;
- int (*filter_function)(int c, mbfl_convert_filter *filter);
ZEND_ASSERT(convd);
ZEND_ASSERT(string);
@@ -212,9 +213,8 @@ size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *str
filter = convd->filter1;
if (filter != NULL) {
- filter_function = filter->filter_function;
while (n > 0) {
- if ((*filter_function)(*p++, filter) < 0) {
+ if ((*filter->filter_function)(*p++, filter) < 0) {
return p - string->val;
}
n--;
@@ -234,9 +234,6 @@ mbfl_buffer_converter_flush(mbfl_buffer_converter *convd)
if (convd->filter1 != NULL) {
mbfl_convert_filter_flush(convd->filter1);
}
- if (convd->filter2 != NULL) {
- mbfl_convert_filter_flush(convd->filter2);
- }
return 0;
}
@@ -262,9 +259,6 @@ mbfl_buffer_converter_feed_result(mbfl_buffer_converter *convd, mbfl_string *str
if (convd->filter1 != NULL) {
mbfl_convert_filter_flush(convd->filter1);
}
- if (convd->filter2 != NULL) {
- mbfl_convert_filter_flush(convd->filter2);
- }
result->encoding = convd->to;
return mbfl_memory_device_result(&convd->device, result);
}
@@ -291,126 +285,102 @@ size_t mbfl_buffer_illegalchars(mbfl_buffer_converter *convd)
/*
* encoding detector
*/
-mbfl_encoding_detector *
-mbfl_encoding_detector_new(const mbfl_encoding **elist, int elistsz, int strict)
+static int mbfl_estimate_encoding_likelihood(int c, void* data)
{
- mbfl_encoding_detector *identd;
-
- int i, num;
- mbfl_identify_filter *filter;
+ mbfl_convert_filter *filter = *((mbfl_convert_filter**)data);
+ uintptr_t *score = (uintptr_t*)(&filter->opaque);
+
+ /* Receive wchars decoded from test string using candidate encoding
+ * If the test string was invalid in the candidate encoding, we assume
+ * it's the wrong one. */
+ if (c & MBFL_WCSGROUP_THROUGH) {
+ filter->num_illegalchar++;
+ } else if (php_unicode_is_cntrl(c) || php_unicode_is_private(c)) {
+ /* Otherwise, count how many control characters and 'private use'
+ * codepoints we see. Those are rarely used and may indicate that
+ * the candidate encoding is not the right one. */
+ *score += 10;
+ } else if (php_unicode_is_punct(c)) {
+ /* Punctuation is also less common than letters/digits */
+ (*score)++;
+ }
+ return c;
+}
- if (elist == NULL || elistsz <= 0) {
+mbfl_encoding_detector *mbfl_encoding_detector_new(const mbfl_encoding **elist, int elistsz, int strict)
+{
+ if (!elistsz) {
return NULL;
}
- /* allocate */
- identd = emalloc(sizeof(mbfl_encoding_detector));
- identd->filter_list = ecalloc(elistsz, sizeof(mbfl_identify_filter *));
-
- /* create filters */
- i = 0;
- num = 0;
- while (i < elistsz) {
- filter = mbfl_identify_filter_new2(elist[i]);
- if (filter != NULL) {
- identd->filter_list[num] = filter;
- num++;
- }
- i++;
+ mbfl_encoding_detector *identd = emalloc(sizeof(mbfl_encoding_detector));
+ identd->filter_list = ecalloc(elistsz, sizeof(mbfl_convert_filter*));
+ for (int i = 0; i < elistsz; i++) {
+ identd->filter_list[i] = mbfl_convert_filter_new(elist[i], &mbfl_encoding_wchar,
+ mbfl_estimate_encoding_likelihood, NULL, &identd->filter_list[i]);
+ identd->filter_list[i]->opaque = (void*)0;
}
- identd->filter_list_size = num;
-
- /* set strict flag */
+ identd->filter_list_size = elistsz;
identd->strict = strict;
-
return identd;
}
-
-void
-mbfl_encoding_detector_delete(mbfl_encoding_detector *identd)
+void mbfl_encoding_detector_delete(mbfl_encoding_detector *identd)
{
- int i;
-
- if (identd != NULL) {
- if (identd->filter_list != NULL) {
- i = identd->filter_list_size;
- while (i > 0) {
- i--;
- mbfl_identify_filter_delete(identd->filter_list[i]);
- }
- efree((void *)identd->filter_list);
- }
- efree((void *)identd);
+ for (int i = 0; i < identd->filter_list_size; i++) {
+ mbfl_convert_filter_delete(identd->filter_list[i]);
}
+ efree(identd->filter_list);
+ efree(identd);
}
-int
-mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *string)
+int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *string)
{
- int res = 0;
- /* feed data */
- if (identd != NULL && string != NULL && string->val != NULL) {
- int num = identd->filter_list_size;
- size_t n = string->len;
- unsigned char *p = string->val;
- int bad = 0;
- while (n > 0) {
- int i;
- for (i = 0; i < num; i++) {
- mbfl_identify_filter *filter = identd->filter_list[i];
- if (!filter->flag) {
- (*filter->filter_function)(*p, filter);
- if (filter->flag) {
- bad++;
- }
+ int num = identd->filter_list_size;
+ size_t n = string->len;
+ unsigned char *p = string->val;
+ int bad = 0;
+
+ while (n--) {
+ for (int i = 0; i < num; i++) {
+ mbfl_convert_filter *filter = identd->filter_list[i];
+ if (!filter->num_illegalchar) {
+ (*filter->filter_function)(*p, filter);
+ if (filter->num_illegalchar) {
+ bad++;
}
}
- if ((num - 1) <= bad) {
- res = 1;
- break;
- }
- p++;
- n--;
}
+ if ((num - 1) <= bad && !identd->strict) {
+ return 1;
+ }
+ p++;
}
- return res;
+ if (identd->strict) {
+ for (int i = 0; i < num; i++) {
+ mbfl_convert_filter *filter = identd->filter_list[i];
+ (filter->filter_flush)(filter);
+ }
+ }
+
+ return 0;
}
const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd)
{
- mbfl_identify_filter *filter;
- const mbfl_encoding *encoding = NULL;
- int n;
-
- /* judge */
- if (identd != NULL) {
- n = identd->filter_list_size - 1;
- while (n >= 0) {
- filter = identd->filter_list[n];
- if (!filter->flag) {
- if (!identd->strict || !filter->status) {
- encoding = filter->encoding;
- }
- }
- n--;
- }
+ uintptr_t best_score = UINT_MAX; /* Low score is 'better' */
+ const mbfl_encoding *enc = NULL;
- /* fallback judge */
- if (!encoding) {
- n = identd->filter_list_size - 1;
- while (n >= 0) {
- filter = identd->filter_list[n];
- if (!filter->flag) {
- encoding = filter->encoding;
- }
- n--;
- }
+ for (int i = 0; i < identd->filter_list_size; i++) {
+ mbfl_convert_filter *filter = identd->filter_list[i];
+ if (!filter->num_illegalchar && (uintptr_t)filter->opaque < best_score) {
+ enc = filter->from;
+ best_score = (uintptr_t)filter->opaque;
}
}
- return encoding;
+ return enc;
}
/*
@@ -479,83 +449,19 @@ mbfl_convert_encoding(
return mbfl_memory_device_result(&device, result);
}
-
/*
* identify encoding
*/
-const mbfl_encoding *
-mbfl_identify_encoding(mbfl_string *string, const mbfl_encoding **elist, int elistsz, int strict)
+const mbfl_encoding *mbfl_identify_encoding(mbfl_string *string, const mbfl_encoding **elist, int elistsz, int strict)
{
- int i, num, bad;
- size_t n;
- unsigned char *p;
- mbfl_identify_filter *flist, *filter;
- const mbfl_encoding *encoding;
-
- /* flist is an array of mbfl_identify_filter instances */
- flist = ecalloc(elistsz, sizeof(mbfl_identify_filter));
-
- num = 0;
- if (elist != NULL) {
- for (i = 0; i < elistsz; i++) {
- if (!mbfl_identify_filter_init2(&flist[num], elist[i])) {
- num++;
- }
- }
- }
-
- /* feed data */
- n = string->len;
- p = string->val;
-
- if (p != NULL) {
- bad = 0;
- while (n > 0) {
- for (i = 0; i < num; i++) {
- filter = &flist[i];
- if (!filter->flag) {
- (*filter->filter_function)(*p, filter);
- if (filter->flag) {
- bad++;
- }
- }
- }
- if ((num - 1) <= bad && !strict) {
- break;
- }
- p++;
- n--;
- }
- }
-
- /* judge */
- encoding = NULL;
-
- for (i = 0; i < num; i++) {
- filter = &flist[i];
- if (!filter->flag) {
- if (strict && filter->status) {
- continue;
- }
- encoding = filter->encoding;
- break;
- }
- }
-
- /* fall-back judge */
- if (!encoding) {
- for (i = 0; i < num; i++) {
- filter = &flist[i];
- if (!filter->flag && (!strict || !filter->status)) {
- encoding = filter->encoding;
- break;
- }
- }
+ if (!elistsz) {
+ return NULL;
}
-
- efree((void *)flist);
-
- return encoding;
+ mbfl_encoding_detector *identd = mbfl_encoding_detector_new(elist, elistsz, strict);
+ mbfl_encoding_detector_feed(identd, string);
+ const mbfl_encoding *enc = mbfl_encoding_detector_judge(identd);
+ mbfl_encoding_detector_delete(identd);
+ return enc;
}
/*
@@ -578,9 +484,9 @@ mbfl_strlen(const mbfl_string *string)
len = 0;
if (encoding->flag & MBFL_ENCTYPE_SBCS) {
len = string->len;
- } else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) {
+ } else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
len = string->len/2;
- } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) {
+ } else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
len = string->len/4;
} else if (encoding->mblen_table != NULL) {
const unsigned char *mbtab = encoding->mblen_table;
@@ -931,14 +837,14 @@ mbfl_substr(
mbfl_string_init(result);
result->encoding = string->encoding;
- if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE | MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) ||
+ if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4)) ||
encoding->mblen_table != NULL) {
len = string->len;
if (encoding->flag & MBFL_ENCTYPE_SBCS) {
start = from;
- } else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) {
+ } else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
start = from*2;
- } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) {
+ } else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
start = from*4;
} else {
const unsigned char *mbtab = encoding->mblen_table;
@@ -963,9 +869,9 @@ mbfl_substr(
end = len;
} else if (encoding->flag & MBFL_ENCTYPE_SBCS) {
end = start + length;
- } else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) {
+ } else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
end = start + length*2;
- } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) {
+ } else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
end = start + length*4;
} else {
const unsigned char *mbtab = encoding->mblen_table;
@@ -1078,18 +984,13 @@ mbfl_strcut(
mbfl_string_init(result);
result->encoding = string->encoding;
- if ((encoding->flag & (MBFL_ENCTYPE_SBCS
- | MBFL_ENCTYPE_WCS2BE
- | MBFL_ENCTYPE_WCS2LE
- | MBFL_ENCTYPE_WCS4BE
- | MBFL_ENCTYPE_WCS4LE))
- || encoding->mblen_table != NULL) {
+ if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4)) || encoding->mblen_table != NULL) {
const unsigned char *start = NULL;
const unsigned char *end = NULL;
unsigned char *w;
size_t sz;
- if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) {
+ if (encoding->flag & MBFL_ENCTYPE_WCS2) {
from &= -2;
if (length >= string->len - from) {
@@ -1098,7 +999,7 @@ mbfl_strcut(
start = string->val + from;
end = start + (length & -2);
- } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) {
+ } else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
from &= -4;
if (length >= string->len - from) {