diff options
-rw-r--r-- | ChangeLog | 4 | ||||
-rw-r--r-- | enc/big5.c | 4 | ||||
-rw-r--r-- | enc/cp949.c | 4 | ||||
-rw-r--r-- | enc/emacs_mule.c | 16 | ||||
-rw-r--r-- | enc/euc_jp.c | 16 | ||||
-rw-r--r-- | enc/euc_kr.c | 4 | ||||
-rw-r--r-- | enc/euc_tw.c | 4 | ||||
-rw-r--r-- | enc/gb18030.c | 16 | ||||
-rw-r--r-- | enc/gbk.c | 4 | ||||
-rw-r--r-- | enc/shift_jis.c | 17 | ||||
-rw-r--r-- | enc/unicode.c | 20 | ||||
-rw-r--r-- | enc/utf_16be.c | 6 | ||||
-rw-r--r-- | enc/utf_16le.c | 6 | ||||
-rw-r--r-- | enc/utf_32be.c | 3 | ||||
-rw-r--r-- | enc/utf_32le.c | 3 | ||||
-rw-r--r-- | enc/utf_8.c | 706 | ||||
-rw-r--r-- | encoding.c | 19 | ||||
-rw-r--r-- | include/ruby/encoding.h | 7 | ||||
-rw-r--r-- | include/ruby/oniguruma.h | 5 | ||||
-rw-r--r-- | regenc.c | 17 | ||||
-rw-r--r-- | regenc.h | 4 | ||||
-rw-r--r-- | regexec.c | 52 | ||||
-rw-r--r-- | string.c | 162 | ||||
-rw-r--r-- | test/ruby/enc/test_utf16.rb | 12 | ||||
-rw-r--r-- | test/ruby/test_m17n.rb | 15 |
25 files changed, 234 insertions, 892 deletions
@@ -1,3 +1,7 @@ +Thu Sep 18 21:37:14 2008 Tanaka Akira <akr@fsij.org> + + * grapheme cluster implementation reverted. [ruby-dev:36375] + Thu Sep 18 20:50:36 2008 Nobuyoshi Nakada <nobu@ruby-lang.org> * lib/rake.rb (Rake::Application#standard_exception_handling): diff --git a/enc/big5.c b/enc/big5.c index 825051c519..9993f472e1 100644 --- a/enc/big5.c +++ b/enc/big5.c @@ -108,9 +108,9 @@ big5_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) } static OnigCodePoint -big5_mbc_to_code(const UChar* p, const UChar* end, int *precise_ret, OnigEncoding enc) +big5_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) { - return onigenc_mbn_mbc_to_code(enc, p, end, precise_ret); + return onigenc_mbn_mbc_to_code(enc, p, end); } static int diff --git a/enc/cp949.c b/enc/cp949.c index b551b04997..009443aed4 100644 --- a/enc/cp949.c +++ b/enc/cp949.c @@ -130,9 +130,9 @@ cp949_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) } static OnigCodePoint -cp949_mbc_to_code(const UChar* p, const UChar* end, int *precise_ret, OnigEncoding enc) +cp949_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) { - return onigenc_mbn_mbc_to_code(enc, p, end, precise_ret); + return onigenc_mbn_mbc_to_code(enc, p, end); } static int diff --git a/enc/emacs_mule.c b/enc/emacs_mule.c index d58b9dfa9d..be7f842259 100644 --- a/enc/emacs_mule.c +++ b/enc/emacs_mule.c @@ -223,9 +223,21 @@ mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) } static OnigCodePoint -mbc_to_code(const UChar* p, const UChar* end, int *precise_ret, OnigEncoding enc) +mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) { - return onigenc_mbn_mbc_to_code(enc, p, end, precise_ret); + int c, i, len; + OnigCodePoint n; + + len = enclen(enc, p, end); + n = (OnigCodePoint )*p++; + if (len == 1) return n; + + for (i = 1; i < len; i++) { + if (p >= end) break; + c = *p++; + n <<= 8; n += c; + } + return n; } static int diff --git a/enc/euc_jp.c b/enc/euc_jp.c index 0424b88559..21f30ad2f3 100644 --- a/enc/euc_jp.c +++ b/enc/euc_jp.c @@ -133,9 +133,21 @@ mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) } static OnigCodePoint -mbc_to_code(const UChar* p, const UChar* end, int *precise_ret, OnigEncoding enc) +mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) { - return onigenc_mbn_mbc_to_code(enc, p, end, precise_ret); + int c, i, len; + OnigCodePoint n; + + len = enclen(enc, p, end); + n = (OnigCodePoint )*p++; + if (len == 1) return n; + + for (i = 1; i < len; i++) { + if (p >= end) break; + c = *p++; + n <<= 8; n += c; + } + return n; } static int diff --git a/enc/euc_kr.c b/enc/euc_kr.c index fa39638aad..f20a57e69a 100644 --- a/enc/euc_kr.c +++ b/enc/euc_kr.c @@ -108,9 +108,9 @@ euckr_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) } static OnigCodePoint -euckr_mbc_to_code(const UChar* p, const UChar* end, int *precise_ret, OnigEncoding enc) +euckr_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) { - return onigenc_mbn_mbc_to_code(enc, p, end, precise_ret); + return onigenc_mbn_mbc_to_code(enc, p, end); } static int diff --git a/enc/euc_tw.c b/enc/euc_tw.c index 8e8597625c..d025a0dd45 100644 --- a/enc/euc_tw.c +++ b/enc/euc_tw.c @@ -150,9 +150,9 @@ euctw_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) } static OnigCodePoint -euctw_mbc_to_code(const UChar* p, const UChar* end, int *precise_ret, OnigEncoding enc ARG_UNUSED) +euctw_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc ARG_UNUSED) { - return onigenc_mbn_mbc_to_code(enc, p, end, precise_ret); + return onigenc_mbn_mbc_to_code(enc, p, end); } static int diff --git a/enc/gb18030.c b/enc/gb18030.c index ba6111b504..16c8c1c17f 100644 --- a/enc/gb18030.c +++ b/enc/gb18030.c @@ -164,9 +164,21 @@ gb18030_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) } static OnigCodePoint -gb18030_mbc_to_code(const UChar* p, const UChar* end, int *precise_ret, OnigEncoding enc) +gb18030_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) { - return onigenc_mbn_mbc_to_code(enc, p, end, precise_ret); + int c, i, len; + OnigCodePoint n; + + len = enclen(enc, p, end); + n = (OnigCodePoint )(*p++); + if (len == 1) return n; + + for (i = 1; i < len; i++) { + if (p >= end) break; + c = *p++; + n <<= 8; n += c; + } + return n; } static int @@ -130,9 +130,9 @@ gbk_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) } static OnigCodePoint -gbk_mbc_to_code(const UChar* p, const UChar* end, int *precise_ret, OnigEncoding enc) +gbk_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) { - return onigenc_mbn_mbc_to_code(enc, p, end, precise_ret); + return onigenc_mbn_mbc_to_code(enc, p, end); } static int diff --git a/enc/shift_jis.c b/enc/shift_jis.c index c081f3b898..b4d8592a27 100644 --- a/enc/shift_jis.c +++ b/enc/shift_jis.c @@ -145,9 +145,22 @@ code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED) } static OnigCodePoint -mbc_to_code(const UChar* p, const UChar* end, int *precise_ret, OnigEncoding enc) +mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) { - return onigenc_mbn_mbc_to_code(enc, p, end, precise_ret); + int c, i, len; + OnigCodePoint n; + + len = enclen(enc, p, end); + c = *p++; + n = c; + if (len == 1) return n; + + for (i = 1; i < len; i++) { + if (p >= end) break; + c = *p++; + n <<= 8; n += c; + } + return n; } static int diff --git a/enc/unicode.c b/enc/unicode.c index 5d2eba79b6..2dfcbba3f6 100644 --- a/enc/unicode.c +++ b/enc/unicode.c @@ -10966,13 +10966,13 @@ onigenc_unicode_mbc_case_fold(OnigEncoding enc, { CodePointList3 *to; OnigCodePoint code; - int i, len, rlen, r; + int i, len, rlen; const UChar *p = *pp; if (CaseFoldInited == 0) init_case_fold_table(); - code = ONIGENC_MBC_PRECISE_CODEPOINT(enc, p, end, &r); - len = ONIGENC_MBCLEN_CHARFOUND_LEN(r); + code = ONIGENC_MBC_TO_CODE(enc, p, end); + len = enclen(enc, p, end); *pp += len; #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI @@ -11160,7 +11160,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) { - int n, i, j, k, len, r; + int n, i, j, k, len; OnigCodePoint code, codes[3]; CodePointList3 *to, *z3; CodePointList2 *z2; @@ -11169,8 +11169,8 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, n = 0; - code = ONIGENC_MBC_PRECISE_CODEPOINT(enc, p, end, &r); - len = ONIGENC_MBCLEN_CHARFOUND_LEN(r); + code = ONIGENC_MBC_TO_CODE(enc, p, end); + len = enclen(enc, p, end); #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { @@ -11311,7 +11311,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, int clen; codes[0] = code; - code = ONIGENC_MBC_PRECISE_CODEPOINT(enc, p, end, &r); + code = ONIGENC_MBC_TO_CODE(enc, p, end); if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0 && to->n == 1) { codes[1] = to->code[0]; @@ -11319,7 +11319,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, else codes[1] = code; - clen = ONIGENC_MBCLEN_CHARFOUND_LEN(r); + clen = enclen(enc, p, end); len += clen; if (onig_st_lookup(Unfold2Table, (st_data_t )codes, (void* )&z2) != 0) { for (i = 0; i < z2->n; i++) { @@ -11332,7 +11332,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, p += clen; if (p < end) { - code = ONIGENC_MBC_PRECISE_CODEPOINT(enc, p, end, &r); + code = ONIGENC_MBC_TO_CODE(enc, p, end); if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0 && to->n == 1) { codes[2] = to->code[0]; @@ -11340,7 +11340,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, else codes[2] = code; - clen = ONIGENC_MBCLEN_CHARFOUND_LEN(r); + clen = enclen(enc, p, end); len += clen; if (onig_st_lookup(Unfold3Table, (st_data_t )codes, (void* )&z2) != 0) { diff --git a/enc/utf_16be.c b/enc/utf_16be.c index 2868989977..8d7c8e9b11 100644 --- a/enc/utf_16be.c +++ b/enc/utf_16be.c @@ -103,15 +103,9 @@ utf16be_is_mbc_newline(const UChar* p, const UChar* end, static OnigCodePoint utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED, - int *precise_ret, OnigEncoding enc) { OnigCodePoint code; - int ret; - - ret = utf16be_mbc_enc_len(p, end, enc); - if (precise_ret) - *precise_ret = ret; if (UTF16_IS_SURROGATE_FIRST(*p)) { code = ((((p[0] - 0xd8) << 2) + ((p[1] & 0xc0) >> 6) + 1) << 16) diff --git a/enc/utf_16le.c b/enc/utf_16le.c index d47a8f6275..c8a1e7a10a 100644 --- a/enc/utf_16le.c +++ b/enc/utf_16le.c @@ -95,17 +95,11 @@ utf16le_is_mbc_newline(const UChar* p, const UChar* end, static OnigCodePoint utf16le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED, - int *precise_ret, OnigEncoding enc ARG_UNUSED) { OnigCodePoint code; UChar c0 = *p; UChar c1 = *(p+1); - int ret; - - ret = utf16le_mbc_enc_len(p, end, enc); - if (precise_ret) - *precise_ret = ret; if (UTF16_IS_SURROGATE_FIRST(c1)) { code = ((((c1 - 0xd8) << 2) + ((c0 & 0xc0) >> 6) + 1) << 16) diff --git a/enc/utf_32be.c b/enc/utf_32be.c index 7bdf9aa7b5..61e7d0f1b5 100644 --- a/enc/utf_32be.c +++ b/enc/utf_32be.c @@ -61,11 +61,8 @@ utf32be_is_mbc_newline(const UChar* p, const UChar* end, static OnigCodePoint utf32be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED, - int *precise_ret, OnigEncoding enc ARG_UNUSED) { - if (precise_ret) - *precise_ret = ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4); return (OnigCodePoint )(((p[0] * 256 + p[1]) * 256 + p[2]) * 256 + p[3]); } diff --git a/enc/utf_32le.c b/enc/utf_32le.c index 2754d33320..3a0a41bba7 100644 --- a/enc/utf_32le.c +++ b/enc/utf_32le.c @@ -61,11 +61,8 @@ utf32le_is_mbc_newline(const UChar* p, const UChar* end, static OnigCodePoint utf32le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED, - int *precise_ret, OnigEncoding enc ARG_UNUSED) { - if (precise_ret) - *precise_ret = ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4); return (OnigCodePoint )(((p[3] * 256 + p[2]) * 256 + p[1]) * 256 + p[0]); } diff --git a/enc/utf_8.c b/enc/utf_8.c index 5913e786a2..3b9387c613 100644 --- a/enc/utf_8.c +++ b/enc/utf_8.c @@ -241,657 +241,6 @@ mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) ONIGENC_CONSTRUCT_MBCLEN_INVALID(); } -static OnigCodePoint mbc_to_code(const UChar* p, const UChar* end, int *precise_ret, OnigEncoding enc); - -/* generated from GraphemeBreakProperty-5.1.0.txt - * Since CR LF is handled in another layer such as IO with text mode, - * CR and LF are merged into CONTROL. */ -#define GRAPHEME_BIT_CONTROL 0x001 -#define GRAPHEME_BIT_EXTEND 0x002 -#define GRAPHEME_BIT_PREPEND 0x004 -#define GRAPHEME_BIT_SPACINGMARK 0x008 -#define GRAPHEME_BIT_L 0x010 -#define GRAPHEME_BIT_V 0x020 -#define GRAPHEME_BIT_T 0x040 -#define GRAPHEME_BIT_LV 0x080 -#define GRAPHEME_BIT_LVT 0x100 -const struct grapheme_table_t { /* codepoint_min <= c < codepoint_min+num_codepoints */ - OnigCodePoint codepoint_min; - unsigned short num_codepoints; - unsigned short properties; -} grapheme_table[] = { - {0x00000,32,0x001}, {0x0007F,33,0x001}, - {0x000AD,1,0x001}, {0x00300,112,0x002}, - {0x00483,7,0x002}, {0x00591,45,0x002}, - {0x005BF,1,0x002}, {0x005C1,2,0x002}, - {0x005C4,2,0x002}, {0x005C7,1,0x002}, - {0x00600,4,0x001}, {0x00610,11,0x002}, - {0x0064B,20,0x002}, {0x00670,1,0x002}, - {0x006D6,7,0x002}, {0x006DD,1,0x001}, - {0x006DE,7,0x002}, {0x006E7,2,0x002}, - {0x006EA,4,0x002}, {0x0070F,1,0x001}, - {0x00711,1,0x002}, {0x00730,27,0x002}, - {0x007A6,11,0x002}, {0x007EB,9,0x002}, - {0x00901,2,0x002}, {0x00903,1,0x008}, - {0x0093C,1,0x002}, {0x0093E,3,0x008}, - {0x00941,8,0x002}, {0x00949,4,0x008}, - {0x0094D,1,0x002}, {0x00951,4,0x002}, - {0x00962,2,0x002}, {0x00981,1,0x002}, - {0x00982,2,0x008}, {0x009BC,1,0x002}, - {0x009BE,1,0x002}, {0x009BF,2,0x008}, - {0x009C1,4,0x002}, {0x009C7,2,0x008}, - {0x009CB,2,0x008}, {0x009CD,1,0x002}, - {0x009D7,1,0x002}, {0x009E2,2,0x002}, - {0x00A01,2,0x002}, {0x00A03,1,0x008}, - {0x00A3C,1,0x002}, {0x00A3E,3,0x008}, - {0x00A41,2,0x002}, {0x00A47,2,0x002}, - {0x00A4B,3,0x002}, {0x00A51,1,0x002}, - {0x00A70,2,0x002}, {0x00A75,1,0x002}, - {0x00A81,2,0x002}, {0x00A83,1,0x008}, - {0x00ABC,1,0x002}, {0x00ABE,3,0x008}, - {0x00AC1,5,0x002}, {0x00AC7,2,0x002}, - {0x00AC9,1,0x008}, {0x00ACB,2,0x008}, - {0x00ACD,1,0x002}, {0x00AE2,2,0x002}, - {0x00B01,1,0x002}, {0x00B02,2,0x008}, - {0x00B3C,1,0x002}, {0x00B3E,2,0x002}, - {0x00B40,1,0x008}, {0x00B41,4,0x002}, - {0x00B47,2,0x008}, {0x00B4B,2,0x008}, - {0x00B4D,1,0x002}, {0x00B56,2,0x002}, - {0x00B62,2,0x002}, {0x00B82,1,0x002}, - {0x00BBE,1,0x002}, {0x00BBF,1,0x008}, - {0x00BC0,1,0x002}, {0x00BC1,2,0x008}, - {0x00BC6,3,0x008}, {0x00BCA,3,0x008}, - {0x00BCD,1,0x002}, {0x00BD7,1,0x002}, - {0x00C01,3,0x008}, {0x00C3E,3,0x002}, - {0x00C41,4,0x008}, {0x00C46,3,0x002}, - {0x00C4A,4,0x002}, {0x00C55,2,0x002}, - {0x00C62,2,0x002}, {0x00C82,2,0x008}, - {0x00CBC,1,0x002}, {0x00CBE,1,0x008}, - {0x00CBF,1,0x002}, {0x00CC0,2,0x008}, - {0x00CC2,1,0x002}, {0x00CC3,2,0x008}, - {0x00CC6,1,0x002}, {0x00CC7,2,0x008}, - {0x00CCA,2,0x008}, {0x00CCC,2,0x002}, - {0x00CD5,2,0x002}, {0x00CE2,2,0x002}, - {0x00D02,2,0x008}, {0x00D3E,1,0x002}, - {0x00D3F,2,0x008}, {0x00D41,4,0x002}, - {0x00D46,3,0x008}, {0x00D4A,3,0x008}, - {0x00D4D,1,0x002}, {0x00D57,1,0x002}, - {0x00D62,2,0x002}, {0x00D82,2,0x008}, - {0x00DCA,1,0x002}, {0x00DCF,1,0x002}, - {0x00DD0,2,0x008}, {0x00DD2,3,0x002}, - {0x00DD6,1,0x002}, {0x00DD8,7,0x008}, - {0x00DDF,1,0x002}, {0x00DF2,2,0x008}, - {0x00E30,11,0x002}, {0x00E40,5,0x004}, - {0x00E45,1,0x002}, {0x00E47,8,0x002}, - {0x00EB0,10,0x002}, {0x00EBB,2,0x002}, - {0x00EC0,5,0x004}, {0x00EC8,6,0x002}, - {0x00F18,2,0x002}, {0x00F35,1,0x002}, - {0x00F37,1,0x002}, {0x00F39,1,0x002}, - {0x00F3E,2,0x008}, {0x00F71,14,0x002}, - {0x00F7F,1,0x008}, {0x00F80,5,0x002}, - {0x00F86,2,0x002}, {0x00F90,8,0x002}, - {0x00F99,36,0x002}, {0x00FC6,1,0x002}, - {0x0102B,2,0x008}, {0x0102D,4,0x002}, - {0x01031,1,0x008}, {0x01032,6,0x002}, - {0x01038,1,0x008}, {0x01039,2,0x002}, - {0x0103B,2,0x008}, {0x0103D,2,0x002}, - {0x01056,2,0x008}, {0x01058,2,0x002}, - {0x0105E,3,0x002}, {0x01062,3,0x008}, - {0x01067,7,0x008}, {0x01071,4,0x002}, - {0x01082,1,0x002}, {0x01083,2,0x008}, - {0x01085,2,0x002}, {0x01087,6,0x008}, - {0x0108D,1,0x002}, {0x0108F,1,0x008}, - {0x01100,90,0x010}, {0x0115F,1,0x010}, - {0x01160,67,0x020}, {0x011A8,82,0x040}, - {0x0135F,1,0x002}, {0x01712,3,0x002}, - {0x01732,3,0x002}, {0x01752,2,0x002}, - {0x01772,2,0x002}, {0x017B4,2,0x001}, - {0x017B6,1,0x008}, {0x017B7,7,0x002}, - {0x017BE,8,0x008}, {0x017C6,1,0x002}, - {0x017C7,2,0x008}, {0x017C9,11,0x002}, - {0x017DD,1,0x002}, {0x0180B,3,0x002}, - {0x018A9,1,0x002}, {0x01920,3,0x002}, - {0x01923,4,0x008}, {0x01927,2,0x002}, - {0x01929,3,0x008}, {0x01930,2,0x008}, - {0x01932,1,0x002}, {0x01933,6,0x008}, - {0x01939,3,0x002}, {0x019B0,17,0x008}, - {0x019C8,2,0x008}, {0x01A17,2,0x002}, - {0x01A19,3,0x008}, {0x01B00,4,0x002}, - {0x01B04,1,0x008}, {0x01B34,1,0x002}, - {0x01B35,1,0x008}, {0x01B36,5,0x002}, - {0x01B3B,1,0x008}, {0x01B3C,1,0x002}, - {0x01B3D,5,0x008}, {0x01B42,1,0x002}, - {0x01B43,2,0x008}, {0x01B6B,9,0x002}, - {0x01B80,2,0x002}, {0x01B82,1,0x008}, - {0x01BA1,1,0x008}, {0x01BA2,4,0x002}, - {0x01BA6,2,0x008}, {0x01BA8,2,0x002}, - {0x01BAA,1,0x008}, {0x01C24,8,0x008}, - {0x01C2C,8,0x002}, {0x01C34,2,0x008}, - {0x01C36,2,0x002}, {0x01DC0,39,0x002}, - {0x01DFE,2,0x002}, {0x0200B,1,0x001}, - {0x0200C,2,0x002}, {0x0200E,2,0x001}, - {0x02028,7,0x001}, {0x02060,5,0x001}, - {0x0206A,6,0x001}, {0x020D0,33,0x002}, - {0x02DE0,32,0x002}, {0x0302A,6,0x002}, - {0x03099,2,0x002}, {0x0A66F,4,0x002}, - {0x0A67C,2,0x002}, {0x0A802,1,0x002}, - {0x0A806,1,0x002}, {0x0A80B,1,0x002}, - {0x0A823,2,0x008}, {0x0A825,2,0x002}, - {0x0A827,1,0x008}, {0x0A880,2,0x008}, - {0x0A8B4,16,0x008}, {0x0A8C4,1,0x002}, - {0x0A926,8,0x002}, {0x0A947,11,0x002}, - {0x0A952,2,0x008}, {0x0AA29,6,0x002}, - {0x0AA2F,2,0x008}, {0x0AA31,2,0x002}, - {0x0AA33,2,0x008}, {0x0AA35,2,0x002}, - {0x0AA43,1,0x002}, {0x0AA4C,1,0x002}, - {0x0AA4D,1,0x008}, {0x0AC00,1,0x080}, - {0x0AC01,27,0x100}, {0x0AC1C,1,0x080}, - {0x0AC1D,27,0x100}, {0x0AC38,1,0x080}, - {0x0AC39,27,0x100}, {0x0AC54,1,0x080}, - {0x0AC55,27,0x100}, {0x0AC70,1,0x080}, - {0x0AC71,27,0x100}, {0x0AC8C,1,0x080}, - {0x0AC8D,27,0x100}, {0x0ACA8,1,0x080}, - {0x0ACA9,27,0x100}, {0x0ACC4,1,0x080}, - {0x0ACC5,27,0x100}, {0x0ACE0,1,0x080}, - {0x0ACE1,27,0x100}, {0x0ACFC,1,0x080}, - {0x0ACFD,27,0x100}, {0x0AD18,1,0x080}, - {0x0AD19,27,0x100}, {0x0AD34,1,0x080}, - {0x0AD35,27,0x100}, {0x0AD50,1,0x080}, - {0x0AD51,27,0x100}, {0x0AD6C,1,0x080}, - {0x0AD6D,27,0x100}, {0x0AD88,1,0x080}, - {0x0AD89,27,0x100}, {0x0ADA4,1,0x080}, - {0x0ADA5,27,0x100}, {0x0ADC0,1,0x080}, - {0x0ADC1,27,0x100}, {0x0ADDC,1,0x080}, - {0x0ADDD,27,0x100}, {0x0ADF8,1,0x080}, - {0x0ADF9,27,0x100}, {0x0AE14,1,0x080}, - {0x0AE15,27,0x100}, {0x0AE30,1,0x080}, - {0x0AE31,27,0x100}, {0x0AE4C,1,0x080}, - {0x0AE4D,27,0x100}, {0x0AE68,1,0x080}, - {0x0AE69,27,0x100}, {0x0AE84,1,0x080}, - {0x0AE85,27,0x100}, {0x0AEA0,1,0x080}, - {0x0AEA1,27,0x100}, {0x0AEBC,1,0x080}, - {0x0AEBD,27,0x100}, {0x0AED8,1,0x080}, - {0x0AED9,27,0x100}, {0x0AEF4,1,0x080}, - {0x0AEF5,27,0x100}, {0x0AF10,1,0x080}, - {0x0AF11,27,0x100}, {0x0AF2C,1,0x080}, - {0x0AF2D,27,0x100}, {0x0AF48,1,0x080}, - {0x0AF49,27,0x100}, {0x0AF64,1,0x080}, - {0x0AF65,27,0x100}, {0x0AF80,1,0x080}, - {0x0AF81,27,0x100}, {0x0AF9C,1,0x080}, - {0x0AF9D,27,0x100}, {0x0AFB8,1,0x080}, - {0x0AFB9,27,0x100}, {0x0AFD4,1,0x080}, - {0x0AFD5,27,0x100}, {0x0AFF0,1,0x080}, - {0x0AFF1,27,0x100}, {0x0B00C,1,0x080}, - {0x0B00D,27,0x100}, {0x0B028,1,0x080}, - {0x0B029,27,0x100}, {0x0B044,1,0x080}, - {0x0B045,27,0x100}, {0x0B060,1,0x080}, - {0x0B061,27,0x100}, {0x0B07C,1,0x080}, - {0x0B07D,27,0x100}, {0x0B098,1,0x080}, - {0x0B099,27,0x100}, {0x0B0B4,1,0x080}, - {0x0B0B5,27,0x100}, {0x0B0D0,1,0x080}, - {0x0B0D1,27,0x100}, {0x0B0EC,1,0x080}, - {0x0B0ED,27,0x100}, {0x0B108,1,0x080}, - {0x0B109,27,0x100}, {0x0B124,1,0x080}, - {0x0B125,27,0x100}, {0x0B140,1,0x080}, - {0x0B141,27,0x100}, {0x0B15C,1,0x080}, - {0x0B15D,27,0x100}, {0x0B178,1,0x080}, - {0x0B179,27,0x100}, {0x0B194,1,0x080}, - {0x0B195,27,0x100}, {0x0B1B0,1,0x080}, - {0x0B1B1,27,0x100}, {0x0B1CC,1,0x080}, - {0x0B1CD,27,0x100}, {0x0B1E8,1,0x080}, - {0x0B1E9,27,0x100}, {0x0B204,1,0x080}, - {0x0B205,27,0x100}, {0x0B220,1,0x080}, - {0x0B221,27,0x100}, {0x0B23C,1,0x080}, - {0x0B23D,27,0x100}, {0x0B258,1,0x080}, - {0x0B259,27,0x100}, {0x0B274,1,0x080}, - {0x0B275,27,0x100}, {0x0B290,1,0x080}, - {0x0B291,27,0x100}, {0x0B2AC,1,0x080}, - {0x0B2AD,27,0x100}, {0x0B2C8,1,0x080}, - {0x0B2C9,27,0x100}, {0x0B2E4,1,0x080}, - {0x0B2E5,27,0x100}, {0x0B300,1,0x080}, - {0x0B301,27,0x100}, {0x0B31C,1,0x080}, - {0x0B31D,27,0x100}, {0x0B338,1,0x080}, - {0x0B339,27,0x100}, {0x0B354,1,0x080}, - {0x0B355,27,0x100}, {0x0B370,1,0x080}, - {0x0B371,27,0x100}, {0x0B38C,1,0x080}, - {0x0B38D,27,0x100}, {0x0B3A8,1,0x080}, - {0x0B3A9,27,0x100}, {0x0B3C4,1,0x080}, - {0x0B3C5,27,0x100}, {0x0B3E0,1,0x080}, - {0x0B3E1,27,0x100}, {0x0B3FC,1,0x080}, - {0x0B3FD,27,0x100}, {0x0B418,1,0x080}, - {0x0B419,27,0x100}, {0x0B434,1,0x080}, - {0x0B435,27,0x100}, {0x0B450,1,0x080}, - {0x0B451,27,0x100}, {0x0B46C,1,0x080}, - {0x0B46D,27,0x100}, {0x0B488,1,0x080}, - {0x0B489,27,0x100}, {0x0B4A4,1,0x080}, - {0x0B4A5,27,0x100}, {0x0B4C0,1,0x080}, - {0x0B4C1,27,0x100}, {0x0B4DC,1,0x080}, - {0x0B4DD,27,0x100}, {0x0B4F8,1,0x080}, - {0x0B4F9,27,0x100}, {0x0B514,1,0x080}, - {0x0B515,27,0x100}, {0x0B530,1,0x080}, - {0x0B531,27,0x100}, {0x0B54C,1,0x080}, - {0x0B54D,27,0x100}, {0x0B568,1,0x080}, - {0x0B569,27,0x100}, {0x0B584,1,0x080}, - {0x0B585,27,0x100}, {0x0B5A0,1,0x080}, - {0x0B5A1,27,0x100}, {0x0B5BC,1,0x080}, - {0x0B5BD,27,0x100}, {0x0B5D8,1,0x080}, - {0x0B5D9,27,0x100}, {0x0B5F4,1,0x080}, - {0x0B5F5,27,0x100}, {0x0B610,1,0x080}, - {0x0B611,27,0x100}, {0x0B62C,1,0x080}, - {0x0B62D,27,0x100}, {0x0B648,1,0x080}, - {0x0B649,27,0x100}, {0x0B664,1,0x080}, - {0x0B665,27,0x100}, {0x0B680,1,0x080}, - {0x0B681,27,0x100}, {0x0B69C,1,0x080}, - {0x0B69D,27,0x100}, {0x0B6B8,1,0x080}, - {0x0B6B9,27,0x100}, {0x0B6D4,1,0x080}, - {0x0B6D5,27,0x100}, {0x0B6F0,1,0x080}, - {0x0B6F1,27,0x100}, {0x0B70C,1,0x080}, - {0x0B70D,27,0x100}, {0x0B728,1,0x080}, - {0x0B729,27,0x100}, {0x0B744,1,0x080}, - {0x0B745,27,0x100}, {0x0B760,1,0x080}, - {0x0B761,27,0x100}, {0x0B77C,1,0x080}, - {0x0B77D,27,0x100}, {0x0B798,1,0x080}, - {0x0B799,27,0x100}, {0x0B7B4,1,0x080}, - {0x0B7B5,27,0x100}, {0x0B7D0,1,0x080}, - {0x0B7D1,27,0x100}, {0x0B7EC,1,0x080}, - {0x0B7ED,27,0x100}, {0x0B808,1,0x080}, - {0x0B809,27,0x100}, {0x0B824,1,0x080}, - {0x0B825,27,0x100}, {0x0B840,1,0x080}, - {0x0B841,27,0x100}, {0x0B85C,1,0x080}, - {0x0B85D,27,0x100}, {0x0B878,1,0x080}, - {0x0B879,27,0x100}, {0x0B894,1,0x080}, - {0x0B895,27,0x100}, {0x0B8B0,1,0x080}, - {0x0B8B1,27,0x100}, {0x0B8CC,1,0x080}, - {0x0B8CD,27,0x100}, {0x0B8E8,1,0x080}, - {0x0B8E9,27,0x100}, {0x0B904,1,0x080}, - {0x0B905,27,0x100}, {0x0B920,1,0x080}, - {0x0B921,27,0x100}, {0x0B93C,1,0x080}, - {0x0B93D,27,0x100}, {0x0B958,1,0x080}, - {0x0B959,27,0x100}, {0x0B974,1,0x080}, - {0x0B975,27,0x100}, {0x0B990,1,0x080}, - {0x0B991,27,0x100}, {0x0B9AC,1,0x080}, - {0x0B9AD,27,0x100}, {0x0B9C8,1,0x080}, - {0x0B9C9,27,0x100}, {0x0B9E4,1,0x080}, - {0x0B9E5,27,0x100}, {0x0BA00,1,0x080}, - {0x0BA01,27,0x100}, {0x0BA1C,1,0x080}, - {0x0BA1D,27,0x100}, {0x0BA38,1,0x080}, - {0x0BA39,27,0x100}, {0x0BA54,1,0x080}, - {0x0BA55,27,0x100}, {0x0BA70,1,0x080}, - {0x0BA71,27,0x100}, {0x0BA8C,1,0x080}, - {0x0BA8D,27,0x100}, {0x0BAA8,1,0x080}, - {0x0BAA9,27,0x100}, {0x0BAC4,1,0x080}, - {0x0BAC5,27,0x100}, {0x0BAE0,1,0x080}, - {0x0BAE1,27,0x100}, {0x0BAFC,1,0x080}, - {0x0BAFD,27,0x100}, {0x0BB18,1,0x080}, - {0x0BB19,27,0x100}, {0x0BB34,1,0x080}, - {0x0BB35,27,0x100}, {0x0BB50,1,0x080}, - {0x0BB51,27,0x100}, {0x0BB6C,1,0x080}, - {0x0BB6D,27,0x100}, {0x0BB88,1,0x080}, - {0x0BB89,27,0x100}, {0x0BBA4,1,0x080}, - {0x0BBA5,27,0x100}, {0x0BBC0,1,0x080}, - {0x0BBC1,27,0x100}, {0x0BBDC,1,0x080}, - {0x0BBDD,27,0x100}, {0x0BBF8,1,0x080}, - {0x0BBF9,27,0x100}, {0x0BC14,1,0x080}, - {0x0BC15,27,0x100}, {0x0BC30,1,0x080}, - {0x0BC31,27,0x100}, {0x0BC4C,1,0x080}, - {0x0BC4D,27,0x100}, {0x0BC68,1,0x080}, - {0x0BC69,27,0x100}, {0x0BC84,1,0x080}, - {0x0BC85,27,0x100}, {0x0BCA0,1,0x080}, - {0x0BCA1,27,0x100}, {0x0BCBC,1,0x080}, - {0x0BCBD,27,0x100}, {0x0BCD8,1,0x080}, - {0x0BCD9,27,0x100}, {0x0BCF4,1,0x080}, - {0x0BCF5,27,0x100}, {0x0BD10,1,0x080}, - {0x0BD11,27,0x100}, {0x0BD2C,1,0x080}, - {0x0BD2D,27,0x100}, {0x0BD48,1,0x080}, - {0x0BD49,27,0x100}, {0x0BD64,1,0x080}, - {0x0BD65,27,0x100}, {0x0BD80,1,0x080}, - {0x0BD81,27,0x100}, {0x0BD9C,1,0x080}, - {0x0BD9D,27,0x100}, {0x0BDB8,1,0x080}, - {0x0BDB9,27,0x100}, {0x0BDD4,1,0x080}, - {0x0BDD5,27,0x100}, {0x0BDF0,1,0x080}, - {0x0BDF1,27,0x100}, {0x0BE0C,1,0x080}, - {0x0BE0D,27,0x100}, {0x0BE28,1,0x080}, - {0x0BE29,27,0x100}, {0x0BE44,1,0x080}, - {0x0BE45,27,0x100}, {0x0BE60,1,0x080}, - {0x0BE61,27,0x100}, {0x0BE7C,1,0x080}, - {0x0BE7D,27,0x100}, {0x0BE98,1,0x080}, - {0x0BE99,27,0x100}, {0x0BEB4,1,0x080}, - {0x0BEB5,27,0x100}, {0x0BED0,1,0x080}, - {0x0BED1,27,0x100}, {0x0BEEC,1,0x080}, - {0x0BEED,27,0x100}, {0x0BF08,1,0x080}, - {0x0BF09,27,0x100}, {0x0BF24,1,0x080}, - {0x0BF25,27,0x100}, {0x0BF40,1,0x080}, - {0x0BF41,27,0x100}, {0x0BF5C,1,0x080}, - {0x0BF5D,27,0x100}, {0x0BF78,1,0x080}, - {0x0BF79,27,0x100}, {0x0BF94,1,0x080}, - {0x0BF95,27,0x100}, {0x0BFB0,1,0x080}, - {0x0BFB1,27,0x100}, {0x0BFCC,1,0x080}, - {0x0BFCD,27,0x100}, {0x0BFE8,1,0x080}, - {0x0BFE9,27,0x100}, {0x0C004,1,0x080}, - {0x0C005,27,0x100}, {0x0C020,1,0x080}, - {0x0C021,27,0x100}, {0x0C03C,1,0x080}, - {0x0C03D,27,0x100}, {0x0C058,1,0x080}, - {0x0C059,27,0x100}, {0x0C074,1,0x080}, - {0x0C075,27,0x100}, {0x0C090,1,0x080}, - {0x0C091,27,0x100}, {0x0C0AC,1,0x080}, - {0x0C0AD,27,0x100}, {0x0C0C8,1,0x080}, - {0x0C0C9,27,0x100}, {0x0C0E4,1,0x080}, - {0x0C0E5,27,0x100}, {0x0C100,1,0x080}, - {0x0C101,27,0x100}, {0x0C11C,1,0x080}, - {0x0C11D,27,0x100}, {0x0C138,1,0x080}, - {0x0C139,27,0x100}, {0x0C154,1,0x080}, - {0x0C155,27,0x100}, {0x0C170,1,0x080}, - {0x0C171,27,0x100}, {0x0C18C,1,0x080}, - {0x0C18D,27,0x100}, {0x0C1A8,1,0x080}, - {0x0C1A9,27,0x100}, {0x0C1C4,1,0x080}, - {0x0C1C5,27,0x100}, {0x0C1E0,1,0x080}, - {0x0C1E1,27,0x100}, {0x0C1FC,1,0x080}, - {0x0C1FD,27,0x100}, {0x0C218,1,0x080}, - {0x0C219,27,0x100}, {0x0C234,1,0x080}, - {0x0C235,27,0x100}, {0x0C250,1,0x080}, - {0x0C251,27,0x100}, {0x0C26C,1,0x080}, - {0x0C26D,27,0x100}, {0x0C288,1,0x080}, - {0x0C289,27,0x100}, {0x0C2A4,1,0x080}, - {0x0C2A5,27,0x100}, {0x0C2C0,1,0x080}, - {0x0C2C1,27,0x100}, {0x0C2DC,1,0x080}, - {0x0C2DD,27,0x100}, {0x0C2F8,1,0x080}, - {0x0C2F9,27,0x100}, {0x0C314,1,0x080}, - {0x0C315,27,0x100}, {0x0C330,1,0x080}, - {0x0C331,27,0x100}, {0x0C34C,1,0x080}, - {0x0C34D,27,0x100}, {0x0C368,1,0x080}, - {0x0C369,27,0x100}, {0x0C384,1,0x080}, - {0x0C385,27,0x100}, {0x0C3A0,1,0x080}, - {0x0C3A1,27,0x100}, {0x0C3BC,1,0x080}, - {0x0C3BD,27,0x100}, {0x0C3D8,1,0x080}, - {0x0C3D9,27,0x100}, {0x0C3F4,1,0x080}, - {0x0C3F5,27,0x100}, {0x0C410,1,0x080}, - {0x0C411,27,0x100}, {0x0C42C,1,0x080}, - {0x0C42D,27,0x100}, {0x0C448,1,0x080}, - {0x0C449,27,0x100}, {0x0C464,1,0x080}, - {0x0C465,27,0x100}, {0x0C480,1,0x080}, - {0x0C481,27,0x100}, {0x0C49C,1,0x080}, - {0x0C49D,27,0x100}, {0x0C4B8,1,0x080}, - {0x0C4B9,27,0x100}, {0x0C4D4,1,0x080}, - {0x0C4D5,27,0x100}, {0x0C4F0,1,0x080}, - {0x0C4F1,27,0x100}, {0x0C50C,1,0x080}, - {0x0C50D,27,0x100}, {0x0C528,1,0x080}, - {0x0C529,27,0x100}, {0x0C544,1,0x080}, - {0x0C545,27,0x100}, {0x0C560,1,0x080}, - {0x0C561,27,0x100}, {0x0C57C,1,0x080}, - {0x0C57D,27,0x100}, {0x0C598,1,0x080}, - {0x0C599,27,0x100}, {0x0C5B4,1,0x080}, - {0x0C5B5,27,0x100}, {0x0C5D0,1,0x080}, - {0x0C5D1,27,0x100}, {0x0C5EC,1,0x080}, - {0x0C5ED,27,0x100}, {0x0C608,1,0x080}, - {0x0C609,27,0x100}, {0x0C624,1,0x080}, - {0x0C625,27,0x100}, {0x0C640,1,0x080}, - {0x0C641,27,0x100}, {0x0C65C,1,0x080}, - {0x0C65D,27,0x100}, {0x0C678,1,0x080}, - {0x0C679,27,0x100}, {0x0C694,1,0x080}, - {0x0C695,27,0x100}, {0x0C6B0,1,0x080}, - {0x0C6B1,27,0x100}, {0x0C6CC,1,0x080}, - {0x0C6CD,27,0x100}, {0x0C6E8,1,0x080}, - {0x0C6E9,27,0x100}, {0x0C704,1,0x080}, - {0x0C705,27,0x100}, {0x0C720,1,0x080}, - {0x0C721,27,0x100}, {0x0C73C,1,0x080}, - {0x0C73D,27,0x100}, {0x0C758,1,0x080}, - {0x0C759,27,0x100}, {0x0C774,1,0x080}, - {0x0C775,27,0x100}, {0x0C790,1,0x080}, - {0x0C791,27,0x100}, {0x0C7AC,1,0x080}, - {0x0C7AD,27,0x100}, {0x0C7C8,1,0x080}, - {0x0C7C9,27,0x100}, {0x0C7E4,1,0x080}, - {0x0C7E5,27,0x100}, {0x0C800,1,0x080}, - {0x0C801,27,0x100}, {0x0C81C,1,0x080}, - {0x0C81D,27,0x100}, {0x0C838,1,0x080}, - {0x0C839,27,0x100}, {0x0C854,1,0x080}, - {0x0C855,27,0x100}, {0x0C870,1,0x080}, - {0x0C871,27,0x100}, {0x0C88C,1,0x080}, - {0x0C88D,27,0x100}, {0x0C8A8,1,0x080}, - {0x0C8A9,27,0x100}, {0x0C8C4,1,0x080}, - {0x0C8C5,27,0x100}, {0x0C8E0,1,0x080}, - {0x0C8E1,27,0x100}, {0x0C8FC,1,0x080}, - {0x0C8FD,27,0x100}, {0x0C918,1,0x080}, - {0x0C919,27,0x100}, {0x0C934,1,0x080}, - {0x0C935,27,0x100}, {0x0C950,1,0x080}, - {0x0C951,27,0x100}, {0x0C96C,1,0x080}, - {0x0C96D,27,0x100}, {0x0C988,1,0x080}, - {0x0C989,27,0x100}, {0x0C9A4,1,0x080}, - {0x0C9A5,27,0x100}, {0x0C9C0,1,0x080}, - {0x0C9C1,27,0x100}, {0x0C9DC,1,0x080}, - {0x0C9DD,27,0x100}, {0x0C9F8,1,0x080}, - {0x0C9F9,27,0x100}, {0x0CA14,1,0x080}, - {0x0CA15,27,0x100}, {0x0CA30,1,0x080}, - {0x0CA31,27,0x100}, {0x0CA4C,1,0x080}, - {0x0CA4D,27,0x100}, {0x0CA68,1,0x080}, - {0x0CA69,27,0x100}, {0x0CA84,1,0x080}, - {0x0CA85,27,0x100}, {0x0CAA0,1,0x080}, - {0x0CAA1,27,0x100}, {0x0CABC,1,0x080}, - {0x0CABD,27,0x100}, {0x0CAD8,1,0x080}, - {0x0CAD9,27,0x100}, {0x0CAF4,1,0x080}, - {0x0CAF5,27,0x100}, {0x0CB10,1,0x080}, - {0x0CB11,27,0x100}, {0x0CB2C,1,0x080}, - {0x0CB2D,27,0x100}, {0x0CB48,1,0x080}, - {0x0CB49,27,0x100}, {0x0CB64,1,0x080}, - {0x0CB65,27,0x100}, {0x0CB80,1,0x080}, - {0x0CB81,27,0x100}, {0x0CB9C,1,0x080}, - {0x0CB9D,27,0x100}, {0x0CBB8,1,0x080}, - {0x0CBB9,27,0x100}, {0x0CBD4,1,0x080}, - {0x0CBD5,27,0x100}, {0x0CBF0,1,0x080}, - {0x0CBF1,27,0x100}, {0x0CC0C,1,0x080}, - {0x0CC0D,27,0x100}, {0x0CC28,1,0x080}, - {0x0CC29,27,0x100}, {0x0CC44,1,0x080}, - {0x0CC45,27,0x100}, {0x0CC60,1,0x080}, - {0x0CC61,27,0x100}, {0x0CC7C,1,0x080}, - {0x0CC7D,27,0x100}, {0x0CC98,1,0x080}, - {0x0CC99,27,0x100}, {0x0CCB4,1,0x080}, - {0x0CCB5,27,0x100}, {0x0CCD0,1,0x080}, - {0x0CCD1,27,0x100}, {0x0CCEC,1,0x080}, - {0x0CCED,27,0x100}, {0x0CD08,1,0x080}, - {0x0CD09,27,0x100}, {0x0CD24,1,0x080}, - {0x0CD25,27,0x100}, {0x0CD40,1,0x080}, - {0x0CD41,27,0x100}, {0x0CD5C,1,0x080}, - {0x0CD5D,27,0x100}, {0x0CD78,1,0x080}, - {0x0CD79,27,0x100}, {0x0CD94,1,0x080}, - {0x0CD95,27,0x100}, {0x0CDB0,1,0x080}, - {0x0CDB1,27,0x100}, {0x0CDCC,1,0x080}, - {0x0CDCD,27,0x100}, {0x0CDE8,1,0x080}, - {0x0CDE9,27,0x100}, {0x0CE04,1,0x080}, - {0x0CE05,27,0x100}, {0x0CE20,1,0x080}, - {0x0CE21,27,0x100}, {0x0CE3C,1,0x080}, - {0x0CE3D,27,0x100}, {0x0CE58,1,0x080}, - {0x0CE59,27,0x100}, {0x0CE74,1,0x080}, - {0x0CE75,27,0x100}, {0x0CE90,1,0x080}, - {0x0CE91,27,0x100}, {0x0CEAC,1,0x080}, - {0x0CEAD,27,0x100}, {0x0CEC8,1,0x080}, - {0x0CEC9,27,0x100}, {0x0CEE4,1,0x080}, - {0x0CEE5,27,0x100}, {0x0CF00,1,0x080}, - {0x0CF01,27,0x100}, {0x0CF1C,1,0x080}, - {0x0CF1D,27,0x100}, {0x0CF38,1,0x080}, - {0x0CF39,27,0x100}, {0x0CF54,1,0x080}, - {0x0CF55,27,0x100}, {0x0CF70,1,0x080}, - {0x0CF71,27,0x100}, {0x0CF8C,1,0x080}, - {0x0CF8D,27,0x100}, {0x0CFA8,1,0x080}, - {0x0CFA9,27,0x100}, {0x0CFC4,1,0x080}, - {0x0CFC5,27,0x100}, {0x0CFE0,1,0x080}, - {0x0CFE1,27,0x100}, {0x0CFFC,1,0x080}, - {0x0CFFD,27,0x100}, {0x0D018,1,0x080}, - {0x0D019,27,0x100}, {0x0D034,1,0x080}, - {0x0D035,27,0x100}, {0x0D050,1,0x080}, - {0x0D051,27,0x100}, {0x0D06C,1,0x080}, - {0x0D06D,27,0x100}, {0x0D088,1,0x080}, - {0x0D089,27,0x100}, {0x0D0A4,1,0x080}, - {0x0D0A5,27,0x100}, {0x0D0C0,1,0x080}, - {0x0D0C1,27,0x100}, {0x0D0DC,1,0x080}, - {0x0D0DD,27,0x100}, {0x0D0F8,1,0x080}, - {0x0D0F9,27,0x100}, {0x0D114,1,0x080}, - {0x0D115,27,0x100}, {0x0D130,1,0x080}, - {0x0D131,27,0x100}, {0x0D14C,1,0x080}, - {0x0D14D,27,0x100}, {0x0D168,1,0x080}, - {0x0D169,27,0x100}, {0x0D184,1,0x080}, - {0x0D185,27,0x100}, {0x0D1A0,1,0x080}, - {0x0D1A1,27,0x100}, {0x0D1BC,1,0x080}, - {0x0D1BD,27,0x100}, {0x0D1D8,1,0x080}, - {0x0D1D9,27,0x100}, {0x0D1F4,1,0x080}, - {0x0D1F5,27,0x100}, {0x0D210,1,0x080}, - {0x0D211,27,0x100}, {0x0D22C,1,0x080}, - {0x0D22D,27,0x100}, {0x0D248,1,0x080}, - {0x0D249,27,0x100}, {0x0D264,1,0x080}, - {0x0D265,27,0x100}, {0x0D280,1,0x080}, - {0x0D281,27,0x100}, {0x0D29C,1,0x080}, - {0x0D29D,27,0x100}, {0x0D2B8,1,0x080}, - {0x0D2B9,27,0x100}, {0x0D2D4,1,0x080}, - {0x0D2D5,27,0x100}, {0x0D2F0,1,0x080}, - {0x0D2F1,27,0x100}, {0x0D30C,1,0x080}, - {0x0D30D,27,0x100}, {0x0D328,1,0x080}, - {0x0D329,27,0x100}, {0x0D344,1,0x080}, - {0x0D345,27,0x100}, {0x0D360,1,0x080}, - {0x0D361,27,0x100}, {0x0D37C,1,0x080}, - {0x0D37D,27,0x100}, {0x0D398,1,0x080}, - {0x0D399,27,0x100}, {0x0D3B4,1,0x080}, - {0x0D3B5,27,0x100}, {0x0D3D0,1,0x080}, - {0x0D3D1,27,0x100}, {0x0D3EC,1,0x080}, - {0x0D3ED,27,0x100}, {0x0D408,1,0x080}, - {0x0D409,27,0x100}, {0x0D424,1,0x080}, - {0x0D425,27,0x100}, {0x0D440,1,0x080}, - {0x0D441,27,0x100}, {0x0D45C,1,0x080}, - {0x0D45D,27,0x100}, {0x0D478,1,0x080}, - {0x0D479,27,0x100}, {0x0D494,1,0x080}, - {0x0D495,27,0x100}, {0x0D4B0,1,0x080}, - {0x0D4B1,27,0x100}, {0x0D4CC,1,0x080}, - {0x0D4CD,27,0x100}, {0x0D4E8,1,0x080}, - {0x0D4E9,27,0x100}, {0x0D504,1,0x080}, - {0x0D505,27,0x100}, {0x0D520,1,0x080}, - {0x0D521,27,0x100}, {0x0D53C,1,0x080}, - {0x0D53D,27,0x100}, {0x0D558,1,0x080}, - {0x0D559,27,0x100}, {0x0D574,1,0x080}, - {0x0D575,27,0x100}, {0x0D590,1,0x080}, - {0x0D591,27,0x100}, {0x0D5AC,1,0x080}, - {0x0D5AD,27,0x100}, {0x0D5C8,1,0x080}, - {0x0D5C9,27,0x100}, {0x0D5E4,1,0x080}, - {0x0D5E5,27,0x100}, {0x0D600,1,0x080}, - {0x0D601,27,0x100}, {0x0D61C,1,0x080}, - {0x0D61D,27,0x100}, {0x0D638,1,0x080}, - {0x0D639,27,0x100}, {0x0D654,1,0x080}, - {0x0D655,27,0x100}, {0x0D670,1,0x080}, - {0x0D671,27,0x100}, {0x0D68C,1,0x080}, - {0x0D68D,27,0x100}, {0x0D6A8,1,0x080}, - {0x0D6A9,27,0x100}, {0x0D6C4,1,0x080}, - {0x0D6C5,27,0x100}, {0x0D6E0,1,0x080}, - {0x0D6E1,27,0x100}, {0x0D6FC,1,0x080}, - {0x0D6FD,27,0x100}, {0x0D718,1,0x080}, - {0x0D719,27,0x100}, {0x0D734,1,0x080}, - {0x0D735,27,0x100}, {0x0D750,1,0x080}, - {0x0D751,27,0x100}, {0x0D76C,1,0x080}, - {0x0D76D,27,0x100}, {0x0D788,1,0x080}, - {0x0D789,27,0x100}, {0x0FB1E,1,0x002}, - {0x0FE00,16,0x002}, {0x0FE20,7,0x002}, - {0x0FEFF,1,0x001}, {0x0FF9E,2,0x002}, - {0x0FFF9,3,0x001}, {0x101FD,1,0x002}, - {0x10A01,3,0x002}, {0x10A05,2,0x002}, - {0x10A0C,4,0x002}, {0x10A38,3,0x002}, - {0x10A3F,1,0x002}, {0x1D165,1,0x002}, - {0x1D166,1,0x008}, {0x1D167,3,0x002}, - {0x1D16D,1,0x008}, {0x1D16E,5,0x002}, - {0x1D173,8,0x001}, {0x1D17B,8,0x002}, - {0x1D185,7,0x002}, {0x1D1AA,4,0x002}, - {0x1D242,3,0x002}, {0xE0001,1,0x001}, - {0xE0020,96,0x001}, {0xE0100,240,0x002}, -}; - -static int -grapheme_cmp(const void *p1, const void *p2) -{ - const struct grapheme_table_t *k = p1; - const struct grapheme_table_t *v = p2; - OnigCodePoint c = k->codepoint_min; - if (c < v->codepoint_min) - return -1; - if (v->codepoint_min + v->num_codepoints <= c) - return 1; - return 0; -} - -static unsigned int -get_grapheme_properties(OnigCodePoint c) -{ - struct grapheme_table_t key, *found; - key.codepoint_min = c; - found = bsearch(&key, grapheme_table, sizeof(grapheme_table)/sizeof(*grapheme_table), - sizeof(*grapheme_table), grapheme_cmp); - if (found) - return found->properties; - return 0; -} - -/* Stream-Safe Text Format assumed - * http://unicode.org/reports/tr15/ */ -#define MAX_BYTES_LENGTH 128 - -static OnigCodePoint mbc_to_code0(const UChar* p, const UChar* end, int len); - -static int -grapheme_boundary_p(int props1, int props2) -{ - if (props2 & GRAPHEME_BIT_CONTROL) - return 1; - if (((props1 & GRAPHEME_BIT_L) && (props2 & (GRAPHEME_BIT_L| - GRAPHEME_BIT_V| - GRAPHEME_BIT_LV| - GRAPHEME_BIT_LVT))) || - ((props1 & (GRAPHEME_BIT_LV| - GRAPHEME_BIT_V)) && (props2 & (GRAPHEME_BIT_V| - GRAPHEME_BIT_T))) || - ((props1 & (GRAPHEME_BIT_LVT| - GRAPHEME_BIT_T)) && (props2 & GRAPHEME_BIT_T))) - return 0; - if (props2 & (GRAPHEME_BIT_EXTEND| - GRAPHEME_BIT_SPACINGMARK)) - return 0; - if (props1 & GRAPHEME_BIT_PREPEND) - return 0; - return 1; -} - -static int -comb_char_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) -{ - /* - * this implements extended grapheme clusters ("user-perceived characters") - * http://www.unicode.org/reports/tr29/ - */ - int r1, l1, r2, l2; - OnigCodePoint c1, c2; - unsigned int p1, p2; - r1 = mbc_enc_len(p, e, enc); - if (!ONIGENC_MBCLEN_CHARFOUND_P(r1)) - return r1; - l1 = ONIGENC_MBCLEN_CHARFOUND_LEN(r1); - c1 = mbc_to_code0(p, e, l1); - p1 = get_grapheme_properties(c1); - - if (p + l1 == e) - return r1; - - while (p + l1 < e && l1 < MAX_BYTES_LENGTH-4) { - if (p1 & GRAPHEME_BIT_CONTROL) - return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(l1); - r2 = mbc_enc_len(p+l1, e, enc); - if (ONIGENC_MBCLEN_INVALID_P(r2)) - return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(l1); - if (ONIGENC_MBCLEN_NEEDMORE_P(r2)) - return r2; - l2 = ONIGENC_MBCLEN_CHARFOUND_LEN(r2); - c2 = mbc_to_code0(p+l1, e, l2); - p2 = get_grapheme_properties(c2); - if (grapheme_boundary_p(p1, p2)) - return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(l1); - l1 += l2; - p1 = p2; - } - /* if p+l1==e, charfound AND needmore */ - return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(l1); -} - static int is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc) { @@ -918,11 +267,12 @@ is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc) } static OnigCodePoint -mbc_to_code0(const UChar* p, const UChar* end, int len) +mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) { - int c; + int c, len; OnigCodePoint n; + len = enclen(enc, p, end); c = *p++; if (len > 1) { len--; @@ -943,24 +293,6 @@ mbc_to_code0(const UChar* p, const UChar* end, int len) } } -static OnigCodePoint -mbc_to_code(const UChar* p, const UChar* end, int *precise_ret, OnigEncoding enc) -{ - int len; - int ret; - - ret = mbc_enc_len(p, end, enc); - if (precise_ret) - *precise_ret = ret; - if (ONIGENC_MBCLEN_CHARFOUND_P(ret)) - len = ONIGENC_MBCLEN_CHARFOUND_LEN(ret); - else if (ONIGENC_MBCLEN_NEEDMORE_P(ret)) - len = end-p+ONIGENC_MBCLEN_NEEDMORE_LEN(ret); - else - len = 1; - return mbc_to_code0(p, end, len); -} - static int code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED) { @@ -1084,30 +416,6 @@ left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, Onig return (UChar* )p; } -static UChar* -left_adjust_combchar_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED) -{ - const UChar *p = left_adjust_char_head(start, s, end, enc); - const UChar *q; - OnigCodePoint c1, c2; - unsigned int p1, p2; - - c2 = mbc_to_code(p, end, NULL, enc); - p2 = get_grapheme_properties(c2); - - while (start < p) { - q = left_adjust_char_head(start, p-1, end, enc); - c1 = mbc_to_code(q, end, NULL, enc); - p1 = get_grapheme_properties(c1); - if (grapheme_boundary_p(p1, p2)) - break; - c2 = c1; - p2 = p1; - p = q; - } - return (UChar *)p; -} - static int get_case_fold_codes_by_str(OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[], @@ -1117,9 +425,9 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, } OnigEncodingDefine(utf_8, UTF_8) = { - comb_char_enc_len, + mbc_enc_len, "UTF-8", /* name */ - MAX_BYTES_LENGTH, /* max byte length */ + 6, /* max byte length */ 1, /* min byte length */ is_mbc_newline, mbc_to_code, @@ -1131,8 +439,8 @@ OnigEncodingDefine(utf_8, UTF_8) = { onigenc_unicode_property_name_to_ctype, onigenc_unicode_is_code_ctype, get_ctype_code_range, - left_adjust_combchar_head, - onigenc_always_false_is_allowed_reverse_match + left_adjust_char_head, + onigenc_always_true_is_allowed_reverse_match }; ENC_ALIAS("CP65001", "UTF-8") diff --git a/encoding.c b/encoding.c index 29d97de1c8..78887de730 100644 --- a/encoding.c +++ b/encoding.c @@ -735,16 +735,20 @@ rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc) int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc) { - unsigned int c; - int l, l2; + unsigned int c, l; if (e <= p) return -1; + if (rb_enc_asciicompat(enc)) { + c = (unsigned char)*p; + if (!ISASCII(c)) + return -1; + if (len) *len = 1; + return c; + } l = rb_enc_precise_mbclen(p, e, enc); if (!MBCLEN_CHARFOUND_P(l)) return -1; - c = rb_enc_mbc_precise_codepoint(p, e, &l2, enc); - if (l != l2) - return -1; + c = rb_enc_mbc_to_codepoint(p, e, enc); if (!rb_enc_isascii(c, enc)) return -1; if (len) *len = l; @@ -755,12 +759,11 @@ unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc) { int r; - OnigCodePoint c; if (e <= p) rb_raise(rb_eArgError, "empty string"); - c = rb_enc_mbc_precise_codepoint(p, e, &r, enc); + r = rb_enc_precise_mbclen(p, e, enc); if (MBCLEN_CHARFOUND_P(r)) - return c; + return rb_enc_mbc_to_codepoint(p, e, enc); else rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc)); } diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index acf10cb072..f2f7ba19b5 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -122,13 +122,12 @@ int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc); /* -> code or raise exception */ unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc); #define rb_enc_mbc_to_codepoint(p, e, enc) ONIGENC_MBC_TO_CODE(enc,(UChar*)(p),(UChar*)(e)) -#define rb_enc_mbc_precise_codepoint(p, e, prec_ret, enc) ONIGENC_MBC_PRECISE_CODEPOINT(enc,(UChar*)(p),(UChar*)(e),(prec_ret)) /* -> codelen>0 or raise exception */ -int rb_enc_codelen(int codepoint, rb_encoding *enc); +int rb_enc_codelen(int code, rb_encoding *enc); -/* codepoint,ptr,encoding -> write buf */ -#define rb_enc_mbcput(codepoint,buf,enc) ONIGENC_CODE_TO_MBC((enc),(codepoint),(UChar*)(buf)) +/* code,ptr,encoding -> write buf */ +#define rb_enc_mbcput(c,buf,enc) ONIGENC_CODE_TO_MBC(enc,c,(UChar*)(buf)) /* start, ptr, end, encoding -> prev_char */ #define rb_enc_prev_char(s,p,e,enc) (char *)onigenc_get_prev_char_head(enc,(UChar*)(s),(UChar*)(p),(UChar*)(e)) diff --git a/include/ruby/oniguruma.h b/include/ruby/oniguruma.h index 9d9da1deab..bc6905edaa 100644 --- a/include/ruby/oniguruma.h +++ b/include/ruby/oniguruma.h @@ -155,7 +155,7 @@ typedef struct OnigEncodingTypeST { int max_enc_len; int min_enc_len; int (*is_mbc_newline)(const OnigUChar* p, const OnigUChar* end, struct OnigEncodingTypeST* enc); - OnigCodePoint (*mbc_to_code)(const OnigUChar* p, const OnigUChar* end, int *precise_ret, struct OnigEncodingTypeST* enc); + OnigCodePoint (*mbc_to_code)(const OnigUChar* p, const OnigUChar* end, struct OnigEncodingTypeST* enc); int (*code_to_mbclen)(OnigCodePoint code, struct OnigEncodingTypeST* enc); int (*code_to_mbc)(OnigCodePoint code, OnigUChar *buf, struct OnigEncodingTypeST* enc); int (*mbc_case_fold)(OnigCaseFoldType flag, const OnigUChar** pp, const OnigUChar* end, OnigUChar* to, struct OnigEncodingTypeST* enc); @@ -249,8 +249,7 @@ int onigenc_mbclen_approximate P_((const OnigUChar* p,const OnigUChar* e, struct #define ONIGENC_MBC_MAXLEN_DIST(enc) ONIGENC_MBC_MAXLEN(enc) #define ONIGENC_MBC_MINLEN(enc) ((enc)->min_enc_len) #define ONIGENC_IS_MBC_NEWLINE(enc,p,end) (enc)->is_mbc_newline((p),(end),enc) -#define ONIGENC_MBC_TO_CODE(enc,p,end) (enc)->mbc_to_code((p),(end),NULL,enc) -#define ONIGENC_MBC_PRECISE_CODEPOINT(enc,p,end,prec_ret) (enc)->mbc_to_code((p),(end),(prec_ret),enc) +#define ONIGENC_MBC_TO_CODE(enc,p,end) (enc)->mbc_to_code((p),(end),enc) #define ONIGENC_CODE_TO_MBCLEN(enc,code) (enc)->code_to_mbclen(code,enc) #define ONIGENC_CODE_TO_MBC(enc,code,buf) (enc)->code_to_mbc(code,buf,enc) #define ONIGENC_PROPERTY_NAME_TO_CTYPE(enc,p,end) \ @@ -615,11 +615,8 @@ onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED, const UChar* e ARG_UN extern OnigCodePoint onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED, - int *precise_ret, OnigEncoding enc ARG_UNUSED) { - if (precise_ret) - *precise_ret = ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1); return (OnigCodePoint )(*p); } @@ -671,20 +668,12 @@ onigenc_ascii_is_code_ctype(OnigCodePoint code, unsigned int ctype, } extern OnigCodePoint -onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end, int *precise_ret) +onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end) { - int c, i, len, ret; + int c, i, len; OnigCodePoint n; - ret = ONIGENC_PRECISE_MBC_ENC_LEN(enc, p, end); - if (precise_ret) - *precise_ret = ret; - if (ONIGENC_MBCLEN_CHARFOUND_P(ret)) - len = ONIGENC_MBCLEN_CHARFOUND_LEN(ret); - else if (ONIGENC_MBCLEN_NEEDMORE_P(ret)) - len = end-p+ONIGENC_MBCLEN_NEEDMORE_LEN(ret); - else - len = 1; + len = enclen(enc, p, end); n = (OnigCodePoint )(*p++); if (len == 1) return n; @@ -122,7 +122,7 @@ ONIG_EXTERN int onigenc_is_mbc_newline_0x0a P_((const UChar* p, const UChar* end /* methods for single byte encoding */ ONIG_EXTERN int onigenc_ascii_mbc_case_fold P_((OnigCaseFoldType flag, const UChar** p, const UChar* end, UChar* lower, OnigEncoding enc)); ONIG_EXTERN int onigenc_single_byte_mbc_enc_len P_((const UChar* p, const UChar* e, OnigEncoding enc)); -ONIG_EXTERN OnigCodePoint onigenc_single_byte_mbc_to_code P_((const UChar* p, const UChar* end, int *precise_ret, OnigEncoding enc)); +ONIG_EXTERN OnigCodePoint onigenc_single_byte_mbc_to_code P_((const UChar* p, const UChar* end, OnigEncoding enc)); ONIG_EXTERN int onigenc_single_byte_code_to_mbclen P_((OnigCodePoint code, OnigEncoding enc)); ONIG_EXTERN int onigenc_single_byte_code_to_mbc P_((OnigCodePoint code, UChar *buf, OnigEncoding enc)); ONIG_EXTERN UChar* onigenc_single_byte_left_adjust_char_head P_((const UChar* start, const UChar* s, const OnigUChar* end, OnigEncoding enc)); @@ -131,7 +131,7 @@ ONIG_EXTERN int onigenc_always_false_is_allowed_reverse_match P_((const UChar* s ONIG_EXTERN int onigenc_ascii_is_code_ctype P_((OnigCodePoint code, unsigned int ctype, OnigEncoding enc)); /* methods for multi byte encoding */ -ONIG_EXTERN OnigCodePoint onigenc_mbn_mbc_to_code P_((OnigEncoding enc, const UChar* p, const UChar* end, int *precise_ret)); +ONIG_EXTERN OnigCodePoint onigenc_mbn_mbc_to_code P_((OnigEncoding enc, const UChar* p, const UChar* end)); ONIG_EXTERN int onigenc_mbn_mbc_case_fold P_((OnigEncoding enc, OnigCaseFoldType flag, const UChar** p, const UChar* end, UChar* lower)); ONIG_EXTERN int onigenc_mb2_code_to_mbclen P_((OnigCodePoint code, OnigEncoding enc)); ONIG_EXTERN int onigenc_mb2_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); @@ -977,24 +977,25 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, }\ } while(0) -#define STRING_CMP_IC(case_fold_flag,s1,ps2,len,text_end) do {\ - if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len, text_end) == 0) \ +#define STRING_CMP_IC(case_fold_flag,s1,ps2,len) do {\ + if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len) == 0) \ goto fail; \ } while(0) static int string_cmp_ic(OnigEncoding enc, int case_fold_flag, - UChar* s1, UChar** ps2, int mblen, const UChar* text_end) + UChar* s1, UChar** ps2, int mblen) { UChar buf1[ONIGENC_MBC_CASE_FOLD_MAXLEN]; UChar buf2[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - UChar *p1, *p2, *end1, *s2; + UChar *p1, *p2, *end1, *s2, *end2; int len1, len2; s2 = *ps2; end1 = s1 + mblen; + end2 = s2 + mblen; while (s1 < end1) { len1 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s1, end1, buf1); - len2 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s2, text_end, buf2); + len2 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s2, end2, buf2); if (len1 != len2) return 0; p1 = buf1; p2 = buf2; @@ -1018,8 +1019,8 @@ static int string_cmp_ic(OnigEncoding enc, int case_fold_flag, }\ } while(0) -#define STRING_CMP_VALUE_IC(case_fold_flag,s1,ps2,len,text_end,is_fail) do {\ - if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len, text_end) == 0) \ +#define STRING_CMP_VALUE_IC(case_fold_flag,s1,ps2,len,is_fail) do {\ + if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len) == 0) \ is_fail = 1; \ else \ is_fail = 0; \ @@ -1125,7 +1126,7 @@ static int backref_match_at_nested_level(regex_t* reg if (ignore_case != 0) { if (string_cmp_ic(reg->enc, case_fold_flag, - pstart, &ss, (int )(pend - pstart), send) == 0) + pstart, &ss, (int )(pend - pstart)) == 0) return 0; /* or goto next_mem; */ } else { @@ -1441,8 +1442,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p != *s++) goto fail; DATA_ENSURE(0); p++; - if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) - goto fail; MOP_OUT; break; @@ -1465,8 +1464,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; q++; } } - if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) - goto fail; MOP_OUT; break; @@ -1477,8 +1474,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p != *s) goto fail; sprev = s; p++; s++; - if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) - goto fail; MOP_OUT; continue; break; @@ -1492,8 +1487,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p != *s) goto fail; sprev = s; p++; s++; - if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) - goto fail; MOP_OUT; continue; break; @@ -1509,8 +1502,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p != *s) goto fail; sprev = s; p++; s++; - if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) - goto fail; MOP_OUT; continue; break; @@ -1528,8 +1519,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p != *s) goto fail; sprev = s; p++; s++; - if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) - goto fail; MOP_OUT; continue; break; @@ -1541,8 +1530,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p++ != *s++) goto fail; } sprev = s - 1; - if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) - goto fail; MOP_OUT; continue; break; @@ -1570,8 +1557,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } } } - if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) - goto fail; + MOP_OUT; continue; break; @@ -1582,8 +1568,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; if (*p != *s) goto fail; p++; s++; - if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) - goto fail; MOP_OUT; break; @@ -1598,8 +1582,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; if (*p != *s) goto fail; p++; s++; - if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) - goto fail; MOP_OUT; continue; break; @@ -1619,8 +1601,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; if (*p != *s) goto fail; p++; s++; - if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) - goto fail; MOP_OUT; continue; break; @@ -1635,8 +1615,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; } sprev = s - 2; - if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) - goto fail; MOP_OUT; continue; break; @@ -1653,8 +1631,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; } sprev = s - 3; - if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) - goto fail; MOP_OUT; continue; break; @@ -1669,8 +1645,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; } sprev = s - tlen; - if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) - goto fail; MOP_OUT; continue; break; @@ -2225,7 +2199,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, n = pend - pstart; DATA_ENSURE(n); sprev = s; - STRING_CMP_IC(case_fold_flag, pstart, &s, n, end); + STRING_CMP_IC(case_fold_flag, pstart, &s, n); while (sprev + (len = enclen(encode, sprev, end)) < s) sprev += len; @@ -2297,7 +2271,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, DATA_ENSURE(n); sprev = s; swork = s; - STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, end, is_fail); + STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); if (is_fail) continue; s = swork; while (sprev + (len = enclen(encode, sprev, end)) < s) @@ -2806,7 +2780,7 @@ slow_search(OnigEncoding enc, UChar* target, UChar* target_end, if (target_end == t || memcmp(t, p, target_end - t) == 0) return s; } - s += enclen(enc, s, text_end); + s += enclen(enc, s, end); } return (UChar* )NULL; @@ -807,6 +807,24 @@ rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr) return c; } +#ifdef NONASCII_MASK +#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80) +static inline VALUE +count_utf8_lead_bytes_with_word(const VALUE *s) +{ + VALUE d = *s; + d |= ~(d>>1); + d >>= 6; + d &= NONASCII_MASK >> 7; + d += (d>>8); + d += (d>>16); +#if SIZEOF_VALUE == 8 + d += (d>>32); +#endif + return (d&0xF); +} +#endif + static long str_strlen(VALUE str, rb_encoding *enc) { @@ -817,6 +835,32 @@ str_strlen(VALUE str, rb_encoding *enc) if (!enc) enc = STR_ENC_GET(str); p = RSTRING_PTR(str); e = RSTRING_END(str); +#ifdef NONASCII_MASK + if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && + enc == rb_utf8_encoding()) { + VALUE len = 0; + if (sizeof(VALUE) * 2 < e - p) { + const VALUE *s, *t; + const VALUE lowbits = sizeof(VALUE) - 1; + s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); + t = (const VALUE*)(~lowbits & (VALUE)e); + while (p < (const char *)s) { + if (is_utf8_lead_byte(*p)) len++; + p++; + } + while (s < t) { + len += count_utf8_lead_bytes_with_word(s); + s++; + } + p = (const char *)s; + } + while (p < e) { + if (is_utf8_lead_byte(*p)) len++; + p++; + } + return (long)len; + } +#endif n = rb_enc_strlen_cr(p, e, enc, &cr); if (cr) { ENC_CODERANGE_SET(str, cr); @@ -1183,6 +1227,44 @@ str_offset(const char *p, const char *e, int nth, rb_encoding *enc, int singleby return pp - p; } +#ifdef NONASCII_MASK +static char * +str_utf8_nth(const char *p, const char *e, int nth) +{ + if (sizeof(VALUE) * 2 < nth) { + const VALUE *s, *t; + const VALUE lowbits = sizeof(VALUE) - 1; + s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); + t = (const VALUE*)(~lowbits & (VALUE)e); + while (p < (const char *)s) { + if (is_utf8_lead_byte(*p)) nth--; + p++; + } + do { + nth -= count_utf8_lead_bytes_with_word(s); + s++; + } while (s < t && sizeof(VALUE) <= nth); + p = (char *)s; + } + while (p < e) { + if (is_utf8_lead_byte(*p)) { + if (nth == 0) break; + nth--; + } + p++; + } + return (char *)p; +} + +static int +str_utf8_offset(const char *p, const char *e, int nth) +{ + const char *pp = str_utf8_nth(p, e, nth); + if (!pp) return e - p; + return pp - p; +} +#endif + /* byte offset to char offset */ long rb_str_sublen(VALUE str, long pos) @@ -1256,6 +1338,13 @@ rb_str_substr(VALUE str, long beg, long len) if (len == 0) { p = 0; } +#ifdef NONASCII_MASK + else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && + enc == rb_utf8_encoding()) { + p = str_utf8_nth(s, e, beg); + len = str_utf8_offset(p, e, len); + } +#endif else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { int char_sz = rb_enc_mbmaxlen(enc); @@ -1998,6 +2087,7 @@ rb_str_cmp_m(VALUE str1, VALUE str2) static VALUE rb_str_casecmp(VALUE str1, VALUE str2) { + long len; rb_encoding *enc; char *p1, *p1end, *p2, *p2end; @@ -2012,8 +2102,8 @@ rb_str_casecmp(VALUE str1, VALUE str2) if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) { while (p1 < p1end && p2 < p2end) { if (*p1 != *p2) { - unsigned int c1 = TOUPPER(*p1 & 0xff); - unsigned int c2 = TOUPPER(*p2 & 0xff); + unsigned int c1 = rb_enc_toupper(*p1 & 0xff, enc); + unsigned int c2 = rb_enc_toupper(*p2 & 0xff, enc); if (c1 > c2) return INT2FIX(1); if (c1 < c2) return INT2FIX(-1); } @@ -2023,42 +2113,18 @@ rb_str_casecmp(VALUE str1, VALUE str2) } else { while (p1 < p1end && p2 < p2end) { - int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc); - int l2, c2 = rb_enc_ascget(p2, p1end, &l2, enc); - - if (0 <= c1) { - if (0 <= c2) { - if (c1 != c2) { - c1 = TOUPPER(c1); - c2 = TOUPPER(c2); - if (c1 > c2) return INT2FIX(1); - if (c1 < c2) return INT2FIX(-1); - } - } - else { - return INT2FIX(-1); - } - } - else { - if (0 <= c2) { - return INT2FIX(1); - } - else { - int l, r; - l1 = rb_enc_mbclen(p1, p1end, enc); - l2 = rb_enc_mbclen(p2, p2end, enc); - l = l1; - if (l2 < l) - l = l2; - r = memcmp(p1, p2, l); - if (r != 0) - return INT2FIX(r < 0 ? -1 : 1); - if (l1 != l2) - return INT2FIX(l1 < l2 ? -1 : 1); - } - } - p1 += l1; - p2 += l2; + unsigned int c1 = rb_enc_codepoint(p1, p1end, enc); + unsigned int c2 = rb_enc_codepoint(p2, p2end, enc); + + if (c1 != c2) { + c1 = rb_enc_toupper(c1, enc); + c2 = rb_enc_toupper(c2, enc); + if (c1 > c2) return INT2FIX(1); + if (c1 < c2) return INT2FIX(-1); + } + len = rb_enc_codelen(c1, enc); + p1 += len; + p2 += len; } } if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0); @@ -2388,7 +2454,7 @@ enc_succ_char(char *p, int len, rb_encoding *enc) if (i < 0) return NEIGHBOR_WRAPPED; ++((unsigned char*)p)[i]; - rb_enc_mbc_precise_codepoint(p, p+len, &l, enc); + l = rb_enc_precise_mbclen(p, p+len, enc); if (MBCLEN_CHARFOUND_P(l)) { l = MBCLEN_CHARFOUND_LEN(l); if (l == len) { @@ -2401,7 +2467,7 @@ enc_succ_char(char *p, int len, rb_encoding *enc) if (MBCLEN_INVALID_P(l) && i < len-1) { int len2, l2; for (len2 = len-1; 0 < len2; len2--) { - rb_enc_mbc_precise_codepoint(p, p+len2, &l2, enc); + l2 = rb_enc_precise_mbclen(p, p+len2, enc); if (!MBCLEN_INVALID_P(l2)) break; } @@ -2420,7 +2486,7 @@ enc_pred_char(char *p, int len, rb_encoding *enc) if (i < 0) return NEIGHBOR_WRAPPED; --((unsigned char*)p)[i]; - rb_enc_mbc_precise_codepoint(p, p+len, &l, enc); + l = rb_enc_precise_mbclen(p, p+len, enc); if (MBCLEN_CHARFOUND_P(l)) { l = MBCLEN_CHARFOUND_LEN(l); if (l == len) { @@ -2433,7 +2499,7 @@ enc_pred_char(char *p, int len, rb_encoding *enc) if (MBCLEN_INVALID_P(l) && i < len-1) { int len2, l2; for (len2 = len-1; 0 < len2; len2--) { - rb_enc_mbc_precise_codepoint(p, p+len2, &l2, enc); + l2 = rb_enc_precise_mbclen(p, p+len2, enc); if (!MBCLEN_INVALID_P(l2)) break; } @@ -2540,7 +2606,7 @@ rb_str_succ(VALUE orig) VALUE str; char *sbeg, *s, *e, *last_alnum = 0; int c = -1; - int l; + long l; char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1"; int carry_pos = 0, carry_len = 1; enum neighbor_char neighbor = NEIGHBOR_FOUND; @@ -2562,8 +2628,7 @@ rb_str_succ(VALUE orig) break; } } - rb_enc_mbc_precise_codepoint(s, e, &l, enc); - if (l <= 0) continue; + if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; neighbor = enc_succ_alnum_char(s, l, enc, carry); switch (neighbor) { case NEIGHBOR_NOT_CHAR: @@ -2582,14 +2647,11 @@ rb_str_succ(VALUE orig) s = e; while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { enum neighbor_char neighbor; - int l2; - rb_enc_mbc_precise_codepoint(s, e, &l, enc); - if (l <= 0) continue; + if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; neighbor = enc_succ_char(s, l, enc); if (neighbor == NEIGHBOR_FOUND) return str; - rb_enc_mbc_precise_codepoint(s, s+l, &l2, enc); - if (l2 != l) { + if (rb_enc_precise_mbclen(s, s+l, enc) != l) { /* wrapped to \0...\0. search next valid char. */ enc_succ_char(s, l, enc); } diff --git a/test/ruby/enc/test_utf16.rb b/test/ruby/enc/test_utf16.rb index 64c1dede75..30dbb2e886 100644 --- a/test/ruby/enc/test_utf16.rb +++ b/test/ruby/enc/test_utf16.rb @@ -368,16 +368,4 @@ EOT r = Regexp.new(Regexp.escape(s)) assert(r =~ s, "#{encdump(r)} =~ #{encdump(s)}") end - - def test_casecmp - assert_equal(0, "\0A".force_encoding("UTF-16BE").casecmp("\0a".force_encoding("UTF-16BE"))) - assert_not_equal(0, "\0A".force_encoding("UTF-16LE").casecmp("\0a".force_encoding("UTF-16LE"))) - assert_not_equal(0, "A\0".force_encoding("UTF-16BE").casecmp("a\0".force_encoding("UTF-16BE"))) - assert_equal(0, "A\0".force_encoding("UTF-16LE").casecmp("a\0".force_encoding("UTF-16LE"))) - - ary = ["ab".force_encoding("UTF-16LE"), "ba".force_encoding("UTF-16LE")] - e = ary.sort {|x,y| x <=> y } - a = ary.sort {|x,y| x.casecmp(y) } - assert_equal(e, a) - end end diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb index f4fc38ebfa..372aaae1f3 100644 --- a/test/ruby/test_m17n.rb +++ b/test/ruby/test_m17n.rb @@ -1293,22 +1293,7 @@ class TestM17N < Test::Unit::TestCase "moved from btest/knownbug, [ruby-dev:33807]") end - def test_combchar - assert_equal(1, "\u{304B 3099}".length) - assert_equal("\u{304B 3099}", "\u{304B 3099}"[0]) - assert_equal(nil, "\u{304B 3099}"[1]) - assert_equal(nil, "\u{304B 3099}".index("\u3099")) - end - - def test_combchar_regexp - assert_match(/\A.\z/, "\u304B\u3099") - assert_nil(/\u3099/ =~ "\u304B\u3099") - assert_nil(/a|b/ =~ "a\u3099") - assert_nil(/\u0100|\u0111/ =~ "\u0100\u3099") - end - def test_combchar_codepoint assert_equal([0x30BB, 0x309A], "\u30BB\u309A".codepoints.to_a) end - end |