From 2c8f2871a8aeff592369a993b1d69557160cfa61 Mon Sep 17 00:00:00 2001 From: Josef Haider <4835525+djoooooe@users.noreply.github.com> Date: Sat, 18 Mar 2023 07:43:54 +0100 Subject: Fix handling of 6-byte codepoints in left_adjust_char_head in CESU-8 encoding --- enc/cesu_8.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'enc') diff --git a/enc/cesu_8.c b/enc/cesu_8.c index decbb928f4..75f62df280 100644 --- a/enc/cesu_8.c +++ b/enc/cesu_8.c @@ -42,6 +42,8 @@ #define VALID_CODE_LIMIT 0x0010ffff #define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80) +#define utf16_is_high_surrogate(v) ((v >> 10) == 0x36) +#define utf16_is_low_surrogate(v) ((v >> 10) == 0x37) static const int EncLen_CESU8[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -283,6 +285,12 @@ is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc) return 0; } +static int +utf8_decode_3byte_sequence(const UChar* p) +{ + return ((p[0] & 0xF) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f); +} + static OnigCodePoint mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) { @@ -295,11 +303,11 @@ mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) case 2: return ((p[0] & 0x1F) << 6) | (p[1] & 0x3f); case 3: - return ((p[0] & 0xF) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f); + return utf8_decode_3byte_sequence(p); case 6: { - int high = ((p[0] & 0xF) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f); - int low = ((p[3] & 0xF) << 12) | ((p[4] & 0x3f) << 6) | (p[5] & 0x3f); + int high = utf8_decode_3byte_sequence(p); + int low = utf8_decode_3byte_sequence(p + 3); return ((high & 0x03ff) << 10) + (low & 0x03ff) + 0x10000; } } @@ -410,7 +418,6 @@ get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out, return onigenc_unicode_ctype_code_range(ctype, ranges); } - static UChar* left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED) { @@ -420,6 +427,14 @@ left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, Onig p = s; while (!utf8_islead(*p) && p > start) p--; + + if (p > start && s - p == 2 && utf16_is_low_surrogate(utf8_decode_3byte_sequence(p))) { + const UChar *p_surrogate_pair = p - 1; + while (!utf8_islead(*p_surrogate_pair) && p_surrogate_pair > start) p_surrogate_pair--; + if (p - p_surrogate_pair == 3 && utf16_is_high_surrogate(utf8_decode_3byte_sequence(p_surrogate_pair))) { + return (UChar* )p_surrogate_pair; + } + } return (UChar* )p; } -- cgit v1.2.1