From 3b73a03497e4fd67459960318308c9265bcd7805 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vicent=20Mart=C3=AD?= Date: Wed, 25 Apr 2012 16:26:12 -0700 Subject: UTF-8 changes yo --- src/win32/utf-conv.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/src/win32/utf-conv.c b/src/win32/utf-conv.c index 0a705c0a..4b95001d 100644 --- a/src/win32/utf-conv.c +++ b/src/win32/utf-conv.c @@ -29,6 +29,98 @@ void gitwin_set_utf8(void) _active_codepage = CP_UTF8; } +#define U16_LEAD(c) (wchar_t)(((c)>>10)+0xd7c0) +#define U16_TRAIL(c) (wchar_t)(((c)&0x3ff)|0xdc00) + +void git__utf8_to_16(wchar_t *dest, const char *src, size_t srcLength) +{ + wchar_t *pDest = dest; + uint32_t ch; + const uint8_t* pSrc = (uint8_t*) src; + const uint8_t *pSrcLimit = pSrc + srcLength; + + assert(dest && src && srcLength > 0); + + if ((pSrcLimit - pSrc) >= 4) { + pSrcLimit -= 3; /* temporarily reduce pSrcLimit */ + + /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */ + do { + ch = *pSrc++; + if(ch < 0xc0) { + /* + * ASCII, or a trail byte in lead position which is treated like + * a single-byte sequence for better character boundary + * resynchronization after illegal sequences. + */ + *pDest++=(wchar_t)ch; + } else if(ch < 0xe0) { /* U+0080..U+07FF */ + /* 0x3080 = (0xc0 << 6) + 0x80 */ + *pDest++ = (wchar_t)((ch << 6) + *pSrc++ - 0x3080); + } else if(ch < 0xf0) { /* U+0800..U+FFFF */ + /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ + /* 0x2080 = (0x80 << 6) + 0x80 */ + ch = (ch << 12) + (*pSrc++ << 6); + *pDest++ = (wchar_t)(ch + *pSrc++ - 0x2080); + } else /* f0..f4 */ { /* U+10000..U+10FFFF */ + /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ + ch = (ch << 18) + (*pSrc++ << 12); + ch += *pSrc++ << 6; + ch += *pSrc++ - 0x3c82080; + *(pDest++) = U16_LEAD(ch); + *(pDest++) = U16_TRAIL(ch); + } + } while(pSrc < pSrcLimit); + + pSrcLimit += 3; /* restore original pSrcLimit */ + } + + while(pSrc < pSrcLimit) { + ch = *pSrc++; + if(ch < 0xc0) { + /* + * ASCII, or a trail byte in lead position which is treated like + * a single-byte sequence for better character boundary + * resynchronization after illegal sequences. + */ + *pDest++=(wchar_t)ch; + continue; + } else if(ch < 0xe0) { /* U+0080..U+07FF */ + if(pSrc < pSrcLimit) { + /* 0x3080 = (0xc0 << 6) + 0x80 */ + *pDest++ = (wchar_t)((ch << 6) + *pSrc++ - 0x3080); + continue; + } + } else if(ch < 0xf0) { /* U+0800..U+FFFF */ + if((pSrcLimit - pSrc) >= 2) { + /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ + /* 0x2080 = (0x80 << 6) + 0x80 */ + ch = (ch << 12) + (*pSrc++ << 6); + *pDest++ = (wchar_t)(ch + *pSrc++ - 0x2080); + pSrc += 3; + continue; + } + } else /* f0..f4 */ { /* U+10000..U+10FFFF */ + if((pSrcLimit - pSrc) >= 3) { + /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ + ch = (ch << 18) + (*pSrc++ << 12); + ch += *pSrc++ << 6; + ch += *pSrc++ - 0x3c82080; + *(pDest++) = U16_LEAD(ch); + *(pDest++) = U16_TRAIL(ch); + pSrc += 4; + continue; + } + } + + /* truncated character at the end */ + *pDest++ = 0xfffd; + break; + } + + *pDest++ = 0x0; +} + wchar_t* gitwin_to_utf16(const char* str) { wchar_t* ret; -- cgit v1.2.1