diff options
Diffstat (limited to 'src/win32/utf-conv.c')
-rw-r--r-- | src/win32/utf-conv.c | 175 |
1 files changed, 116 insertions, 59 deletions
diff --git a/src/win32/utf-conv.c b/src/win32/utf-conv.c index d4dbfbab9..b9ccfb5e5 100644 --- a/src/win32/utf-conv.c +++ b/src/win32/utf-conv.c @@ -8,74 +8,131 @@ #include "common.h" #include "utf-conv.h" -#define U16_LEAD(c) (wchar_t)(((c)>>10)+0xd7c0) -#define U16_TRAIL(c) (wchar_t)(((c)&0x3ff)|0xdc00) +#ifndef WC_ERR_INVALID_CHARS +# define WC_ERR_INVALID_CHARS 0x80 +#endif -#if 0 -void git__utf8_to_16(wchar_t *dest, size_t length, const char *src) +GIT_INLINE(DWORD) get_wc_flags(void) { - wchar_t *pDest = dest; - uint32_t ch; - const uint8_t* pSrc = (uint8_t*) src; - - assert(dest && src && length); - - length--; - - while(*pSrc && length > 0) { - ch = *pSrc++; - length--; - - if(ch < 0xc0) { - /* - * ASCII, or a trail byte in lead position which is treated like - * a single-byte sequence for better character boundary - * resynchronization after illegal sequences. - */ - *pDest++ = (wchar_t)ch; - continue; - } else if(ch < 0xe0) { /* U+0080..U+07FF */ - if (pSrc[0]) { - /* 0x3080 = (0xc0 << 6) + 0x80 */ - *pDest++ = (wchar_t)((ch << 6) + *pSrc++ - 0x3080); - continue; - } - } else if(ch < 0xf0) { /* U+0800..U+FFFF */ - if (pSrc[0] && pSrc[1]) { - /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ - /* 0x2080 = (0x80 << 6) + 0x80 */ - ch = (ch << 12) + (*pSrc++ << 6); - *pDest++ = (wchar_t)(ch + *pSrc++ - 0x2080); - continue; - } - } else /* f0..f4 */ { /* U+10000..U+10FFFF */ - if (length >= 1 && pSrc[0] && pSrc[1] && pSrc[2]) { - /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ - ch = (ch << 18) + (*pSrc++ << 12); - ch += *pSrc++ << 6; - ch += *pSrc++ - 0x3c82080; - *(pDest++) = U16_LEAD(ch); - *(pDest++) = U16_TRAIL(ch); - length--; /* two bytes for this character */ - continue; - } - } - - /* truncated character at the end */ - *pDest++ = 0xfffd; - break; + static char inited = 0; + static DWORD flags; + + /* Invalid code point check supported on Vista+ only */ + if (!inited) { + flags = git_has_win32_version(6, 0, 0) ? WC_ERR_INVALID_CHARS : 0; + inited = 1; } - *pDest++ = 0x0; + return flags; } -#endif -int git__utf8_to_16(wchar_t * dest, size_t dest_size, const char *src) +/** + * Converts a UTF-8 string to wide characters. + * + * @param dest The buffer to receive the wide string. + * @param dest_size The size of the buffer, in characters. + * @param src The UTF-8 string to convert. + * @return The length of the wide string, in characters (not counting the NULL terminator), or < 0 for failure + */ +int git__utf8_to_16(wchar_t *dest, size_t dest_size, const char *src) { - return MultiByteToWideChar(CP_UTF8, 0, src, -1, dest, (int)dest_size); + /* Length of -1 indicates NULL termination of the input string. Subtract 1 from the result to + * turn 0 into -1 (an error code) and to not count the NULL terminator as part of the string's + * length. MultiByteToWideChar never returns int's minvalue, so underflow is not possible */ + return MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src, -1, dest, (int)dest_size) - 1; } +/** + * Converts a wide string to UTF-8. + * + * @param dest The buffer to receive the UTF-8 string. + * @param dest_size The size of the buffer, in bytes. + * @param src The wide string to convert. + * @return The length of the UTF-8 string, in bytes (not counting the NULL terminator), or < 0 for failure + */ int git__utf16_to_8(char *dest, size_t dest_size, const wchar_t *src) { - return WideCharToMultiByte(CP_UTF8, 0, src, -1, dest, (int)dest_size, NULL, NULL); + /* Length of -1 indicates NULL termination of the input string. Subtract 1 from the result to + * turn 0 into -1 (an error code) and to not count the NULL terminator as part of the string's + * length. WideCharToMultiByte never returns int's minvalue, so underflow is not possible */ + return WideCharToMultiByte(CP_UTF8, get_wc_flags(), src, -1, dest, (int)dest_size, NULL, NULL) - 1; +} + +/** + * Converts a UTF-8 string to wide characters. + * Memory is allocated to hold the converted string. + * The caller is responsible for freeing the string with git__free. + * + * @param dest Receives a pointer to the wide string. + * @param src The UTF-8 string to convert. + * @return The length of the wide string, in characters (not counting the NULL terminator), or < 0 for failure + */ +int git__utf8_to_16_alloc(wchar_t **dest, const char *src) +{ + int utf16_size; + + *dest = NULL; + + /* Length of -1 indicates NULL termination of the input string */ + utf16_size = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src, -1, NULL, 0); + + if (!utf16_size) + return -1; + + *dest = git__malloc(utf16_size * sizeof(wchar_t)); + + if (!*dest) + return -1; + + utf16_size = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src, -1, *dest, utf16_size); + + if (!utf16_size) { + git__free(*dest); + *dest = NULL; + } + + /* Subtract 1 from the result to turn 0 into -1 (an error code) and to not count the NULL + * terminator as part of the string's length. MultiByteToWideChar never returns int's minvalue, + * so underflow is not possible */ + return utf16_size - 1; +} + +/** + * Converts a wide string to UTF-8. + * Memory is allocated to hold the converted string. + * The caller is responsible for freeing the string with git__free. + * + * @param dest Receives a pointer to the UTF-8 string. + * @param src The wide string to convert. + * @return The length of the UTF-8 string, in bytes (not counting the NULL terminator), or < 0 for failure + */ +int git__utf16_to_8_alloc(char **dest, const wchar_t *src) +{ + int utf8_size; + DWORD dwFlags = get_wc_flags(); + + *dest = NULL; + + /* Length of -1 indicates NULL termination of the input string */ + utf8_size = WideCharToMultiByte(CP_UTF8, dwFlags, src, -1, NULL, 0, NULL, NULL); + + if (!utf8_size) + return -1; + + *dest = git__malloc(utf8_size); + + if (!*dest) + return -1; + + utf8_size = WideCharToMultiByte(CP_UTF8, dwFlags, src, -1, *dest, utf8_size, NULL, NULL); + + if (!utf8_size) { + git__free(*dest); + *dest = NULL; + } + + /* Subtract 1 from the result to turn 0 into -1 (an error code) and to not count the NULL + * terminator as part of the string's length. MultiByteToWideChar never returns int's minvalue, + * so underflow is not possible */ + return utf8_size - 1; } |