diff options
Diffstat (limited to 'src/VBox/Runtime/common/string/utf-16.cpp')
-rw-r--r-- | src/VBox/Runtime/common/string/utf-16.cpp | 578 |
1 files changed, 0 insertions, 578 deletions
diff --git a/src/VBox/Runtime/common/string/utf-16.cpp b/src/VBox/Runtime/common/string/utf-16.cpp index c1259a6dd0f..f5b85686942 100644 --- a/src/VBox/Runtime/common/string/utf-16.cpp +++ b/src/VBox/Runtime/common/string/utf-16.cpp @@ -170,145 +170,6 @@ RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2) RT_EXPORT_SYMBOL(RTUtf16Cmp); -RTDECL(int) RTUtf16ICmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2) -{ - if (pwsz1 == pwsz2) - return 0; - if (!pwsz1) - return -1; - if (!pwsz2) - return 1; - - PCRTUTF16 pwsz1Start = pwsz1; /* keep it around in case we have to backtrack on a surrogate pair */ - for (;;) - { - register RTUTF16 wc1 = *pwsz1; - register RTUTF16 wc2 = *pwsz2; - register int iDiff = wc1 - wc2; - if (iDiff) - { - /* unless they are *both* surrogate pairs, there is no chance they'll be identical. */ - if ( wc1 < 0xd800 - || wc2 < 0xd800 - || wc1 > 0xdfff - || wc2 > 0xdfff) - { - /* simple UCS-2 char */ - iDiff = RTUniCpToUpper(wc1) - RTUniCpToUpper(wc2); - if (iDiff) - iDiff = RTUniCpToLower(wc1) - RTUniCpToLower(wc2); - } - else - { - /* a damned pair */ - RTUNICP uc1; - RTUNICP uc2; - if (wc1 >= 0xdc00) - { - if (pwsz1Start == pwsz1) - return iDiff; - uc1 = pwsz1[-1]; - if (uc1 < 0xd800 || uc1 >= 0xdc00) - return iDiff; - uc1 = 0x10000 + (((uc1 & 0x3ff) << 10) | (wc1 & 0x3ff)); - uc2 = 0x10000 + (((pwsz2[-1] & 0x3ff) << 10) | (wc2 & 0x3ff)); - } - else - { - uc1 = *++pwsz1; - if (uc1 < 0xdc00 || uc1 >= 0xe000) - return iDiff; - uc1 = 0x10000 + (((wc1 & 0x3ff) << 10) | (uc1 & 0x3ff)); - uc2 = 0x10000 + (((wc2 & 0x3ff) << 10) | (*++pwsz2 & 0x3ff)); - } - iDiff = RTUniCpToUpper(uc1) - RTUniCpToUpper(uc2); - if (iDiff) - iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* serious paranoia! */ - } - if (iDiff) - return iDiff; - } - if (!wc1) - return 0; - pwsz1++; - pwsz2++; - } -} -RT_EXPORT_SYMBOL(RTUtf16ICmp); - - -RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz) -{ - PRTUTF16 pwc = pwsz; - for (;;) - { - RTUTF16 wc = *pwc; - if (!wc) - break; - if (wc < 0xd800 || wc >= 0xdc00) - { - RTUNICP ucFolded = RTUniCpToLower(wc); - if (ucFolded < 0x10000) - *pwc++ = RTUniCpToLower(wc); - } - else - { - /* surrogate */ - RTUTF16 wc2 = pwc[1]; - if (wc2 >= 0xdc00 && wc2 <= 0xdfff) - { - RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff)); - RTUNICP ucFolded = RTUniCpToLower(uc); - if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */ - { - uc -= 0x10000; - *pwc++ = 0xd800 | (uc >> 10); - *pwc++ = 0xdc00 | (uc & 0x3ff); - } - } - else /* invalid encoding. */ - pwc++; - } - } - return pwsz; -} -RT_EXPORT_SYMBOL(RTUtf16ToLower); - - -RTDECL(PRTUTF16) RTUtf16ToUpper(PRTUTF16 pwsz) -{ - PRTUTF16 pwc = pwsz; - for (;;) - { - RTUTF16 wc = *pwc; - if (!wc) - break; - if (wc < 0xd800 || wc >= 0xdc00) - *pwc++ = RTUniCpToUpper(wc); - else - { - /* surrogate */ - RTUTF16 wc2 = pwc[1]; - if (wc2 >= 0xdc00 && wc2 <= 0xdfff) - { - RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff)); - RTUNICP ucFolded = RTUniCpToUpper(uc); - if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */ - { - uc -= 0x10000; - *pwc++ = 0xd800 | (uc >> 10); - *pwc++ = 0xdc00 | (uc & 0x3ff); - } - } - else /* invalid encoding. */ - pwc++; - } - } - return pwsz; -} -RT_EXPORT_SYMBOL(RTUtf16ToUpper); - - RTDECL(int) RTUtf16ValidateEncoding(PCRTUTF16 pwsz) { return RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0); @@ -785,442 +646,3 @@ RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint) } RT_EXPORT_SYMBOL(RTUtf16PutCpInternal); - -/** - * Validate the UTF-16 encoding and calculates the length of a Latin1 encoding. - * - * @returns iprt status code. - * @param pwsz The UTF-16 string. - * @param cwc The max length of the UTF-16 string to consider. - * @param pcch Where to store the length (excluding '\\0') of the Latin1 string. (cch == cb, btw) - */ -static int rtUtf16CalcLatin1Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch) -{ - int rc = VINF_SUCCESS; - size_t cch = 0; - while (cwc > 0) - { - RTUTF16 wc = *pwsz++; cwc--; - if (!wc) - break; - else if (RT_LIKELY(wc < 0x100)) - ++cch; - else - { - if (wc < 0xd800 || wc > 0xdfff) - { - if (wc >= 0xfffe) - { - RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc)); - rc = VERR_CODE_POINT_ENDIAN_INDICATOR; - break; - } - } - else - { - if (wc >= 0xdc00) - { - RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc)); - rc = VERR_INVALID_UTF16_ENCODING; - break; - } - if (cwc <= 0) - { - RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc)); - rc = VERR_INVALID_UTF16_ENCODING; - break; - } - wc = *pwsz++; cwc--; - if (wc < 0xdc00 || wc > 0xdfff) - { - RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc)); - rc = VERR_INVALID_UTF16_ENCODING; - break; - } - } - - rc = VERR_NO_TRANSLATION; - break; - } - } - - /* done */ - *pcch = cch; - return rc; -} - - -/** - * Recodes an valid UTF-16 string as Latin1. - * - * @returns iprt status code. - * @param pwsz The UTF-16 string. - * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding - * will stop when cwc or '\\0' is reached. - * @param psz Where to store the Latin1 string. - * @param cch The size of the Latin1 buffer, excluding the terminator. - */ -static int rtUtf16RecodeAsLatin1(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch) -{ - unsigned char *pch = (unsigned char *)psz; - int rc = VINF_SUCCESS; - while (cwc > 0) - { - RTUTF16 wc = *pwsz++; cwc--; - if (!wc) - break; - if (RT_LIKELY(wc < 0x100)) - { - if (RT_UNLIKELY(cch < 1)) - { - RTStrAssertMsgFailed(("Buffer overflow! 1\n")); - rc = VERR_BUFFER_OVERFLOW; - break; - } - cch--; - *pch++ = (unsigned char)wc; - } - else - { - if (wc < 0xd800 || wc > 0xdfff) - { - if (wc >= 0xfffe) - { - RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc)); - rc = VERR_CODE_POINT_ENDIAN_INDICATOR; - break; - } - } - else - { - if (wc >= 0xdc00) - { - RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc)); - rc = VERR_INVALID_UTF16_ENCODING; - break; - } - if (cwc <= 0) - { - RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc)); - rc = VERR_INVALID_UTF16_ENCODING; - break; - } - RTUTF16 wc2 = *pwsz++; cwc--; - if (wc2 < 0xdc00 || wc2 > 0xdfff) - { - RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc)); - rc = VERR_INVALID_UTF16_ENCODING; - break; - } - } - - rc = VERR_NO_TRANSLATION; - break; - } - } - - /* done */ - *pch = '\0'; - return rc; -} - - -RTDECL(int) RTUtf16ToLatin1Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag) -{ - /* - * Validate input. - */ - Assert(VALID_PTR(ppszString)); - Assert(VALID_PTR(pwszString)); - *ppszString = NULL; - - /* - * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it. - */ - size_t cch; - int rc = rtUtf16CalcLatin1Length(pwszString, RTSTR_MAX, &cch); - if (RT_SUCCESS(rc)) - { - /* - * Allocate buffer and recode it. - */ - char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag); - if (pszResult) - { - rc = rtUtf16RecodeAsLatin1(pwszString, RTSTR_MAX, pszResult, cch); - if (RT_SUCCESS(rc)) - { - *ppszString = pszResult; - return rc; - } - - RTMemFree(pszResult); - } - else - rc = VERR_NO_STR_MEMORY; - } - return rc; -} -RT_EXPORT_SYMBOL(RTUtf16ToLatin1Tag); - - -RTDECL(int) RTUtf16ToLatin1ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag) -{ - /* - * Validate input. - */ - AssertPtr(pwszString); - AssertPtr(ppsz); - AssertPtrNull(pcch); - - /* - * Validate the UTF-16 string and calculate the length of the Latin1 encoding of it. - */ - size_t cchResult; - int rc = rtUtf16CalcLatin1Length(pwszString, cwcString, &cchResult); - if (RT_SUCCESS(rc)) - { - if (pcch) - *pcch = cchResult; - - /* - * Check buffer size / Allocate buffer and recode it. - */ - bool fShouldFree; - char *pszResult; - if (cch > 0 && *ppsz) - { - fShouldFree = false; - if (cch <= cchResult) - return VERR_BUFFER_OVERFLOW; - pszResult = *ppsz; - } - else - { - *ppsz = NULL; - fShouldFree = true; - cch = RT_MAX(cch, cchResult + 1); - pszResult = (char *)RTMemAllocTag(cch, pszTag); - } - if (pszResult) - { - rc = rtUtf16RecodeAsLatin1(pwszString, cwcString, pszResult, cch - 1); - if (RT_SUCCESS(rc)) - { - *ppsz = pszResult; - return rc; - } - - if (fShouldFree) - RTMemFree(pszResult); - } - else - rc = VERR_NO_STR_MEMORY; - } - return rc; -} -RT_EXPORT_SYMBOL(RTUtf16ToLatin1ExTag); - - -RTDECL(size_t) RTUtf16CalcLatin1Len(PCRTUTF16 pwsz) -{ - size_t cch; - int rc = rtUtf16CalcLatin1Length(pwsz, RTSTR_MAX, &cch); - return RT_SUCCESS(rc) ? cch : 0; -} -RT_EXPORT_SYMBOL(RTUtf16CalcLatin1Len); - - -RTDECL(int) RTUtf16CalcLatin1LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch) -{ - size_t cch; - int rc = rtUtf16CalcLatin1Length(pwsz, cwc, &cch); - if (pcch) - *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0; - return rc; -} -RT_EXPORT_SYMBOL(RTUtf16CalcLatin1LenEx); - - -/** - * Calculates the UTF-16 length of a Latin1 string. In fact this is just the - * original length, but the function saves us nasty comments to that effect - * all over the place. - * - * @returns IPRT status code. - * @param psz Pointer to the Latin1 string. - * @param cch The max length of the string. (btw cch = cb) - * Use RTSTR_MAX if all of the string is to be examined.s - * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters. - */ -static int rtLatin1CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc) -{ - *pcwc = RTStrNLen(psz, cch); - return VINF_SUCCESS; -} - - -/** - * Recodes a Latin1 string as UTF-16. This is just a case of expanding it to - * sixteen bits, as Unicode is a superset of Latin1. - * - * Since we know the input is valid, we do *not* perform length checks. - * - * @returns iprt status code. - * @param psz The Latin1 string to recode. - * @param cch The number of chars (the type char, so bytes if you like) to process of the Latin1 string. - * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'. - * @param pwsz Where to store the UTF-16 string. - * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0'). - */ -static int rtLatin1RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc) -{ - int rc = VINF_SUCCESS; - const unsigned char *puch = (const unsigned char *)psz; - PRTUTF16 pwc = pwsz; - while (cch-- > 0) - { - /* read the next char and check for terminator. */ - const unsigned char uch = *puch; - if (!uch) - break; - - /* check for output overflow */ - if (RT_UNLIKELY(cwc < 1)) - { - rc = VERR_BUFFER_OVERFLOW; - break; - } - - /* expand the code point */ - *pwc++ = uch; - cwc--; - puch++; - } - - /* done */ - *pwc = '\0'; - return rc; -} - - -RTDECL(int) RTLatin1ToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag) -{ - /* - * Validate input. - */ - Assert(VALID_PTR(ppwszString)); - Assert(VALID_PTR(pszString)); - *ppwszString = NULL; - - /* - * Validate the input and calculate the length of the UTF-16 string. - */ - size_t cwc; - int rc = rtLatin1CalcUtf16Length(pszString, RTSTR_MAX, &cwc); - if (RT_SUCCESS(rc)) - { - /* - * Allocate buffer. - */ - PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag); - if (pwsz) - { - /* - * Encode the UTF-16 string. - */ - rc = rtLatin1RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc); - if (RT_SUCCESS(rc)) - { - *ppwszString = pwsz; - return rc; - } - RTMemFree(pwsz); - } - else - rc = VERR_NO_UTF16_MEMORY; - } - return rc; -} -RT_EXPORT_SYMBOL(RTLatin1ToUtf16Tag); - - -RTDECL(int) RTLatin1ToUtf16ExTag(const char *pszString, size_t cchString, - PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag) -{ - /* - * Validate input. - */ - Assert(VALID_PTR(pszString)); - Assert(VALID_PTR(ppwsz)); - Assert(!pcwc || VALID_PTR(pcwc)); - - /* - * Validate the input and calculate the length of the UTF-16 string. - */ - size_t cwcResult; - int rc = rtLatin1CalcUtf16Length(pszString, cchString, &cwcResult); - if (RT_SUCCESS(rc)) - { - if (pcwc) - *pcwc = cwcResult; - - /* - * Check buffer size / Allocate buffer. - */ - bool fShouldFree; - PRTUTF16 pwszResult; - if (cwc > 0 && *ppwsz) - { - fShouldFree = false; - if (cwc <= cwcResult) - return VERR_BUFFER_OVERFLOW; - pwszResult = *ppwsz; - } - else - { - *ppwsz = NULL; - fShouldFree = true; - cwc = RT_MAX(cwcResult + 1, cwc); - pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag); - } - if (pwszResult) - { - /* - * Encode the UTF-16 string. - */ - rc = rtLatin1RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1); - if (RT_SUCCESS(rc)) - { - *ppwsz = pwszResult; - return rc; - } - if (fShouldFree) - RTMemFree(pwszResult); - } - else - rc = VERR_NO_UTF16_MEMORY; - } - return rc; -} -RT_EXPORT_SYMBOL(RTLatin1ToUtf16ExTag); - - -RTDECL(size_t) RTLatin1CalcUtf16Len(const char *psz) -{ - size_t cwc; - int rc = rtLatin1CalcUtf16Length(psz, RTSTR_MAX, &cwc); - return RT_SUCCESS(rc) ? cwc : 0; -} -RT_EXPORT_SYMBOL(RTLatin1CalcUtf16Len); - - -RTDECL(int) RTLatin1CalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc) -{ - size_t cwc; - int rc = rtLatin1CalcUtf16Length(psz, cch, &cwc); - if (pcwc) - *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0; - return rc; -} -RT_EXPORT_SYMBOL(RTLatin1CalcUtf16LenEx); |