summaryrefslogtreecommitdiff
path: root/subversion/libsvn_subr/utf.c
diff options
context:
space:
mode:
Diffstat (limited to 'subversion/libsvn_subr/utf.c')
-rw-r--r--subversion/libsvn_subr/utf.c338
1 files changed, 255 insertions, 83 deletions
diff --git a/subversion/libsvn_subr/utf.c b/subversion/libsvn_subr/utf.c
index 4f9102d..7d20d24 100644
--- a/subversion/libsvn_subr/utf.c
+++ b/subversion/libsvn_subr/utf.c
@@ -59,6 +59,12 @@ static const char *SVN_APR_UTF8_CHARSET = "UTF-8";
static svn_mutex__t *xlate_handle_mutex = NULL;
static svn_boolean_t assume_native_charset_is_utf8 = FALSE;
+#if defined(WIN32)
+typedef svn_subr__win32_xlate_t xlate_handle_t;
+#else
+typedef apr_xlate_t xlate_handle_t;
+#endif
+
/* The xlate handle cache is a global hash table with linked lists of xlate
* handles. In multi-threaded environments, a thread "borrows" an xlate
* handle from the cache during a translation and puts it back afterwards.
@@ -69,7 +75,7 @@ static svn_boolean_t assume_native_charset_is_utf8 = FALSE;
* is the number of simultanous handles in use for that key. */
typedef struct xlate_handle_node_t {
- apr_xlate_t *handle;
+ xlate_handle_t *handle;
/* FALSE if the handle is not valid, since its pool is being
destroyed. */
svn_boolean_t valid;
@@ -172,7 +178,7 @@ get_xlate_key(const char *topage,
topage = "APR_DEFAULT_CHARSET";
return apr_pstrcat(pool, "svn-utf-", frompage, "to", topage,
- "-xlate-handle", (char *)NULL);
+ "-xlate-handle", SVN_VA_NULL);
}
/* Atomically replace the content in *MEM with NEW_VALUE and return
@@ -184,16 +190,10 @@ static APR_INLINE void*
atomic_swap(void * volatile * mem, void *new_value)
{
#if APR_HAS_THREADS
-#if APR_VERSION_AT_LEAST(1,3,0)
/* Cast is necessary because of APR bug:
https://issues.apache.org/bugzilla/show_bug.cgi?id=50731 */
return apr_atomic_xchgptr((volatile void **)mem, new_value);
#else
- /* old APRs don't support atomic swaps. Simply return the
- * input to the caller for further proccessing. */
- return new_value;
-#endif
-#else
/* no threads - no sync. necessary */
void *old_value = (void*)*mem;
*mem = new_value;
@@ -211,7 +211,7 @@ xlate_alloc_handle(xlate_handle_node_t **ret,
apr_pool_t *pool)
{
apr_status_t apr_err;
- apr_xlate_t *handle;
+ xlate_handle_t *handle;
const char *name;
/* The error handling doesn't support the following cases, since we don't
@@ -223,7 +223,7 @@ xlate_alloc_handle(xlate_handle_node_t **ret,
/* Try to create a handle. */
#if defined(WIN32)
- apr_err = svn_subr__win32_xlate_open((win32_xlate_t **)&handle, topage,
+ apr_err = svn_subr__win32_xlate_open(&handle, topage,
frompage, pool);
name = "win32-xlate: ";
#else
@@ -257,7 +257,7 @@ xlate_alloc_handle(xlate_handle_node_t **ret,
later. APR_STRERR will be in the local encoding, not in UTF-8, though.
*/
svn_strerror(apr_err, apr_strerr, sizeof(apr_strerr));
- return svn_error_createf(SVN_ERR_PLUGIN_LOAD_FAILURE,
+ return svn_error_createf(SVN_ERR_PLUGIN_LOAD_FAILURE,
svn_error_create(apr_err, NULL, apr_strerr),
"%s%s", name, errstr);
}
@@ -480,58 +480,6 @@ get_uton_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool)
}
-/* Copy LEN bytes of SRC, converting non-ASCII and zero bytes to ?\nnn
- sequences, allocating the result in POOL. */
-static const char *
-fuzzy_escape(const char *src, apr_size_t len, apr_pool_t *pool)
-{
- const char *src_orig = src, *src_end = src + len;
- apr_size_t new_len = 0;
- char *new;
- const char *new_orig;
-
- /* First count how big a dest string we'll need. */
- while (src < src_end)
- {
- if (! svn_ctype_isascii(*src) || *src == '\0')
- new_len += 5; /* 5 slots, for "?\XXX" */
- else
- new_len += 1; /* one slot for the 7-bit char */
-
- src++;
- }
-
- /* Allocate that amount, plus one slot for '\0' character. */
- new = apr_palloc(pool, new_len + 1);
-
- new_orig = new;
-
- /* And fill it up. */
- while (src_orig < src_end)
- {
- if (! svn_ctype_isascii(*src_orig) || src_orig == '\0')
- {
- /* This is the same format as svn_xml_fuzzy_escape uses, but that
- function escapes different characters. Please keep in sync!
- ### If we add another fuzzy escape somewhere, we should abstract
- ### this out to a common function. */
- apr_snprintf(new, 6, "?\\%03u", (unsigned char) *src_orig);
- new += 5;
- }
- else
- {
- *new = *src_orig;
- new += 1;
- }
-
- src_orig++;
- }
-
- *new = '\0';
-
- return new_orig;
-}
-
/* Convert SRC_LENGTH bytes of SRC_DATA in NODE->handle, store the result
in *DEST, which is allocated in POOL. */
static svn_error_t *
@@ -544,9 +492,8 @@ convert_to_stringbuf(xlate_handle_node_t *node,
#ifdef WIN32
apr_status_t apr_err;
- apr_err = svn_subr__win32_xlate_to_stringbuf((win32_xlate_t *) node->handle,
- src_data, src_length,
- dest, pool);
+ apr_err = svn_subr__win32_xlate_to_stringbuf(node->handle, src_data,
+ src_length, dest, pool);
#else
apr_size_t buflen = src_length * 2;
apr_status_t apr_err;
@@ -609,8 +556,8 @@ convert_to_stringbuf(xlate_handle_node_t *node,
(pool, _("Can't convert string from '%s' to '%s':"),
node->frompage, node->topage);
- err = svn_error_create(apr_err, NULL, fuzzy_escape(src_data,
- src_length, pool));
+ err = svn_error_create(
+ apr_err, NULL, svn_utf__fuzzy_escape(src_data, src_length, pool));
return svn_error_create(apr_err, err, errstr);
}
/* Else, exited due to success. Trim the result buffer down to the
@@ -691,7 +638,7 @@ invalid_utf8(const char *data, apr_size_t len, apr_pool_t *pool)
valid_txt = apr_pstrcat(pool, valid_txt,
apr_psprintf(pool, " %02x",
(unsigned char)last[i-valid]),
- (char *)NULL);
+ SVN_VA_NULL);
/* 4 invalid octets will guarantee that the faulty octet is displayed */
invalid = data + len - last;
@@ -701,7 +648,7 @@ invalid_utf8(const char *data, apr_size_t len, apr_pool_t *pool)
invalid_txt = apr_pstrcat(pool, invalid_txt,
apr_psprintf(pool, " %02x",
(unsigned char)last[i]),
- (char *)NULL);
+ SVN_VA_NULL);
return svn_error_createf(APR_EINVAL, NULL,
_("Valid UTF-8 data\n(hex:%s)\n"
@@ -986,18 +933,6 @@ svn_utf_cstring_from_utf8_ex2(const char **dest,
return err;
}
-
-svn_error_t *
-svn_utf_cstring_from_utf8_ex(const char **dest,
- const char *src,
- const char *topage,
- const char *convset_key,
- apr_pool_t *pool)
-{
- return svn_utf_cstring_from_utf8_ex2(dest, src, topage, pool);
-}
-
-
const char *
svn_utf__cstring_from_utf8_fuzzy(const char *src,
apr_pool_t *pool,
@@ -1007,7 +942,7 @@ svn_utf__cstring_from_utf8_fuzzy(const char *src,
const char *escaped, *converted;
svn_error_t *err;
- escaped = fuzzy_escape(src, strlen(src), pool);
+ escaped = svn_utf__fuzzy_escape(src, strlen(src), pool);
/* Okay, now we have a *new* UTF-8 string, one that's guaranteed to
contain only 7-bit bytes :-). Recode to native... */
@@ -1084,3 +1019,240 @@ svn_utf_cstring_from_utf8_string(const char **dest,
return err;
}
+
+
+/* Insert the given UCS-4 VALUE into BUF at the given OFFSET. */
+static void
+membuf_insert_ucs4(svn_membuf_t *buf, apr_size_t offset, apr_int32_t value)
+{
+ svn_membuf__resize(buf, (offset + 1) * sizeof(value));
+ ((apr_int32_t*)buf->data)[offset] = value;
+}
+
+/* TODO: Use compiler intrinsics for byte swaps. */
+#define SWAP_SHORT(x) ((((x) & 0xff) << 8) | (((x) >> 8) & 0xff))
+#define SWAP_LONG(x) ((((x) & 0xff) << 24) | (((x) & 0xff00) << 8) \
+ | (((x) >> 8) & 0xff00) | (((x) >> 24) & 0xff))
+
+#define IS_UTF16_LEAD_SURROGATE(c) ((c) >= 0xd800 && (c) <= 0xdbff)
+#define IS_UTF16_TRAIL_SURROGATE(c) ((c) >= 0xdc00 && (c) <= 0xdfff)
+
+svn_error_t *
+svn_utf__utf16_to_utf8(const svn_string_t **result,
+ const apr_uint16_t *utf16str,
+ apr_size_t utf16len,
+ svn_boolean_t big_endian,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool)
+{
+ static const apr_uint16_t endiancheck = 0xa55a;
+ const svn_boolean_t arch_big_endian =
+ (((const char*)&endiancheck)[sizeof(endiancheck) - 1] == '\x5a');
+ const svn_boolean_t swap_order = (!big_endian != !arch_big_endian);
+
+ apr_uint16_t lead_surrogate;
+ apr_size_t length;
+ apr_size_t offset;
+ svn_membuf_t ucs4buf;
+ svn_membuf_t resultbuf;
+ svn_string_t *res;
+
+ if (utf16len == SVN_UTF__UNKNOWN_LENGTH)
+ {
+ const apr_uint16_t *endp = utf16str;
+ while (*endp++)
+ ;
+ utf16len = (endp - utf16str);
+ }
+
+ svn_membuf__create(&ucs4buf, utf16len * sizeof(apr_int32_t), scratch_pool);
+
+ for (lead_surrogate = 0, length = 0, offset = 0;
+ offset < utf16len; ++offset)
+ {
+ const apr_uint16_t code =
+ (swap_order ? SWAP_SHORT(utf16str[offset]) : utf16str[offset]);
+
+ if (lead_surrogate)
+ {
+ if (IS_UTF16_TRAIL_SURROGATE(code))
+ {
+ /* Combine the lead and trail currogates into a 32-bit code. */
+ membuf_insert_ucs4(&ucs4buf, length++,
+ (0x010000
+ + (((lead_surrogate & 0x03ff) << 10)
+ | (code & 0x03ff))));
+ lead_surrogate = 0;
+ continue;
+ }
+ else
+ {
+ /* If we didn't find a surrogate pair, just dump the
+ lead surrogate into the stream. */
+ membuf_insert_ucs4(&ucs4buf, length++, lead_surrogate);
+ lead_surrogate = 0;
+ }
+ }
+
+ if ((offset + 1) < utf16len && IS_UTF16_LEAD_SURROGATE(code))
+ {
+ /* Store a lead surrogate that is followed by at least one
+ code for the next iteration. */
+ lead_surrogate = code;
+ continue;
+ }
+ else
+ membuf_insert_ucs4(&ucs4buf, length++, code);
+ }
+
+ /* Convert the UCS-4 buffer to UTF-8, assuming an average of 2 bytes
+ per code point for encoding. The buffer will grow as
+ necessary. */
+ svn_membuf__create(&resultbuf, length * 2, result_pool);
+ SVN_ERR(svn_utf__encode_ucs4_string(
+ &resultbuf, ucs4buf.data, length, &length));
+
+ res = apr_palloc(result_pool, sizeof(*res));
+ res->data = resultbuf.data;
+ res->len = length;
+ *result = res;
+ return SVN_NO_ERROR;
+}
+
+
+svn_error_t *
+svn_utf__utf32_to_utf8(const svn_string_t **result,
+ const apr_int32_t *utf32str,
+ apr_size_t utf32len,
+ svn_boolean_t big_endian,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool)
+{
+ static const apr_int32_t endiancheck = 0xa5cbbc5a;
+ const svn_boolean_t arch_big_endian =
+ (((const char*)&endiancheck)[sizeof(endiancheck) - 1] == '\x5a');
+ const svn_boolean_t swap_order = (!big_endian != !arch_big_endian);
+
+ apr_size_t length;
+ svn_membuf_t resultbuf;
+ svn_string_t *res;
+
+ if (utf32len == SVN_UTF__UNKNOWN_LENGTH)
+ {
+ const apr_int32_t *endp = utf32str;
+ while (*endp++)
+ ;
+ utf32len = (endp - utf32str);
+ }
+
+ if (swap_order)
+ {
+ apr_size_t offset;
+ svn_membuf_t ucs4buf;
+
+ svn_membuf__create(&ucs4buf, utf32len * sizeof(apr_int32_t),
+ scratch_pool);
+
+ for (offset = 0; offset < utf32len; ++offset)
+ {
+ const apr_int32_t code = SWAP_LONG(utf32str[offset]);
+ membuf_insert_ucs4(&ucs4buf, offset, code);
+ }
+ utf32str = ucs4buf.data;
+ }
+
+ /* Convert the UCS-4 buffer to UTF-8, assuming an average of 2 bytes
+ per code point for encoding. The buffer will grow as
+ necessary. */
+ svn_membuf__create(&resultbuf, utf32len * 2, result_pool);
+ SVN_ERR(svn_utf__encode_ucs4_string(
+ &resultbuf, utf32str, utf32len, &length));
+
+ res = apr_palloc(result_pool, sizeof(*res));
+ res->data = resultbuf.data;
+ res->len = length;
+ *result = res;
+ return SVN_NO_ERROR;
+}
+
+
+#ifdef WIN32
+
+
+svn_error_t *
+svn_utf__win32_utf8_to_utf16(const WCHAR **result,
+ const char *src,
+ const WCHAR *prefix,
+ apr_pool_t *result_pool)
+{
+ const int utf8_count = strlen(src);
+ const int prefix_len = (prefix ? lstrlenW(prefix) : 0);
+ WCHAR *wide_str;
+ int wide_count;
+
+ if (0 == prefix_len + utf8_count)
+ {
+ *result = L"";
+ return SVN_NO_ERROR;
+ }
+
+ wide_count = MultiByteToWideChar(CP_UTF8, 0, src, utf8_count, NULL, 0);
+ if (wide_count == 0)
+ return svn_error_wrap_apr(apr_get_os_error(),
+ _("Conversion to UTF-16 failed"));
+
+ wide_str = apr_palloc(result_pool,
+ (prefix_len + wide_count + 1) * sizeof(*wide_str));
+ if (prefix_len)
+ memcpy(wide_str, prefix, prefix_len * sizeof(*wide_str));
+ if (0 == MultiByteToWideChar(CP_UTF8, 0, src, utf8_count,
+ wide_str + prefix_len, wide_count))
+ return svn_error_wrap_apr(apr_get_os_error(),
+ _("Conversion to UTF-16 failed"));
+
+ wide_str[prefix_len + wide_count] = 0;
+ *result = wide_str;
+
+ return SVN_NO_ERROR;
+}
+
+svn_error_t *
+svn_utf__win32_utf16_to_utf8(const char **result,
+ const WCHAR *src,
+ const char *prefix,
+ apr_pool_t *result_pool)
+{
+ const int wide_count = lstrlenW(src);
+ const int prefix_len = (prefix ? strlen(prefix) : 0);
+ char *utf8_str;
+ int utf8_count;
+
+ if (0 == prefix_len + wide_count)
+ {
+ *result = "";
+ return SVN_NO_ERROR;
+ }
+
+ utf8_count = WideCharToMultiByte(CP_UTF8, 0, src, wide_count,
+ NULL, 0, NULL, FALSE);
+ if (utf8_count == 0)
+ return svn_error_wrap_apr(apr_get_os_error(),
+ _("Conversion from UTF-16 failed"));
+
+ utf8_str = apr_palloc(result_pool,
+ (prefix_len + utf8_count + 1) * sizeof(*utf8_str));
+ if (prefix_len)
+ memcpy(utf8_str, prefix, prefix_len * sizeof(*utf8_str));
+ if (0 == WideCharToMultiByte(CP_UTF8, 0, src, wide_count,
+ utf8_str + prefix_len, utf8_count,
+ NULL, FALSE))
+ return svn_error_wrap_apr(apr_get_os_error(),
+ _("Conversion from UTF-16 failed"));
+
+ utf8_str[prefix_len + utf8_count] = 0;
+ *result = utf8_str;
+
+ return SVN_NO_ERROR;
+}
+
+#endif /* WIN32 */