summaryrefslogtreecommitdiff
path: root/subversion/libsvn_subr/utf8proc.c
diff options
context:
space:
mode:
Diffstat (limited to 'subversion/libsvn_subr/utf8proc.c')
-rw-r--r--subversion/libsvn_subr/utf8proc.c530
1 files changed, 530 insertions, 0 deletions
diff --git a/subversion/libsvn_subr/utf8proc.c b/subversion/libsvn_subr/utf8proc.c
new file mode 100644
index 0000000..1e705f5
--- /dev/null
+++ b/subversion/libsvn_subr/utf8proc.c
@@ -0,0 +1,530 @@
+/*
+ * utf8proc.c: Wrappers for the utf8proc library
+ *
+ * ====================================================================
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * ====================================================================
+ */
+
+
+
+#include <apr_fnmatch.h>
+
+#include "private/svn_string_private.h"
+#include "private/svn_utf_private.h"
+#include "svn_private_config.h"
+
+#define UTF8PROC_INLINE
+/* Somehow utf8proc thinks it is nice to use strlen as an argument name,
+ while this function is already defined via apr.h */
+#define strlen svn__strlen_var
+#include "utf8proc/utf8proc.c"
+#undef strlen
+
+
+
+const char *
+svn_utf__utf8proc_compiled_version(void)
+{
+ static const char utf8proc_version[] =
+ APR_STRINGIFY(UTF8PROC_VERSION_MAJOR) "."
+ APR_STRINGIFY(UTF8PROC_VERSION_MINOR) "."
+ APR_STRINGIFY(UTF8PROC_VERSION_PATCH);
+ return utf8proc_version;
+}
+
+const char *
+svn_utf__utf8proc_runtime_version(void)
+{
+ /* Unused static function warning removal hack. */
+ SVN_UNUSED(utf8proc_NFD);
+ SVN_UNUSED(utf8proc_NFC);
+ SVN_UNUSED(utf8proc_NFKD);
+ SVN_UNUSED(utf8proc_NFKC);
+
+ return utf8proc_version();
+}
+
+
+
+/* Fill the given BUFFER with decomposed UCS-4 representation of the
+ * UTF-8 STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING
+ * is NUL-terminated; otherwise look only at the first LENGTH bytes in
+ * STRING. Upon return, BUFFER->data points at an array of UCS-4
+ * characters, and return the length of the array. TRANSFORM_FLAGS
+ * define exactly how the decomposition is performed.
+ *
+ * A negative return value is an utf8proc error code and may indicate
+ * that STRING contains invalid UTF-8 or was so long that an overflow
+ * occurred.
+ */
+static ssize_t
+unicode_decomposition(int transform_flags,
+ const char *string, apr_size_t length,
+ svn_membuf_t *buffer)
+{
+ const int nullterm = (length == SVN_UTF__UNKNOWN_LENGTH
+ ? UTF8PROC_NULLTERM : 0);
+
+ for (;;)
+ {
+ apr_int32_t *const ucs4buf = buffer->data;
+ const ssize_t ucs4len = buffer->size / sizeof(*ucs4buf);
+ const ssize_t result =
+ utf8proc_decompose((const void*) string, length, ucs4buf, ucs4len,
+ UTF8PROC_DECOMPOSE | UTF8PROC_STABLE
+ | transform_flags | nullterm);
+
+ if (result < 0 || result <= ucs4len)
+ return result;
+
+ /* Increase the decomposition buffer size and retry */
+ svn_membuf__ensure(buffer, result * sizeof(*ucs4buf));
+ }
+}
+
+/* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8
+ * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
+ * NUL-terminated; otherwise look only at the first LENGTH bytes in
+ * STRING. Upon return, BUFFER->data points at an array of UCS-4
+ * characters and *RESULT_LENGTH contains the length of the array.
+ *
+ * A returned error may indicate that STRING contains invalid UTF-8 or
+ * invalid Unicode codepoints. Any error message comes from utf8proc.
+ */
+static svn_error_t *
+decompose_normalized(apr_size_t *result_length,
+ const char *string, apr_size_t length,
+ svn_membuf_t *buffer)
+{
+ ssize_t result = unicode_decomposition(0, string, length, buffer);
+ if (result < 0)
+ return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
+ gettext(utf8proc_errmsg(result)));
+ *result_length = result;
+ return SVN_NO_ERROR;
+}
+
+/* Fill the given BUFFER with an NFC UTF-8 representation of the UTF-8
+ * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
+ * NUL-terminated; otherwise look only at the first LENGTH bytes in
+ * STRING. Upon return, BUFFER->data points at a NUL-terminated string
+ * of UTF-8 characters.
+ *
+ * A returned error may indicate that STRING contains invalid UTF-8 or
+ * invalid Unicode codepoints. Any error message comes from utf8proc.
+ */
+static svn_error_t *
+normalize_cstring(apr_size_t *result_length,
+ const char *string, apr_size_t length,
+ svn_membuf_t *buffer)
+{
+ ssize_t result = unicode_decomposition(0, string, length, buffer);
+ if (result >= 0)
+ {
+ svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1);
+ result = utf8proc_reencode(buffer->data, result,
+ UTF8PROC_COMPOSE | UTF8PROC_STABLE);
+ }
+ if (result < 0)
+ return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
+ gettext(utf8proc_errmsg(result)));
+ *result_length = result;
+ return SVN_NO_ERROR;
+}
+
+/* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of
+ * length LENB. Return 0 if they're equal, a negative value if BUFA is
+ * less than BUFB, otherwise a positive value.
+ *
+ * Yes, this is strcmp for known-length UCS-4 strings.
+ */
+static int
+ucs4cmp(const apr_int32_t *bufa, apr_size_t lena,
+ const apr_int32_t *bufb, apr_size_t lenb)
+{
+ const apr_size_t len = (lena < lenb ? lena : lenb);
+ apr_size_t i;
+
+ for (i = 0; i < len; ++i)
+ {
+ const int diff = bufa[i] - bufb[i];
+ if (diff)
+ return diff;
+ }
+ return (lena == lenb ? 0 : (lena < lenb ? -1 : 1));
+}
+
+svn_error_t *
+svn_utf__normcmp(int *result,
+ const char *str1, apr_size_t len1,
+ const char *str2, apr_size_t len2,
+ svn_membuf_t *buf1, svn_membuf_t *buf2)
+{
+ apr_size_t buflen1;
+ apr_size_t buflen2;
+
+ /* Shortcut-circuit the decision if at least one of the strings is empty. */
+ const svn_boolean_t empty1 =
+ (0 == len1 || (len1 == SVN_UTF__UNKNOWN_LENGTH && !*str1));
+ const svn_boolean_t empty2 =
+ (0 == len2 || (len2 == SVN_UTF__UNKNOWN_LENGTH && !*str2));
+ if (empty1 || empty2)
+ {
+ *result = (empty1 == empty2 ? 0 : (empty1 ? -1 : 1));
+ return SVN_NO_ERROR;
+ }
+
+ SVN_ERR(decompose_normalized(&buflen1, str1, len1, buf1));
+ SVN_ERR(decompose_normalized(&buflen2, str2, len2, buf2));
+ *result = ucs4cmp(buf1->data, buflen1, buf2->data, buflen2);
+ return SVN_NO_ERROR;
+}
+
+svn_error_t*
+svn_utf__normalize(const char **result,
+ const char *str, apr_size_t len,
+ svn_membuf_t *buf)
+{
+ apr_size_t result_length;
+ SVN_ERR(normalize_cstring(&result_length, str, len, buf));
+ *result = (const char*)(buf->data);
+ return SVN_NO_ERROR;
+}
+
+/* Decode a single UCS-4 code point to UTF-8, appending the result to BUFFER.
+ * Assume BUFFER is already filled to *LENGTH and return the new size there.
+ * This function does *not* nul-terminate the stringbuf!
+ *
+ * A returned error indicates that the codepoint is invalid.
+ */
+static svn_error_t *
+encode_ucs4(svn_membuf_t *buffer, apr_int32_t ucs4chr, apr_size_t *length)
+{
+ apr_size_t utf8len;
+
+ if (buffer->size - *length < 4)
+ svn_membuf__resize(buffer, buffer->size + 4);
+
+ utf8len = utf8proc_encode_char(ucs4chr, ((uint8_t*)buffer->data + *length));
+ if (!utf8len)
+ return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL,
+ _("Invalid Unicode character U+%04lX"),
+ (long)ucs4chr);
+ *length += utf8len;
+ return SVN_NO_ERROR;
+}
+
+svn_error_t *
+svn_utf__encode_ucs4_string(svn_membuf_t *buffer,
+ const apr_int32_t *ucs4str,
+ apr_size_t length,
+ apr_size_t *result_length)
+{
+ *result_length = 0;
+ while (length-- > 0)
+ SVN_ERR(encode_ucs4(buffer, *ucs4str++, result_length));
+ svn_membuf__resize(buffer, *result_length + 1);
+ ((char*)buffer->data)[*result_length] = '\0';
+ return SVN_NO_ERROR;
+}
+
+
+svn_error_t *
+svn_utf__glob(svn_boolean_t *match,
+ const char *pattern, apr_size_t pattern_len,
+ const char *string, apr_size_t string_len,
+ const char *escape, apr_size_t escape_len,
+ svn_boolean_t sql_like,
+ svn_membuf_t *pattern_buf,
+ svn_membuf_t *string_buf,
+ svn_membuf_t *temp_buf)
+{
+ apr_size_t patternbuf_len;
+ apr_size_t tempbuf_len;
+
+ /* If we're in GLOB mode, we don't do custom escape chars. */
+ if (escape && !sql_like)
+ return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
+ _("Cannot use a custom escape token"
+ " in glob matching mode"));
+
+ /* Convert the patern to NFD UTF-8. We can't use the UCS-4 result
+ because apr_fnmatch can't handle it.*/
+ SVN_ERR(decompose_normalized(&tempbuf_len, pattern, pattern_len, temp_buf));
+ if (!sql_like)
+ SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf, temp_buf->data,
+ tempbuf_len, &patternbuf_len));
+ else
+ {
+ /* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */
+ const apr_int32_t *like = temp_buf->data;
+ apr_int32_t ucs4esc;
+ svn_boolean_t escaped;
+ apr_size_t i;
+
+ if (!escape)
+ ucs4esc = -1; /* Definitely an invalid UCS-4 character. */
+ else
+ {
+ const int nullterm = (escape_len == SVN_UTF__UNKNOWN_LENGTH
+ ? UTF8PROC_NULLTERM : 0);
+ ssize_t result =
+ utf8proc_decompose((const void*) escape, escape_len, &ucs4esc, 1,
+ UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | nullterm);
+ if (result < 0)
+ return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
+ gettext(utf8proc_errmsg(result)));
+ if (result == 0 || result > 1)
+ return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
+ _("Escape token must be one character"));
+ if ((ucs4esc & 0xFF) != ucs4esc)
+ return svn_error_createf(SVN_ERR_UTF8_GLOB, NULL,
+ _("Invalid escape character U+%04lX"),
+ (long)ucs4esc);
+ }
+
+ patternbuf_len = 0;
+ svn_membuf__ensure(pattern_buf, tempbuf_len + 1);
+ for (i = 0, escaped = FALSE; i < tempbuf_len; ++i, ++like)
+ {
+ if (*like == ucs4esc && !escaped)
+ {
+ svn_membuf__resize(pattern_buf, patternbuf_len + 1);
+ ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
+ escaped = TRUE;
+ }
+ else if (escaped)
+ {
+ SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
+ escaped = FALSE;
+ }
+ else
+ {
+ if ((*like == '[' || *like == '\\') && !escaped)
+ {
+ /* Escape brackets and backslashes which are always
+ literals in LIKE patterns. */
+ svn_membuf__resize(pattern_buf, patternbuf_len + 1);
+ ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
+ escaped = TRUE;
+ --i; --like;
+ continue;
+ }
+
+ /* Replace LIKE wildcards with their GLOB equivalents. */
+ if (*like == '%' || *like == '_')
+ {
+ const char wildcard = (*like == '%' ? '*' : '?');
+ svn_membuf__resize(pattern_buf, patternbuf_len + 1);
+ ((char*)pattern_buf->data)[patternbuf_len++] = wildcard;
+ }
+ else
+ SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
+ }
+ }
+ svn_membuf__resize(pattern_buf, patternbuf_len + 1);
+ ((char*)pattern_buf->data)[patternbuf_len] = '\0';
+ }
+
+ /* Now normalize the string */
+ SVN_ERR(decompose_normalized(&tempbuf_len, string, string_len, temp_buf));
+ SVN_ERR(svn_utf__encode_ucs4_string(string_buf, temp_buf->data,
+ tempbuf_len, &tempbuf_len));
+
+ *match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0);
+ return SVN_NO_ERROR;
+}
+
+svn_boolean_t
+svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool)
+{
+ svn_error_t *err;
+ svn_membuf_t buffer;
+ apr_size_t result_length;
+ const apr_size_t length = strlen(string);
+ svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool);
+ err = normalize_cstring(&result_length, string, length, &buffer);
+ if (err)
+ {
+ svn_error_clear(err);
+ return FALSE;
+ }
+ return (length == result_length && 0 == strcmp(string, buffer.data));
+}
+
+const char *
+svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool)
+{
+ /* Hexadecimal digits for code conversion. */
+ static const char digits[] = "0123456789ABCDEF";
+
+ /* Flags used for Unicode decomposition. */
+ static const int decomp_flags = (
+ UTF8PROC_COMPAT | UTF8PROC_STABLE | UTF8PROC_LUMP
+ | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK);
+
+ svn_stringbuf_t *result;
+ svn_membuf_t buffer;
+ ssize_t decomp_length;
+ ssize_t len;
+
+ /* Decompose to a non-reversible compatibility format. */
+ svn_membuf__create(&buffer, length * sizeof(apr_int32_t), pool);
+ decomp_length = unicode_decomposition(decomp_flags, src, length, &buffer);
+ if (decomp_length < 0)
+ {
+ svn_membuf_t part;
+ apr_size_t done, prev;
+
+ /* The only other error we can receive here indicates an integer
+ overflow due to the length of the input string. Not very
+ likely, but we certainly shouldn't continue in that case. */
+ SVN_ERR_ASSERT_NO_RETURN(decomp_length == UTF8PROC_ERROR_INVALIDUTF8);
+
+ /* Break the decomposition into parts that are valid UTF-8, and
+ bytes that are not. Represent the invalid bytes in the target
+ erray by their negative value. This works because utf8proc
+ will not generate Unicode code points with values larger than
+ U+10FFFF. */
+ svn_membuf__create(&part, sizeof(apr_int32_t), pool);
+ decomp_length = 0;
+ done = prev = 0;
+ while (done < length)
+ {
+ apr_int32_t uc;
+
+ while (done < length)
+ {
+ len = utf8proc_iterate((uint8_t*)src + done, length - done, &uc);
+ if (len < 0)
+ break;
+ done += len;
+ }
+
+ /* Decompose the valid part */
+ if (done > prev)
+ {
+ len = unicode_decomposition(
+ decomp_flags, src + prev, done - prev, &part);
+ SVN_ERR_ASSERT_NO_RETURN(len > 0);
+ svn_membuf__resize(
+ &buffer, (decomp_length + len) * sizeof(apr_int32_t));
+ memcpy((apr_int32_t*)buffer.data + decomp_length,
+ part.data, len * sizeof(apr_int32_t));
+ decomp_length += len;
+ prev = done;
+ }
+
+ /* What follows could be a valid UTF-8 sequence, but not
+ a valid Unicode character. */
+ if (done < length)
+ {
+ const char *last;
+
+ /* Determine the length of the UTF-8 sequence */
+ const char *const p = src + done;
+ len = utf8proc_utf8class[(uint8_t)*p];
+
+ /* Check if the multi-byte sequence is valid UTF-8. */
+ if (len > 1 && len <= (apr_ssize_t)(length - done))
+ last = svn_utf__last_valid(p, len);
+ else
+ last = NULL;
+
+ /* Might not be a valid UTF-8 sequence at all */
+ if (!last || (last && last - p < len))
+ {
+ uc = -((apr_int32_t)(*p & 0xff));
+ len = 1;
+ }
+ else
+ {
+ switch (len)
+ {
+ /* Decode the UTF-8 sequence without validation. */
+ case 2:
+ uc = ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
+ break;
+ case 3:
+ uc = (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
+ + (p[2] & 0x3f));
+ break;
+ case 4:
+ uc = (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
+ + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f));
+ break;
+ default:
+ SVN_ERR_ASSERT_NO_RETURN(
+ !"Unexpected invalid UTF-8 byte");
+ }
+
+ }
+
+ svn_membuf__resize(
+ &buffer, (decomp_length + 1) * sizeof(apr_int32_t));
+ ((apr_int32_t*)buffer.data)[decomp_length++] = uc;
+ done += len;
+ prev = done;
+ }
+ }
+ }
+
+ /* Scan the result and deleting any combining diacriticals and
+ inserting placeholders where any non-ascii characters remain. */
+ result = svn_stringbuf_create_ensure(decomp_length, pool);
+ for (len = 0; len < decomp_length; ++len)
+ {
+ const apr_int32_t cp = ((apr_int32_t*)buffer.data)[len];
+ if (cp > 0 && cp < 127)
+ svn_stringbuf_appendbyte(result, (char)cp);
+ else if (cp == 0)
+ svn_stringbuf_appendcstr(result, "\\0");
+ else if (cp < 0)
+ {
+ const apr_int32_t rcp = ((-cp) & 0xff);
+ svn_stringbuf_appendcstr(result, "?\\");
+ svn_stringbuf_appendbyte(result, digits[(rcp & 0x00f0) >> 4]);
+ svn_stringbuf_appendbyte(result, digits[(rcp & 0x000f)]);
+ }
+ else
+ {
+ if (utf8proc_codepoint_valid(cp))
+ {
+ const utf8proc_property_t *prop = utf8proc_get_property(cp);
+ if (prop->combining_class != 0)
+ continue; /* Combining mark; ignore */
+ svn_stringbuf_appendcstr(result, "{U+");
+ }
+ else
+ svn_stringbuf_appendcstr(result, "{U?");
+ if (cp > 0xffff)
+ {
+ svn_stringbuf_appendbyte(result, digits[(cp & 0xf00000) >> 20]);
+ svn_stringbuf_appendbyte(result, digits[(cp & 0x0f0000) >> 16]);
+ }
+ svn_stringbuf_appendbyte(result, digits[(cp & 0xf000) >> 12]);
+ svn_stringbuf_appendbyte(result, digits[(cp & 0x0f00) >> 8]);
+ svn_stringbuf_appendbyte(result, digits[(cp & 0x00f0) >> 4]);
+ svn_stringbuf_appendbyte(result, digits[(cp & 0x000f)]);
+ svn_stringbuf_appendbyte(result, '}');
+ }
+ }
+
+ return result->data;
+}