1 files changed, 530 insertions, 0 deletions
diff --git a/subversion/libsvn_subr/utf8proc.c b/subversion/libsvn_subr/utf8proc.c
new file mode 100644
index 0000000..1e705f5
--- /dev/null
+++ b/subversion/libsvn_subr/utf8proc.c
@@ -0,0 +1,530 @@
+/*
+ * utf8proc.c:  Wrappers for the utf8proc library
+ *
+ * ====================================================================
+ *    Licensed to the Apache Software Foundation (ASF) under one
+ *    or more contributor license agreements.  See the NOTICE file
+ *    distributed with this work for additional information
+ *    regarding copyright ownership.  The ASF licenses this file
+ *    to you under the Apache License, Version 2.0 (the
+ *    "License"); you may not use this file except in compliance
+ *    with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing,
+ *    software distributed under the License is distributed on an
+ *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *    KIND, either express or implied.  See the License for the
+ *    specific language governing permissions and limitations
+ *    under the License.
+ * ====================================================================
+ */
+
+
+
+#include <apr_fnmatch.h>
+
+#include "private/svn_string_private.h"
+#include "private/svn_utf_private.h"
+#include "svn_private_config.h"
+
+#define UTF8PROC_INLINE
+/* Somehow utf8proc thinks it is nice to use strlen as an argument name,
+   while this function is already defined via apr.h */
+#define strlen svn__strlen_var
+#include "utf8proc/utf8proc.c"
+#undef strlen
+
+
+
+const char *
+svn_utf__utf8proc_compiled_version(void)
+{
+  static const char utf8proc_version[] =
+                                  APR_STRINGIFY(UTF8PROC_VERSION_MAJOR) "."
+                                  APR_STRINGIFY(UTF8PROC_VERSION_MINOR) "."
+                                  APR_STRINGIFY(UTF8PROC_VERSION_PATCH);
+  return utf8proc_version;
+}
+
+const char *
+svn_utf__utf8proc_runtime_version(void)
+{
+  /* Unused static function warning removal hack. */
+  SVN_UNUSED(utf8proc_NFD);
+  SVN_UNUSED(utf8proc_NFC);
+  SVN_UNUSED(utf8proc_NFKD);
+  SVN_UNUSED(utf8proc_NFKC);
+
+  return utf8proc_version();
+}
+
+
+
+/* Fill the given BUFFER with decomposed UCS-4 representation of the
+ * UTF-8 STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING
+ * is NUL-terminated; otherwise look only at the first LENGTH bytes in
+ * STRING. Upon return, BUFFER->data points at an array of UCS-4
+ * characters, and return the length of the array. TRANSFORM_FLAGS
+ * define exactly how the decomposition is performed.
+ *
+ * A negative return value is an utf8proc error code and may indicate
+ * that STRING contains invalid UTF-8 or was so long that an overflow
+ * occurred.
+ */
+static ssize_t
+unicode_decomposition(int transform_flags,
+                      const char *string, apr_size_t length,
+                      svn_membuf_t *buffer)
+{
+  const int nullterm = (length == SVN_UTF__UNKNOWN_LENGTH
+                        ? UTF8PROC_NULLTERM : 0);
+
+  for (;;)
+    {
+      apr_int32_t *const ucs4buf = buffer->data;
+      const ssize_t ucs4len = buffer->size / sizeof(*ucs4buf);
+      const ssize_t result =
+        utf8proc_decompose((const void*) string, length, ucs4buf, ucs4len,
+                           UTF8PROC_DECOMPOSE | UTF8PROC_STABLE
+                           | transform_flags | nullterm);
+
+      if (result < 0 || result <= ucs4len)
+        return result;
+
+      /* Increase the decomposition buffer size and retry */
+      svn_membuf__ensure(buffer, result * sizeof(*ucs4buf));
+    }
+}
+
+/* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8
+ * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
+ * NUL-terminated; otherwise look only at the first LENGTH bytes in
+ * STRING. Upon return, BUFFER->data points at an array of UCS-4
+ * characters and *RESULT_LENGTH contains the length of the array.
+ *
+ * A returned error may indicate that STRING contains invalid UTF-8 or
+ * invalid Unicode codepoints. Any error message comes from utf8proc.
+ */
+static svn_error_t *
+decompose_normalized(apr_size_t *result_length,
+                     const char *string, apr_size_t length,
+                     svn_membuf_t *buffer)
+{
+  ssize_t result = unicode_decomposition(0, string, length, buffer);
+  if (result < 0)
+    return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
+                            gettext(utf8proc_errmsg(result)));
+  *result_length = result;
+  return SVN_NO_ERROR;
+}
+
+/* Fill the given BUFFER with an NFC UTF-8 representation of the UTF-8
+ * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
+ * NUL-terminated; otherwise look only at the first LENGTH bytes in
+ * STRING. Upon return, BUFFER->data points at a NUL-terminated string
+ * of UTF-8 characters.
+ *
+ * A returned error may indicate that STRING contains invalid UTF-8 or
+ * invalid Unicode codepoints. Any error message comes from utf8proc.
+ */
+static svn_error_t *
+normalize_cstring(apr_size_t *result_length,
+                  const char *string, apr_size_t length,
+                  svn_membuf_t *buffer)
+{
+  ssize_t result = unicode_decomposition(0, string, length, buffer);
+  if (result >= 0)
+    {
+      svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1);
+      result = utf8proc_reencode(buffer->data, result,
+                                 UTF8PROC_COMPOSE | UTF8PROC_STABLE);
+    }
+  if (result < 0)
+    return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
+                            gettext(utf8proc_errmsg(result)));
+  *result_length = result;
+  return SVN_NO_ERROR;
+}
+
+/* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of
+ * length LENB. Return 0 if they're equal, a negative value if BUFA is
+ * less than BUFB, otherwise a positive value.
+ *
+ * Yes, this is strcmp for known-length UCS-4 strings.
+ */
+static int
+ucs4cmp(const apr_int32_t *bufa, apr_size_t lena,
+        const apr_int32_t *bufb, apr_size_t lenb)
+{
+  const apr_size_t len = (lena < lenb ? lena : lenb);
+  apr_size_t i;
+
+  for (i = 0; i < len; ++i)
+    {
+      const int diff = bufa[i] - bufb[i];
+      if (diff)
+        return diff;
+    }
+  return (lena == lenb ? 0 : (lena < lenb ? -1 : 1));
+}
+
+svn_error_t *
+svn_utf__normcmp(int *result,
+                 const char *str1, apr_size_t len1,
+                 const char *str2, apr_size_t len2,
+                 svn_membuf_t *buf1, svn_membuf_t *buf2)
+{
+  apr_size_t buflen1;
+  apr_size_t buflen2;
+
+  /* Shortcut-circuit the decision if at least one of the strings is empty. */
+  const svn_boolean_t empty1 =
+    (0 == len1 || (len1 == SVN_UTF__UNKNOWN_LENGTH && !*str1));
+  const svn_boolean_t empty2 =
+    (0 == len2 || (len2 == SVN_UTF__UNKNOWN_LENGTH && !*str2));
+  if (empty1 || empty2)
+    {
+      *result = (empty1 == empty2 ? 0 : (empty1 ? -1 : 1));
+      return SVN_NO_ERROR;
+    }
+
+  SVN_ERR(decompose_normalized(&buflen1, str1, len1, buf1));
+  SVN_ERR(decompose_normalized(&buflen2, str2, len2, buf2));
+  *result = ucs4cmp(buf1->data, buflen1, buf2->data, buflen2);
+  return SVN_NO_ERROR;
+}
+
+svn_error_t*
+svn_utf__normalize(const char **result,
+                   const char *str, apr_size_t len,
+                   svn_membuf_t *buf)
+{
+  apr_size_t result_length;
+  SVN_ERR(normalize_cstring(&result_length, str, len, buf));
+  *result = (const char*)(buf->data);
+  return SVN_NO_ERROR;
+}
+
+/* Decode a single UCS-4 code point to UTF-8, appending the result to BUFFER.
+ * Assume BUFFER is already filled to *LENGTH and return the new size there.
+ * This function does *not* nul-terminate the stringbuf!
+ *
+ * A returned error indicates that the codepoint is invalid.
+ */
+static svn_error_t *
+encode_ucs4(svn_membuf_t *buffer, apr_int32_t ucs4chr, apr_size_t *length)
+{
+  apr_size_t utf8len;
+
+  if (buffer->size - *length < 4)
+    svn_membuf__resize(buffer, buffer->size + 4);
+
+  utf8len = utf8proc_encode_char(ucs4chr, ((uint8_t*)buffer->data + *length));
+  if (!utf8len)
+    return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL,
+                             _("Invalid Unicode character U+%04lX"),
+                             (long)ucs4chr);
+  *length += utf8len;
+  return SVN_NO_ERROR;
+}
+
+svn_error_t *
+svn_utf__encode_ucs4_string(svn_membuf_t *buffer,
+                            const apr_int32_t *ucs4str,
+                            apr_size_t length,
+                            apr_size_t *result_length)
+{
+  *result_length = 0;
+  while (length-- > 0)
+    SVN_ERR(encode_ucs4(buffer, *ucs4str++, result_length));
+  svn_membuf__resize(buffer, *result_length + 1);
+  ((char*)buffer->data)[*result_length] = '\0';
+  return SVN_NO_ERROR;
+}
+
+
+svn_error_t *
+svn_utf__glob(svn_boolean_t *match,
+              const char *pattern, apr_size_t pattern_len,
+              const char *string, apr_size_t string_len,
+              const char *escape, apr_size_t escape_len,
+              svn_boolean_t sql_like,
+              svn_membuf_t *pattern_buf,
+              svn_membuf_t *string_buf,
+              svn_membuf_t *temp_buf)
+{
+  apr_size_t patternbuf_len;
+  apr_size_t tempbuf_len;
+
+  /* If we're in GLOB mode, we don't do custom escape chars. */
+  if (escape && !sql_like)
+    return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
+                            _("Cannot use a custom escape token"
+                              " in glob matching mode"));
+
+  /* Convert the patern to NFD UTF-8. We can't use the UCS-4 result
+     because apr_fnmatch can't handle it.*/
+  SVN_ERR(decompose_normalized(&tempbuf_len, pattern, pattern_len, temp_buf));
+  if (!sql_like)
+    SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf, temp_buf->data,
+                                        tempbuf_len, &patternbuf_len));
+  else
+    {
+      /* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */
+      const apr_int32_t *like = temp_buf->data;
+      apr_int32_t ucs4esc;
+      svn_boolean_t escaped;
+      apr_size_t i;
+
+      if (!escape)
+        ucs4esc = -1;           /* Definitely an invalid UCS-4 character. */
+      else
+        {
+          const int nullterm = (escape_len == SVN_UTF__UNKNOWN_LENGTH
+                                ? UTF8PROC_NULLTERM : 0);
+          ssize_t result =
+            utf8proc_decompose((const void*) escape, escape_len, &ucs4esc, 1,
+                               UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | nullterm);
+          if (result < 0)
+            return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
+                                    gettext(utf8proc_errmsg(result)));
+          if (result == 0 || result > 1)
+            return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
+                                    _("Escape token must be one character"));
+          if ((ucs4esc & 0xFF) != ucs4esc)
+            return svn_error_createf(SVN_ERR_UTF8_GLOB, NULL,
+                                     _("Invalid escape character U+%04lX"),
+                                     (long)ucs4esc);
+        }
+
+      patternbuf_len = 0;
+      svn_membuf__ensure(pattern_buf, tempbuf_len + 1);
+      for (i = 0, escaped = FALSE; i < tempbuf_len; ++i, ++like)
+        {
+          if (*like == ucs4esc && !escaped)
+            {
+              svn_membuf__resize(pattern_buf, patternbuf_len + 1);
+              ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
+              escaped = TRUE;
+            }
+          else if (escaped)
+            {
+              SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
+              escaped = FALSE;
+            }
+          else
+            {
+              if ((*like == '[' || *like == '\\') && !escaped)
+                {
+                  /* Escape brackets and backslashes which are always
+                     literals in LIKE patterns. */
+                  svn_membuf__resize(pattern_buf, patternbuf_len + 1);
+                  ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
+                  escaped = TRUE;
+                  --i; --like;
+                  continue;
+                }
+
+              /* Replace LIKE wildcards with their GLOB equivalents. */
+              if (*like == '%' || *like == '_')
+                {
+                  const char wildcard = (*like == '%' ? '*' : '?');
+                  svn_membuf__resize(pattern_buf, patternbuf_len + 1);
+                  ((char*)pattern_buf->data)[patternbuf_len++] = wildcard;
+                }
+              else
+                SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
+            }
+        }
+      svn_membuf__resize(pattern_buf, patternbuf_len + 1);
+      ((char*)pattern_buf->data)[patternbuf_len] = '\0';
+    }
+
+  /* Now normalize the string */
+  SVN_ERR(decompose_normalized(&tempbuf_len, string, string_len, temp_buf));
+  SVN_ERR(svn_utf__encode_ucs4_string(string_buf, temp_buf->data,
+                                      tempbuf_len, &tempbuf_len));
+
+  *match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0);
+  return SVN_NO_ERROR;
+}
+
+svn_boolean_t
+svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool)
+{
+  svn_error_t *err;
+  svn_membuf_t buffer;
+  apr_size_t result_length;
+  const apr_size_t length = strlen(string);
+  svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool);
+  err = normalize_cstring(&result_length, string, length, &buffer);
+  if (err)
+    {
+      svn_error_clear(err);
+      return FALSE;
+    }
+  return (length == result_length && 0 == strcmp(string, buffer.data));
+}
+
+const char *
+svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool)
+{
+  /* Hexadecimal digits for code conversion. */
+  static const char digits[] = "0123456789ABCDEF";
+
+  /* Flags used for Unicode decomposition. */
+  static const int decomp_flags = (
+      UTF8PROC_COMPAT | UTF8PROC_STABLE | UTF8PROC_LUMP
+      | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK);
+
+  svn_stringbuf_t *result;
+  svn_membuf_t buffer;
+  ssize_t decomp_length;
+  ssize_t len;
+
+  /* Decompose to a non-reversible compatibility format. */
+  svn_membuf__create(&buffer, length * sizeof(apr_int32_t), pool);
+  decomp_length = unicode_decomposition(decomp_flags, src, length, &buffer);
+  if (decomp_length < 0)
+    {
+      svn_membuf_t part;
+      apr_size_t done, prev;
+
+      /* The only other error we can receive here indicates an integer
+         overflow due to the length of the input string. Not very
+         likely, but we certainly shouldn't continue in that case. */
+      SVN_ERR_ASSERT_NO_RETURN(decomp_length == UTF8PROC_ERROR_INVALIDUTF8);
+
+      /* Break the decomposition into parts that are valid UTF-8, and
+         bytes that are not. Represent the invalid bytes in the target
+         erray by their negative value. This works because utf8proc
+         will not generate Unicode code points with values larger than
+         U+10FFFF. */
+      svn_membuf__create(&part, sizeof(apr_int32_t), pool);
+      decomp_length = 0;
+      done = prev = 0;
+      while (done < length)
+        {
+          apr_int32_t uc;
+
+          while (done < length)
+            {
+              len = utf8proc_iterate((uint8_t*)src + done, length - done, &uc);
+              if (len < 0)
+                break;
+              done += len;
+            }
+
+          /* Decompose the valid part */
+          if (done > prev)
+            {
+              len = unicode_decomposition(
+                  decomp_flags, src + prev, done - prev, &part);
+              SVN_ERR_ASSERT_NO_RETURN(len > 0);
+              svn_membuf__resize(
+                  &buffer, (decomp_length + len) * sizeof(apr_int32_t));
+              memcpy((apr_int32_t*)buffer.data + decomp_length,
+                     part.data, len * sizeof(apr_int32_t));
+              decomp_length += len;
+              prev = done;
+            }
+
+          /* What follows could be a valid UTF-8 sequence, but not
+             a valid Unicode character. */
+          if (done < length)
+            {
+              const char *last;
+
+              /* Determine the length of the UTF-8 sequence */
+              const char *const p = src + done;
+              len = utf8proc_utf8class[(uint8_t)*p];
+
+              /* Check if the multi-byte sequence is valid UTF-8. */
+              if (len > 1 && len <= (apr_ssize_t)(length - done))
+                last = svn_utf__last_valid(p, len);
+              else
+                last = NULL;
+
+              /* Might not be a valid UTF-8 sequence at all */
+              if (!last || (last && last - p < len))
+                {
+                  uc = -((apr_int32_t)(*p & 0xff));
+                  len = 1;
+                }
+              else
+                {
+                  switch (len)
+                    {
+                      /* Decode the UTF-8 sequence without validation. */
+                    case 2:
+                      uc = ((p[0] & 0x1f) <<  6) + (p[1] & 0x3f);
+                      break;
+                    case 3:
+                      uc = (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) <<  6)
+                            + (p[2] & 0x3f));
+                      break;
+                    case 4:
+                      uc = (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
+                            + ((p[2] & 0x3f) <<  6) + (p[3] & 0x3f));
+                      break;
+                    default:
+                      SVN_ERR_ASSERT_NO_RETURN(
+                          !"Unexpected invalid UTF-8 byte");
+                    }
+
+                }
+
+              svn_membuf__resize(
+                  &buffer, (decomp_length + 1) * sizeof(apr_int32_t));
+              ((apr_int32_t*)buffer.data)[decomp_length++] = uc;
+              done += len;
+              prev = done;
+            }
+        }
+    }
+
+  /* Scan the result and deleting any combining diacriticals and
+     inserting placeholders where any non-ascii characters remain.  */
+  result = svn_stringbuf_create_ensure(decomp_length, pool);
+  for (len = 0; len < decomp_length; ++len)
+    {
+      const apr_int32_t cp = ((apr_int32_t*)buffer.data)[len];
+      if (cp > 0 && cp < 127)
+        svn_stringbuf_appendbyte(result, (char)cp);
+      else if (cp == 0)
+        svn_stringbuf_appendcstr(result, "\\0");
+      else if (cp < 0)
+        {
+          const apr_int32_t rcp = ((-cp) & 0xff);
+          svn_stringbuf_appendcstr(result, "?\\");
+          svn_stringbuf_appendbyte(result, digits[(rcp & 0x00f0) >> 4]);
+          svn_stringbuf_appendbyte(result, digits[(rcp & 0x000f)]);
+        }
+      else
+        {
+          if (utf8proc_codepoint_valid(cp))
+            {
+              const utf8proc_property_t *prop = utf8proc_get_property(cp);
+              if (prop->combining_class != 0)
+                continue;           /* Combining mark; ignore */
+              svn_stringbuf_appendcstr(result, "{U+");
+            }
+          else
+            svn_stringbuf_appendcstr(result, "{U?");
+          if (cp > 0xffff)
+            {
+              svn_stringbuf_appendbyte(result, digits[(cp & 0xf00000) >> 20]);
+              svn_stringbuf_appendbyte(result, digits[(cp & 0x0f0000) >> 16]);
+            }
+          svn_stringbuf_appendbyte(result, digits[(cp & 0xf000) >> 12]);
+          svn_stringbuf_appendbyte(result, digits[(cp & 0x0f00) >> 8]);
+          svn_stringbuf_appendbyte(result, digits[(cp & 0x00f0) >> 4]);
+          svn_stringbuf_appendbyte(result, digits[(cp & 0x000f)]);
+          svn_stringbuf_appendbyte(result, '}');
+        }
+    }
+
+  return result->data;
+}