summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVicent Marti <tanoku@gmail.com>2014-12-16 13:03:02 +0100
committerEdward Thomson <ethomson@microsoft.com>2014-12-16 10:24:18 -0600
commit8e35527de25ac156f3600e2ce49b0c3483c258c4 (patch)
tree50bea03490fcdd44ced99b0e63f828610cf52abf
parent11d67b754d47967642570f796601e8850f001d73 (diff)
downloadlibgit2-8e35527de25ac156f3600e2ce49b0c3483c258c4.tar.gz
path: Use UTF8 iteration for HFS chars
-rw-r--r--src/path.c126
-rw-r--r--src/util.c76
-rw-r--r--src/util.h11
3 files changed, 132 insertions, 81 deletions
diff --git a/src/path.c b/src/path.c
index 724d9ede2..b9c9729c1 100644
--- a/src/path.c
+++ b/src/path.c
@@ -1282,93 +1282,57 @@ GIT_INLINE(bool) verify_dospath(
component[last] != ':');
}
-GIT_INLINE(bool) verify_dotgit_hfs(const char *component, size_t len)
+static int32_t next_hfs_char(const char **in, size_t *len)
{
- const unsigned char *c;
- int git = 0, ign = 0;
- unsigned char one, two;
-
- while (len) {
- switch (*(c = (const unsigned char *)component++)) {
- case '.':
- if (ign || git++ != 0)
- return true;
- break;
- case 'g':
- case 'G':
- if (ign || git++ != 1)
- return true;
- break;
- case 'i':
- case 'I':
- if (ign || git++ != 2)
- return true;
- break;
- case 't':
- case 'T':
- if (ign || git++ != 3)
- return true;
- break;
-
- case 0xe2:
- case 0xef:
- if (ign++ != 0)
- return true;
- one = *c;
- break;
-
- case 0x80:
- case 0x81:
- if (ign++ != 1 || one != 0xe2)
- return true;
- two = *c;
- break;
-
- case 0xbb:
- if (ign++ != 1 || one != 0xef)
- return true;
- two = *c;
- break;
-
- case 0x8c:
- case 0x8d:
- case 0x8e:
- case 0x8f:
- if (ign != 2 || two != 0x80)
- return true;
- ign = 0;
- break;
-
- case 0xaa:
- case 0xab:
- case 0xac:
- case 0xad:
- case 0xae:
- if (ign != 2 || (two != 0x80 && two != 0x81))
- return true;
- ign = 0;
- break;
-
- case 0xaf:
- if (ign != 2 || two != 0x81)
- return true;
- ign = 0;
- break;
-
- case 0xbf:
- if (ign != 2 || two != 0xbb)
- return true;
- ign = 0;
- break;
+ while (*len) {
+ int32_t codepoint;
+ int cp_len = git__utf8_iterate((const uint8_t *)(*in), (int)(*len), &codepoint);
+ if (cp_len < 0)
+ return -1;
- default:
- return true;
+ (*in) += cp_len;
+ (*len) -= cp_len;
+
+ /* these code points are ignored completely */
+ switch (codepoint) {
+ case 0x200c: /* ZERO WIDTH NON-JOINER */
+ case 0x200d: /* ZERO WIDTH JOINER */
+ case 0x200e: /* LEFT-TO-RIGHT MARK */
+ case 0x200f: /* RIGHT-TO-LEFT MARK */
+ case 0x202a: /* LEFT-TO-RIGHT EMBEDDING */
+ case 0x202b: /* RIGHT-TO-LEFT EMBEDDING */
+ case 0x202c: /* POP DIRECTIONAL FORMATTING */
+ case 0x202d: /* LEFT-TO-RIGHT OVERRIDE */
+ case 0x202e: /* RIGHT-TO-LEFT OVERRIDE */
+ case 0x206a: /* INHIBIT SYMMETRIC SWAPPING */
+ case 0x206b: /* ACTIVATE SYMMETRIC SWAPPING */
+ case 0x206c: /* INHIBIT ARABIC FORM SHAPING */
+ case 0x206d: /* ACTIVATE ARABIC FORM SHAPING */
+ case 0x206e: /* NATIONAL DIGIT SHAPES */
+ case 0x206f: /* NOMINAL DIGIT SHAPES */
+ case 0xfeff: /* ZERO WIDTH NO-BREAK SPACE */
+ continue;
}
- len--;
+ /* fold into lowercase -- this will only fold characters in
+ * the ASCII range, which is perfectly fine, because the
+ * git folder name can only be composed of ascii characters
+ */
+ return tolower(codepoint);
}
+ return 0; /* NULL byte -- end of string */
+}
+
+static bool verify_dotgit_hfs(const char *path, size_t len)
+{
+ if (next_hfs_char(&path, &len) != '.' ||
+ next_hfs_char(&path, &len) != 'g' ||
+ next_hfs_char(&path, &len) != 'i' ||
+ next_hfs_char(&path, &len) != 't' ||
+ next_hfs_char(&path, &len) != 0)
+ return true;
- return (ign || git != 4);
+ return false;
}
GIT_INLINE(bool) verify_char(unsigned char c, unsigned int flags)
diff --git a/src/util.c b/src/util.c
index 6b0efbea5..7ee3e2ff9 100644
--- a/src/util.c
+++ b/src/util.c
@@ -664,3 +664,79 @@ void git__insertsort_r(
if (freeswap)
git__free(swapel);
}
+
+static const int8_t utf8proc_utf8class[256] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+int git__utf8_charlen(const uint8_t *str, int str_len)
+{
+ int length, i;
+
+ length = utf8proc_utf8class[str[0]];
+ if (!length)
+ return -1;
+
+ if (str_len >= 0 && length > str_len)
+ return -str_len;
+
+ for (i = 1; i < length; i++) {
+ if ((str[i] & 0xC0) != 0x80)
+ return -i;
+ }
+
+ return length;
+}
+
+int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst)
+{
+ int length;
+ int32_t uc = -1;
+
+ *dst = -1;
+ length = git__utf8_charlen(str, str_len);
+ if (length < 0)
+ return -1;
+
+ switch (length) {
+ case 1:
+ uc = str[0];
+ break;
+ case 2:
+ uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
+ if (uc < 0x80) uc = -1;
+ break;
+ case 3:
+ uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
+ + (str[2] & 0x3F);
+ if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
+ (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
+ break;
+ case 4:
+ uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
+ + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
+ if (uc < 0x10000 || uc >= 0x110000) uc = -1;
+ break;
+ }
+
+ if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
+ return -1;
+
+ *dst = uc;
+ return length;
+}
diff --git a/src/util.h b/src/util.h
index 17cc08987..7cfc0d644 100644
--- a/src/util.h
+++ b/src/util.h
@@ -368,6 +368,17 @@ extern int git__date_rfc2822_fmt(char *out, size_t len, const git_time *date);
extern size_t git__unescape(char *str);
/*
+ * Iterate through an UTF-8 string, yielding one
+ * codepoint at a time.
+ *
+ * @param str current position in the string
+ * @param str_len size left in the string; -1 if the string is NULL-terminated
+ * @param dst pointer where to store the current codepoint
+ * @return length in bytes of the read codepoint; -1 if the codepoint was invalid
+ */
+extern int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst);
+
+/*
* Safely zero-out memory, making sure that the compiler
* doesn't optimize away the operation.
*/