url: Don't pass UTF-8 hostname to libidn unless it's valid UTF-8. draft6

- Rename utf8len function to utf8_strict_codepoint_count.
author: Jay Satiro <raysatiro@yahoo.com> 2015-07-08 03:53:30 -0400
committer: Jay Satiro <raysatiro@yahoo.com> 2015-07-08 03:53:30 -0400
commit: c9fb473ed2b44b765eebbf738a511f2f46371e89 (patch)
tree: cda7343463cb2a99099bf6d877f40c8945918ea7
parent: 288f6917bbd3decf94ad6a725e7b90def965c9c3 (diff)
download: curl-c9fb473ed2b44b765eebbf738a511f2f46371e89.tar.gz
4 files changed, 53 insertions, 26 deletions
diff --git a/lib/non-ascii.c b/lib/non-ascii.c
index 97f30ccc4..a28685e49 100644
--- a/lib/non-ascii.c
+++ b/lib/non-ascii.c
@@ -341,16 +341,21 @@ CURLcode Curl_convert_form(struct SessionHandle *data, struct FormData *form)
 #endif /* CURL_DOES_CONVERSIONS */
 
 #ifdef USE_LIBIDN
-/* utf8len: Count the number of UTF-8 characters.
+/* utf8_strict_codepoint_count:
+Count the number of Unicode codepoints encoded in a UTF-8 string.
 
 This function also tests for valid UTF-8 in accordance with the Unicode
 Standard, Section Conformance 3.9, Table 3-7, Well-Formed UTF-8 Byte Sequences.
 http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404
 
-Success: Returns the number of unicode codepoints encoded in a UTF-8 string.
+We are using this function to test the UTF-8 strings before we pass to libidn,
+so the conformance must remain strict. If we encounter any byte sequence that
+is not well-formed then we error.
+
+Success: Returns the number of Unicode codepoints encoded in a UTF-8 string.
 Failure: Returns -1 if 'str' is NULL or points to invalid UTF-8.
 */
-curl_off_t utf8len(const char *str)
+curl_off_t utf8_strict_codepoint_count(const char *str)
 {
   const unsigned char *ch = (const unsigned char*)str;
   const curl_off_t error = -1;
diff --git a/lib/non-ascii.h b/lib/non-ascii.h
index 48dc21ac3..fbdfb4070 100644
--- a/lib/non-ascii.h
+++ b/lib/non-ascii.h
@@ -61,7 +61,13 @@ CURLcode Curl_convert_form(struct SessionHandle *data, struct FormData *form);
 #endif
 
 #ifdef USE_LIBIDN
-curl_off_t utf8len(const char *str);
+/*
+ * utf8_strict_codepoint_count returns the Unicode codepoint count from a UTF-8
+ * string or -1 if invalid UTF-8 is encountered. Note that a UTF-8 BOM is a
+ * codepoint and is counted as such.
+ * Refer to comment block above this function's definition for more detail.
+ */
+curl_off_t utf8_strict_codepoint_count(const char *str);
 #endif
 
 #endif /* HEADER_CURL_NON_ASCII_H */
diff --git a/lib/url.c b/lib/url.c
index 169273b37..d30f063ad 100644
--- a/lib/url.c
+++ b/lib/url.c
@@ -3712,7 +3712,7 @@ static void fix_hostname(struct SessionHandle *data,
      * are also disallowed. This is a security measure; unsanitized UTF-8
      * could be used to encode embedded null bytes and other undesirable stuff.
      */
-    if(utf8len(utf8) < 0) {
+    if(utf8_strict_codepoint_count(utf8) < 0) {
       infof(data, "Hostname contains invalid UTF-8 sequence\n");
       rc = IDNA_STRINGPREP_ERROR;
     }
diff --git a/tests/unit/unit1603.c b/tests/unit/unit1603.c
index 0f835df75..00f68b0e3 100644
--- a/tests/unit/unit1603.c
+++ b/tests/unit/unit1603.c
@@ -41,36 +41,45 @@ static void unit_stop(void)
 UNITTEST_START
 
 #ifdef USE_LIBIDN
-  fail_unless( utf8len(NULL) == -1, "null string should be an error" );
+  fail_unless( utf8_strict_codepoint_count(NULL) == -1,
+      "null string should be an error" );
 
-  fail_unless( utf8len("") == 0, "empty string should get utf8len == 0" );
+  fail_unless( utf8_strict_codepoint_count("") == 0,
+      "empty string should get utf8_strict_codepoint_count == 0" );
 
-  fail_unless( utf8len("\r\n") == 2, "ordinary ascii should get utf8len =="
+  fail_unless( utf8_strict_codepoint_count("\r\n") == 2,
+      "ordinary ascii should get utf8_strict_codepoint_count =="
       " strlen, even if it contains control chars");
 
   /* Mixture of normal and double-byte sequences as used in latin langs. */
-  fail_unless( utf8len("\xC2\xA9 2001, Chang\xC3\xA9 Corp.") == 20,
-      "utf8len should handle valid latin 1");
+  fail_unless( utf8_strict_codepoint_count(
+            "\xC2\xA9 2001, Chang\xC3\xA9 Corp.") == 20,
+      "utf8_strict_codepoint_count should handle valid latin 1");
 
   /* Japanese, Russian, Greek 2- and 3-byte sequences -- with a little ASCII */
-  fail_unless( utf8len("\xE5\xA4\x89\xE3\x82\x8F\xD1\x81\xD0\xB2 ascii "
-      "\xD1\x8F\xD0\xB7\xCF\x8E\xCF\x81\xCE\xB1") == 16,
-      "utf8len should support a mix of several interesting languages");
+  fail_unless( utf8_strict_codepoint_count(
+            "\xE5\xA4\x89\xE3\x82\x8F\xD1\x81\xD0\xB2 ascii "
+            "\xD1\x8F\xD0\xB7\xCF\x8E\xCF\x81\xCE\xB1") == 16,
+      "utf8_strict_codepoint_count should support a mix of several interesting"
+      " languages");
 
   /* overlong encoding of the Euro sign */
-  fail_unless( utf8len("\xF0\x82\x82\xAC") == -1,
-      "utf8len should reject overlong encodings");
+  fail_unless( utf8_strict_codepoint_count(
+            "\xF0\x82\x82\xAC") == -1,
+      "utf8_strict_codepoint_count should reject overlong encodings");
 
   /* overlong encoding of embedded null */
-  fail_unless( utf8len("with embedded null \xC0\x80 <<there") == -1,
-      "utf8len must disallow embedded null with overlong encoding, which is"
-      " known as 'modified utf8' in some circles but which is dangerous when"
-      " passed to libidn");
+  fail_unless( utf8_strict_codepoint_count(
+            "with embedded null \xC0\x80 <<there") == -1,
+      "utf8_strict_codepoint_count must disallow embedded null with overlong"
+      " encoding, which is known as 'modified utf8' in some circles but which"
+      " is dangerous when passed to libidn");
 
   /* surrogate pair */
-  fail_unless(utf8len("\xED\xA0\x81\xED\xB0\x80") == -1,
-      "utf8len must disallow CESU-8-style surrogate pairs (see"
-      " http://j.mp/1HzJPBY)");
+  fail_unless(utf8_strict_codepoint_count(
+            "\xED\xA0\x81\xED\xB0\x80") == -1,
+      "utf8_strict_codepoint_count must disallow CESU-8-style surrogate pairs"
+      " (see http://j.mp/1HzJPBY)");
 
   /* invalid trail bytes, per table 3.7 in the Unicode Standard v7, Section
      Conformance 3.9, Table 3-7, Well-Formed UTF-8 Byte Sequences.
@@ -82,10 +91,17 @@ UNITTEST_START
      seems prudent to prove that the table and our algorithm and our named
      scenarios all have the same scope...
   */
-  fail_unless(utf8len("\xE0\x9F\xB1") == -1, "bad 2nd byte");
-  fail_unless(utf8len("\xED\xA0\xB1") == -1, "bad 2nd byte");
-  fail_unless(utf8len("\xF0\x85\xB1\xB1") == -1, "bad 2nd byte");
-  fail_unless(utf8len("\xF4\x90\xB1\xB1") == -1, "bad 2nd byte");
+  fail_unless(utf8_strict_codepoint_count("\xE0\x9F\xB1") == -1,
+      "bad 2nd byte");
+
+  fail_unless(utf8_strict_codepoint_count("\xED\xA0\xB1") == -1,
+      "bad 2nd byte");
+
+  fail_unless(utf8_strict_codepoint_count("\xF0\x85\xB1\xB1") == -1,
+      "bad 2nd byte");
+
+  fail_unless(utf8_strict_codepoint_count("\xF4\x90\xB1\xB1") == -1,
+      "bad 2nd byte");
 
   /* Up to this point, we've just proved that our validation logic is
    * accurate. Now we need to prove that it actually gets invoked when we
author	Jay Satiro <raysatiro@yahoo.com>	2015-07-08 03:53:30 -0400
committer	Jay Satiro <raysatiro@yahoo.com>	2015-07-08 03:53:30 -0400
commit	c9fb473ed2b44b765eebbf738a511f2f46371e89 (patch)
tree	cda7343463cb2a99099bf6d877f40c8945918ea7
parent	288f6917bbd3decf94ad6a725e7b90def965c9c3 (diff)
download	curl-c9fb473ed2b44b765eebbf738a511f2f46371e89.tar.gz