url: Don't pass UTF-8 hostname to libidn unless it's valid UTF-8. draft5

- Add a unit test. There is some question about whether we should be returning a better error (currently, the validation error, IDNA_STRINGPREP_ERROR, gets lost because the only action we take on invalid input is to not convert non-ascii hostnames, which eventually causes a dns failure; that's what gets reported). The unit test for the fix thus tests dns failure instead of looking for a specific error from utf8 validation.
author: Daniel Hardman <daniel.hardman@gmail.com> 2015-07-08 03:04:46 -0400
committer: Jay Satiro <raysatiro@yahoo.com> 2015-07-08 03:13:31 -0400
commit: 288f6917bbd3decf94ad6a725e7b90def965c9c3 (patch)
tree: 7381f83eff22fc2539b7517da6a02d3bac8a3d12
parent: bd2ecf6a552ccffe8797b81f18522338fa7e1a71 (diff)
download: curl-288f6917bbd3decf94ad6a725e7b90def965c9c3.tar.gz
6 files changed, 147 insertions, 4 deletions
diff --git a/lib/non-ascii.c b/lib/non-ascii.c
index adb41de88..97f30ccc4 100644
--- a/lib/non-ascii.c
+++ b/lib/non-ascii.c
@@ -347,7 +347,7 @@ This function also tests for valid UTF-8 in accordance with the Unicode
 Standard, Section Conformance 3.9, Table 3-7, Well-Formed UTF-8 Byte Sequences.
 http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404
 
-Success: Returns the number of UTF-8 characters.
+Success: Returns the number of unicode codepoints encoded in a UTF-8 string.
 Failure: Returns -1 if 'str' is NULL or points to invalid UTF-8.
 */
 curl_off_t utf8len(const char *str)
diff --git a/lib/url.c b/lib/url.c
index feb4e4b95..169273b37 100644
--- a/lib/url.c
+++ b/lib/url.c
@@ -3671,7 +3671,7 @@ static bool tld_check_name(struct SessionHandle *data,
 
   return TRUE;
 }
-#endif
+#endif /* USE_LIBIDN */
 
 /*
  * Perform any necessary IDN conversion of hostname
@@ -3705,8 +3705,13 @@ static void fix_hostname(struct SessionHandle *data,
   if(stringprep_check_version(LIBIDN_REQUIRED_VERSION)) {
     char *ace_hostname = NULL;
     int rc;
-    /* Don't pass UTF-8 hostname to libidn unless it's valid UTF-8 */
     char *utf8 = stringprep_locale_to_utf8(host->name);
+    /* Don't pass UTF-8 hostname to libidn unless it's valid UTF-8. This check
+     * is quite picky; in addition to 5- and 6-byte sequences and invalid lead
+     * and continuation bytes, CESU-8 and so-called 'Modified UTF-8' sequences
+     * are also disallowed. This is a security measure; unsanitized UTF-8
+     * could be used to encode embedded null bytes and other undesirable stuff.
+     */
     if(utf8len(utf8) < 0) {
       infof(data, "Hostname contains invalid UTF-8 sequence\n");
       rc = IDNA_STRINGPREP_ERROR;
diff --git a/tests/data/Makefile.inc b/tests/data/Makefile.inc
index 51823fe88..05602f8fa 100644
--- a/tests/data/Makefile.inc
+++ b/tests/data/Makefile.inc
@@ -155,7 +155,7 @@ test1520 \
 \
 test1525 test1526 test1527 test1528 test1529 test1530 test1531 \
 \
-test1600 test1601 test1602 \
+test1600 test1601 test1602 test1603 \
 \
 test1800 test1801 \
 \
diff --git a/tests/data/test1603 b/tests/data/test1603
new file mode 100644
index 000000000..7a78ffe4c
--- /dev/null
+++ b/tests/data/test1603
@@ -0,0 +1,26 @@
+<testcase>
+<info>
+<keywords>
+unittest
+idn
+</keywords>
+</info>
+
+#
+# Client-side
+<client>
+<server>
+none
+</server>
+<features>
+unittest
+</features>
+ <name>
+Internal sanitization of idn input testing
+ </name>
+<tool>
+unit1603
+</tool>
+</client>
+
+</testcase>
diff --git a/tests/unit/Makefile.inc b/tests/unit/Makefile.inc
index 9073b34e6..e1c1821e1 100644
--- a/tests/unit/Makefile.inc
+++ b/tests/unit/Makefile.inc
@@ -8,6 +8,7 @@ UNITFILES = curlcheck.h \
 UNITPROGS = unit1300 unit1301 unit1302 unit1303 unit1304 unit1305 unit1307	\
  unit1308 unit1309 unit1330 unit1394 unit1395 unit1396 unit1397 unit1398	\
  unit1600 unit1601 unit1602
+ unit1600 unit1601 unit1602 unit1603
 
 unit1300_SOURCES = unit1300.c $(UNITFILES)
 unit1300_CPPFLAGS = $(AM_CPPFLAGS)
@@ -66,3 +67,6 @@ unit1601_CPPFLAGS = $(AM_CPPFLAGS)
 unit1602_SOURCES = unit1602.c $(UNITFILES)
 unit1602_CPPFLAGS = $(AM_CPPFLAGS)
 
+unit1603_SOURCES = unit1603.c $(UNITFILES)
+unit1603_CPPFLAGS = $(AM_CPPFLAGS)
+
diff --git a/tests/unit/unit1603.c b/tests/unit/unit1603.c
new file mode 100644
index 000000000..0f835df75
--- /dev/null
+++ b/tests/unit/unit1603.c
@@ -0,0 +1,108 @@
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2015, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at http://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+#include "curlcheck.h"
+
+#include "urldata.h"
+#include "non-ascii.h"
+
+CURL *easy;
+
+static CURLcode unit_setup(void)
+{
+  easy = curl_easy_init();
+  return CURLE_OK;
+}
+
+static void unit_stop(void)
+{
+  curl_easy_cleanup(easy);
+}
+
+
+UNITTEST_START
+
+#ifdef USE_LIBIDN
+  fail_unless( utf8len(NULL) == -1, "null string should be an error" );
+
+  fail_unless( utf8len("") == 0, "empty string should get utf8len == 0" );
+
+  fail_unless( utf8len("\r\n") == 2, "ordinary ascii should get utf8len =="
+      " strlen, even if it contains control chars");
+
+  /* Mixture of normal and double-byte sequences as used in latin langs. */
+  fail_unless( utf8len("\xC2\xA9 2001, Chang\xC3\xA9 Corp.") == 20,
+      "utf8len should handle valid latin 1");
+
+  /* Japanese, Russian, Greek 2- and 3-byte sequences -- with a little ASCII */
+  fail_unless( utf8len("\xE5\xA4\x89\xE3\x82\x8F\xD1\x81\xD0\xB2 ascii "
+      "\xD1\x8F\xD0\xB7\xCF\x8E\xCF\x81\xCE\xB1") == 16,
+      "utf8len should support a mix of several interesting languages");
+
+  /* overlong encoding of the Euro sign */
+  fail_unless( utf8len("\xF0\x82\x82\xAC") == -1,
+      "utf8len should reject overlong encodings");
+
+  /* overlong encoding of embedded null */
+  fail_unless( utf8len("with embedded null \xC0\x80 <<there") == -1,
+      "utf8len must disallow embedded null with overlong encoding, which is"
+      " known as 'modified utf8' in some circles but which is dangerous when"
+      " passed to libidn");
+
+  /* surrogate pair */
+  fail_unless(utf8len("\xED\xA0\x81\xED\xB0\x80") == -1,
+      "utf8len must disallow CESU-8-style surrogate pairs (see"
+      " http://j.mp/1HzJPBY)");
+
+  /* invalid trail bytes, per table 3.7 in the Unicode Standard v7, Section
+     Conformance 3.9, Table 3-7, Well-Formed UTF-8 Byte Sequences.
+
+       http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404
+
+     These cases catch the same issues as the ones tested above, but the table
+     doesn't make it obvious which sort of malformation it's preventing. It
+     seems prudent to prove that the table and our algorithm and our named
+     scenarios all have the same scope...
+  */
+  fail_unless(utf8len("\xE0\x9F\xB1") == -1, "bad 2nd byte");
+  fail_unless(utf8len("\xED\xA0\xB1") == -1, "bad 2nd byte");
+  fail_unless(utf8len("\xF0\x85\xB1\xB1") == -1, "bad 2nd byte");
+  fail_unless(utf8len("\xF4\x90\xB1\xB1") == -1, "bad 2nd byte");
+
+  /* Up to this point, we've just proved that our validation logic is
+   * accurate. Now we need to prove that it actually gets invoked when we
+   * use libidn. We do this by setting up a connection to a url with a
+   * hostname that contains invalid utf8, and verifying that we get back
+   * an error that shows that our validation logic rejected it.
+   */
+  struct SessionHandle * curl;
+  CURLcode ret = Curl_open(&curl);
+  fail_unless(ret == 0, "serious error");
+  ret = curl_easy_setopt(curl, CURLOPT_URL, "http://x\xC0\x80.com/");
+  fail_unless(ret == 0, "shouldn't complain yet");
+  ret = curl_easy_perform(curl);
+  fail_unless(ret != 0, "should get error about invalid hostname");
+  /*printf("%s\n", curl_easy_strerror(ret));*/
+#else
+  /*printf("test skipped; libidn not active\n");*/
+#endif
+
+UNITTEST_STOP
author	Daniel Hardman <daniel.hardman@gmail.com>	2015-07-08 03:04:46 -0400
committer	Jay Satiro <raysatiro@yahoo.com>	2015-07-08 03:13:31 -0400
commit	288f6917bbd3decf94ad6a725e7b90def965c9c3 (patch)
tree	7381f83eff22fc2539b7517da6a02d3bac8a3d12
parent	bd2ecf6a552ccffe8797b81f18522338fa7e1a71 (diff)
download	curl-288f6917bbd3decf94ad6a725e7b90def965c9c3.tar.gz