url: Properly parse certain oddly formed urls

In cases where there are multiple @-chars in a url, Node currently parses the hostname and auth sections differently than web browsers. This part of the bug is serious, and should be landed in v0.10, and also ported to v0.8, and releases made as soon as possible. The less serious issue is that there are many other sorts of malformed urls which Node either accepts when it should reject, or interprets differently than web browsers. For example, `http://a.com*foo` is interpreted by Node like `http://a.com/*foo` when web browsers treat this as `http://a.com%3Bfoo/`. In general, *only* the `hostEndingChars` should be the characters that delimit the host portion of the URL. Most of the current `nonHostChars` that appear in the hostname should be escaped, but some of them (such as `;` and `%` when it does not introduce a hex pair) should raise an error. We need to have a broader discussion about whether it's best to throw in these cases, and potentially break extant programs, or return an object that has every field set to `null` so that any attempt to read the hostname/auth/etc. will appear to be empty.
author: isaacs <i@izs.me> 2013-06-03 13:39:57 -0700
committer: isaacs <i@izs.me> 2013-06-03 16:29:10 -0700
commit: ba0ed00b5f31d736c9a99acc9d1eac88e5decf85 (patch)
tree: e9661e3fe6d2757d80647cee08db3d93c3d49590
parent: 4dc5b13861d2385b1bda5944fa89cbb2621ac95e (diff)
download: node-ba0ed00b5f31d736c9a99acc9d1eac88e5decf85.tar.gz
2 files changed, 110 insertions, 57 deletions
diff --git a/lib/url.js b/lib/url.js
index 77686ed0f..a27b76da3 100644
--- a/lib/url.js
+++ b/lib/url.js
@@ -48,7 +48,7 @@ var protocolPattern = /^([a-z0-9.+-]+:)/i,
     // them.
     nonHostChars = ['%', '/', '?', ';', '#']
       .concat(unwise).concat(autoEscape),
-    nonAuthChars = ['/', '@', '?', '#'].concat(delims),
+    hostEndingChars = ['/', '?', '#'],
     hostnameMaxLen = 255,
     hostnamePartPattern = /^[a-z0-9A-Z_-]{0,63}$/,
     hostnamePartStart = /^([a-z0-9A-Z_-]{0,63})(.*)$/,
@@ -90,8 +90,6 @@ var protocolPattern = /^([a-z0-9.+-]+:)/i,
     querystring = require('querystring');
 
 function urlParse(url, parseQueryString, slashesDenoteHost) {
-  if (url && typeof(url) === 'object' && url.href) return url;
-
   if (typeof url !== 'string') {
     throw new TypeError("Parameter 'url' must be a string, not " + typeof url);
   }
@@ -125,57 +123,66 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
 
   if (!hostlessProtocol[proto] &&
       (slashes || (proto && !slashedProtocol[proto]))) {
+
     // there's a hostname.
     // the first instance of /, ?, ;, or # ends the host.
-    // don't enforce full RFC correctness, just be unstupid about it.
-
+    //
     // If there is an @ in the hostname, then non-host chars *are* allowed
-    // to the left of the first @ sign, unless some non-auth character
+    // to the left of the last @ sign, unless some host-ending character
     // comes *before* the @-sign.
     // URLs are obnoxious.
-    var atSign = rest.indexOf('@');
-    if (atSign !== -1) {
-      var auth = rest.slice(0, atSign);
-
-      // there *may be* an auth
-      var hasAuth = true;
-      for (var i = 0, l = nonAuthChars.length; i < l; i++) {
-        if (auth.indexOf(nonAuthChars[i]) !== -1) {
-          // not a valid auth.  Something like http://foo.com/bar@baz/
-          hasAuth = false;
-          break;
-        }
-      }
+    //
+    // ex:
+    // http://a@b@c/ => user:a@b host:c
+    // http://a@b?@c => user:a host:c path:/?@c
+
+    // v0.12 TODO(isaacs): This is not quite how Chrome does things.
+    // Review our test case against browsers more comprehensively.
+
+    // find the first instance of any hostEndingChars
+    var hostEnd = -1;
+    for (var i = 0; i < hostEndingChars.length; i++) {
+      var hec = rest.indexOf(hostEndingChars[i]);
+      if (hec !== -1 && (hostEnd === -1 || hec < hostEnd))
+        hostEnd = hec;
+    }
 
-      if (hasAuth) {
-        // pluck off the auth portion.
-        out.auth = decodeURIComponent(auth);
-        rest = rest.substr(atSign + 1);
-      }
+    // at this point, either we have an explicit point where the
+    // auth portion cannot go past, or the last @ char is the decider.
+    var auth, atSign;
+    if (hostEnd === -1) {
+      // atSign can be anywhere.
+      atSign = rest.lastIndexOf('@');
+    } else {
+      // atSign must be in auth portion.
+      // http://a@b/c@d => host:b auth:a path:/c@d
+      atSign = rest.lastIndexOf('@', hostEnd);
     }
 
-    var firstNonHost = -1;
-    for (var i = 0, l = nonHostChars.length; i < l; i++) {
-      var index = rest.indexOf(nonHostChars[i]);
-      if (index !== -1 &&
-          (firstNonHost < 0 || index < firstNonHost)) firstNonHost = index;
+    // Now we have a portion which is definitely the auth.
+    // Pull that off.
+    if (atSign !== -1) {
+      auth = rest.slice(0, atSign);
+      rest = rest.slice(atSign + 1);
+      out.auth = decodeURIComponent(auth);
     }
 
-    if (firstNonHost !== -1) {
-      out.host = rest.substr(0, firstNonHost);
-      rest = rest.substr(firstNonHost);
-    } else {
-      out.host = rest;
-      rest = '';
+    // the host is the remaining to the left of the first non-host char
+    hostEnd = -1;
+    for (var i = 0; i < nonHostChars.length; i++) {
+      var hec = rest.indexOf(nonHostChars[i]);
+      if (hec !== -1 && (hostEnd === -1 || hec < hostEnd))
+        hostEnd = hec;
     }
+    // if we still have not hit it, then the entire thing is a host.
+    if (hostEnd === -1)
+      hostEnd = rest.length;
+
+    out.host = rest.slice(0, hostEnd);
+    rest = rest.slice(hostEnd);
 
     // pull out port.
-    var p = parseHost(out.host);
-    var keys = Object.keys(p);
-    for (var i = 0, l = keys.length; i < l; i++) {
-      var key = keys[i];
-      out[key] = p[key];
-    }
+    parseHost(out);
 
     // we've indicated that there is a hostname,
     // so even if it's empty, it has to be present.
@@ -187,9 +194,7 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
         out.hostname[out.hostname.length - 1] === ']';
 
     // validate a little.
-    if (out.hostname.length > hostnameMaxLen) {
-      out.hostname = '';
-    } else if (!ipv6Hostname) {
+    if (!ipv6Hostname) {
       var hostparts = out.hostname.split(/\./);
       for (var i = 0, l = hostparts.length; i < l; i++) {
         var part = hostparts[i];
@@ -225,8 +230,12 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
       }
     }
 
-    // hostnames are always lower case.
-    out.hostname = out.hostname.toLowerCase();
+    if (out.hostname.length > hostnameMaxLen) {
+      out.hostname = '';
+    } else {
+      // hostnames are always lower case.
+      out.hostname = out.hostname.toLowerCase();
+    }
 
     if (!ipv6Hostname) {
       // IDNA Support: Returns a puny coded representation of "domain".
@@ -243,11 +252,13 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
       out.hostname = newOut.join('.');
     }
 
-    out.host = (out.hostname || '') +
-        ((out.port) ? ':' + out.port : '');
+    var p = out.port ? ':' + out.port : '';
+    var h = out.hostname || '';
+    out.host = h + p;
     out.href += out.host;
 
     // strip [ and ] from the hostname
+    // the host field still retains them, though
     if (ipv6Hostname) {
       out.hostname = out.hostname.substr(1, out.hostname.length - 2);
       if (rest[0] !== '/') {
@@ -302,8 +313,9 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
 
   //to support http.request
   if (out.pathname || out.search) {
-    out.path = (out.pathname ? out.pathname : '') +
-               (out.search ? out.search : '');
+    var p = out.pathname || '';
+    var s = out.search || '';
+    out.path = p + s;
   }
 
   // finally, reconstruct the href based on what has been validated.
@@ -332,9 +344,9 @@ function urlFormat(obj) {
       host = false,
       query = '';
 
-  if (obj.host !== undefined) {
+  if (obj.host) {
     host = auth + obj.host;
-  } else if (obj.hostname !== undefined) {
+  } else if (obj.hostname) {
     host = auth + (obj.hostname.indexOf(':') === -1 ?
         obj.hostname :
         '[' + obj.hostname + ']');
@@ -365,6 +377,11 @@ function urlFormat(obj) {
   if (hash && hash.charAt(0) !== '#') hash = '#' + hash;
   if (search && search.charAt(0) !== '?') search = '?' + search;
 
+  pathname = pathname.replace(/[?#]/g, function(match) {
+    return encodeURIComponent(match);
+  });
+  search = search.replace('#', '%23');
+
   return protocol + host + pathname + search + hash;
 }
 
@@ -610,16 +627,15 @@ function urlResolveObject(source, relative) {
   return source;
 }
 
-function parseHost(host) {
-  var out = {};
+function parseHost(obj) {
+  var host = obj.host;
   var port = portPattern.exec(host);
   if (port) {
     port = port[0];
     if (port !== ':') {
-      out.port = port.substr(1);
+      obj.port = port.substr(1);
     }
     host = host.substr(0, host.length - port.length);
   }
-  if (host) out.hostname = host;
-  return out;
+  if (host) obj.hostname = host;
 }
diff --git a/test/simple/test-url.js b/test/simple/test-url.js
index 764a541b6..f6c8d02d1 100644
--- a/test/simple/test-url.js
+++ b/test/simple/test-url.js
@@ -741,6 +741,43 @@ var parseTests = {
     'path': '/test',
   },
 
+  'http://a@b@c/': {
+    protocol: 'http:',
+    slashes: true,
+    auth: 'a@b',
+    host: 'c',
+    hostname: 'c',
+    href: 'http://a%40b@c/',
+    path: '/',
+    pathname: '/'
+  },
+
+  'http://a@b?@c': {
+    protocol: 'http:',
+    slashes: true,
+    auth: 'a',
+    host: 'b',
+    hostname: 'b',
+    href: 'http://a@b/?@c',
+    path: '/?@c',
+    pathname: '/',
+    search: '?@c',
+    query: '@c'
+  },
+
+  'http://a\r" \t\n<\'b:b@c\r\nd/e?f':{
+    protocol: 'http:',
+    slashes: true,
+    auth: 'a\r" \t\n<\'b:b',
+    host: 'c',
+    hostname: 'c',
+    search: '?f',
+    query: 'f',
+    pathname: '%0D%0Ad/e',
+    path: '%0D%0Ad/e?f',
+    href: 'http://a%0D%22%20%09%0A%3C\'b:b@c/%0D%0Ad/e?f'
+  }
+
 };
 
 for (var u in parseTests) {
author	isaacs <i@izs.me>	2013-06-03 13:39:57 -0700
committer	isaacs <i@izs.me>	2013-06-03 16:29:10 -0700
commit	ba0ed00b5f31d736c9a99acc9d1eac88e5decf85 (patch)
tree	e9661e3fe6d2757d80647cee08db3d93c3d49590
parent	4dc5b13861d2385b1bda5944fa89cbb2621ac95e (diff)
download	node-ba0ed00b5f31d736c9a99acc9d1eac88e5decf85.tar.gz