[core] server.http-parseopts URL normalization opt (fixes #1720)

server.http-parseopts = ( ... ) URL normalization options Note: *not applied* to CONNECT method Note: In a future release, URL normalization likely enabled by default (normalize URL, reject control chars, remove . and .. path segments) To prepare for this change, lighttpd.conf configurations should explicitly select desired behavior by enabling or disabling: server.http-parseopts = ( "url-normalize" => "enable", ... ) server.http-parseopts = ( "url-normalize" => "disable" ) x-ref: "lighttpd ... compares URIs to patterns in the (1) url.redirect and (2) url.rewrite configuration settings before performing URL decoding, which might allow remote attackers to bypass intended access restrictions, and obtain sensitive information or possibly modify data." https://www.cvedetails.com/cve/CVE-2008-4359/ "Rewrite/redirect rules and URL encoding" https://redmine.lighttpd.net/issues/1720
author: Glenn Strauss <gstrauss@gluelogic.com> 2018-05-01 00:20:26 -0400
committer: Glenn Strauss <gstrauss@gluelogic.com> 2018-08-12 14:43:22 -0400
commit: 3eb7902e10ba75b3f2eb159e244d0d8e5037ccd2 (patch)
tree: 3915619c5c0c93733c3f00d670e559ef319c9df7 /src/burl.c
parent: 6ccccaaa38bdf545dafbd2e31950e756fc6ac775 (diff)
download: lighttpd-git-3eb7902e10ba75b3f2eb159e244d0d8e5037ccd2.tar.gz
1 files changed, 357 insertions, 0 deletions
diff --git a/src/burl.c b/src/burl.c
new file mode 100644
index 00000000..3eadb6ce
--- /dev/null
+++ b/src/burl.c
@@ -0,0 +1,357 @@
+#include "first.h"
+#include "burl.h"
+
+#include <string.h>
+
+#include "buffer.h"
+
+static const char hex_chars_uc[] = "0123456789ABCDEF";
+
+/* everything except: ! $ & ' ( ) * + , - . / 0-9 : ; = ? @ A-Z _ a-z ~ */
+static const char encoded_chars_http_uri_reqd[] = {
+  /*
+  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
+  */
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  00 -  0F control chars */
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  10 -  1F */
+  1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /*  20 -  2F space " # % */
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,  /*  30 -  3F < > */
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /*  40 -  4F */
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,  /*  50 -  5F [ \ ] ^ */
+  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /*  60 -  6F ` */
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,  /*  70 -  7F { | } DEL */
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  80 -  8F */
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  90 -  9F */
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  A0 -  AF */
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  B0 -  BF */
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  C0 -  CF */
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  D0 -  DF */
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  E0 -  EF */
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  F0 -  FF */
+};
+
+
+/* c (char) and n (nibble) MUST be unsigned integer types */
+#define li_cton(c,n) \
+  (((n) = (c) - '0') <= 9 || (((n) = ((c)&0xdf) - 'A') <= 5 ? ((n) += 10) : 0))
+
+/* b (byte) MUST be unsigned integer type
+ * https://en.wikipedia.org/wiki/UTF-8
+ * reject overlong encodings of 7-byte ASCII and invalid UTF-8
+ * (but does not detect other overlong multi-byte encodings) */
+#define li_utf8_invalid_byte(b) ((b) >= 0xF5 || ((b)|0x1) == 0xC1)
+
+
+static int burl_is_unreserved (const int c)
+{
+    return (light_isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~');
+}
+
+
+static int burl_normalize_basic_unreserved_fix (buffer *b, buffer *t, int i, int qs)
+{
+    int j = i;
+    const int used = (int)buffer_string_length(b);
+    const unsigned char * const s = (unsigned char *)b->ptr;
+    unsigned char * const p =
+      (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1);
+    unsigned int n1, n2;
+    memcpy(p, s, (size_t)i);
+    for (; i < used; ++i, ++j) {
+        if (!encoded_chars_http_uri_reqd[s[i]]) {
+            if (s[i] == '?' && -1 == qs) qs = j;
+            p[j] = s[i];
+        }
+        else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) {
+            const unsigned int x = (n1 << 4) | n2;
+            if (burl_is_unreserved(x)) {
+                p[j] = x;
+            }
+            else {
+                p[j]   = '%';
+                p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/
+                p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/
+                if (li_utf8_invalid_byte(x)) qs = -2;
+            }
+            i+=2;
+        }
+        else if (s[i] == '#') break; /* ignore fragment */
+        else {
+            p[j]   = '%';
+            p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF];
+            p[++j] = hex_chars_uc[s[i] & 0xF];
+            if (li_utf8_invalid_byte(s[i])) qs = -2;
+        }
+    }
+    buffer_commit(t, (size_t)j);
+    buffer_copy_buffer(b, t);
+    return qs;
+}
+
+
+static int burl_normalize_basic_unreserved (buffer *b, buffer *t)
+{
+    const unsigned char * const s = (unsigned char *)b->ptr;
+    const int used = (int)buffer_string_length(b);
+    unsigned int n1, n2, x;
+    int qs = -1;
+
+    for (int i = 0; i < used; ++i) {
+        if (!encoded_chars_http_uri_reqd[s[i]]) {
+            if (s[i] == '?' && -1 == qs) qs = i;
+        }
+        else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)
+                 && !burl_is_unreserved((x = (n1 << 4) | n2))) {
+            if (li_utf8_invalid_byte(x)) qs = -2;
+            if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */
+            if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */
+            i+=2;
+        }
+        else if (s[i] == '#') { /* ignore fragment */
+            buffer_string_set_length(b, (size_t)i);
+            break;
+        }
+        else {
+            qs = burl_normalize_basic_unreserved_fix(b, t, i, qs);
+            break;
+        }
+    }
+
+    return qs;
+}
+
+
+static int burl_normalize_basic_required_fix (buffer *b, buffer *t, int i, int qs)
+{
+    int j = i;
+    const int used = (int)buffer_string_length(b);
+    const unsigned char * const s = (unsigned char *)b->ptr;
+    unsigned char * const p =
+      (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1);
+    unsigned int n1, n2;
+    memcpy(p, s, (size_t)i);
+    for (; i < used; ++i, ++j) {
+        if (!encoded_chars_http_uri_reqd[s[i]]) {
+            if (s[i] == '?' && -1 == qs) qs = j;
+            p[j] = s[i];
+        }
+        else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) {
+            const unsigned int x = (n1 << 4) | n2;
+            if (!encoded_chars_http_uri_reqd[x]
+                && (qs < 0 ? (x!='/'&&x!='?') : (x!='&'&&x!='='&&x!=';'))) {
+                p[j] = x;
+            }
+            else {
+                p[j]   = '%';
+                p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/
+                p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/
+                if (li_utf8_invalid_byte(x)) qs = -2;
+            }
+            i+=2;
+        }
+        else if (s[i] == '#') break; /* ignore fragment */
+        else {
+            p[j]   = '%';
+            p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF];
+            p[++j] = hex_chars_uc[s[i] & 0xF];
+            if (li_utf8_invalid_byte(s[i])) qs = -2;
+        }
+    }
+    buffer_commit(t, (size_t)j);
+    buffer_copy_buffer(b, t);
+    return qs;
+}
+
+
+static int burl_normalize_basic_required (buffer *b, buffer *t)
+{
+    const unsigned char * const s = (unsigned char *)b->ptr;
+    const int used = (int)buffer_string_length(b);
+    unsigned int n1, n2, x;
+    int qs = -1;
+
+    for (int i = 0; i < used; ++i) {
+        if (!encoded_chars_http_uri_reqd[s[i]]) {
+            if (s[i] == '?' && -1 == qs) qs = i;
+        }
+        else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)
+                 && (encoded_chars_http_uri_reqd[(x = (n1 << 4) | n2)]
+                     ||(qs < 0 ? (x=='/'||x=='?') : (x=='&'||x=='='||x==';')))){
+            if (li_utf8_invalid_byte(x)) qs = -2;
+            if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */
+            if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */
+            i+=2;
+        }
+        else if (s[i] == '#') { /* ignore fragment */
+            buffer_string_set_length(b, (size_t)i);
+            break;
+        }
+        else {
+            qs = burl_normalize_basic_required_fix(b, t, i, qs);
+            break;
+        }
+    }
+
+    return qs;
+}
+
+
+static int burl_contains_ctrls (const buffer *b)
+{
+    const char * const s = b->ptr;
+    const int used = (int)buffer_string_length(b);
+    for (int i = 0; i < used; ++i) {
+        if (s[i] == '%' && (s[i+1] < '2' || (s[i+1] == '7' && s[i+2] == 'F')))
+            return 1;
+    }
+    return 0;
+}
+
+
+static void burl_normalize_qs20_to_plus_fix (buffer *b, int i)
+{
+    char * const s = b->ptr;
+    const int used = (int)buffer_string_length(b);
+    int j = i;
+    for (; i < used; ++i, ++j) {
+        s[j] = s[i];
+        if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') {
+            s[j] = '+';
+            i+=2;
+        }
+    }
+    buffer_string_set_length(b, j);
+}
+
+
+static void burl_normalize_qs20_to_plus (buffer *b, int qs)
+{
+    const char * const s = b->ptr;
+    const int used = qs < 0 ? 0 : (int)buffer_string_length(b);
+    int i;
+    if (qs < 0) return;
+    for (i = qs+1; i < used; ++i) {
+        if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') break;
+    }
+    if (i != used) burl_normalize_qs20_to_plus_fix(b, i);
+}
+
+
+static int burl_normalize_2F_to_slash_fix (buffer *b, int qs, int i)
+{
+    char * const s = b->ptr;
+    const int blen = (int)buffer_string_length(b);
+    const int used = qs < 0 ? blen : qs;
+    int j = i;
+    for (; i < used; ++i, ++j) {
+        s[j] = s[i];
+        if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') {
+            s[j] = '/';
+            i+=2;
+        }
+    }
+    if (qs >= 0) {
+        memmove(s+j, s+qs, blen - qs);
+        j += blen - qs;
+    }
+    buffer_string_set_length(b, j);
+    return qs;
+}
+
+
+static int burl_normalize_2F_to_slash (buffer *b, int qs, int flags)
+{
+    /*("%2F" must already have been uppercased during normalization)*/
+    const char * const s = b->ptr;
+    const int used = qs < 0 ? (int)buffer_string_length(b) : qs;
+    for (int i = 0; i < used; ++i) {
+        if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') {
+            return (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE)
+              ? burl_normalize_2F_to_slash_fix(b, qs, i)
+              : -2; /*(flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)*/
+        }
+    }
+    return qs;
+}
+
+
+static int burl_normalize_path (buffer *b, buffer *t, int qs, int flags)
+{
+    const unsigned char * const s = (unsigned char *)b->ptr;
+    const int used = (int)buffer_string_length(b);
+    int path_simplify = 0;
+    for (int i = 0, len = qs < 0 ? used : qs; i < len; ++i) {
+        if (s[i] == '.' && (s[i+1] != '.' || ++i)
+            && (s[i+1] == '/' || s[i+1] == '?' || s[i+1] == '\0')) {
+            path_simplify = 1;
+            break;
+        }
+        do { ++i; } while (i < len && s[i] != '/');
+        if (s[i] == '/' && s[i+1] == '/') { /*(s[len] != '/')*/
+            path_simplify = 1;
+            break;
+        }
+    }
+
+    if (path_simplify) {
+        if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT) return -2;
+        if (qs >= 0) {
+            buffer_copy_string_len(t, b->ptr+qs, used - qs);
+            buffer_string_set_length(b, qs);
+        }
+
+        buffer_path_simplify(b, b);
+
+        if (qs >= 0) {
+            qs = (int)buffer_string_length(b);
+            buffer_append_string_len(b, CONST_BUF_LEN(t));
+        }
+    }
+
+    return qs;
+}
+
+
+int burl_normalize (buffer *b, buffer *t, int flags)
+{
+    int qs;
+
+  #if defined(__WIN32) || defined(__CYGWIN__)
+    /* Windows and Cygwin treat '\\' as '/' if '\\' is present in path;
+     * convert to '/' for consistency before percent-encoding
+     * normalization which will convert '\\' to "%5C" in the URL.
+     * (Clients still should not be sending '\\' unencoded in requests.) */
+    if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_BACKSLASH_TRANS) {
+        for (char *p = b->ptr; *p != '?' && *p != '\0'; ++p) {
+            if (*p == '\\') *p = '/';
+        }
+    }
+  #endif
+
+    qs = (flags & HTTP_PARSEOPT_URL_NORMALIZE_REQUIRED)
+      ? burl_normalize_basic_required(b, t)
+      : burl_normalize_basic_unreserved(b, t);
+    if (-2 == qs) return -2;
+
+    if (flags & HTTP_PARSEOPT_URL_NORMALIZE_CTRLS_REJECT) {
+        if (burl_contains_ctrls(b)) return -2;
+    }
+
+    if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE
+                |HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)) {
+        qs = burl_normalize_2F_to_slash(b, qs, flags);
+        if (-2 == qs) return -2;
+    }
+
+    if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REMOVE
+                |HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT)) {
+        qs = burl_normalize_path(b, t, qs, flags);
+        if (-2 == qs) return -2;
+    }
+
+    if (flags & HTTP_PARSEOPT_URL_NORMALIZE_QUERY_20_PLUS) {
+        if (qs >= 0) burl_normalize_qs20_to_plus(b, qs);
+    }
+
+    return qs;
+}
author	Glenn Strauss <gstrauss@gluelogic.com>	2018-05-01 00:20:26 -0400
committer	Glenn Strauss <gstrauss@gluelogic.com>	2018-08-12 14:43:22 -0400
commit	3eb7902e10ba75b3f2eb159e244d0d8e5037ccd2 (patch)
tree	3915619c5c0c93733c3f00d670e559ef319c9df7 /src/burl.c
parent	6ccccaaa38bdf545dafbd2e31950e756fc6ac775 (diff)
download	lighttpd-git-3eb7902e10ba75b3f2eb159e244d0d8e5037ccd2.tar.gz