summaryrefslogtreecommitdiff
path: root/src/burl.c
diff options
context:
space:
mode:
authorGlenn Strauss <gstrauss@gluelogic.com>2018-05-01 00:20:26 -0400
committerGlenn Strauss <gstrauss@gluelogic.com>2018-08-12 14:43:22 -0400
commit3eb7902e10ba75b3f2eb159e244d0d8e5037ccd2 (patch)
tree3915619c5c0c93733c3f00d670e559ef319c9df7 /src/burl.c
parent6ccccaaa38bdf545dafbd2e31950e756fc6ac775 (diff)
downloadlighttpd-git-3eb7902e10ba75b3f2eb159e244d0d8e5037ccd2.tar.gz
[core] server.http-parseopts URL normalization opt (fixes #1720)
server.http-parseopts = ( ... ) URL normalization options Note: *not applied* to CONNECT method Note: In a future release, URL normalization likely enabled by default (normalize URL, reject control chars, remove . and .. path segments) To prepare for this change, lighttpd.conf configurations should explicitly select desired behavior by enabling or disabling: server.http-parseopts = ( "url-normalize" => "enable", ... ) server.http-parseopts = ( "url-normalize" => "disable" ) x-ref: "lighttpd ... compares URIs to patterns in the (1) url.redirect and (2) url.rewrite configuration settings before performing URL decoding, which might allow remote attackers to bypass intended access restrictions, and obtain sensitive information or possibly modify data." https://www.cvedetails.com/cve/CVE-2008-4359/ "Rewrite/redirect rules and URL encoding" https://redmine.lighttpd.net/issues/1720
Diffstat (limited to 'src/burl.c')
-rw-r--r--src/burl.c357
1 files changed, 357 insertions, 0 deletions
diff --git a/src/burl.c b/src/burl.c
new file mode 100644
index 00000000..3eadb6ce
--- /dev/null
+++ b/src/burl.c
@@ -0,0 +1,357 @@
+#include "first.h"
+#include "burl.h"
+
+#include <string.h>
+
+#include "buffer.h"
+
+static const char hex_chars_uc[] = "0123456789ABCDEF";
+
+/* everything except: ! $ & ' ( ) * + , - . / 0-9 : ; = ? @ A-Z _ a-z ~ */
+static const char encoded_chars_http_uri_reqd[] = {
+ /*
+ 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00 - 0F control chars */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10 - 1F */
+ 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 20 - 2F space " # % */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 30 - 3F < > */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40 - 4F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 50 - 5F [ \ ] ^ */
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60 - 6F ` */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, /* 70 - 7F { | } DEL */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 80 - 8F */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 90 - 9F */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* A0 - AF */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* B0 - BF */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* C0 - CF */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* D0 - DF */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* E0 - EF */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* F0 - FF */
+};
+
+
+/* c (char) and n (nibble) MUST be unsigned integer types */
+#define li_cton(c,n) \
+ (((n) = (c) - '0') <= 9 || (((n) = ((c)&0xdf) - 'A') <= 5 ? ((n) += 10) : 0))
+
+/* b (byte) MUST be unsigned integer type
+ * https://en.wikipedia.org/wiki/UTF-8
+ * reject overlong encodings of 7-byte ASCII and invalid UTF-8
+ * (but does not detect other overlong multi-byte encodings) */
+#define li_utf8_invalid_byte(b) ((b) >= 0xF5 || ((b)|0x1) == 0xC1)
+
+
+static int burl_is_unreserved (const int c)
+{
+ return (light_isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~');
+}
+
+
+static int burl_normalize_basic_unreserved_fix (buffer *b, buffer *t, int i, int qs)
+{
+ int j = i;
+ const int used = (int)buffer_string_length(b);
+ const unsigned char * const s = (unsigned char *)b->ptr;
+ unsigned char * const p =
+ (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1);
+ unsigned int n1, n2;
+ memcpy(p, s, (size_t)i);
+ for (; i < used; ++i, ++j) {
+ if (!encoded_chars_http_uri_reqd[s[i]]) {
+ if (s[i] == '?' && -1 == qs) qs = j;
+ p[j] = s[i];
+ }
+ else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) {
+ const unsigned int x = (n1 << 4) | n2;
+ if (burl_is_unreserved(x)) {
+ p[j] = x;
+ }
+ else {
+ p[j] = '%';
+ p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/
+ p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/
+ if (li_utf8_invalid_byte(x)) qs = -2;
+ }
+ i+=2;
+ }
+ else if (s[i] == '#') break; /* ignore fragment */
+ else {
+ p[j] = '%';
+ p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF];
+ p[++j] = hex_chars_uc[s[i] & 0xF];
+ if (li_utf8_invalid_byte(s[i])) qs = -2;
+ }
+ }
+ buffer_commit(t, (size_t)j);
+ buffer_copy_buffer(b, t);
+ return qs;
+}
+
+
+static int burl_normalize_basic_unreserved (buffer *b, buffer *t)
+{
+ const unsigned char * const s = (unsigned char *)b->ptr;
+ const int used = (int)buffer_string_length(b);
+ unsigned int n1, n2, x;
+ int qs = -1;
+
+ for (int i = 0; i < used; ++i) {
+ if (!encoded_chars_http_uri_reqd[s[i]]) {
+ if (s[i] == '?' && -1 == qs) qs = i;
+ }
+ else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)
+ && !burl_is_unreserved((x = (n1 << 4) | n2))) {
+ if (li_utf8_invalid_byte(x)) qs = -2;
+ if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */
+ if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */
+ i+=2;
+ }
+ else if (s[i] == '#') { /* ignore fragment */
+ buffer_string_set_length(b, (size_t)i);
+ break;
+ }
+ else {
+ qs = burl_normalize_basic_unreserved_fix(b, t, i, qs);
+ break;
+ }
+ }
+
+ return qs;
+}
+
+
+static int burl_normalize_basic_required_fix (buffer *b, buffer *t, int i, int qs)
+{
+ int j = i;
+ const int used = (int)buffer_string_length(b);
+ const unsigned char * const s = (unsigned char *)b->ptr;
+ unsigned char * const p =
+ (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1);
+ unsigned int n1, n2;
+ memcpy(p, s, (size_t)i);
+ for (; i < used; ++i, ++j) {
+ if (!encoded_chars_http_uri_reqd[s[i]]) {
+ if (s[i] == '?' && -1 == qs) qs = j;
+ p[j] = s[i];
+ }
+ else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) {
+ const unsigned int x = (n1 << 4) | n2;
+ if (!encoded_chars_http_uri_reqd[x]
+ && (qs < 0 ? (x!='/'&&x!='?') : (x!='&'&&x!='='&&x!=';'))) {
+ p[j] = x;
+ }
+ else {
+ p[j] = '%';
+ p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/
+ p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/
+ if (li_utf8_invalid_byte(x)) qs = -2;
+ }
+ i+=2;
+ }
+ else if (s[i] == '#') break; /* ignore fragment */
+ else {
+ p[j] = '%';
+ p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF];
+ p[++j] = hex_chars_uc[s[i] & 0xF];
+ if (li_utf8_invalid_byte(s[i])) qs = -2;
+ }
+ }
+ buffer_commit(t, (size_t)j);
+ buffer_copy_buffer(b, t);
+ return qs;
+}
+
+
+static int burl_normalize_basic_required (buffer *b, buffer *t)
+{
+ const unsigned char * const s = (unsigned char *)b->ptr;
+ const int used = (int)buffer_string_length(b);
+ unsigned int n1, n2, x;
+ int qs = -1;
+
+ for (int i = 0; i < used; ++i) {
+ if (!encoded_chars_http_uri_reqd[s[i]]) {
+ if (s[i] == '?' && -1 == qs) qs = i;
+ }
+ else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)
+ && (encoded_chars_http_uri_reqd[(x = (n1 << 4) | n2)]
+ ||(qs < 0 ? (x=='/'||x=='?') : (x=='&'||x=='='||x==';')))){
+ if (li_utf8_invalid_byte(x)) qs = -2;
+ if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */
+ if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */
+ i+=2;
+ }
+ else if (s[i] == '#') { /* ignore fragment */
+ buffer_string_set_length(b, (size_t)i);
+ break;
+ }
+ else {
+ qs = burl_normalize_basic_required_fix(b, t, i, qs);
+ break;
+ }
+ }
+
+ return qs;
+}
+
+
+static int burl_contains_ctrls (const buffer *b)
+{
+ const char * const s = b->ptr;
+ const int used = (int)buffer_string_length(b);
+ for (int i = 0; i < used; ++i) {
+ if (s[i] == '%' && (s[i+1] < '2' || (s[i+1] == '7' && s[i+2] == 'F')))
+ return 1;
+ }
+ return 0;
+}
+
+
+static void burl_normalize_qs20_to_plus_fix (buffer *b, int i)
+{
+ char * const s = b->ptr;
+ const int used = (int)buffer_string_length(b);
+ int j = i;
+ for (; i < used; ++i, ++j) {
+ s[j] = s[i];
+ if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') {
+ s[j] = '+';
+ i+=2;
+ }
+ }
+ buffer_string_set_length(b, j);
+}
+
+
+static void burl_normalize_qs20_to_plus (buffer *b, int qs)
+{
+ const char * const s = b->ptr;
+ const int used = qs < 0 ? 0 : (int)buffer_string_length(b);
+ int i;
+ if (qs < 0) return;
+ for (i = qs+1; i < used; ++i) {
+ if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') break;
+ }
+ if (i != used) burl_normalize_qs20_to_plus_fix(b, i);
+}
+
+
+static int burl_normalize_2F_to_slash_fix (buffer *b, int qs, int i)
+{
+ char * const s = b->ptr;
+ const int blen = (int)buffer_string_length(b);
+ const int used = qs < 0 ? blen : qs;
+ int j = i;
+ for (; i < used; ++i, ++j) {
+ s[j] = s[i];
+ if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') {
+ s[j] = '/';
+ i+=2;
+ }
+ }
+ if (qs >= 0) {
+ memmove(s+j, s+qs, blen - qs);
+ j += blen - qs;
+ }
+ buffer_string_set_length(b, j);
+ return qs;
+}
+
+
+static int burl_normalize_2F_to_slash (buffer *b, int qs, int flags)
+{
+ /*("%2F" must already have been uppercased during normalization)*/
+ const char * const s = b->ptr;
+ const int used = qs < 0 ? (int)buffer_string_length(b) : qs;
+ for (int i = 0; i < used; ++i) {
+ if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') {
+ return (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE)
+ ? burl_normalize_2F_to_slash_fix(b, qs, i)
+ : -2; /*(flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)*/
+ }
+ }
+ return qs;
+}
+
+
+static int burl_normalize_path (buffer *b, buffer *t, int qs, int flags)
+{
+ const unsigned char * const s = (unsigned char *)b->ptr;
+ const int used = (int)buffer_string_length(b);
+ int path_simplify = 0;
+ for (int i = 0, len = qs < 0 ? used : qs; i < len; ++i) {
+ if (s[i] == '.' && (s[i+1] != '.' || ++i)
+ && (s[i+1] == '/' || s[i+1] == '?' || s[i+1] == '\0')) {
+ path_simplify = 1;
+ break;
+ }
+ do { ++i; } while (i < len && s[i] != '/');
+ if (s[i] == '/' && s[i+1] == '/') { /*(s[len] != '/')*/
+ path_simplify = 1;
+ break;
+ }
+ }
+
+ if (path_simplify) {
+ if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT) return -2;
+ if (qs >= 0) {
+ buffer_copy_string_len(t, b->ptr+qs, used - qs);
+ buffer_string_set_length(b, qs);
+ }
+
+ buffer_path_simplify(b, b);
+
+ if (qs >= 0) {
+ qs = (int)buffer_string_length(b);
+ buffer_append_string_len(b, CONST_BUF_LEN(t));
+ }
+ }
+
+ return qs;
+}
+
+
+int burl_normalize (buffer *b, buffer *t, int flags)
+{
+ int qs;
+
+ #if defined(__WIN32) || defined(__CYGWIN__)
+ /* Windows and Cygwin treat '\\' as '/' if '\\' is present in path;
+ * convert to '/' for consistency before percent-encoding
+ * normalization which will convert '\\' to "%5C" in the URL.
+ * (Clients still should not be sending '\\' unencoded in requests.) */
+ if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_BACKSLASH_TRANS) {
+ for (char *p = b->ptr; *p != '?' && *p != '\0'; ++p) {
+ if (*p == '\\') *p = '/';
+ }
+ }
+ #endif
+
+ qs = (flags & HTTP_PARSEOPT_URL_NORMALIZE_REQUIRED)
+ ? burl_normalize_basic_required(b, t)
+ : burl_normalize_basic_unreserved(b, t);
+ if (-2 == qs) return -2;
+
+ if (flags & HTTP_PARSEOPT_URL_NORMALIZE_CTRLS_REJECT) {
+ if (burl_contains_ctrls(b)) return -2;
+ }
+
+ if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE
+ |HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)) {
+ qs = burl_normalize_2F_to_slash(b, qs, flags);
+ if (-2 == qs) return -2;
+ }
+
+ if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REMOVE
+ |HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT)) {
+ qs = burl_normalize_path(b, t, qs, flags);
+ if (-2 == qs) return -2;
+ }
+
+ if (flags & HTTP_PARSEOPT_URL_NORMALIZE_QUERY_20_PLUS) {
+ if (qs >= 0) burl_normalize_qs20_to_plus(b, qs);
+ }
+
+ return qs;
+}