diff options
Diffstat (limited to 'src/waitress/parser.py')
-rw-r--r-- | src/waitress/parser.py | 414 |
1 files changed, 414 insertions, 0 deletions
diff --git a/src/waitress/parser.py b/src/waitress/parser.py new file mode 100644 index 0000000..765fe59 --- /dev/null +++ b/src/waitress/parser.py @@ -0,0 +1,414 @@ +############################################################################## +# +# Copyright (c) 2001, 2002 Zope Foundation and Contributors. +# All Rights Reserved. +# +# This software is subject to the provisions of the Zope Public License, +# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. +# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED +# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS +# FOR A PARTICULAR PURPOSE. +# +############################################################################## +"""HTTP Request Parser + +This server uses asyncore to accept connections and do initial +processing but threads to do work. +""" +import re +from io import BytesIO + +from waitress.buffers import OverflowableBuffer +from waitress.compat import tostr, unquote_bytes_to_wsgi, urlparse +from waitress.receiver import ChunkedReceiver, FixedStreamReceiver +from waitress.utilities import ( + BadRequest, + RequestEntityTooLarge, + RequestHeaderFieldsTooLarge, + ServerNotImplemented, + find_double_newline, +) +from .rfc7230 import HEADER_FIELD + + +class ParsingError(Exception): + pass + + +class TransferEncodingNotImplemented(Exception): + pass + + +class HTTPRequestParser(object): + """A structure that collects the HTTP request. + + Once the stream is completed, the instance is passed to + a server task constructor. + """ + + completed = False # Set once request is completed. + empty = False # Set if no request was made. + expect_continue = False # client sent "Expect: 100-continue" header + headers_finished = False # True when headers have been read + header_plus = b"" + chunked = False + content_length = 0 + header_bytes_received = 0 + body_bytes_received = 0 + body_rcv = None + version = "1.0" + error = None + connection_close = False + + # Other attributes: first_line, header, headers, command, uri, version, + # path, query, fragment + + def __init__(self, adj): + """ + adj is an Adjustments object. + """ + # headers is a mapping containing keys translated to uppercase + # with dashes turned into underscores. + self.headers = {} + self.adj = adj + + def received(self, data): + """ + Receives the HTTP stream for one request. Returns the number of + bytes consumed. Sets the completed flag once both the header and the + body have been received. + """ + if self.completed: + return 0 # Can't consume any more. + + datalen = len(data) + br = self.body_rcv + if br is None: + # In header. + max_header = self.adj.max_request_header_size + + s = self.header_plus + data + index = find_double_newline(s) + consumed = 0 + + if index >= 0: + # If the headers have ended, and we also have part of the body + # message in data we still want to validate we aren't going + # over our limit for received headers. + self.header_bytes_received += index + consumed = datalen - (len(s) - index) + else: + self.header_bytes_received += datalen + consumed = datalen + + # If the first line + headers is over the max length, we return a + # RequestHeaderFieldsTooLarge error rather than continuing to + # attempt to parse the headers. + if self.header_bytes_received >= max_header: + self.parse_header(b"GET / HTTP/1.0\r\n") + self.error = RequestHeaderFieldsTooLarge( + "exceeds max_header of %s" % max_header + ) + self.completed = True + return consumed + + if index >= 0: + # Header finished. + header_plus = s[:index] + + # Remove preceeding blank lines. This is suggested by + # https://tools.ietf.org/html/rfc7230#section-3.5 to support + # clients sending an extra CR LF after another request when + # using HTTP pipelining + header_plus = header_plus.lstrip() + + if not header_plus: + self.empty = True + self.completed = True + else: + try: + self.parse_header(header_plus) + except ParsingError as e: + self.error = BadRequest(e.args[0]) + self.completed = True + except TransferEncodingNotImplemented as e: + self.error = ServerNotImplemented(e.args[0]) + self.completed = True + else: + if self.body_rcv is None: + # no content-length header and not a t-e: chunked + # request + self.completed = True + + if self.content_length > 0: + max_body = self.adj.max_request_body_size + # we won't accept this request if the content-length + # is too large + + if self.content_length >= max_body: + self.error = RequestEntityTooLarge( + "exceeds max_body of %s" % max_body + ) + self.completed = True + self.headers_finished = True + + return consumed + + # Header not finished yet. + self.header_plus = s + + return datalen + else: + # In body. + consumed = br.received(data) + self.body_bytes_received += consumed + max_body = self.adj.max_request_body_size + + if self.body_bytes_received >= max_body: + # this will only be raised during t-e: chunked requests + self.error = RequestEntityTooLarge("exceeds max_body of %s" % max_body) + self.completed = True + elif br.error: + # garbage in chunked encoding input probably + self.error = br.error + self.completed = True + elif br.completed: + # The request (with the body) is ready to use. + self.completed = True + + if self.chunked: + # We've converted the chunked transfer encoding request + # body into a normal request body, so we know its content + # length; set the header here. We already popped the + # TRANSFER_ENCODING header in parse_header, so this will + # appear to the client to be an entirely non-chunked HTTP + # request with a valid content-length. + self.headers["CONTENT_LENGTH"] = str(br.__len__()) + + return consumed + + def parse_header(self, header_plus): + """ + Parses the header_plus block of text (the headers plus the + first line of the request). + """ + index = header_plus.find(b"\r\n") + if index >= 0: + first_line = header_plus[:index].rstrip() + header = header_plus[index + 2 :] + else: + raise ParsingError("HTTP message header invalid") + + if b"\r" in first_line or b"\n" in first_line: + raise ParsingError("Bare CR or LF found in HTTP message") + + self.first_line = first_line # for testing + + lines = get_header_lines(header) + + headers = self.headers + for line in lines: + header = HEADER_FIELD.match(line) + + if not header: + raise ParsingError("Invalid header") + + key, value = header.group("name", "value") + + if b"_" in key: + # TODO(xistence): Should we drop this request instead? + continue + + # Only strip off whitespace that is considered valid whitespace by + # RFC7230, don't strip the rest + value = value.strip(b" \t") + key1 = tostr(key.upper().replace(b"-", b"_")) + # If a header already exists, we append subsequent values + # separated by a comma. Applications already need to handle + # the comma separated values, as HTTP front ends might do + # the concatenation for you (behavior specified in RFC2616). + try: + headers[key1] += tostr(b", " + value) + except KeyError: + headers[key1] = tostr(value) + + # command, uri, version will be bytes + command, uri, version = crack_first_line(first_line) + version = tostr(version) + command = tostr(command) + self.command = command + self.version = version + ( + self.proxy_scheme, + self.proxy_netloc, + self.path, + self.query, + self.fragment, + ) = split_uri(uri) + self.url_scheme = self.adj.url_scheme + connection = headers.get("CONNECTION", "") + + if version == "1.0": + if connection.lower() != "keep-alive": + self.connection_close = True + + if version == "1.1": + # since the server buffers data from chunked transfers and clients + # never need to deal with chunked requests, downstream clients + # should not see the HTTP_TRANSFER_ENCODING header; we pop it + # here + te = headers.pop("TRANSFER_ENCODING", "") + + # NB: We can not just call bare strip() here because it will also + # remove other non-printable characters that we explicitly do not + # want removed so that if someone attempts to smuggle a request + # with these characters we don't fall prey to it. + # + # For example \x85 is stripped by default, but it is not considered + # valid whitespace to be stripped by RFC7230. + encodings = [ + encoding.strip(" \t").lower() for encoding in te.split(",") if encoding + ] + + for encoding in encodings: + # Out of the transfer-codings listed in + # https://tools.ietf.org/html/rfc7230#section-4 we only support + # chunked at this time. + + # Note: the identity transfer-coding was removed in RFC7230: + # https://tools.ietf.org/html/rfc7230#appendix-A.2 and is thus + # not supported + if encoding not in {"chunked"}: + raise TransferEncodingNotImplemented( + "Transfer-Encoding requested is not supported." + ) + + if encodings and encodings[-1] == "chunked": + self.chunked = True + buf = OverflowableBuffer(self.adj.inbuf_overflow) + self.body_rcv = ChunkedReceiver(buf) + elif encodings: # pragma: nocover + raise TransferEncodingNotImplemented( + "Transfer-Encoding requested is not supported." + ) + + expect = headers.get("EXPECT", "").lower() + self.expect_continue = expect == "100-continue" + if connection.lower() == "close": + self.connection_close = True + + if not self.chunked: + try: + cl = int(headers.get("CONTENT_LENGTH", 0)) + except ValueError: + raise ParsingError("Content-Length is invalid") + + self.content_length = cl + if cl > 0: + buf = OverflowableBuffer(self.adj.inbuf_overflow) + self.body_rcv = FixedStreamReceiver(cl, buf) + + def get_body_stream(self): + body_rcv = self.body_rcv + if body_rcv is not None: + return body_rcv.getfile() + else: + return BytesIO() + + def close(self): + body_rcv = self.body_rcv + if body_rcv is not None: + body_rcv.getbuf().close() + + +def split_uri(uri): + # urlsplit handles byte input by returning bytes on py3, so + # scheme, netloc, path, query, and fragment are bytes + + scheme = netloc = path = query = fragment = b"" + + # urlsplit below will treat this as a scheme-less netloc, thereby losing + # the original intent of the request. Here we shamelessly stole 4 lines of + # code from the CPython stdlib to parse out the fragment and query but + # leave the path alone. See + # https://github.com/python/cpython/blob/8c9e9b0cd5b24dfbf1424d1f253d02de80e8f5ef/Lib/urllib/parse.py#L465-L468 + # and https://github.com/Pylons/waitress/issues/260 + + if uri[:2] == b"//": + path = uri + + if b"#" in path: + path, fragment = path.split(b"#", 1) + + if b"?" in path: + path, query = path.split(b"?", 1) + else: + try: + scheme, netloc, path, query, fragment = urlparse.urlsplit(uri) + except UnicodeError: + raise ParsingError("Bad URI") + + return ( + tostr(scheme), + tostr(netloc), + unquote_bytes_to_wsgi(path), + tostr(query), + tostr(fragment), + ) + + +def get_header_lines(header): + """ + Splits the header into lines, putting multi-line headers together. + """ + r = [] + lines = header.split(b"\r\n") + for line in lines: + if not line: + continue + + if b"\r" in line or b"\n" in line: + raise ParsingError('Bare CR or LF found in header line "%s"' % tostr(line)) + + if line.startswith((b" ", b"\t")): + if not r: + # https://corte.si/posts/code/pathod/pythonservers/index.html + raise ParsingError('Malformed header line "%s"' % tostr(line)) + r[-1] += line + else: + r.append(line) + return r + + +first_line_re = re.compile( + b"([^ ]+) " + b"((?:[^ :?#]+://[^ ?#/]*(?:[0-9]{1,5})?)?[^ ]+)" + b"(( HTTP/([0-9.]+))$|$)" +) + + +def crack_first_line(line): + m = first_line_re.match(line) + if m is not None and m.end() == len(line): + if m.group(3): + version = m.group(5) + else: + version = b"" + method = m.group(1) + + # the request methods that are currently defined are all uppercase: + # https://www.iana.org/assignments/http-methods/http-methods.xhtml and + # the request method is case sensitive according to + # https://tools.ietf.org/html/rfc7231#section-4.1 + + # By disallowing anything but uppercase methods we save poor + # unsuspecting souls from sending lowercase HTTP methods to waitress + # and having the request complete, while servers like nginx drop the + # request onto the floor. + if method != method.upper(): + raise ParsingError('Malformed HTTP method "%s"' % tostr(method)) + uri = m.group(2) + return method, uri, version + else: + return b"", b"", b"" |