summaryrefslogtreecommitdiff
path: root/src/waitress/parser.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/waitress/parser.py')
-rw-r--r--src/waitress/parser.py414
1 files changed, 414 insertions, 0 deletions
diff --git a/src/waitress/parser.py b/src/waitress/parser.py
new file mode 100644
index 0000000..765fe59
--- /dev/null
+++ b/src/waitress/parser.py
@@ -0,0 +1,414 @@
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Foundation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""HTTP Request Parser
+
+This server uses asyncore to accept connections and do initial
+processing but threads to do work.
+"""
+import re
+from io import BytesIO
+
+from waitress.buffers import OverflowableBuffer
+from waitress.compat import tostr, unquote_bytes_to_wsgi, urlparse
+from waitress.receiver import ChunkedReceiver, FixedStreamReceiver
+from waitress.utilities import (
+ BadRequest,
+ RequestEntityTooLarge,
+ RequestHeaderFieldsTooLarge,
+ ServerNotImplemented,
+ find_double_newline,
+)
+from .rfc7230 import HEADER_FIELD
+
+
+class ParsingError(Exception):
+ pass
+
+
+class TransferEncodingNotImplemented(Exception):
+ pass
+
+
+class HTTPRequestParser(object):
+ """A structure that collects the HTTP request.
+
+ Once the stream is completed, the instance is passed to
+ a server task constructor.
+ """
+
+ completed = False # Set once request is completed.
+ empty = False # Set if no request was made.
+ expect_continue = False # client sent "Expect: 100-continue" header
+ headers_finished = False # True when headers have been read
+ header_plus = b""
+ chunked = False
+ content_length = 0
+ header_bytes_received = 0
+ body_bytes_received = 0
+ body_rcv = None
+ version = "1.0"
+ error = None
+ connection_close = False
+
+ # Other attributes: first_line, header, headers, command, uri, version,
+ # path, query, fragment
+
+ def __init__(self, adj):
+ """
+ adj is an Adjustments object.
+ """
+ # headers is a mapping containing keys translated to uppercase
+ # with dashes turned into underscores.
+ self.headers = {}
+ self.adj = adj
+
+ def received(self, data):
+ """
+ Receives the HTTP stream for one request. Returns the number of
+ bytes consumed. Sets the completed flag once both the header and the
+ body have been received.
+ """
+ if self.completed:
+ return 0 # Can't consume any more.
+
+ datalen = len(data)
+ br = self.body_rcv
+ if br is None:
+ # In header.
+ max_header = self.adj.max_request_header_size
+
+ s = self.header_plus + data
+ index = find_double_newline(s)
+ consumed = 0
+
+ if index >= 0:
+ # If the headers have ended, and we also have part of the body
+ # message in data we still want to validate we aren't going
+ # over our limit for received headers.
+ self.header_bytes_received += index
+ consumed = datalen - (len(s) - index)
+ else:
+ self.header_bytes_received += datalen
+ consumed = datalen
+
+ # If the first line + headers is over the max length, we return a
+ # RequestHeaderFieldsTooLarge error rather than continuing to
+ # attempt to parse the headers.
+ if self.header_bytes_received >= max_header:
+ self.parse_header(b"GET / HTTP/1.0\r\n")
+ self.error = RequestHeaderFieldsTooLarge(
+ "exceeds max_header of %s" % max_header
+ )
+ self.completed = True
+ return consumed
+
+ if index >= 0:
+ # Header finished.
+ header_plus = s[:index]
+
+ # Remove preceeding blank lines. This is suggested by
+ # https://tools.ietf.org/html/rfc7230#section-3.5 to support
+ # clients sending an extra CR LF after another request when
+ # using HTTP pipelining
+ header_plus = header_plus.lstrip()
+
+ if not header_plus:
+ self.empty = True
+ self.completed = True
+ else:
+ try:
+ self.parse_header(header_plus)
+ except ParsingError as e:
+ self.error = BadRequest(e.args[0])
+ self.completed = True
+ except TransferEncodingNotImplemented as e:
+ self.error = ServerNotImplemented(e.args[0])
+ self.completed = True
+ else:
+ if self.body_rcv is None:
+ # no content-length header and not a t-e: chunked
+ # request
+ self.completed = True
+
+ if self.content_length > 0:
+ max_body = self.adj.max_request_body_size
+ # we won't accept this request if the content-length
+ # is too large
+
+ if self.content_length >= max_body:
+ self.error = RequestEntityTooLarge(
+ "exceeds max_body of %s" % max_body
+ )
+ self.completed = True
+ self.headers_finished = True
+
+ return consumed
+
+ # Header not finished yet.
+ self.header_plus = s
+
+ return datalen
+ else:
+ # In body.
+ consumed = br.received(data)
+ self.body_bytes_received += consumed
+ max_body = self.adj.max_request_body_size
+
+ if self.body_bytes_received >= max_body:
+ # this will only be raised during t-e: chunked requests
+ self.error = RequestEntityTooLarge("exceeds max_body of %s" % max_body)
+ self.completed = True
+ elif br.error:
+ # garbage in chunked encoding input probably
+ self.error = br.error
+ self.completed = True
+ elif br.completed:
+ # The request (with the body) is ready to use.
+ self.completed = True
+
+ if self.chunked:
+ # We've converted the chunked transfer encoding request
+ # body into a normal request body, so we know its content
+ # length; set the header here. We already popped the
+ # TRANSFER_ENCODING header in parse_header, so this will
+ # appear to the client to be an entirely non-chunked HTTP
+ # request with a valid content-length.
+ self.headers["CONTENT_LENGTH"] = str(br.__len__())
+
+ return consumed
+
+ def parse_header(self, header_plus):
+ """
+ Parses the header_plus block of text (the headers plus the
+ first line of the request).
+ """
+ index = header_plus.find(b"\r\n")
+ if index >= 0:
+ first_line = header_plus[:index].rstrip()
+ header = header_plus[index + 2 :]
+ else:
+ raise ParsingError("HTTP message header invalid")
+
+ if b"\r" in first_line or b"\n" in first_line:
+ raise ParsingError("Bare CR or LF found in HTTP message")
+
+ self.first_line = first_line # for testing
+
+ lines = get_header_lines(header)
+
+ headers = self.headers
+ for line in lines:
+ header = HEADER_FIELD.match(line)
+
+ if not header:
+ raise ParsingError("Invalid header")
+
+ key, value = header.group("name", "value")
+
+ if b"_" in key:
+ # TODO(xistence): Should we drop this request instead?
+ continue
+
+ # Only strip off whitespace that is considered valid whitespace by
+ # RFC7230, don't strip the rest
+ value = value.strip(b" \t")
+ key1 = tostr(key.upper().replace(b"-", b"_"))
+ # If a header already exists, we append subsequent values
+ # separated by a comma. Applications already need to handle
+ # the comma separated values, as HTTP front ends might do
+ # the concatenation for you (behavior specified in RFC2616).
+ try:
+ headers[key1] += tostr(b", " + value)
+ except KeyError:
+ headers[key1] = tostr(value)
+
+ # command, uri, version will be bytes
+ command, uri, version = crack_first_line(first_line)
+ version = tostr(version)
+ command = tostr(command)
+ self.command = command
+ self.version = version
+ (
+ self.proxy_scheme,
+ self.proxy_netloc,
+ self.path,
+ self.query,
+ self.fragment,
+ ) = split_uri(uri)
+ self.url_scheme = self.adj.url_scheme
+ connection = headers.get("CONNECTION", "")
+
+ if version == "1.0":
+ if connection.lower() != "keep-alive":
+ self.connection_close = True
+
+ if version == "1.1":
+ # since the server buffers data from chunked transfers and clients
+ # never need to deal with chunked requests, downstream clients
+ # should not see the HTTP_TRANSFER_ENCODING header; we pop it
+ # here
+ te = headers.pop("TRANSFER_ENCODING", "")
+
+ # NB: We can not just call bare strip() here because it will also
+ # remove other non-printable characters that we explicitly do not
+ # want removed so that if someone attempts to smuggle a request
+ # with these characters we don't fall prey to it.
+ #
+ # For example \x85 is stripped by default, but it is not considered
+ # valid whitespace to be stripped by RFC7230.
+ encodings = [
+ encoding.strip(" \t").lower() for encoding in te.split(",") if encoding
+ ]
+
+ for encoding in encodings:
+ # Out of the transfer-codings listed in
+ # https://tools.ietf.org/html/rfc7230#section-4 we only support
+ # chunked at this time.
+
+ # Note: the identity transfer-coding was removed in RFC7230:
+ # https://tools.ietf.org/html/rfc7230#appendix-A.2 and is thus
+ # not supported
+ if encoding not in {"chunked"}:
+ raise TransferEncodingNotImplemented(
+ "Transfer-Encoding requested is not supported."
+ )
+
+ if encodings and encodings[-1] == "chunked":
+ self.chunked = True
+ buf = OverflowableBuffer(self.adj.inbuf_overflow)
+ self.body_rcv = ChunkedReceiver(buf)
+ elif encodings: # pragma: nocover
+ raise TransferEncodingNotImplemented(
+ "Transfer-Encoding requested is not supported."
+ )
+
+ expect = headers.get("EXPECT", "").lower()
+ self.expect_continue = expect == "100-continue"
+ if connection.lower() == "close":
+ self.connection_close = True
+
+ if not self.chunked:
+ try:
+ cl = int(headers.get("CONTENT_LENGTH", 0))
+ except ValueError:
+ raise ParsingError("Content-Length is invalid")
+
+ self.content_length = cl
+ if cl > 0:
+ buf = OverflowableBuffer(self.adj.inbuf_overflow)
+ self.body_rcv = FixedStreamReceiver(cl, buf)
+
+ def get_body_stream(self):
+ body_rcv = self.body_rcv
+ if body_rcv is not None:
+ return body_rcv.getfile()
+ else:
+ return BytesIO()
+
+ def close(self):
+ body_rcv = self.body_rcv
+ if body_rcv is not None:
+ body_rcv.getbuf().close()
+
+
+def split_uri(uri):
+ # urlsplit handles byte input by returning bytes on py3, so
+ # scheme, netloc, path, query, and fragment are bytes
+
+ scheme = netloc = path = query = fragment = b""
+
+ # urlsplit below will treat this as a scheme-less netloc, thereby losing
+ # the original intent of the request. Here we shamelessly stole 4 lines of
+ # code from the CPython stdlib to parse out the fragment and query but
+ # leave the path alone. See
+ # https://github.com/python/cpython/blob/8c9e9b0cd5b24dfbf1424d1f253d02de80e8f5ef/Lib/urllib/parse.py#L465-L468
+ # and https://github.com/Pylons/waitress/issues/260
+
+ if uri[:2] == b"//":
+ path = uri
+
+ if b"#" in path:
+ path, fragment = path.split(b"#", 1)
+
+ if b"?" in path:
+ path, query = path.split(b"?", 1)
+ else:
+ try:
+ scheme, netloc, path, query, fragment = urlparse.urlsplit(uri)
+ except UnicodeError:
+ raise ParsingError("Bad URI")
+
+ return (
+ tostr(scheme),
+ tostr(netloc),
+ unquote_bytes_to_wsgi(path),
+ tostr(query),
+ tostr(fragment),
+ )
+
+
+def get_header_lines(header):
+ """
+ Splits the header into lines, putting multi-line headers together.
+ """
+ r = []
+ lines = header.split(b"\r\n")
+ for line in lines:
+ if not line:
+ continue
+
+ if b"\r" in line or b"\n" in line:
+ raise ParsingError('Bare CR or LF found in header line "%s"' % tostr(line))
+
+ if line.startswith((b" ", b"\t")):
+ if not r:
+ # https://corte.si/posts/code/pathod/pythonservers/index.html
+ raise ParsingError('Malformed header line "%s"' % tostr(line))
+ r[-1] += line
+ else:
+ r.append(line)
+ return r
+
+
+first_line_re = re.compile(
+ b"([^ ]+) "
+ b"((?:[^ :?#]+://[^ ?#/]*(?:[0-9]{1,5})?)?[^ ]+)"
+ b"(( HTTP/([0-9.]+))$|$)"
+)
+
+
+def crack_first_line(line):
+ m = first_line_re.match(line)
+ if m is not None and m.end() == len(line):
+ if m.group(3):
+ version = m.group(5)
+ else:
+ version = b""
+ method = m.group(1)
+
+ # the request methods that are currently defined are all uppercase:
+ # https://www.iana.org/assignments/http-methods/http-methods.xhtml and
+ # the request method is case sensitive according to
+ # https://tools.ietf.org/html/rfc7231#section-4.1
+
+ # By disallowing anything but uppercase methods we save poor
+ # unsuspecting souls from sending lowercase HTTP methods to waitress
+ # and having the request complete, while servers like nginx drop the
+ # request onto the floor.
+ if method != method.upper():
+ raise ParsingError('Malformed HTTP method "%s"' % tostr(method))
+ uri = m.group(2)
+ return method, uri, version
+ else:
+ return b"", b"", b""