diff options
Diffstat (limited to 'pypers/europython05/Quixote-2.0/http_request.py')
-rwxr-xr-x | pypers/europython05/Quixote-2.0/http_request.py | 759 |
1 files changed, 759 insertions, 0 deletions
diff --git a/pypers/europython05/Quixote-2.0/http_request.py b/pypers/europython05/Quixote-2.0/http_request.py new file mode 100755 index 0000000..6a9602d --- /dev/null +++ b/pypers/europython05/Quixote-2.0/http_request.py @@ -0,0 +1,759 @@ +"""quixote.http_request +$HeadURL: svn+ssh://svn.mems-exchange.org/repos/trunk/quixote/http_request.py $ +$Id: http_request.py 26337 2005-03-11 17:06:05Z dbinger $ + +Provides the HTTPRequest class and related code for parsing HTTP +requests, such as the Upload class. +""" + +import re +import string +import tempfile +import urllib +import rfc822 +from cStringIO import StringIO + +from quixote.http_response import HTTPResponse +from quixote.errors import RequestError + + +# Various regexes for parsing specific bits of HTTP, all from RFC 2616. + +# These are needed by 'get_encoding()', to parse the "Accept-Encoding" +# header. LWS is linear whitespace; the latter two assume that LWS +# has been removed. +_http_lws_re = re.compile(r"(\r\n)?[ \t]+") +_http_list_re = re.compile(r",+") +_http_encoding_re = re.compile(r"([^;]+)(;q=([\d.]+))?$") + +# These are needed by 'guess_browser_version()', for parsing the +# "User-Agent" header. +# token = 1*<any CHAR except CTLs or separators> +# CHAR = any 7-bit US ASCII character (0-127) +# separators are ( ) < > @ , ; : \ " / [ ] ? = { } +# +# The user_agent RE is a simplification; it only looks for one "product", +# possibly followed by a comment. +_http_token_pat = r"[\w!#$%&'*+.^`|~-]+" +_http_product_pat = r'(%s)(?:/(%s))?' % (_http_token_pat, _http_token_pat) +_http_product_re = re.compile(_http_product_pat) +_comment_delim_re = re.compile(r';\s*') + + +def get_content_type(environ): + ctype = environ.get("CONTENT_TYPE") + if ctype: + return ctype.split(";")[0] + else: + return None + +def _decode_string(s, charset): + if charset == 'iso-8859-1': + return s + try: + return s.decode(charset) + except LookupError: + raise RequestError('unknown charset %r' % charset) + except UnicodeDecodeError: + raise RequestError('invalid %r encoded string' % charset) + +def parse_header(line): + """Parse a Content-type like header. + + Return the main content-type and a dictionary of options. + + """ + plist = map(lambda x: x.strip(), line.split(';')) + key = plist.pop(0).lower() + pdict = {} + for p in plist: + i = p.find('=') + if i >= 0: + name = p[:i].strip().lower() + value = p[i+1:].strip() + if len(value) >= 2 and value[0] == value[-1] == '"': + value = value[1:-1] + pdict[name] = value + return key, pdict + +def parse_content_disposition(full_cdisp): + (cdisp, cdisp_params) = parse_header(full_cdisp) + name = cdisp_params.get('name') + if not (cdisp == 'form-data' and name): + raise RequestError('expected Content-Disposition: form-data ' + 'with a "name" parameter: got %r' % full_cdisp) + return (name, cdisp_params.get('filename')) + +def parse_query(qs, charset): + """(qs: string) -> {key:string, string|[string]} + + Parse a query given as a string argument and return a dictionary. + """ + fields = {} + for chunk in filter(None, qs.split('&')): + if '=' not in chunk: + name = chunk + value = '' + else: + name, value = chunk.split('=', 1) + name = urllib.unquote(name.replace('+', ' ')) + value = urllib.unquote(value.replace('+', ' ')) + name = _decode_string(name, charset) + value = _decode_string(value, charset) + _add_field_value(fields, name, value) + return fields + +def _add_field_value(fields, name, value): + if name in fields: + values = fields[name] + if not isinstance(values, list): + fields[name] = values = [values] + values.append(value) + else: + fields[name] = value + + +class HTTPRequest: + """ + Model a single HTTP request and all associated data: environment + variables, form variables, cookies, etc. + + To access environment variables associated with the request, use + get_environ(): eg. request.get_environ('SERVER_PORT', 80). + + To access form variables, use get_field(), eg. + request.get_field("name"). + + To access cookies, use get_cookie(). + + Various bits and pieces of the requested URL can be accessed with + get_url(), get_path(), get_server() + + The HTTPResponse object corresponding to this request is available + in the 'response' attribute. This is rarely needed: eg. to send an + error response, you should raise one of the exceptions in errors.py; + to send a redirect, you should use the request's redirect() method, + which lets you specify relative URLs. However, if you need to tweak + the response object in other ways, you can do so via 'response'. + Just keep in mind that Quixote discards the original response object + when handling an exception. + """ + + DEFAULT_CHARSET = 'iso-8859-1' + + def __init__(self, stdin, environ): + self.stdin = stdin + self.environ = environ + self.form = {} + self.session = None + self.response = HTTPResponse() + + # The strange treatment of SERVER_PORT_SECURE is because IIS + # sets this environment variable to "0" for non-SSL requests + # (most web servers -- well, Apache at least -- simply don't set + # it in that case). + if (environ.get('HTTPS', 'off').lower() == 'on' or + environ.get('SERVER_PORT_SECURE', '0') != '0'): + self.scheme = "https" + else: + self.scheme = "http" + + k = self.environ.get('HTTP_COOKIE', '') + if k: + self.cookies = parse_cookies(k) + else: + self.cookies = {} + + # IIS breaks PATH_INFO because it leaves in the path to + # the script, so SCRIPT_NAME is "/cgi-bin/q.py" and PATH_INFO + # is "/cgi-bin/q.py/foo/bar". The following code fixes + # PATH_INFO to the expected value "/foo/bar". + web_server = environ.get('SERVER_SOFTWARE', 'unknown') + if web_server.find('Microsoft-IIS') != -1: + script = environ['SCRIPT_NAME'] + path = environ['PATH_INFO'] + if path.startswith(script): + path = path[len(script):] + self.environ['PATH_INFO'] = path + + def process_inputs(self): + query = self.get_query() + if query: + self.form.update(parse_query(query, self.DEFAULT_CHARSET)) + length = self.environ.get('CONTENT_LENGTH') or "0" + try: + length = int(length) + except ValueError: + raise RequestError('invalid content-length header') + ctype = self.environ.get("CONTENT_TYPE") + if ctype: + ctype, ctype_params = parse_header(ctype) + if ctype == 'application/x-www-form-urlencoded': + self._process_urlencoded(length, ctype_params) + elif ctype == 'multipart/form-data': + self._process_multipart(length, ctype_params) + + def _process_urlencoded(self, length, params): + query = self.stdin.read(length) + if len(query) != length: + raise RequestError('unexpected end of request body') + charset = params.get('charset', self.DEFAULT_CHARSET) + self.form.update(parse_query(query, charset)) + + def _process_multipart(self, length, params): + boundary = params.get('boundary') + if not boundary: + raise RequestError('multipart/form-data missing boundary') + charset = params.get('charset') + mimeinput = MIMEInput(self.stdin, boundary, length) + try: + for line in mimeinput.readpart(): + pass # discard lines up to first boundary + while mimeinput.moreparts(): + self._process_multipart_body(mimeinput, charset) + except EOFError: + raise RequestError('unexpected end of multipart/form-data') + + def _process_multipart_body(self, mimeinput, charset): + headers = StringIO() + lines = mimeinput.readpart() + for line in lines: + headers.write(line) + if line == '\r\n': + break + headers.seek(0) + headers = rfc822.Message(headers) + ctype, ctype_params = parse_header(headers.get('content-type', '')) + if ctype and 'charset' in ctype_params: + charset = ctype_params['charset'] + cdisp, cdisp_params = parse_header(headers.get('content-disposition', + '')) + if not cdisp: + raise RequestError('expected Content-Disposition header') + name = cdisp_params.get('name') + filename = cdisp_params.get('filename') + if not (cdisp == 'form-data' and name): + raise RequestError('expected Content-Disposition: form-data' + 'with a "name" parameter: got %r' % + headers.get('content-disposition', '')) + # FIXME: should really to handle Content-Transfer-Encoding and other + # MIME complexity here. See RFC2048 for the full horror story. + if filename: + # it might be large file upload so use a temporary file + upload = Upload(filename, ctype, charset) + upload.receive(lines) + _add_field_value(self.form, name, upload) + else: + value = _decode_string(''.join(lines), + charset or self.DEFAULT_CHARSET) + _add_field_value(self.form, name, value) + + def get_header(self, name, default=None): + """get_header(name : string, default : string = None) -> string + + Return the named HTTP header, or an optional default argument + (or None) if the header is not found. Note that both original + and CGI-ified header names are recognized, e.g. 'Content-Type', + 'CONTENT_TYPE' and 'HTTP_CONTENT_TYPE' should all return the + Content-Type header, if available. + """ + environ = self.environ + name = name.replace("-", "_").upper() + val = environ.get(name) + if val is not None: + return val + if name[:5] != 'HTTP_': + name = 'HTTP_' + name + return environ.get(name, default) + + def get_cookie(self, cookie_name, default=None): + return self.cookies.get(cookie_name, default) + + def get_cookies(self): + return self.cookies + + def get_field(self, name, default=None): + return self.form.get(name, default) + + def get_fields(self): + return self.form + + def get_method(self): + """Returns the HTTP method for this request + """ + return self.environ.get('REQUEST_METHOD', 'GET') + + def formiter(self): + return self.form.iteritems() + + def get_scheme(self): + return self.scheme + + # The following environment variables are useful for reconstructing + # the original URL, all of which are specified by CGI 1.1: + # + # SERVER_NAME "www.example.com" + # SCRIPT_NAME "/q" + # PATH_INFO "/debug/dump_sessions" + # QUERY_STRING "session_id=10.27.8.40...." + + def get_server(self): + """get_server() -> string + + Return the server name with an optional port number, eg. + "www.example.com" or "foo.bar.com:8000". + """ + http_host = self.environ.get("HTTP_HOST") + if http_host: + return http_host + server_name = self.environ["SERVER_NAME"].strip() + server_port = self.environ.get("SERVER_PORT") + if (not server_port or + (self.get_scheme() == "http" and server_port == "80") or + (self.get_scheme() == "https" and server_port == "443")): + return server_name + else: + return server_name + ":" + server_port + + def get_path(self, n=0): + """get_path(n : int = 0) -> string + + Return the path of the current request, chopping off 'n' path + components from the right. Eg. if the path is "/bar/baz/qux", + n=0 would return "/bar/baz/qux" and n=2 would return "/bar". + Note that the query string, if any, is not included. + + A path with a trailing slash should just be considered as having + an empty last component. Eg. if the path is "/bar/baz/", then: + get_path(0) == "/bar/baz/" + get_path(1) == "/bar/baz" + get_path(2) == "/bar" + + If 'n' is negative, then components from the left of the path + are returned. Continuing the above example, + get_path(-1) = "/bar" + get_path(-2) = "/bar/baz" + get_path(-3) = "/bar/baz/" + + Raises ValueError if absolute value of n is larger than the number of + path components.""" + + path_info = self.environ.get('PATH_INFO', '') + path = self.environ['SCRIPT_NAME'] + path_info + if n == 0: + return path + else: + path_comps = path.split('/') + if abs(n) > len(path_comps)-1: + raise ValueError, "n=%d too big for path '%s'" % (n, path) + if n > 0: + return '/'.join(path_comps[:-n]) + elif n < 0: + return '/'.join(path_comps[:-n+1]) + else: + assert 0, "Unexpected value for n (%s)" % n + + def get_query(self): + """() -> string + + Return the query component of the URL. + """ + return self.environ.get('QUERY_STRING', '') + + def get_url(self, n=0): + """get_url(n : int = 0) -> string + + Return the URL of the current request, chopping off 'n' path + components from the right. Eg. if the URL is + "http://foo.com/bar/baz/qux", n=2 would return + "http://foo.com/bar". Does not include the query string (if + any). + """ + return "%s://%s%s" % (self.get_scheme(), self.get_server(), + urllib.quote(self.get_path(n))) + + def get_environ(self, key, default=None): + """get_environ(key : string) -> string + + Fetch a CGI environment variable from the request environment. + See http://hoohoo.ncsa.uiuc.edu/cgi/env.html + for the variables specified by the CGI standard. + """ + return self.environ.get(key, default) + + def get_encoding(self, encodings): + """get_encoding(encodings : [string]) -> string + + Parse the "Accept-encoding" header. 'encodings' is a list of + encodings supported by the server sorted in order of preference. + The return value is one of 'encodings' or None if the client + does not accept any of the encodings. + """ + accept_encoding = self.get_header("accept-encoding") or "" + found_encodings = self._parse_pref_header(accept_encoding) + if found_encodings: + for encoding in encodings: + if found_encodings.has_key(encoding): + return encoding + return None + + def get_accepted_types(self): + """get_accepted_types() : {string:float} + Return a dictionary mapping MIME types the client will accept + to the corresponding quality value (1.0 if no value was specified). + """ + accept_types = self.environ.get('HTTP_ACCEPT', "") + return self._parse_pref_header(accept_types) + + + def _parse_pref_header(self, S): + """_parse_pref_header(S:string) : {string:float} + Parse a list of HTTP preferences (content types, encodings) and + return a dictionary mapping strings to the quality value. + """ + + found = {} + # remove all linear whitespace + S = _http_lws_re.sub("", S) + for coding in _http_list_re.split(S): + m = _http_encoding_re.match(coding) + if m: + encoding = m.group(1).lower() + q = m.group(3) or 1.0 + try: + q = float(q) + except ValueError: + continue + if encoding == "*": + continue # stupid, ignore it + if q > 0: + found[encoding] = q + return found + + def dump(self): + result=[] + row='%-15s %s' + + result.append("Form:") + L = self.form.items() ; L.sort() + for k,v in L: + result.append(row % (k,v)) + + result.append("") + result.append("Cookies:") + L = self.cookies.items() ; L.sort() + for k,v in L: + result.append(row % (k,v)) + + + result.append("") + result.append("Environment:") + L = self.environ.items() ; L.sort() + for k,v in L: + result.append(row % (k,v)) + return "\n".join(result) + + def guess_browser_version(self): + """guess_browser_version() -> (name : string, version : string) + + Examine the User-agent request header to try to figure out what + the current browser is. Returns either (name, version) where + each element is a string, (None, None) if we couldn't parse the + User-agent header at all, or (name, None) if we got the name but + couldn't figure out the version. + + Handles Microsoft's little joke of pretending to be Mozilla, + eg. if the "User-Agent" header is + Mozilla/5.0 (compatible; MSIE 5.5) + returns ("MSIE", "5.5"). Konqueror does the same thing, and + it's handled the same way. + """ + ua = self.get_header('user-agent') + if ua is None: + return (None, None) + + # The syntax for "User-Agent" in RFC 2616 is fairly simple: + # + # User-Agent = "User-Agent" ":" 1*( product | comment ) + # product = token ["/" product-version ] + # product-version = token + # comment = "(" *( ctext | comment ) ")" + # ctext = <any TEXT excluding "(" and ")"> + # token = 1*<any CHAR except CTLs or tspecials> + # tspecials = "(" | ")" | "<" | ">" | "@" | "," | ";" | ":" | + # "\" | <"> | "/" | "[" | "]" | "?" | "=" | "{" | + # "}" | SP | HT + # + # This function handles the most-commonly-used subset of this syntax, + # namely + # User-Agent = "User-Agent" ":" product 1*SP [comment] + # ie. one product string followed by an optional comment; + # anything after that first comment is ignored. This should be + # enough to distinguish Mozilla/Netscape, MSIE, Opera, and + # Konqueror. + + m = _http_product_re.match(ua) + if not m: + import sys + sys.stderr.write("couldn't parse User-Agent header: %r\n" % ua) + return (None, None) + + name, version = m.groups() + ua = ua[m.end():].lstrip() + + if ua.startswith('('): + # we need to handle nested comments since MSIE uses them + depth = 1 + chars = [] + for c in ua[1:]: + if c == '(': + depth += 1 + elif c == ')': + depth -= 1 + if depth == 0: + break + elif depth == 1: + # nested comments are discarded + chars.append(c) + comment = ''.join(chars) + else: + comment = '' + if comment: + comment_chunks = _comment_delim_re.split(comment) + else: + comment_chunks = [] + + if ("compatible" in comment_chunks and + len(comment_chunks) > 1 and comment_chunks[1]): + # A-ha! Someone is kidding around, pretending to be what + # they are not. Most likely MSIE masquerading as Mozilla, + # but lots of other clients (eg. Konqueror) do the same. + real_ua = comment_chunks[1] + if "/" in real_ua: + (name, version) = real_ua.split("/", 1) + else: + if real_ua.startswith("MSIE") and ' ' in real_ua: + (name, version) = real_ua.split(" ", 1) + else: + name = real_ua + version = None + return (name, version) + + # Either nobody is pulling our leg, or we didn't find anything + # that looks vaguely like a user agent in the comment. So use + # what we found outside the comment, ie. what the spec says we + # should use (sigh). + return (name, version) + + # guess_browser_version () + + +# See RFC 2109 for details. Note that this parser is more liberal. +_COOKIE_RE = re.compile(r""" + \s* + (?P<name>[^=;,\s]+) + \s* + ( + = + \s* + ( + (?P<qvalue> "(\\[\x00-\x7f] | [^"])*") + | + (?P<value> [^";,\s]*) + ) + )? + \s* + [;,]? + """, re.VERBOSE) + +def parse_cookies(text): + result = {} + for m in _COOKIE_RE.finditer(text): + name = m.group('name') + if name[0] == '$': + # discard, we don't handle per cookie attributes (e.g. $Path) + continue + qvalue = m.group('qvalue') + if qvalue: + value = re.sub(r'\\(.)', r'\1', qvalue)[1:-1] + else: + value = m.group('value') or '' + result[name] = value + return result + +SAFE_CHARS = string.letters + string.digits + "-@&+=_., " +_safe_trans = None + +def make_safe_filename(s): + global _safe_trans + if _safe_trans is None: + _safe_trans = ["_"] * 256 + for c in SAFE_CHARS: + _safe_trans[ord(c)] = c + _safe_trans = "".join(_safe_trans) + + return s.translate(_safe_trans) + + +class Upload: + r""" + Represents a single uploaded file. Uploaded files live in the + filesystem, *not* in memory. + + fp + an open file containing the content of the upload. The file pointer + points to the beginning of the file + orig_filename + the complete filename supplied by the user-agent in the + request that uploaded this file. Depending on the browser, + this might have the complete path of the original file + on the client system, in the client system's syntax -- eg. + "C:\foo\bar\upload_this" or "/foo/bar/upload_this" or + "foo:bar:upload_this". + base_filename + the base component of orig_filename, shorn of MS-DOS, + Mac OS, and Unix path components and with "unsafe" + characters neutralized (see make_safe_filename()) + content_type + the content type provided by the user-agent in the request + that uploaded this file. + charset + the charset provide by the user-agent + """ + + def __init__(self, orig_filename, content_type=None, charset=None): + if orig_filename: + self.orig_filename = orig_filename + bspos = orig_filename.rfind("\\") + cpos = orig_filename.rfind(":") + spos = orig_filename.rfind("/") + if bspos != -1: # eg. "\foo\bar" or "D:\ding\dong" + filename = orig_filename[bspos+1:] + elif cpos != -1: # eg. "C:foo" or ":ding:dong:foo" + filename = orig_filename[cpos+1:] + elif spos != -1: # eg. "foo/bar/baz" or "/tmp/blah" + filename = orig_filename[spos+1:] + else: + filename = orig_filename + + self.base_filename = make_safe_filename(filename) + else: + self.orig_filename = None + self.base_filename = None + self.content_type = content_type + self.charset = charset + self.fp = None + + def receive(self, lines): + self.fp = tempfile.TemporaryFile("w+b") + for line in lines: + self.fp.write(line) + self.fp.seek(0) + + def __str__(self): + return str(self.orig_filename) + + def __repr__(self): + return "<%s at %x: %s>" % (self.__class__.__name__, id(self), self) + + def read(self, n): + return self.fp.read(n) + + def readline(self): + return self.fp.readlines() + + def __iter__(self): + return iter(self.fp) + + def close(self): + self.fp.close() + + +class LineInput: + r""" + A wrapper for an input stream that has the following properties: + + * lines are terminated by \r\n + + * lines shorter than 'maxlength' are always returned unbroken + + * lines longer than 'maxlength' are broken but the pair of + characters \r\n are never split + + * no more than 'length' characters are read from the underlying + stream + + * if the underlying stream does not produce at least 'length' + characters then EOFError is raised + + """ + def __init__(self, fp, length): + self.fp = fp + self.length = length + self.buf = '' + + def readline(self, maxlength=4096): + # fill buffer + n = min(self.length, maxlength - len(self.buf)) + if n > 0: + self.length -= n + assert self.length >= 0 + chunk = self.fp.read(n) + if len(chunk) != n: + raise EOFError('unexpected end of input') + self.buf += chunk + # split into lines + buf = self.buf + i = buf.find('\r\n') + if i >= 0: + i += 2 + self.buf = buf[i:] + return buf[:i] + elif buf.endswith('\r'): + # avoid splitting CR LF pairs + self.buf = '\r' + return buf[:-1] + else: + self.buf = '' + return buf + +class MIMEInput: + """ + Split a MIME input stream into parts. Note that this class does not + handle headers, transfer encoding, etc. + """ + + def __init__(self, fp, boundary, length): + self.lineinput = LineInput(fp, length) + self.pat = re.compile(r'--%s(--)?[ \t]*\r\n' % re.escape(boundary)) + self.done = False + + def moreparts(self): + """Return true if there are more parts to be read.""" + return not self.done + + def readpart(self): + """Generate all the lines up to a MIME boundary. Note that you + must exhaust the generator before calling this function again.""" + assert not self.done + last_line = '' + while 1: + line = self.lineinput.readline() + if not line: + # Hit EOF -- nothing more to read. This should *not* happen + # in a well-formed MIME message. + raise EOFError('MIME boundary not found (end of input)') + if last_line.endswith('\r\n') or last_line == '': + m = self.pat.match(line) + if m: + # If we hit the boundary line, return now. Forget + # the current line *and* the CRLF ending of the + # previous line. + if m.group(1): + # hit final boundary + self.done = True + yield last_line[:-2] + return + if last_line: + yield last_line + last_line = line |