diff options
author | Marc Abramowitz <marc@marc-abramowitz.com> | 2015-04-30 17:39:24 -0700 |
---|---|---|
committer | Marc Abramowitz <marc@marc-abramowitz.com> | 2015-04-30 17:39:24 -0700 |
commit | fa100c92c06d3a8a61a0dda1a2e06018437b09c6 (patch) | |
tree | a1cc50f93fbf257685c3849e03496c5e33949281 /paste/util/mimeparse.py | |
download | paste-git-test_wsgirequest_charset_use_UTF-8_instead_of_iso-8859-1.tar.gz |
test_wsgirequest_charset: Use UTF-8 instead of iso-8859-1test_wsgirequest_charset_use_UTF-8_instead_of_iso-8859-1
because it seems that the defacto standard for encoding URIs is to use UTF-8.
I've been reading about url encoding and it seems like perhaps using an
encoding other than UTF-8 is very non-standard and not well-supported (this
test is trying to use `iso-8859-1`).
From http://en.wikipedia.org/wiki/Percent-encoding
> For a non-ASCII character, it is typically converted to its byte sequence in
> UTF-8, and then each byte value is represented as above.
> The generic URI syntax mandates that new URI schemes that provide for the
> representation of character data in a URI must, in effect, represent
> characters from the unreserved set without translation, and should convert
> all other characters to bytes according to UTF-8, and then percent-encode
> those values. This requirement was introduced in January 2005 with the
> publication of RFC 3986
From http://tools.ietf.org/html/rfc3986:
> Non-ASCII characters must first be encoded according to UTF-8 [STD63], and
> then each octet of the corresponding UTF-8 sequence must be percent-encoded
> to be represented as URI characters. URI producing applications must not use
> percent-encoding in host unless it is used to represent a UTF-8 character
> sequence.
From http://tools.ietf.org/html/rfc3987:
> Conversions from URIs to IRIs MUST NOT use any character encoding other than
> UTF-8 in steps 3 and 4, even if it might be possible to guess from the
> context that another character encoding than UTF-8 was used in the URI. For
> example, the URI "http://www.example.org/r%E9sum%E9.html" might with some
> guessing be interpreted to contain two e-acute characters encoded as
> iso-8859-1. It must not be converted to an IRI containing these e-acute
> characters. Otherwise, in the future the IRI will be mapped to
> "http://www.example.org/r%C3%A9sum%C3%A9.html", which is a different URI from
> "http://www.example.org/r%E9sum%E9.html".
See issue #7, which I think this at least partially fixes.
Diffstat (limited to 'paste/util/mimeparse.py')
-rw-r--r-- | paste/util/mimeparse.py | 160 |
1 files changed, 160 insertions, 0 deletions
diff --git a/paste/util/mimeparse.py b/paste/util/mimeparse.py new file mode 100644 index 0000000..b796c8b --- /dev/null +++ b/paste/util/mimeparse.py @@ -0,0 +1,160 @@ +"""MIME-Type Parser + +This module provides basic functions for handling mime-types. It can handle +matching mime-types against a list of media-ranges. See section 14.1 of +the HTTP specification [RFC 2616] for a complete explanation. + + http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.1 + +Based on mimeparse 0.1.2 by Joe Gregorio: + + http://code.google.com/p/mimeparse/ + +Contents: + - parse_mime_type(): Parses a mime-type into its component parts. + - parse_media_range(): Media-ranges are mime-types with wild-cards and a 'q' quality parameter. + - quality(): Determines the quality ('q') of a mime-type when compared against a list of media-ranges. + - quality_parsed(): Just like quality() except the second parameter must be pre-parsed. + - best_match(): Choose the mime-type with the highest quality ('q') from a list of candidates. + - desired_matches(): Filter against a list of desired mime-types in the order the server prefers. + +""" + + +def parse_mime_type(mime_type): + """Carves up a mime-type and returns a tuple of the + (type, subtype, params) where 'params' is a dictionary + of all the parameters for the media range. + For example, the media range 'application/xhtml;q=0.5' would + get parsed into: + + ('application', 'xhtml', {'q', '0.5'}) + """ + type = mime_type.split(';') + type, plist = type[0], type[1:] + try: + type, subtype = type.split('/', 1) + except ValueError: + type, subtype = type.strip() or '*', '*' + else: + type = type.strip() or '*' + subtype = subtype.strip() or '*' + params = {} + for param in plist: + param = param.split('=', 1) + if len(param) == 2: + key, value = param[0].strip(), param[1].strip() + if key and value: + params[key] = value + return type, subtype, params + +def parse_media_range(range): + """Carves up a media range and returns a tuple of the + (type, subtype, params) where 'params' is a dictionary + of all the parameters for the media range. + For example, the media range 'application/*;q=0.5' would + get parsed into: + + ('application', '*', {'q', '0.5'}) + + In addition this function also guarantees that there + is a value for 'q' in the params dictionary, filling it + in with a proper default if necessary. + """ + type, subtype, params = parse_mime_type(range) + try: + if not 0 <= float(params['q']) <= 1: + raise ValueError + except (KeyError, ValueError): + params['q'] = '1' + return type, subtype, params + +def fitness_and_quality_parsed(mime_type, parsed_ranges): + """Find the best match for a given mime-type against + a list of media_ranges that have already been + parsed by parse_media_range(). Returns a tuple of + the fitness value and the value of the 'q' quality + parameter of the best match, or (-1, 0) if no match + was found. Just as for quality_parsed(), 'parsed_ranges' + must be a list of parsed media ranges.""" + best_fitness, best_fit_q = -1, 0 + target_type, target_subtype, target_params = parse_media_range(mime_type) + for type, subtype, params in parsed_ranges: + if (type == target_type + or type == '*' or target_type == '*') and ( + subtype == target_subtype + or subtype == '*' or target_subtype == '*'): + fitness = 0 + if type == target_type: + fitness += 100 + if subtype == target_subtype: + fitness += 10 + for key in target_params: + if key != 'q' and key in params: + if params[key] == target_params[key]: + fitness += 1 + if fitness > best_fitness: + best_fitness = fitness + best_fit_q = params['q'] + return best_fitness, float(best_fit_q) + +def quality_parsed(mime_type, parsed_ranges): + """Find the best match for a given mime-type against + a list of media_ranges that have already been + parsed by parse_media_range(). Returns the + 'q' quality parameter of the best match, 0 if no + match was found. This function behaves the same as quality() + except that 'parsed_ranges' must be a list of + parsed media ranges.""" + return fitness_and_quality_parsed(mime_type, parsed_ranges)[1] + +def quality(mime_type, ranges): + """Returns the quality 'q' of a mime-type when compared + against the media-ranges in ranges. For example: + + >>> quality('text/html','text/*;q=0.3, text/html;q=0.7, text/html;level=1, text/html;level=2;q=0.4, */*;q=0.5') + 0.7 + + """ + parsed_ranges = map(parse_media_range, ranges.split(',')) + return quality_parsed(mime_type, parsed_ranges) + +def best_match(supported, header): + """Takes a list of supported mime-types and finds the best + match for all the media-ranges listed in header. In case of + ambiguity, whatever comes first in the list will be chosen. + The value of header must be a string that conforms to the format + of the HTTP Accept: header. The value of 'supported' is a list + of mime-types. + + >>> best_match(['application/xbel+xml', 'text/xml'], 'text/*;q=0.5,*/*; q=0.1') + 'text/xml' + """ + if not supported: + return '' + parsed_header = list(map(parse_media_range, header.split(','))) + best_type = max([ + (fitness_and_quality_parsed(mime_type, parsed_header), -n) + for n, mime_type in enumerate(supported)]) + return best_type[0][1] and supported[-best_type[1]] or '' + +def desired_matches(desired, header): + """Takes a list of desired mime-types in the order the server prefers to + send them regardless of the browsers preference. + + Browsers (such as Firefox) technically want XML over HTML depending on how + one reads the specification. This function is provided for a server to + declare a set of desired mime-types it supports, and returns a subset of + the desired list in the same order should each one be Accepted by the + browser. + + >>> desired_matches(['text/html', 'application/xml'], \ + ... 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png') + ['text/html', 'application/xml'] + >>> desired_matches(['text/html', 'application/xml'], 'application/xml,application/json') + ['application/xml'] + """ + parsed_ranges = list(map(parse_media_range, header.split(','))) + return [mimetype for mimetype in desired + if quality_parsed(mimetype, parsed_ranges)] + |