diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-09 10:38:14 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-09 10:38:14 -0500 |
commit | e4f4a83c3963397bfc47b5e04c11245efaecab10 (patch) | |
tree | 27942e7d8a2bc849f2b586bade20ae0b6f29bce9 /bs4/dammit.py | |
parent | 02b01471645ad4b315d848d7c5d5a75ec38b1370 (diff) | |
download | beautifulsoup4-e4f4a83c3963397bfc47b5e04c11245efaecab10.tar.gz |
Unicode, Dammit now detects the encoding in HTML 5-style <meta> tags like <meta charset="utf-8" />. [bug=837268]
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r-- | bs4/dammit.py | 6 |
1 files changed, 4 insertions, 2 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py index 09ac89e..0c4bf17 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -27,8 +27,10 @@ try: except ImportError: pass -xml_encoding_re = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) -html_meta_re = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode(), re.I) +xml_encoding_re = re.compile( + '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) +html_meta_re = re.compile( + '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) class EntitySubstitution(object): |