diff options
author | Bastien Nocera <hadess@hadess.net> | 2021-03-04 15:42:06 +0100 |
---|---|---|
committer | Bastien Nocera <hadess@hadess.net> | 2021-03-04 17:36:00 +0100 |
commit | 0f99bf8bc71b33c283c0559e22d8a198d5691089 (patch) | |
tree | ca864c27436be0cab6f86f492989322b50d74bfd | |
parent | a19de02ba7586b576c42d8f0758eeaef94652d2b (diff) | |
download | totem-pl-parser-0f99bf8bc71b33c283c0559e22d8a198d5691089.tar.gz |
plparser: Detect character encoding when UTF-8 validation fails
Use uchardet when available to detect the encoding of XML data when the
declared encoding doesn't match the data passed.
-rw-r--r-- | meson.build | 20 | ||||
-rw-r--r-- | meson_options.txt | 2 | ||||
-rw-r--r-- | plparse/totem-pl-parser.c | 38 |
3 files changed, 57 insertions, 3 deletions
diff --git a/meson.build b/meson.build index df547ad..0aa06cf 100644 --- a/meson.build +++ b/meson.build @@ -136,6 +136,22 @@ foreach cflag: test_cflags endif endforeach +# uchardet dependency +enable_uchardet = get_option('enable-uchardet') +have_uchardet = false +if enable_uchardet != 'no' + uchardet_dep = dependency('uchardet', required: false) + if enable_uchardet == 'yes' and not uchardet_dep.found() + error('uchardet support requested but not available.') + endif + if uchardet_dep.found() + cdata.set('HAVE_UCHARDET', true, + description: 'uchardet available in the system') + have_uchardet = true + totem_pl_parser_deps += [uchardet_dep] + endif +endif + # quvi dependency enable_quvi = get_option('enable-quvi') have_quvi = false @@ -238,7 +254,9 @@ message(''' Quvi video link parsing : @0@ ISO detection with libarchive : @1@ AmazonAMZ decoding with libgcrypt : @2@ + uchardet encoding detection : @3@ '''.format(have_quvi.to_string('yes', 'no'), have_libarchive.to_string('yes', 'no'), - have_libgcrypt.to_string('yes', 'no'))) + have_libgcrypt.to_string('yes', 'no'), + have_uchardet.to_string('yes', 'no'))) diff --git a/meson_options.txt b/meson_options.txt index 81a02ba..24df404 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -4,6 +4,8 @@ option('enable-libarchive', type: 'combo', choices : ['yes', 'no', 'auto'], valu description : 'Enable libarchive support.') option('enable-libgcrypt', type: 'combo', choices : ['yes', 'no', 'auto'], value : 'auto', description : 'Enable libgcrypt support.') +option('enable-uchardet', type: 'combo', choices : ['yes', 'no', 'auto'], value : 'auto', + description : 'Enable uchardet support.') option('enable-gtk-doc', type: 'boolean', value: 'false', description : 'Generate the API reference (depends on GTK-Doc)') option('introspection', type: 'boolean', value: 'true', diff --git a/plparse/totem-pl-parser.c b/plparse/totem-pl-parser.c index 9869701..ee6f12f 100644 --- a/plparse/totem-pl-parser.c +++ b/plparse/totem-pl-parser.c @@ -132,6 +132,9 @@ #ifndef TOTEM_PL_PARSER_MINI #include <gobject/gvaluecollector.h> +#ifdef HAVE_UCHARDET +#include <uchardet.h> +#endif #include "totem-pl-parser.h" #include "totemplparser-marshal.h" @@ -1846,6 +1849,34 @@ totem_pl_parser_cleanup_xml (char *contents) } } +#ifdef HAVE_UCHARDET +static char * +guess_text_encoding (const char *text, + gsize len) +{ + uchardet_t handle; + char *encoding = NULL; + int ret; + + handle = uchardet_new (); + ret = uchardet_handle_data (handle, text, len); + if (ret == 0) { + uchardet_data_end (handle); + encoding = g_strdup (uchardet_get_charset (handle)); + } + + uchardet_delete (handle); + return encoding; +} +#else +static char * +guess_text_encoding (const char *text, + gsize len) +{ + return NULL; +} +#endif /* HAVE_UCHARDET */ + xml_node_t * totem_pl_parser_parse_xml_relaxed (char *contents, gsize size) @@ -1879,8 +1910,11 @@ totem_pl_parser_parse_xml_relaxed (char *contents, return doc; g_debug ("Document %s pretended to be in UTF-8 but didn't validate", encoding ? "explicitly" : "implicitly"); - /* FIXME detect encoding using uchardet */ - return NULL; + g_free (encoding); + encoding = guess_text_encoding (contents, size); + if (!encoding) + return NULL; + /* fall-through with the detected encoding */ } xml_parser_free_tree (doc); |