summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBastien Nocera <hadess@hadess.net>2021-03-04 15:42:06 +0100
committerBastien Nocera <hadess@hadess.net>2021-03-04 17:36:00 +0100
commit0f99bf8bc71b33c283c0559e22d8a198d5691089 (patch)
treeca864c27436be0cab6f86f492989322b50d74bfd
parenta19de02ba7586b576c42d8f0758eeaef94652d2b (diff)
downloadtotem-pl-parser-0f99bf8bc71b33c283c0559e22d8a198d5691089.tar.gz
plparser: Detect character encoding when UTF-8 validation fails
Use uchardet when available to detect the encoding of XML data when the declared encoding doesn't match the data passed.
-rw-r--r--meson.build20
-rw-r--r--meson_options.txt2
-rw-r--r--plparse/totem-pl-parser.c38
3 files changed, 57 insertions, 3 deletions
diff --git a/meson.build b/meson.build
index df547ad..0aa06cf 100644
--- a/meson.build
+++ b/meson.build
@@ -136,6 +136,22 @@ foreach cflag: test_cflags
endif
endforeach
+# uchardet dependency
+enable_uchardet = get_option('enable-uchardet')
+have_uchardet = false
+if enable_uchardet != 'no'
+ uchardet_dep = dependency('uchardet', required: false)
+ if enable_uchardet == 'yes' and not uchardet_dep.found()
+ error('uchardet support requested but not available.')
+ endif
+ if uchardet_dep.found()
+ cdata.set('HAVE_UCHARDET', true,
+ description: 'uchardet available in the system')
+ have_uchardet = true
+ totem_pl_parser_deps += [uchardet_dep]
+ endif
+endif
+
# quvi dependency
enable_quvi = get_option('enable-quvi')
have_quvi = false
@@ -238,7 +254,9 @@ message('''
Quvi video link parsing : @0@
ISO detection with libarchive : @1@
AmazonAMZ decoding with libgcrypt : @2@
+ uchardet encoding detection : @3@
'''.format(have_quvi.to_string('yes', 'no'),
have_libarchive.to_string('yes', 'no'),
- have_libgcrypt.to_string('yes', 'no')))
+ have_libgcrypt.to_string('yes', 'no'),
+ have_uchardet.to_string('yes', 'no')))
diff --git a/meson_options.txt b/meson_options.txt
index 81a02ba..24df404 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -4,6 +4,8 @@ option('enable-libarchive', type: 'combo', choices : ['yes', 'no', 'auto'], valu
description : 'Enable libarchive support.')
option('enable-libgcrypt', type: 'combo', choices : ['yes', 'no', 'auto'], value : 'auto',
description : 'Enable libgcrypt support.')
+option('enable-uchardet', type: 'combo', choices : ['yes', 'no', 'auto'], value : 'auto',
+ description : 'Enable uchardet support.')
option('enable-gtk-doc', type: 'boolean', value: 'false',
description : 'Generate the API reference (depends on GTK-Doc)')
option('introspection', type: 'boolean', value: 'true',
diff --git a/plparse/totem-pl-parser.c b/plparse/totem-pl-parser.c
index 9869701..ee6f12f 100644
--- a/plparse/totem-pl-parser.c
+++ b/plparse/totem-pl-parser.c
@@ -132,6 +132,9 @@
#ifndef TOTEM_PL_PARSER_MINI
#include <gobject/gvaluecollector.h>
+#ifdef HAVE_UCHARDET
+#include <uchardet.h>
+#endif
#include "totem-pl-parser.h"
#include "totemplparser-marshal.h"
@@ -1846,6 +1849,34 @@ totem_pl_parser_cleanup_xml (char *contents)
}
}
+#ifdef HAVE_UCHARDET
+static char *
+guess_text_encoding (const char *text,
+ gsize len)
+{
+ uchardet_t handle;
+ char *encoding = NULL;
+ int ret;
+
+ handle = uchardet_new ();
+ ret = uchardet_handle_data (handle, text, len);
+ if (ret == 0) {
+ uchardet_data_end (handle);
+ encoding = g_strdup (uchardet_get_charset (handle));
+ }
+
+ uchardet_delete (handle);
+ return encoding;
+}
+#else
+static char *
+guess_text_encoding (const char *text,
+ gsize len)
+{
+ return NULL;
+}
+#endif /* HAVE_UCHARDET */
+
xml_node_t *
totem_pl_parser_parse_xml_relaxed (char *contents,
gsize size)
@@ -1879,8 +1910,11 @@ totem_pl_parser_parse_xml_relaxed (char *contents,
return doc;
g_debug ("Document %s pretended to be in UTF-8 but didn't validate",
encoding ? "explicitly" : "implicitly");
- /* FIXME detect encoding using uchardet */
- return NULL;
+ g_free (encoding);
+ encoding = guess_text_encoding (contents, size);
+ if (!encoding)
+ return NULL;
+ /* fall-through with the detected encoding */
}
xml_parser_free_tree (doc);