summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorPierre Le Marre <dev@wismill.eu>2023-05-13 17:26:24 +0200
committerRan Benita <ran@unusedvar.com>2023-05-13 22:02:46 +0300
commit183761ac24544b355aaf362e62d05fa1c184baf8 (patch)
tree0fb328d8876d92997fca57acfbb4a76dc6ae7d58 /src
parent5fbffaf035f0c0edbcf7b2e747ccab9a234101ff (diff)
downloadxorg-lib-libxkbcommon-183761ac24544b355aaf362e62d05fa1c184baf8.tar.gz
Do not interpret nor emit invalid Unicode encoding forms
Surrogates are invalid in both UTF-32 and UTF-8. See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875 and https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G31703
Diffstat (limited to 'src')
-rw-r--r--src/keysym-utf.c16
-rw-r--r--src/utf8.c16
2 files changed, 27 insertions, 5 deletions
diff --git a/src/keysym-utf.c b/src/keysym-utf.c
index a9d46d1..0bb9a4f 100644
--- a/src/keysym-utf.c
+++ b/src/keysym-utf.c
@@ -41,6 +41,8 @@
#include "utils.h"
#include "utf8.h"
+#define NO_KEYSYM_UNICODE_CONVERSION 0
+
/* We don't use the uint32_t types here, to save some space. */
struct codepair {
uint16_t keysym;
@@ -847,7 +849,7 @@ bin_search(const struct codepair *table, size_t length, xkb_keysym_t keysym)
}
/* no matching Unicode value found in table */
- return 0;
+ return NO_KEYSYM_UNICODE_CONVERSION;
}
XKB_EXPORT uint32_t
@@ -871,6 +873,13 @@ xkb_keysym_to_utf32(xkb_keysym_t keysym)
return keysym & 0x7f;
/* also check for directly encoded Unicode codepoints */
+
+ /* Exclude surrogates: they are invalid in UTF-32.
+ * See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875
+ * for further details.
+ */
+ if (0x0100d800 <= keysym && keysym <= 0x0100dfff)
+ return NO_KEYSYM_UNICODE_CONVERSION;
/*
* In theory, this is supposed to start from 0x100100, such that the ASCII
* range, which is already covered by 0x00-0xff, can't be encoded in two
@@ -900,7 +909,8 @@ xkb_utf32_to_keysym(uint32_t ucs)
return XKB_KEY_Delete;
/* Unicode non-symbols and code points outside Unicode planes */
- if ((ucs >= 0xfdd0 && ucs <= 0xfdef) ||
+ if ((ucs >= 0xd800 && ucs <= 0xdfff) ||
+ (ucs >= 0xfdd0 && ucs <= 0xfdef) ||
ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe)
return XKB_KEY_NoSymbol;
@@ -948,7 +958,7 @@ xkb_keysym_to_utf8(xkb_keysym_t keysym, char *buffer, size_t size)
codepoint = xkb_keysym_to_utf32(keysym);
- if (codepoint == 0)
+ if (codepoint == NO_KEYSYM_UNICODE_CONVERSION)
return 0;
return utf32_to_utf8(codepoint, buffer);
diff --git a/src/utf8.c b/src/utf8.c
index 15aa237..d37ba8e 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -32,6 +32,11 @@
#include "utf8.h"
+/* Conformant encoding form conversion from UTF-32 to UTF-8.
+ *
+ * See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875
+ * for further details.
+*/
int
utf32_to_utf8(uint32_t unichar, char *buffer)
{
@@ -47,6 +52,10 @@ utf32_to_utf8(uint32_t unichar, char *buffer)
length = 2;
head = 0xc0;
}
+ /* Handle surrogates */
+ else if (0xd800 <= unichar && unichar <= 0xdfff) {
+ goto ill_formed_code_unit_subsequence;
+ }
else if (unichar <= 0xffff) {
length = 3;
head = 0xe0;
@@ -56,8 +65,7 @@ utf32_to_utf8(uint32_t unichar, char *buffer)
head = 0xf0;
}
else {
- buffer[0] = '\0';
- return 0;
+ goto ill_formed_code_unit_subsequence;
}
for (count = length - 1, shift = 0; count > 0; count--, shift += 6)
@@ -67,6 +75,10 @@ utf32_to_utf8(uint32_t unichar, char *buffer)
buffer[length] = '\0';
return length + 1;
+
+ill_formed_code_unit_subsequence:
+ buffer[0] = '\0';
+ return 0;
}
bool