summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDave Beckett <dave@dajobe.org>2022-06-06 18:19:49 -0700
committerDave Beckett <dave@dajobe.org>2022-06-06 22:07:24 -0700
commit3cca62a33da68143b687c9e486eefc7c7cbb4586 (patch)
tree1d0fc6d5a5bbef9bf224a161671fbd316af702c4
parent94a2bb5086db948d83a039fbe6852a6a47f9b588 (diff)
downloadraptor-3cca62a33da68143b687c9e486eefc7c7cbb4586.tar.gz
Fix internal ICU string NFC check to convert to UTF-16 firstt
(raptor_nfc_icu_check, raptor_unicode_check_utf8_nfc_string): Changed signature of these internal functions to not have error_p which wasn't even consistently used. Instead return -1 on failure which is also "falsey" in C. (raptor_nfc_icu_check): Do an UTF-8 (raptor) to UTF-16 conversion before trying to do a NFC normalization check. Update callers of above internal functions to remove error_p argument which was unused in all callers. Update rdfxmla tests to allow tests to throw warnings.
-rw-r--r--src/raptor_internal.h4
-rw-r--r--src/raptor_nfc_icu.c80
-rw-r--r--src/raptor_rdfxml.c10
-rw-r--r--src/raptor_unicode.c13
-rw-r--r--tests/rdfxml/Makefile.am2
5 files changed, 58 insertions, 51 deletions
diff --git a/src/raptor_internal.h b/src/raptor_internal.h
index be8698dc..e6f98e94 100644
--- a/src/raptor_internal.h
+++ b/src/raptor_internal.h
@@ -781,7 +781,7 @@ int raptor_check_ordinal(const unsigned char *name);
#endif
/* raptor_nfc_icu.c */
-int raptor_nfc_icu_check (const unsigned char* string, size_t len, int *error);
+int raptor_nfc_icu_check (const unsigned char* string, size_t len);
/* raptor_namespace.c */
@@ -967,7 +967,7 @@ extern const raptor_unichar raptor_unicode_max_codepoint;
int raptor_unicode_is_namestartchar(raptor_unichar c);
int raptor_unicode_is_namechar(raptor_unichar c);
-int raptor_unicode_check_utf8_nfc_string(const unsigned char *input, size_t length, int* error);
+int raptor_unicode_check_utf8_nfc_string(const unsigned char *input, size_t length);
/* raptor_www*.c */
#ifdef RAPTOR_WWW_LIBXML
diff --git a/src/raptor_nfc_icu.c b/src/raptor_nfc_icu.c
index 9d17982a..03d2459f 100644
--- a/src/raptor_nfc_icu.c
+++ b/src/raptor_nfc_icu.c
@@ -40,6 +40,7 @@
#include "raptor2.h"
#include "raptor_internal.h"
+#include <unicode/ustring.h>
#if ICU_UC_MAJOR_VERSION >= 56
#include <unicode/unorm2.h>
#else
@@ -47,58 +48,65 @@
#endif
-/**
+/*
* raptor_nfc_icu_check:
* @input: UTF-8 string
* @length: length of string
- * @errorp: pointer to store offset of character in error (or NULL)
+ * @error: pointer to error flag (or NULL)
*
- * Unicode Normal Form C (NFC) check function.
+ * INTERNAL - Unicode Normal Form C (NFC) check function via ICU
*
- * If errorp is not NULL, it is set to the offset of the character
- * in error in the buffer, or <0 if there is no error.
+ * If errorp is not NULL, it is set to non-0 on error
*
- * Return value: Non 0 if the string is NFC
+ * Return value: <0 on error, 0 if is not NFC, >0 if is NFC
**/
int
-raptor_nfc_icu_check(const unsigned char* string, size_t len, int *error)
+raptor_nfc_icu_check(const unsigned char* string, size_t len)
{
- /* unorm_quickCheck was deprecated in ICU UC V56 */
-
#if ICU_UC_MAJOR_VERSION >= 56
/* norm2 is be a singleton - do not attempt to free it */
const UNormalizer2 *norm2;
+#endif
UErrorCode error_code = U_ZERO_ERROR;
UNormalizationCheckResult res;
+ UChar *dest; /* UTF-16 */
+ int32_t dest_capacity = len << 1;
+ int32_t dest_length;
+ int rc = 0;
+ /* ICU functions take a UTF-16 string so convert */
+ dest = RAPTOR_MALLOC(UChar*, dest_capacity + 1);
+ if(!dest)
+ goto error;
+
+ (void)u_strFromUTF8(dest, dest_capacity, &dest_length,
+ (const char *)string, (int32_t)len, &error_code);
+ if(!U_SUCCESS(error_code))
+ goto error;
+
+ /* unorm_quickCheck was deprecated in ICU UC V56 */
+#if ICU_UC_MAJOR_VERSION >= 56
norm2 = unorm2_getNFCInstance(&error_code);
- if(!U_SUCCESS(error_code)) {
- if(error)
- *error = 1;
- return 0;
- }
-
- res = unorm2_quickCheck(norm2,(const UChar *)string, (int32_t)len,
- &error_code);
- if(!U_SUCCESS(error_code)) {
- if(error)
- *error = 1;
- return 0;
- }
-
- return (res == UNORM_YES);
+ if(!U_SUCCESS(error_code))
+ goto error;
+
+ res = unorm2_quickCheck(norm2, dest, dest_length, &error_code);
#else
- UNormalizationCheckResult res;
- UErrorCode error_code = U_ZERO_ERROR;
-
- res = unorm_quickCheck((const UChar *)string, (int32_t)len,
- UNORM_NFC, &error_code);
- if(!U_SUCCESS(error_code)) {
- if(error)
- *error = 1;
- return 0;
- }
-
- return (res == UNORM_YES);
+ res = unorm_quickCheck(dest, dest_length, UNORM_NFC, &error_code);
#endif
+ if(!U_SUCCESS(error_code))
+ goto error;
+
+ /* success */
+ rc = (res == UNORM_YES);
+ goto cleanup;
+
+error:
+ rc = -1;
+
+cleanup:
+ if(dest)
+ RAPTOR_FREE(UChar*, dest);
+
+ return rc;
}
diff --git a/src/raptor_rdfxml.c b/src/raptor_rdfxml.c
index ce9f3931..2a742d0a 100644
--- a/src/raptor_rdfxml.c
+++ b/src/raptor_rdfxml.c
@@ -1396,8 +1396,7 @@ raptor_rdfxml_process_property_attributes(raptor_parser *rdf_parser,
}
- if(!raptor_unicode_check_utf8_nfc_string(value, strlen((const char*)value),
- NULL)) {
+ if(!raptor_unicode_check_utf8_nfc_string(value, strlen((const char*)value))) {
raptor_log_level l;
raptor_rdfxml_update_document_locator(rdf_parser);
@@ -1505,7 +1504,7 @@ raptor_rdfxml_process_property_attributes(raptor_parser *rdf_parser,
}
if(object_is_literal &&
- !raptor_unicode_check_utf8_nfc_string(value, value_len, NULL)) {
+ !raptor_unicode_check_utf8_nfc_string(value, value_len)) {
raptor_log_level l;
raptor_rdfxml_update_document_locator(rdf_parser);
@@ -2780,8 +2779,7 @@ raptor_rdfxml_end_element_grammar(raptor_parser *rdf_parser,
if(!literal_datatype && literal &&
!raptor_unicode_check_utf8_nfc_string(literal,
- xml_element->content_cdata_length,
- NULL)) {
+ xml_element->content_cdata_length)) {
raptor_log_level l;
raptor_rdfxml_update_document_locator(rdf_parser);
@@ -2836,7 +2834,7 @@ raptor_rdfxml_end_element_grammar(raptor_parser *rdf_parser,
length = xml_element->content_cdata_length;
}
- if(!raptor_unicode_check_utf8_nfc_string(buffer, length, NULL)) {
+ if(!raptor_unicode_check_utf8_nfc_string(buffer, length)) {
raptor_log_level l;
raptor_rdfxml_update_document_locator(rdf_parser);
diff --git a/src/raptor_unicode.c b/src/raptor_unicode.c
index 3a0a1f57..c50d2538 100644
--- a/src/raptor_unicode.c
+++ b/src/raptor_unicode.c
@@ -786,14 +786,14 @@ raptor_unicode_is_extender(long c)
*
* INTERNAL - Check if a Unicode UTF-8 encoded string is in Unicode Normal Form C.
*
- * Return value: Non 0 if the string is in NFC (or an error)
+ * Return value: <0 on error, 0 if not NFC, >0 if is NFC
**/
int
-raptor_unicode_check_utf8_nfc_string(const unsigned char *input, size_t length,
- int *error)
+raptor_unicode_check_utf8_nfc_string(const unsigned char *input, size_t length)
{
unsigned int i;
int plain = 1;
+ int rc;
for(i = 0; i < length; i++)
if(input[i] > 0x7f) {
@@ -805,12 +805,11 @@ raptor_unicode_check_utf8_nfc_string(const unsigned char *input, size_t length,
return 1;
#ifdef RAPTOR_NFC_ICU
- return raptor_nfc_icu_check(input, length, error);
+ rc = raptor_nfc_icu_check(input, length);
#else
- if(error)
- *error = 1;
- return 1;
+ rc = 1;
#endif
+ return rc;
}
diff --git a/tests/rdfxml/Makefile.am b/tests/rdfxml/Makefile.am
index cbb8acc8..a9fe3e7e 100644
--- a/tests/rdfxml/Makefile.am
+++ b/tests/rdfxml/Makefile.am
@@ -451,6 +451,8 @@ check-rdfxmla: build-rdfdiff build-rapper $(check_rdfxmla_deps)
fi; \
if test $$status1 = 0 -a $$status2 = 0; then \
$(RECHO) "ok"; \
+ elif test $$status1 = 2 -a $$status2 = 0; then \
+ $(RECHO) "ok with warnings"; grep Warning $$name.err; \
else \
$(RECHO) "FAILED"; \
echo $(RAPPER) -q -o rdfxml-abbrev $(srcdir)/$$test $$baseuri '>' $$name-rdfxmla.rdf; \