From 3cca62a33da68143b687c9e486eefc7c7cbb4586 Mon Sep 17 00:00:00 2001 From: Dave Beckett Date: Mon, 6 Jun 2022 18:19:49 -0700 Subject: Fix internal ICU string NFC check to convert to UTF-16 firstt (raptor_nfc_icu_check, raptor_unicode_check_utf8_nfc_string): Changed signature of these internal functions to not have error_p which wasn't even consistently used. Instead return -1 on failure which is also "falsey" in C. (raptor_nfc_icu_check): Do an UTF-8 (raptor) to UTF-16 conversion before trying to do a NFC normalization check. Update callers of above internal functions to remove error_p argument which was unused in all callers. Update rdfxmla tests to allow tests to throw warnings. --- src/raptor_internal.h | 4 +-- src/raptor_nfc_icu.c | 80 ++++++++++++++++++++++++++---------------------- src/raptor_rdfxml.c | 10 +++--- src/raptor_unicode.c | 13 ++++---- tests/rdfxml/Makefile.am | 2 ++ 5 files changed, 58 insertions(+), 51 deletions(-) diff --git a/src/raptor_internal.h b/src/raptor_internal.h index be8698dc..e6f98e94 100644 --- a/src/raptor_internal.h +++ b/src/raptor_internal.h @@ -781,7 +781,7 @@ int raptor_check_ordinal(const unsigned char *name); #endif /* raptor_nfc_icu.c */ -int raptor_nfc_icu_check (const unsigned char* string, size_t len, int *error); +int raptor_nfc_icu_check (const unsigned char* string, size_t len); /* raptor_namespace.c */ @@ -967,7 +967,7 @@ extern const raptor_unichar raptor_unicode_max_codepoint; int raptor_unicode_is_namestartchar(raptor_unichar c); int raptor_unicode_is_namechar(raptor_unichar c); -int raptor_unicode_check_utf8_nfc_string(const unsigned char *input, size_t length, int* error); +int raptor_unicode_check_utf8_nfc_string(const unsigned char *input, size_t length); /* raptor_www*.c */ #ifdef RAPTOR_WWW_LIBXML diff --git a/src/raptor_nfc_icu.c b/src/raptor_nfc_icu.c index 9d17982a..03d2459f 100644 --- a/src/raptor_nfc_icu.c +++ b/src/raptor_nfc_icu.c @@ -40,6 +40,7 @@ #include "raptor2.h" #include "raptor_internal.h" +#include #if ICU_UC_MAJOR_VERSION >= 56 #include #else @@ -47,58 +48,65 @@ #endif -/** +/* * raptor_nfc_icu_check: * @input: UTF-8 string * @length: length of string - * @errorp: pointer to store offset of character in error (or NULL) + * @error: pointer to error flag (or NULL) * - * Unicode Normal Form C (NFC) check function. + * INTERNAL - Unicode Normal Form C (NFC) check function via ICU * - * If errorp is not NULL, it is set to the offset of the character - * in error in the buffer, or <0 if there is no error. + * If errorp is not NULL, it is set to non-0 on error * - * Return value: Non 0 if the string is NFC + * Return value: <0 on error, 0 if is not NFC, >0 if is NFC **/ int -raptor_nfc_icu_check(const unsigned char* string, size_t len, int *error) +raptor_nfc_icu_check(const unsigned char* string, size_t len) { - /* unorm_quickCheck was deprecated in ICU UC V56 */ - #if ICU_UC_MAJOR_VERSION >= 56 /* norm2 is be a singleton - do not attempt to free it */ const UNormalizer2 *norm2; +#endif UErrorCode error_code = U_ZERO_ERROR; UNormalizationCheckResult res; + UChar *dest; /* UTF-16 */ + int32_t dest_capacity = len << 1; + int32_t dest_length; + int rc = 0; + /* ICU functions take a UTF-16 string so convert */ + dest = RAPTOR_MALLOC(UChar*, dest_capacity + 1); + if(!dest) + goto error; + + (void)u_strFromUTF8(dest, dest_capacity, &dest_length, + (const char *)string, (int32_t)len, &error_code); + if(!U_SUCCESS(error_code)) + goto error; + + /* unorm_quickCheck was deprecated in ICU UC V56 */ +#if ICU_UC_MAJOR_VERSION >= 56 norm2 = unorm2_getNFCInstance(&error_code); - if(!U_SUCCESS(error_code)) { - if(error) - *error = 1; - return 0; - } - - res = unorm2_quickCheck(norm2,(const UChar *)string, (int32_t)len, - &error_code); - if(!U_SUCCESS(error_code)) { - if(error) - *error = 1; - return 0; - } - - return (res == UNORM_YES); + if(!U_SUCCESS(error_code)) + goto error; + + res = unorm2_quickCheck(norm2, dest, dest_length, &error_code); #else - UNormalizationCheckResult res; - UErrorCode error_code = U_ZERO_ERROR; - - res = unorm_quickCheck((const UChar *)string, (int32_t)len, - UNORM_NFC, &error_code); - if(!U_SUCCESS(error_code)) { - if(error) - *error = 1; - return 0; - } - - return (res == UNORM_YES); + res = unorm_quickCheck(dest, dest_length, UNORM_NFC, &error_code); #endif + if(!U_SUCCESS(error_code)) + goto error; + + /* success */ + rc = (res == UNORM_YES); + goto cleanup; + +error: + rc = -1; + +cleanup: + if(dest) + RAPTOR_FREE(UChar*, dest); + + return rc; } diff --git a/src/raptor_rdfxml.c b/src/raptor_rdfxml.c index ce9f3931..2a742d0a 100644 --- a/src/raptor_rdfxml.c +++ b/src/raptor_rdfxml.c @@ -1396,8 +1396,7 @@ raptor_rdfxml_process_property_attributes(raptor_parser *rdf_parser, } - if(!raptor_unicode_check_utf8_nfc_string(value, strlen((const char*)value), - NULL)) { + if(!raptor_unicode_check_utf8_nfc_string(value, strlen((const char*)value))) { raptor_log_level l; raptor_rdfxml_update_document_locator(rdf_parser); @@ -1505,7 +1504,7 @@ raptor_rdfxml_process_property_attributes(raptor_parser *rdf_parser, } if(object_is_literal && - !raptor_unicode_check_utf8_nfc_string(value, value_len, NULL)) { + !raptor_unicode_check_utf8_nfc_string(value, value_len)) { raptor_log_level l; raptor_rdfxml_update_document_locator(rdf_parser); @@ -2780,8 +2779,7 @@ raptor_rdfxml_end_element_grammar(raptor_parser *rdf_parser, if(!literal_datatype && literal && !raptor_unicode_check_utf8_nfc_string(literal, - xml_element->content_cdata_length, - NULL)) { + xml_element->content_cdata_length)) { raptor_log_level l; raptor_rdfxml_update_document_locator(rdf_parser); @@ -2836,7 +2834,7 @@ raptor_rdfxml_end_element_grammar(raptor_parser *rdf_parser, length = xml_element->content_cdata_length; } - if(!raptor_unicode_check_utf8_nfc_string(buffer, length, NULL)) { + if(!raptor_unicode_check_utf8_nfc_string(buffer, length)) { raptor_log_level l; raptor_rdfxml_update_document_locator(rdf_parser); diff --git a/src/raptor_unicode.c b/src/raptor_unicode.c index 3a0a1f57..c50d2538 100644 --- a/src/raptor_unicode.c +++ b/src/raptor_unicode.c @@ -786,14 +786,14 @@ raptor_unicode_is_extender(long c) * * INTERNAL - Check if a Unicode UTF-8 encoded string is in Unicode Normal Form C. * - * Return value: Non 0 if the string is in NFC (or an error) + * Return value: <0 on error, 0 if not NFC, >0 if is NFC **/ int -raptor_unicode_check_utf8_nfc_string(const unsigned char *input, size_t length, - int *error) +raptor_unicode_check_utf8_nfc_string(const unsigned char *input, size_t length) { unsigned int i; int plain = 1; + int rc; for(i = 0; i < length; i++) if(input[i] > 0x7f) { @@ -805,12 +805,11 @@ raptor_unicode_check_utf8_nfc_string(const unsigned char *input, size_t length, return 1; #ifdef RAPTOR_NFC_ICU - return raptor_nfc_icu_check(input, length, error); + rc = raptor_nfc_icu_check(input, length); #else - if(error) - *error = 1; - return 1; + rc = 1; #endif + return rc; } diff --git a/tests/rdfxml/Makefile.am b/tests/rdfxml/Makefile.am index cbb8acc8..a9fe3e7e 100644 --- a/tests/rdfxml/Makefile.am +++ b/tests/rdfxml/Makefile.am @@ -451,6 +451,8 @@ check-rdfxmla: build-rdfdiff build-rapper $(check_rdfxmla_deps) fi; \ if test $$status1 = 0 -a $$status2 = 0; then \ $(RECHO) "ok"; \ + elif test $$status1 = 2 -a $$status2 = 0; then \ + $(RECHO) "ok with warnings"; grep Warning $$name.err; \ else \ $(RECHO) "FAILED"; \ echo $(RAPPER) -q -o rdfxml-abbrev $(srcdir)/$$test $$baseuri '>' $$name-rdfxmla.rdf; \ -- cgit v1.2.1