From ffb95239aacf86d8dc622a438bdaacfac4a66efc Mon Sep 17 00:00:00 2001 From: Rob Browning Date: Sun, 12 Mar 2023 14:26:10 -0500 Subject: scm_i_utf8_string_hash: compute u8 chars not bytes Noticed while investigating a migration to utf-8 strings. After making changes that routed non-ascii symbol hashing through this function, encoding-iso88597.test began intermittently failing because it would traverse trailing garbage when u8_strnlen reported 8 chars instead of 4. Change the scm_i_str2symbol and scm_i_str2uninterned_symbol internal hash type to unsigned long to explicitly match the scm_i_string_hash result type. * libguile/hash.c (scm_i_utf8_string_hash): Call u8_mbsnlen not u8_strnlen. * libguile/symbols.c (scm_i_str2symbol, scm_i_str2uninterned_symbol): Use unsigned long for scm_i_string_hash result. * test-suite/standalone/.gitignore: Add test-hashing. * test-suite/standalone/Makefile.am: Add test-hashing. * test-suite/standalone/test-hashing.c: Add. --- NEWS | 12 +++++++ libguile/hash.c | 2 +- libguile/symbols.c | 4 +-- test-suite/standalone/.gitignore | 1 + test-suite/standalone/Makefile.am | 7 ++++ test-suite/standalone/test-hashing.c | 63 ++++++++++++++++++++++++++++++++++++ 6 files changed, 86 insertions(+), 3 deletions(-) create mode 100644 test-suite/standalone/test-hashing.c diff --git a/NEWS b/NEWS index a0009406f..a55cb583b 100644 --- a/NEWS +++ b/NEWS @@ -21,6 +21,18 @@ definitely unused---this is notably the case for modules that are only used at macro-expansion time, such as (srfi srfi-26). In those cases, the compiler reports it as "possibly unused". +* Bug fixes + +* Hashing of UTF-8 symbols with non-ASCII characters avoids corruption + +This issue could cause `scm_from_utf8_symbol' and +`scm_from_utf8_symboln` to incorrectly conclude that the symbol hadn't +already been interned, and then create a new one, which of course +wouldn't be `eq?' to the other(s). The incorrect hash was the result of +a buffer overrun, and so might vary. This problem affected a number of +other operations, given the internal use of those functions. +() + Changes in 3.0.9 (since 3.0.8) diff --git a/libguile/hash.c b/libguile/hash.c index c192ac2e5..5abdfe397 100644 --- a/libguile/hash.c +++ b/libguile/hash.c @@ -185,7 +185,7 @@ scm_i_utf8_string_hash (const char *str, size_t len) /* Invalid UTF-8; punt. */ return scm_i_string_hash (scm_from_utf8_stringn (str, len)); - length = u8_strnlen (ustr, len); + length = u8_mbsnlen (ustr, len); /* Set up the internal state. */ a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + 47; diff --git a/libguile/symbols.c b/libguile/symbols.c index 02be7c1c4..086abf585 100644 --- a/libguile/symbols.c +++ b/libguile/symbols.c @@ -239,7 +239,7 @@ static SCM scm_i_str2symbol (SCM str) { SCM symbol; - size_t raw_hash = scm_i_string_hash (str); + unsigned long raw_hash = scm_i_string_hash (str); symbol = lookup_interned_symbol (str, raw_hash); if (scm_is_true (symbol)) @@ -261,7 +261,7 @@ scm_i_str2symbol (SCM str) static SCM scm_i_str2uninterned_symbol (SCM str) { - size_t raw_hash = scm_i_string_hash (str); + unsigned long raw_hash = scm_i_string_hash (str); return scm_i_make_symbol (str, SCM_I_F_SYMBOL_UNINTERNED, raw_hash); } diff --git a/test-suite/standalone/.gitignore b/test-suite/standalone/.gitignore index 794146e60..f38f7fbe2 100644 --- a/test-suite/standalone/.gitignore +++ b/test-suite/standalone/.gitignore @@ -1,5 +1,6 @@ /test-conversion /test-gh +/test-hashing /test-list /test-num2integral /test-round diff --git a/test-suite/standalone/Makefile.am b/test-suite/standalone/Makefile.am index 547241afa..17bb59a18 100644 --- a/test-suite/standalone/Makefile.am +++ b/test-suite/standalone/Makefile.am @@ -167,6 +167,13 @@ test_conversion_LDADD = $(LIBGUILE_LDADD) $(top_builddir)/lib/libgnu.la check_PROGRAMS += test-conversion TESTS += test-conversion +# test-hashing +test_hashing_SOURCES = test-hashing.c +test_hashing_CFLAGS = ${test_cflags} +test_hashing_LDADD = $(LIBGUILE_LDADD) $(top_builddir)/lib/libgnu.la +check_PROGRAMS += test-hashing +TESTS += test-hashing + # test-loose-ends test_loose_ends_SOURCES = test-loose-ends.c test_loose_ends_CFLAGS = ${test_cflags} diff --git a/test-suite/standalone/test-hashing.c b/test-suite/standalone/test-hashing.c new file mode 100644 index 000000000..5982a0fdb --- /dev/null +++ b/test-suite/standalone/test-hashing.c @@ -0,0 +1,63 @@ +/* Copyright 2022-2023 + Free Software Foundation, Inc. + + This file is part of Guile. + + Guile is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Guile is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with Guile. If not, see + . */ + +#if HAVE_CONFIG_H +# include +#endif + +#include + +#include + +static void +test_hashing () +{ + // Make sure a utf-8 symbol has the expected hash. In addition to + // catching algorithmic regressions, this would have caught a + // long-standing buffer overflow. + + // Περί + char about_u8[] = {0xce, 0xa0, 0xce, 0xb5, 0xcf, 0x81, 0xce, 0xaf, 0}; + SCM sym = scm_from_utf8_symbol (about_u8); + + // Value determined by calling wide_string_hash on {0x3A0, 0x3B5, + // 0x3C1, 0x3AF} via a temporary test program. + const unsigned long expect = 4029223418961680680; + const unsigned long actual = scm_to_ulong (scm_symbol_hash (sym)); + + if (actual != expect) + { + fprintf (stderr, "fail: unexpected utf-8 symbol hash (%lu != %lu)\n", + actual, expect); + exit (EXIT_FAILURE); + } +} + +static void +tests (void *data, int argc, char **argv) +{ + test_hashing (); +} + +int +main (int argc, char *argv[]) +{ + scm_boot_guile (argc, argv, tests, NULL); + return 0; +} -- cgit v1.2.1