summaryrefslogtreecommitdiff
path: root/libguile/strings.c
diff options
context:
space:
mode:
authorAndy Wingo <wingo@pobox.com>2017-02-16 12:57:46 +0100
committerAndy Wingo <wingo@pobox.com>2017-02-16 13:11:29 +0100
commitd0934df1f2f0e5d3fa9a1a1f15e6f2dec1d15698 (patch)
tree4f15abdb93948be9df6510b8dd9cb658a56fb3ae /libguile/strings.c
parentc38b9625c88f4d1760068926273b6d89ffbd4527 (diff)
downloadguile-d0934df1f2f0e5d3fa9a1a1f15e6f2dec1d15698.tar.gz
Stringbufs immutable by default
* libguile/snarf.h (SCM_IMMUTABLE_STRINGBUF): Remove shared flag. Stringbufs are immutable by default. * libguile/strings.c: Rewrite blurb. Change to have stringbufs be immutable by default and mutable only when marked as such. Going mutable means making a private copy. (STRINGBUF_MUTABLE, STRINGBUF_F_MUTABLE): New definitions. (SET_STRINGBUF_SHARED): Remove. (scm_i_print_stringbuf): Simplify to just alias the stringbuf as-is. (substring_with_immutable_stringbuf): New helper. (scm_i_substring, scm_i_substring_read_only, scm_i_substring_copy): use new helper. (scm_i_string_ensure_mutable_x): New helper. (scm_i_substring_shared): Use scm_i_string_ensure_mutable_x. (stringbuf_write_mutex): Remove; yaaaaaaaay. (scm_i_string_start_writing): Use scm_i_string_ensure_mutable_x. No more mutex. (scm_i_string_stop_writing): Now a no-op. (scm_i_make_symbol): Use substring/copy. (scm_sys_string_dump, scm_sys_symbol_dump): Update. * libguile/strings.h (SCM_I_STRINGBUF_F_SHARED): Remove. (SCM_I_STRINGBUF_F_MUTABLE): Add. * module/system/vm/assembler.scm (link-data): Don't add shared flag any more. Existing compiled flags are harmless tho. * test-suite/tests/strings.test ("string internals"): Update.
Diffstat (limited to 'libguile/strings.c')
-rw-r--r--libguile/strings.c325
1 files changed, 127 insertions, 198 deletions
diff --git a/libguile/strings.c b/libguile/strings.c
index a153d29ab..e460a938d 100644
--- a/libguile/strings.c
+++ b/libguile/strings.c
@@ -54,40 +54,34 @@ SCM_SYMBOL (sym_UTF_8, "UTF-8");
SCM_SYMBOL (sym_ISO_8859_1, "ISO-8859-1");
SCM_SYMBOL (sym_error, "error");
-/* Stringbufs
- *
- * XXX - keeping an accurate refcount during GC seems to be quite
- * tricky, so we just keep score of whether a stringbuf might be
- * shared, not whether it definitely is.
- *
- * The scheme I (mvo) tried to keep an accurate reference count would
- * recount all strings that point to a stringbuf during the mark-phase
- * of the GC. This was done since one cannot access the stringbuf of
- * a string when that string is freed (in order to decrease the
- * reference count). The memory of the stringbuf might have been
- * reused already for something completely different.
- *
- * This recounted worked for a small number of threads beating on
- * cow-strings, but it failed randomly with more than 10 threads, say.
- * I couldn't figure out what went wrong, so I used the conservative
- * approach implemented below.
- *
- * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
- * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
- * strings.
- */
+/* A stringbuf is a linear buffer of characters. Every string has a
+ stringbuf. Strings may reference just a slice of a stringbuf; that's
+ often the case for strings made by the "substring" function.
+
+ Stringbufs may hold either 8-bit characters or 32-bit characters. In
+ either case the characters are Unicode codepoints. "Narrow"
+ stringbufs thus have the ISO-8859-1 (Latin-1) encoding, and "wide"
+ stringbufs have the UTF-32 (UCS-4) encoding.
+
+ By default, stringbufs are immutable. This enables an O(1)
+ "substring" operation with no synchronization. A string-set! will
+ first ensure that the string's stringbuf is mutable, copying the
+ stringbuf if necessary. This is therefore a copy-on-write
+ representation. However, taking a substring of a mutable stringbuf
+ is an O(n) operation as it has to create a new immutable stringbuf.
+ There are also mutation-sharing substrings as well. */
/* The size in words of the stringbuf header (type tag + size). */
#define STRINGBUF_HEADER_SIZE 2U
#define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
-#define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
#define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
+#define STRINGBUF_F_MUTABLE SCM_I_STRINGBUF_F_MUTABLE
#define STRINGBUF_TAG scm_tc7_stringbuf
-#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
+#define STRINGBUF_MUTABLE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_MUTABLE)
#define STRINGBUF_CONTENTS(buf) ((void *) \
SCM_CELL_OBJECT_LOC (buf, \
@@ -97,16 +91,6 @@ SCM_SYMBOL (sym_error, "error");
#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
-#define SET_STRINGBUF_SHARED(buf) \
- do \
- { \
- /* Don't modify BUF if it's already marked as shared since it might be \
- a read-only, statically allocated stringbuf. */ \
- if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
- SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
- } \
- while (0)
-
#ifdef SCM_STRING_LENGTH_HISTOGRAM
static size_t lenhist[1001];
#endif
@@ -231,8 +215,6 @@ narrow_stringbuf (SCM buf)
return new_buf;
}
-scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
-
/* Copy-on-write strings.
*/
@@ -267,15 +249,8 @@ scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
void
scm_i_print_stringbuf (SCM exp, SCM port, scm_print_state *pstate)
{
- SCM str;
-
- scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
- SET_STRINGBUF_SHARED (exp);
- scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
-
- str = scm_double_cell (RO_STRING_TAG, SCM_UNPACK(exp),
- 0, STRINGBUF_LENGTH (exp));
-
+ SCM str = scm_double_cell (STRING_TAG, SCM_UNPACK(exp), 0,
+ STRINGBUF_LENGTH (exp));
scm_puts ("#<stringbuf ", port);
scm_iprin1 (str, port, pstate);
scm_puts (">", port);
@@ -289,7 +264,6 @@ static void
init_null_stringbuf (void)
{
null_stringbuf = make_stringbuf (0);
- SET_STRINGBUF_SHARED (null_stringbuf);
}
/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
@@ -359,77 +333,110 @@ get_str_buf_start (SCM *str, SCM *buf, size_t *start)
*buf = STRING_STRINGBUF (*str);
}
-SCM
-scm_i_substring (SCM str, size_t start, size_t end)
+static SCM
+substring_with_immutable_stringbuf (SCM str, size_t start, size_t end,
+ int force_copy_p, int read_only_p)
{
- if (start == end)
- return scm_i_make_string (0, NULL, 0);
+ SCM buf;
+ size_t str_start, len;
+ scm_t_bits tag = read_only_p ? RO_STRING_TAG : STRING_TAG;
+
+ get_str_buf_start (&str, &buf, &str_start);
+ len = end - start;
+ start += str_start;
+
+ if (len == 0)
+ return scm_i_make_string (0, NULL, read_only_p);
+ else if (!force_copy_p && SCM_LIKELY (!STRINGBUF_MUTABLE (buf)))
+ return scm_double_cell (tag, SCM_UNPACK (buf), start, len);
else
{
- SCM buf;
- size_t str_start;
- get_str_buf_start (&str, &buf, &str_start);
- scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
- SET_STRINGBUF_SHARED (buf);
- scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
- return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
- (scm_t_bits)str_start + start,
- (scm_t_bits) end - start);
+ SCM new_buf, new_str;
+
+ if (STRINGBUF_WIDE (buf))
+ {
+ new_buf = make_wide_stringbuf (len);
+ u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
+ (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + start), len);
+ new_str = scm_double_cell (tag, SCM_UNPACK (new_buf), 0, len);
+ scm_i_try_narrow_string (new_str);
+ }
+ else
+ {
+ new_buf = make_stringbuf (len);
+ memcpy (STRINGBUF_CHARS (new_buf),
+ STRINGBUF_CHARS (buf) + start, len);
+ new_str = scm_double_cell (tag, SCM_UNPACK (new_buf), 0, len);
+ }
+
+ return new_str;
}
}
SCM
+scm_i_substring (SCM str, size_t start, size_t end)
+{
+ return substring_with_immutable_stringbuf (str, start, end, 0, 0);
+}
+
+SCM
scm_i_substring_read_only (SCM str, size_t start, size_t end)
{
- if (start == end)
- return scm_i_make_string (0, NULL, 1);
- else
- {
- SCM buf;
- size_t str_start;
- get_str_buf_start (&str, &buf, &str_start);
- scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
- SET_STRINGBUF_SHARED (buf);
- scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
- return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
- (scm_t_bits)str_start + start,
- (scm_t_bits) end - start);
- }
+ return substring_with_immutable_stringbuf (str, start, end, 0, 1);
}
SCM
scm_i_substring_copy (SCM str, size_t start, size_t end)
{
- if (start == end)
- return scm_i_make_string (0, NULL, 0);
- else
+ return substring_with_immutable_stringbuf (str, start, end, 1, 0);
+}
+
+static void
+scm_i_string_ensure_mutable_x (SCM str)
+{
+ SCM buf;
+
+ if (IS_SH_STRING (str))
{
- size_t len = end - start;
- SCM buf, my_buf, substr;
- size_t str_start;
- int wide = 0;
- get_str_buf_start (&str, &buf, &str_start);
- if (scm_i_is_narrow_string (str))
- {
- my_buf = make_stringbuf (len);
- memcpy (STRINGBUF_CHARS (my_buf),
- STRINGBUF_CHARS (buf) + str_start + start, len);
- }
- else
- {
- my_buf = make_wide_stringbuf (len);
- u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
- (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
- + start), len);
- wide = 1;
- }
- scm_remember_upto_here_1 (buf);
- substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
- (scm_t_bits) 0, (scm_t_bits) len);
- if (wide)
- scm_i_try_narrow_string (substr);
- return substr;
+ /* Shared-mutation strings always have mutable stringbufs. */
+ buf = STRING_STRINGBUF (SH_STRING_STRING (str));
+ if (!STRINGBUF_MUTABLE (buf))
+ abort ();
+ return;
}
+
+ if (IS_RO_STRING (str))
+ scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (str));
+
+ buf = STRING_STRINGBUF (str);
+
+ if (STRINGBUF_MUTABLE (buf))
+ return;
+
+ /* Otherwise copy and mark the fresh stringbuf as mutable. Note that
+ we copy the whole stringbuf so that the start/len offsets from the
+ original string keep working, so that concurrent accessors on this
+ string don't see things in an inconsistent state. */
+ {
+ SCM new_buf;
+ size_t len = STRINGBUF_LENGTH (buf);
+
+ if (STRINGBUF_WIDE (buf))
+ {
+ new_buf = make_wide_stringbuf (len);
+ u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
+ (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
+ }
+ else
+ {
+ new_buf = make_stringbuf (len);
+ memcpy (STRINGBUF_CHARS (new_buf), STRINGBUF_CHARS (buf), len);
+ }
+
+ SCM_SET_CELL_WORD_0 (new_buf,
+ SCM_CELL_WORD_0 (new_buf) | STRINGBUF_F_MUTABLE);
+ SET_STRING_STRINGBUF (str, new_buf);
+ }
}
SCM
@@ -439,6 +446,8 @@ scm_i_substring_shared (SCM str, size_t start, size_t end)
return str;
else if (start == end)
return scm_i_make_string (0, NULL, 0);
+ else if (IS_RO_STRING (str))
+ return scm_i_substring_read_only (str, start, end);
else
{
size_t len = end - start;
@@ -447,6 +456,9 @@ scm_i_substring_shared (SCM str, size_t start, size_t end)
start += STRING_START (str);
str = SH_STRING_STRING (str);
}
+
+ scm_i_string_ensure_mutable_x (str);
+
return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
(scm_t_bits)start, (scm_t_bits) len);
}
@@ -568,60 +580,13 @@ scm_i_string_wide_chars (SCM str)
scm_list_1 (str));
}
-/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
+/* If the buffer in ORIG_STR is immutable, copy ORIG_STR's characters to
a new string buffer, so that it can be modified without modifying
- other strings. Also, lock the string mutex. Later, one must call
- scm_i_string_stop_writing to unlock the mutex. */
+ other strings. */
SCM
scm_i_string_start_writing (SCM orig_str)
{
- SCM buf, str = orig_str;
- size_t start;
-
- get_str_buf_start (&str, &buf, &start);
- if (IS_RO_STRING (str))
- scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
-
- scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
- if (STRINGBUF_SHARED (buf))
- {
- /* Clone the stringbuf. */
- size_t len = STRING_LENGTH (str);
- SCM new_buf;
-
- scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
-
- if (scm_i_is_narrow_string (str))
- {
- new_buf = make_stringbuf (len);
- memcpy (STRINGBUF_CHARS (new_buf),
- STRINGBUF_CHARS (buf) + STRING_START (str), len);
-
- }
- else
- {
- new_buf = make_wide_stringbuf (len);
- u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
- (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
- + STRING_START (str)), len);
- }
-
- SET_STRING_STRINGBUF (str, new_buf);
- start -= STRING_START (str);
-
- /* FIXME: The following operations are not atomic, so other threads
- looking at STR may see an inconsistent state. Nevertheless it can't
- hurt much since (i) accessing STR while it is being mutated can't
- yield a crash, and (ii) concurrent accesses to STR should be
- protected by a mutex at the application level. The latter may not
- apply when STR != ORIG_STR, though. */
- SET_STRING_START (str, 0);
- SET_STRING_STRINGBUF (str, new_buf);
-
- buf = new_buf;
-
- scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
- }
+ scm_i_string_ensure_mutable_x (orig_str);
return orig_str;
}
@@ -661,7 +626,6 @@ scm_i_string_writable_wide_chars (SCM str)
void
scm_i_string_stop_writing (void)
{
- scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
}
/* Return the Xth character of STR as a UCS-4 codepoint. */
@@ -768,42 +732,10 @@ scm_i_make_symbol (SCM name, scm_t_bits flags,
unsigned long hash, SCM props)
{
SCM buf;
- size_t start = STRING_START (name);
size_t length = STRING_LENGTH (name);
- if (IS_SH_STRING (name))
- {
- name = SH_STRING_STRING (name);
- start += STRING_START (name);
- }
+ name = scm_i_substring_copy (name, 0, length);
buf = STRING_STRINGBUF (name);
-
- if (start == 0 && length == STRINGBUF_LENGTH (buf))
- {
- /* reuse buf. */
- scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
- SET_STRINGBUF_SHARED (buf);
- scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
- }
- else
- {
- /* make new buf. */
- if (scm_i_is_narrow_string (name))
- {
- SCM new_buf = make_stringbuf (length);
- memcpy (STRINGBUF_CHARS (new_buf),
- STRINGBUF_CHARS (buf) + start, length);
- buf = new_buf;
- }
- else
- {
- SCM new_buf = make_wide_stringbuf (length);
- u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
- (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
- length);
- buf = new_buf;
- }
- }
return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
(scm_t_bits) hash, SCM_UNPACK (props));
}
@@ -882,9 +814,6 @@ SCM
scm_i_symbol_substring (SCM sym, size_t start, size_t end)
{
SCM buf = SYMBOL_STRINGBUF (sym);
- scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
- SET_STRINGBUF_SHARED (buf);
- scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
(scm_t_bits)start, (scm_t_bits) end - start);
}
@@ -921,8 +850,8 @@ SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
"A new string containing this string's stringbuf's characters\n"
"@item stringbuf-length\n"
"The number of characters in this stringbuf\n"
- "@item stringbuf-shared\n"
- "@code{#t} if this stringbuf is shared\n"
+ "@item stringbuf-mutable\n"
+ "@code{#t} if this stringbuf is mutable\n"
"@item stringbuf-wide\n"
"@code{#t} if this stringbuf's characters are stored in a\n"
"32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
@@ -984,11 +913,11 @@ SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
}
e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
scm_from_size_t (STRINGBUF_LENGTH (buf)));
- if (STRINGBUF_SHARED (buf))
- e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
+ if (STRINGBUF_MUTABLE (buf))
+ e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-mutable"),
SCM_BOOL_T);
else
- e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
+ e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-mutable"),
SCM_BOOL_F);
if (STRINGBUF_WIDE (buf))
e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
@@ -1015,8 +944,8 @@ SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
"A new string containing this symbols's stringbuf's characters\n"
"@item stringbuf-length\n"
"The number of characters in this stringbuf\n"
- "@item stringbuf-shared\n"
- "@code{#t} if this stringbuf is shared\n"
+ "@item stringbuf-mutable\n"
+ "@code{#t} if this stringbuf is mutable\n"
"@item stringbuf-wide\n"
"@code{#t} if this stringbuf's characters are stored in a\n"
"32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
@@ -1057,11 +986,11 @@ SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
}
e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
scm_from_size_t (STRINGBUF_LENGTH (buf)));
- if (STRINGBUF_SHARED (buf))
- e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
+ if (STRINGBUF_MUTABLE (buf))
+ e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-mutable"),
SCM_BOOL_T);
else
- e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
+ e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-mutable"),
SCM_BOOL_F);
if (STRINGBUF_WIDE (buf))
e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),