summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2020-01-03 22:16:42 -0700
committerKarl Williamson <khw@cpan.org>2020-02-19 22:09:48 -0700
commitd2c9cb5392e8c58c3bb1935fc3c098737224567c (patch)
tree647a4398f465b3b0751440c59747aea104c989ac
parent4829f32decd128e6a122bd8ce35fe944bd87f104 (diff)
downloadperl-d2c9cb5392e8c58c3bb1935fc3c098737224567c.tar.gz
POSIX::mblen() Make thread-safe; allow shift state control
This commit changes the behavior so that it takes a scalar parameter instead of a char *, and thus might not be forceable into a valid PV. When not a PV, the shift state is reinitialized, like calling mblen with a NULL first parameter. Previously the shift state was always reinitialized with every call, which meant this could not work on locales with shift states. This commit also changes to use mbrlen() on threaded perls transparently (mostly), when available, to achieve thread-safe operation. It is not completely transparent because mbrlen (under the very rare stateful locales) returns a different value when it's resetting the shift state. It also may set errno differently upon errors, and no effort is made to hide that difference. Also mbrlen on some platforms can handle partial characters. [perl #133928] showed that someone was having trouble with shift states.
-rw-r--r--embedvar.h1
-rw-r--r--ext/POSIX/POSIX.xs60
-rw-r--r--ext/POSIX/lib/POSIX.pod30
-rw-r--r--ext/POSIX/t/mb.t23
-rw-r--r--intrpvar.h4
-rw-r--r--locale.c6
-rw-r--r--perl.h14
-rw-r--r--pod/perldelta.pod17
-rw-r--r--sv.c4
-rw-r--r--t/porting/known_pod_issues.dat1
10 files changed, 127 insertions, 33 deletions
diff --git a/embedvar.h b/embedvar.h
index 63a741edb6..3970f5a42b 100644
--- a/embedvar.h
+++ b/embedvar.h
@@ -203,6 +203,7 @@
#define PL_markstack_ptr (vTHX->Imarkstack_ptr)
#define PL_max_intro_pending (vTHX->Imax_intro_pending)
#define PL_maxsysfd (vTHX->Imaxsysfd)
+#define PL_mbrlen_ps (vTHX->Imbrlen_ps)
#define PL_memory_debug_header (vTHX->Imemory_debug_header)
#define PL_mess_sv (vTHX->Imess_sv)
#define PL_min_intro_pending (vTHX->Imin_intro_pending)
diff --git a/ext/POSIX/POSIX.xs b/ext/POSIX/POSIX.xs
index c8d6e8e3f9..d8b2605e89 100644
--- a/ext/POSIX/POSIX.xs
+++ b/ext/POSIX/POSIX.xs
@@ -1542,11 +1542,9 @@ END_EXTERN_C
#define waitpid(a,b,c) not_here("waitpid")
#endif
-#ifndef HAS_MBLEN
-#ifndef mblen
+#if ! defined(HAS_MBLEN) && ! defined(HAS_MBRLEN)
#define mblen(a,b) not_here("mblen")
#endif
-#endif
#ifndef HAS_MBTOWC
#define mbtowc(pwc, s, n) not_here("mbtowc")
#endif
@@ -3342,30 +3340,54 @@ write(fd, buffer, nbytes)
void
abort()
-#ifdef I_WCHAR
-# include <wchar.h>
+#if defined(HAS_MBRLEN) && (defined(USE_ITHREADS) || ! defined(HAS_MBLEN))
+# define USE_MBRLEN
+#else
+# undef USE_MBRLEN
#endif
int
mblen(s, n)
- char * s
+ SV * s
size_t n
- PREINIT:
-#if defined(USE_ITHREADS) && defined(HAS_MBRLEN)
- mbstate_t ps;
-#endif
CODE:
-#if defined(USE_ITHREADS) && defined(HAS_MBRLEN)
- memset(&ps, 0, sizeof(ps)); /* Initialize state */
- RETVAL = mbrlen(s, n, &ps); /* Prefer reentrant version */
-#else
- /* This might prevent some races, but locales can be switched out
- * without locking, so this isn't a cure all */
- LOCALE_LOCK;
+ errno = 0;
- RETVAL = mblen(s, n);
- LOCALE_UNLOCK;
+ SvGETMAGIC(s);
+ if (! SvOK(s)) {
+#ifdef USE_MBRLEN
+ /* Initialize the shift state in PL_mbrlen_ps. The Standard says
+ * that should be all zeros. */
+ memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
+ RETVAL = 0;
+#else
+ LOCALE_LOCK;
+ RETVAL = mblen(NULL, 0);
+ LOCALE_UNLOCK;
#endif
+ }
+ else { /* Not resetting state */
+ SV * byte_s = sv_2mortal(newSVsv_nomg(s));
+ if (! sv_utf8_downgrade_nomg(byte_s, TRUE)) {
+ SETERRNO(EINVAL, LIB_INVARG);
+ RETVAL = -1;
+ }
+ else {
+ size_t len;
+ char * string = SvPV(byte_s, len);
+#ifdef USE_MBRLEN
+ RETVAL = (SSize_t) mbrlen(string, len, &PL_mbrlen_ps);
+ if (RETVAL < 0) RETVAL = -1; /* Use mblen() ret code for
+ transparency */
+#else
+ /* Locking prevents races, but locales can be switched out
+ * without locking, so this isn't a cure all */
+ LOCALE_LOCK;
+ RETVAL = mblen(string, len);
+ LOCALE_UNLOCK;
+#endif
+ }
+ }
OUTPUT:
RETVAL
diff --git a/ext/POSIX/lib/POSIX.pod b/ext/POSIX/lib/POSIX.pod
index 78eb4f18e5..9c36c798e4 100644
--- a/ext/POSIX/lib/POSIX.pod
+++ b/ext/POSIX/lib/POSIX.pod
@@ -1069,13 +1069,29 @@ Not implemented. C<malloc()> is C-specific. Perl does memory management transp
=item C<mblen>
-This is identical to the C function C<mblen()>.
-
-Core Perl does not have any support for the wide and multibyte
-characters of the C standards, except under UTF-8 locales, so this might
-be a rather useless function.
-
-However, Perl supports Unicode, see L<perluniintro>.
+This is the same as the C function C<mblen()> on unthreaded perls. On
+threaded perls, it transparently (almost) substitutes the more
+thread-safe L<C<mbrlen>(3)>, if available, instead of C<mblen>.
+
+Core Perl does not have any support for wide and multibyte locales,
+except Unicode UTF-8 locales. This function, in conjunction with
+L</mbtowc> and L</wctomb> may be used to roll your own decoding/encoding
+of other types of multi-byte locales.
+
+Use C<undef> as the first parameter to this function to get the effect
+of passing NULL as the first parameter to C<mblen>. This resets any
+shift state to its initial value. The return value is undefined if
+C<mbrlen> was substituted, so you should never rely on it.
+
+When the first parameter is a scalar containing a value that either is a
+PV string or can be forced into one, the return value is the number of
+bytes occupied by the first character of that string; or 0 if that first
+character is the wide NUL character; or negative if there is an error.
+This is based on the locale that currently underlies the program,
+regardless of whether or not the function is called from Perl code that
+is within the scope of S<C<use locale>>. Perl makes no attempt at
+hiding from your code any differences in the C<errno> setting between
+C<mblen> and C<mbrlen>. It does set C<errno> to 0 before calling them.
See L</mblen>.
diff --git a/ext/POSIX/t/mb.t b/ext/POSIX/t/mb.t
index 053693e611..629f4776de 100644
--- a/ext/POSIX/t/mb.t
+++ b/ext/POSIX/t/mb.t
@@ -19,20 +19,20 @@ BEGIN {
require 'test.pl';
}
-plan tests => 4;
+plan tests => 5;
use POSIX qw();
SKIP: {
- skip("mblen() not present", 4) unless $Config{d_mblen};
+ skip("mblen() not present", 6) unless $Config{d_mblen};
is(&POSIX::mblen("a", &POSIX::MB_CUR_MAX), 1, 'mblen() basically works');
- skip("LC_CTYPE locale support not available", 3)
+ skip("LC_CTYPE locale support not available", 4)
unless locales_enabled('LC_CTYPE');
my $utf8_locale = find_utf8_ctype_locale();
- skip("no utf8 locale available", 3) unless $utf8_locale;
+ skip("no utf8 locale available", 4) unless $utf8_locale;
local $ENV{LC_CTYPE} = $utf8_locale;
local $ENV{LC_ALL};
@@ -44,17 +44,26 @@ SKIP: {
SKIP: {
my ($major, $minor, $rest) = $Config{osvers} =~ / (\d+) \. (\d+) .* /x;
- skip("mblen() broken (at least for c.utf8) on early HP-UX", 2)
+ skip("mblen() broken (at least for c.utf8) on early HP-UX", 3)
if $Config{osname} eq 'hpux'
&& $major < 11 || ($major == 11 && $minor < 31);
+
fresh_perl_is(
- 'use POSIX; print &POSIX::mblen("'
+ 'use POSIX; &POSIX::mblen(undef,0); print &POSIX::mblen("'
. I8_to_native("\x{c3}\x{28}")
. '", 2)',
-1, {}, 'mblen() recognizes invalid multibyte characters');
fresh_perl_is(
- 'use POSIX; print &POSIX::mblen("\N{GREEK SMALL LETTER SIGMA}", 2)',
+ 'use POSIX; &POSIX::mblen(undef,0);
+ my $sigma = "\N{GREEK SMALL LETTER SIGMA}";
+ utf8::encode($sigma);
+ print &POSIX::mblen($sigma, 2)',
2, {}, 'mblen() works on UTF-8 characters');
+
+ fresh_perl_is(
+ 'use POSIX; &POSIX::mblen(undef,0);
+ my $wide; print &POSIX::mblen("\N{GREEK SMALL LETTER SIGMA}", 1);',
+ -1, {}, 'mblen() returns -1 when input length is too short');
}
}
diff --git a/intrpvar.h b/intrpvar.h
index ff238ab9ff..39bc99d898 100644
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -938,6 +938,10 @@ PERLVARI(I, InBitmap, SV *, NULL)
PERLVAR(I, CCC_non0_non230, SV *)
PERLVAR(I, Private_Use, SV *)
+#ifdef HAS_MBRLEN
+PERLVAR(I, mbrlen_ps, mbstate_t)
+#endif
+
/* If you are adding a U8 or U16, check to see if there are 'Space' comments
* above on where there are gaps which currently will be structure padding. */
diff --git a/locale.c b/locale.c
index 482a533f86..787474b9a7 100644
--- a/locale.c
+++ b/locale.c
@@ -3461,6 +3461,12 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
# endif
# endif /* DEBUGGING */
+ /* Initialize the per-thread mbrFOO() state variable. See POSIX.xs for
+ * why this particular incantation is used. */
+#ifdef HAS_MBRLEN
+ memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
+#endif
+
/* Initialize the cache of the program's UTF-8ness for the always known
* locales C and POSIX */
my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness,
diff --git a/perl.h b/perl.h
index 65009e117d..8813a51bcf 100644
--- a/perl.h
+++ b/perl.h
@@ -803,6 +803,20 @@ out of them.
#include <sys/types.h>
+/* EVC 4 SDK headers includes a bad definition of MB_CUR_MAX in stdlib.h
+ which is included from stdarg.h. Bad definition not present in SD 2008
+ SDK headers. wince.h is not yet included, so we cant fix this from there
+ since by then MB_CUR_MAX will be defined from stdlib.h.
+ cewchar.h includes a correct definition of MB_CUR_MAX and it is copied here
+ since cewchar.h can't be included this early */
+#if defined(UNDER_CE) && (_MSC_VER < 1300)
+# define MB_CUR_MAX 1uL
+#endif
+
+# ifdef I_WCHAR
+# include <wchar.h>
+# endif
+
# include <stdarg.h>
#ifdef I_STDINT
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index f76af50567..c6cad67e71 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -54,6 +54,23 @@ patterns using the above syntaxes, as an alternative to C<\N{...}>.
A comparison of the two methods is given in
L<perlunicode/Comparison of \N{...} and \p{name=...}>.
+=head2 The C<POSIX::mblen()> function now works on shift state locales
+and is thread-safe on C99 and above compilers
+when executed on a platform that has locale thread-safety.
+
+This function is always executed under the current C language locale.
+(See L<perllocale>.) Most locales are stateless, but a few, notably the
+very rarely encountered ISO 2022, maintain a state between calls to this
+function. Previously the state was cleared on every call to this
+function, but now the state is not reset unless the first parameter is
+C<undef>.
+
+On threaded perls, the C99 function L<mbrlen(3)>,
+when available, is substituted for plain
+C<mblen>.
+This makes this function thread-safe when executing on a locale
+thread-safe platform.
+
=head1 Security
XXX Any security-related notices go here. In particular, any security
diff --git a/sv.c b/sv.c
index f464065e90..3c533b08d2 100644
--- a/sv.c
+++ b/sv.c
@@ -15688,6 +15688,10 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
# endif
#endif /* !USE_LOCALE_NUMERIC */
+#ifdef HAS_MBRLEN
+ PL_mbrlen_ps = proto_perl->Imbrlen_ps;
+#endif
+
PL_langinfo_buf = NULL;
PL_langinfo_bufsize = 0;
diff --git a/t/porting/known_pod_issues.dat b/t/porting/known_pod_issues.dat
index ad18b675a8..195040af6c 100644
--- a/t/porting/known_pod_issues.dat
+++ b/t/porting/known_pod_issues.dat
@@ -200,6 +200,7 @@ Math::BigInt::Pari
Math::Random::MT::Perl
Math::Random::Secure
Math::TrulyRandom
+mbrlen(3)
md5sum(1)
Method::Signatures
mmap(2)