Add my_strftime8()

This is like plain my_strftime(), but additionally returns an indication of the UTF-8ness of the returned string
author: Karl Williamson <khw@cpan.org> 2021-02-18 16:08:19 -0700
committer: Karl Williamson <khw@cpan.org> 2022-08-22 06:52:08 -0600
commit: 7b12ce912aaddd1bd2f0dfc4cef735d45600d0fa (patch)
tree: e9cc3ecc92cb91fcc4b2432c24d6b0247dd9d5b4
parent: 692d08c5148904dec25634369d9e81ede7ff18ee (diff)
download: perl-7b12ce912aaddd1bd2f0dfc4cef735d45600d0fa.tar.gz
7 files changed, 66 insertions, 18 deletions
diff --git a/embed.fnc b/embed.fnc
index ac55565e05..a13820c68e 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1449,7 +1449,12 @@ Cpd	|PerlIO*|my_popen_list	|NN const char* mode|int n|NN SV ** args
 Apd	|void	|my_setenv	|NULLOK const char* nam|NULLOK const char* val
 m	|I32	|my_stat
 pX	|I32	|my_stat_flags	|NULLOK const U32 flags
-Adfp	|char *	|my_strftime	|NN const char *fmt|int sec|int min|int hour|int mday|int mon|int year|int wday|int yday|int isdst
+Adfp	|char *	|my_strftime	|NN const char *fmt|int sec|int min|int hour \
+				|int mday|int mon|int year|int wday|int yday \
+				|int isdst
+Adfp	|char *	|my_strftime8	|NN const char *fmt|int sec|int min|int hour \
+				|int mday|int mon|int year|int wday|int yday \
+				|int isdst|NULLOK utf8ness_t * utf8ness
 : Used in pp_ctl.c
 p	|void	|my_unexec
 ApR	|OP*	|newANONLIST	|NULLOK OP* o
diff --git a/embed.h b/embed.h
index ae37387b29..46b743a86c 100644
--- a/embed.h
+++ b/embed.h
@@ -327,6 +327,7 @@
 #define my_setenv(a,b)		Perl_my_setenv(aTHX_ a,b)
 #define my_socketpair		Perl_my_socketpair
 #define my_strftime(a,b,c,d,e,f,g,h,i,j)	Perl_my_strftime(aTHX_ a,b,c,d,e,f,g,h,i,j)
+#define my_strftime8(a,b,c,d,e,f,g,h,i,j,k)	Perl_my_strftime8(aTHX_ a,b,c,d,e,f,g,h,i,j,k)
 #define my_strtod		Perl_my_strtod
 #define newANONATTRSUB(a,b,c,d)	Perl_newANONATTRSUB(aTHX_ a,b,c,d)
 #define newANONHASH(a)		Perl_newANONHASH(aTHX_ a)
diff --git a/ext/POSIX/POSIX.xs b/ext/POSIX/POSIX.xs
index 062fa309b4..69250a8408 100644
--- a/ext/POSIX/POSIX.xs
+++ b/ext/POSIX/POSIX.xs
@@ -3577,29 +3577,19 @@ strftime(fmt, sec, min, hour, mday, mon, year, wday = -1, yday = -1, isdst = -1)
 	{
 	    char *buf;
             SV *sv;
+            utf8ness_t is_utf8;
 
             /* allowing user-supplied (rather than literal) formats
              * is normally frowned upon as a potential security risk;
              * but this is part of the API so we have to allow it */
             GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
-	    buf = my_strftime(SvPV_nolen(fmt), sec, min, hour, mday, mon, year, wday, yday, isdst);
+	    buf = my_strftime8(SvPV_nolen(fmt), sec, min, hour, mday, mon, year, wday, yday, isdst, &is_utf8);
             GCC_DIAG_RESTORE_STMT;
             sv = sv_newmortal();
 	    if (buf) {
                 STRLEN len = strlen(buf);
 		sv_usepvn_flags(sv, buf, len, SV_HAS_TRAILING_NUL);
-		if (       SvUTF8(fmt)
-                    || (   is_utf8_non_invariant_string((U8*) buf, len)
-#ifdef USE_LOCALE_TIME
-                        && _is_cur_LC_category_utf8(LC_TIME)
-#else   /* If can't check directly, at least can see if script is consistent,
-           under UTF-8, which gives us an extra measure of confidence. */
-
-                        && isSCRIPT_RUN((const U8 *) buf,
-                                        (const U8 *) buf + len,
-                                        TRUE) /* Means assume UTF-8 */
-#endif
-                )) {
+		if (SvUTF8(fmt) || is_utf8 == UTF8NESS_YES) {
 		    SvUTF8_on(sv);
 		}
             }
diff --git a/locale.c b/locale.c
index 41f9ef1bdb..05f26e3814 100644
--- a/locale.c
+++ b/locale.c
@@ -4240,6 +4240,39 @@ S_my_langinfo_i(pTHX_
 
 #endif      /* USE_LOCALE */
 
+char *
+Perl_my_strftime8(pTHX_ const char *fmt, int sec, int min, int hour, int mday,
+                         int mon, int year, int wday, int yday, int isdst,
+                         utf8ness_t * utf8ness)
+{   /* Documented in util.c */
+    char * retval = my_strftime(fmt, sec, min, hour, mday, mon, year, wday,
+                                yday, isdst);
+
+    PERL_ARGS_ASSERT_MY_STRFTIME8;
+
+    if (utf8ness) {
+
+#ifdef USE_LOCALE_TIME
+        *utf8ness = get_locale_string_utf8ness_i(NULL, LC_TIME_INDEX_,
+                                                 retval, LOCALE_UTF8NESS_UNKNOWN);
+#else
+        *utf8ness = UTF8NESS_IMMATERIAL;
+#endif
+
+    }
+
+    DEBUG_Lv(PerlIO_printf(Perl_debug_log, "fmt=%s, retval=%s", fmt,
+                 ((is_utf8_string((U8 *) retval, 0))
+                  ? retval
+                  :_byte_dump_string((U8 *) retval, strlen(retval), 0)));
+             if (utf8ness) PerlIO_printf(Perl_debug_log, "; utf8ness=%d",
+                                                         (int) *utf8ness);
+             PerlIO_printf(Perl_debug_log, "\n");
+            );
+
+    return retval;
+}
+
 /*
  * Initialize locale awareness.
  */
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index ceadb9f66e..82db679dd9 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -352,6 +352,13 @@ A new API function L<perlapi/C<Perl_localeconv>> is added.  This is the
 same as L<C<POSIX::localeconv>|POSIX/localeconv> (returning a hash of
 the C<localeconv()>> fields), but directly callable from XS code.
 
+=item *
+
+A new API function L<perlapi/C<my_strftime8>> is added.  This is the
+same as plain L<perlapi/C<my_strftime>>, but with an extra parameter
+that allows the caller to simply and reliably know if the returned
+string is UTF-8.
+
 =back
 
 =head1 Selected Bug Fixes
diff --git a/proto.h b/proto.h
index b0560d7e77..be82a7f480 100644
--- a/proto.h
+++ b/proto.h
@@ -2554,6 +2554,11 @@ PERL_CALLCONV char *	Perl_my_strftime(pTHX_ const char *fmt, int sec, int min, i
 #define PERL_ARGS_ASSERT_MY_STRFTIME	\
 	assert(fmt)
 
+PERL_CALLCONV char *	Perl_my_strftime8(pTHX_ const char *fmt, int sec, int min, int hour, int mday, int mon, int year, int wday, int yday, int isdst, utf8ness_t * utf8ness)
+			__attribute__format__(__strftime__,pTHX_1,0);
+#define PERL_ARGS_ASSERT_MY_STRFTIME8	\
+	assert(fmt)
+
 PERL_CALLCONV NV	Perl_my_strtod(const char * const s, char ** e)
 			__attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_MY_STRTOD	\
diff --git a/util.c b/util.c
index 231daf06da..930ae8ea60 100644
--- a/util.c
+++ b/util.c
@@ -4189,16 +4189,23 @@ Perl_my_strftime(pTHX_ const char *fmt, int sec, int min, int hour, int mday, in
 
 /*
 =for apidoc_section $time
-=for apidoc my_strftime
+=for apidoc      my_strftime
+=for apidoc_item my_strftime8
+
 strftime(), but with a different API so that the return value is a pointer
 to the formatted result (which MUST be arranged to be FREED BY THE
-CALLER).  This allows this function to increase the buffer size as needed,
+CALLER).  This allows these functions to increase the buffer size as needed,
 so that the caller doesn't have to worry about that.
 
-Note that yday and wday effectively are ignored by this function, as
+C<my_strftime8> is the same as plain C<my_strftime>, but has an extra
+parameter, a pointer to a variable declared as L</C<utf8ness_t>>.
+Upon return, its variable will be set to indicate how the resultant string
+should be treated with regards to its UTF-8ness.
+
+Note that yday and wday effectively are ignored by these functions, as
 mini_mktime() overwrites them
 
-Also note that this is always executed in the underlying locale of the program,
+Also note that they are always executed in the underlying locale of the program,
 giving localized results.
 
 =cut
author	Karl Williamson <khw@cpan.org>	2021-02-18 16:08:19 -0700
committer	Karl Williamson <khw@cpan.org>	2022-08-22 06:52:08 -0600
commit	7b12ce912aaddd1bd2f0dfc4cef735d45600d0fa (patch)
tree	e9cc3ecc92cb91fcc4b2432c24d6b0247dd9d5b4
parent	692d08c5148904dec25634369d9e81ede7ff18ee (diff)
download	perl-7b12ce912aaddd1bd2f0dfc4cef735d45600d0fa.tar.gz