Add utf8ness_t typedef

This will be used in future commits
author: Karl Williamson <khw@cpan.org> 2022-08-21 09:57:34 -0600
committer: Karl Williamson <khw@cpan.org> 2022-08-22 06:52:08 -0600
commit: 1cdf84d21594fdf7969027ab387c8dc03c48f45f (patch)
tree: 6bf32f6a678f799cadc4cb02440a954b9f3b345b /utf8.h
parent: a0eeb33971b86f17825611fd35b697d0a285f5ab (diff)
download: perl-1cdf84d21594fdf7969027ab387c8dc03c48f45f.tar.gz
1 files changed, 77 insertions, 0 deletions
diff --git a/utf8.h b/utf8.h
index c3ff2dd93f..e0b08ad6fb 100644
--- a/utf8.h
+++ b/utf8.h
@@ -32,6 +32,83 @@
 #ifndef PERL_UTF8_H_      /* Guard against recursive inclusion */
 #define PERL_UTF8_H_ 1
 
+/*
+=for apidoc Ay||utf8ness_t
+
+This typedef is used by several core functions that return PV strings, to
+indicate the UTF-8ness of those strings.
+
+(If you write a new function, you probably should instead return the PV in an
+SV with the UTF-8 flag of the SV properly set, rather than use this mechanism.)
+
+The possible values this can be are:
+
+=over
+
+=item C<UTF8NESS_YES>
+
+This means the string definitely should be treated as a sequence of
+UTF-8-encoded characters.
+
+Most code that needs to handle this typedef should be of the form:
+
+ if (utf8ness_flag == UTF8NESS_YES) {
+     treat as utf8;  // like turning on an SV UTF-8 flag
+ }
+
+=item C<UTF8NESS_NO>
+
+This means the string definitely should be treated as a sequence of bytes, not
+encoded as UTF-8.
+
+=item C<UTF8NESS_IMMATERIAL>
+
+This means it is equally valid to treat the string as bytes, or as UTF-8
+characters; use whichever way you want.  This happens when the string consists
+entirely of characters which have the same representation whether encoded in
+UTF-8 or not.
+
+=item C<UTF8NESS_UNKNOWN>
+
+This means it is unknown how the string should be treated.  No core function
+will ever return this value to a non-core caller.  Instead, it is used by the
+caller to initialize a variable to a non-legal value.  A typical call will look like:
+
+ utf8ness_t string_is_utf8 = UTF8NESS_UNKNOWN
+ const char * string = foo(arg1, arg2, ..., &string_is_utf8);
+ if (string_is_utf8 == UTF8NESS_YES) {
+    do something for UTF-8;
+ }
+
+=back
+
+The following relationships hold between the enum values:
+
+=over
+
+=item S<C<0 E<lt>= I<enum value> E<lt>= UTF8NESS_IMMATERIAL>>
+
+the string may be treated in code as non-UTF8
+
+=item S<C<UTF8NESS_IMMATERIAL E<lt>= <I<enum value>>>
+
+the string may be treated in code as encoded in UTF-8
+
+=back
+
+=cut
+*/
+
+typedef enum {
+    UTF8NESS_NO               =  0,  /* Definitely not UTF-8 */
+    UTF8NESS_IMMATERIAL       =  1,  /* Representation is the same in UTF-8 as
+                                        not, so the UTF8ness doesn't actually
+                                        matter */
+    UTF8NESS_YES              =  2,  /* Defintely is UTF-8, wideness
+                                        unspecified */
+    UTF8NESS_UNKNOWN = (STRLEN) -1,  /* Undetermined so far */
+} utf8ness_t;
+
 /* Use UTF-8 as the default script encoding?
  * Turning this on will break scripts having non-UTF-8 binary
  * data (such as Latin-1) in string literals. */
author	Karl Williamson <khw@cpan.org>	2022-08-21 09:57:34 -0600
committer	Karl Williamson <khw@cpan.org>	2022-08-22 06:52:08 -0600
commit	1cdf84d21594fdf7969027ab387c8dc03c48f45f (patch)
tree	6bf32f6a678f799cadc4cb02440a954b9f3b345b /utf8.h
parent	a0eeb33971b86f17825611fd35b697d0a285f5ab (diff)
download	perl-1cdf84d21594fdf7969027ab387c8dc03c48f45f.tar.gz