summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2022-08-21 09:57:34 -0600
committerKarl Williamson <khw@cpan.org>2022-08-22 06:52:08 -0600
commit1cdf84d21594fdf7969027ab387c8dc03c48f45f (patch)
tree6bf32f6a678f799cadc4cb02440a954b9f3b345b /utf8.h
parenta0eeb33971b86f17825611fd35b697d0a285f5ab (diff)
downloadperl-1cdf84d21594fdf7969027ab387c8dc03c48f45f.tar.gz
Add utf8ness_t typedef
This will be used in future commits
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h77
1 files changed, 77 insertions, 0 deletions
diff --git a/utf8.h b/utf8.h
index c3ff2dd93f..e0b08ad6fb 100644
--- a/utf8.h
+++ b/utf8.h
@@ -32,6 +32,83 @@
#ifndef PERL_UTF8_H_ /* Guard against recursive inclusion */
#define PERL_UTF8_H_ 1
+/*
+=for apidoc Ay||utf8ness_t
+
+This typedef is used by several core functions that return PV strings, to
+indicate the UTF-8ness of those strings.
+
+(If you write a new function, you probably should instead return the PV in an
+SV with the UTF-8 flag of the SV properly set, rather than use this mechanism.)
+
+The possible values this can be are:
+
+=over
+
+=item C<UTF8NESS_YES>
+
+This means the string definitely should be treated as a sequence of
+UTF-8-encoded characters.
+
+Most code that needs to handle this typedef should be of the form:
+
+ if (utf8ness_flag == UTF8NESS_YES) {
+ treat as utf8; // like turning on an SV UTF-8 flag
+ }
+
+=item C<UTF8NESS_NO>
+
+This means the string definitely should be treated as a sequence of bytes, not
+encoded as UTF-8.
+
+=item C<UTF8NESS_IMMATERIAL>
+
+This means it is equally valid to treat the string as bytes, or as UTF-8
+characters; use whichever way you want. This happens when the string consists
+entirely of characters which have the same representation whether encoded in
+UTF-8 or not.
+
+=item C<UTF8NESS_UNKNOWN>
+
+This means it is unknown how the string should be treated. No core function
+will ever return this value to a non-core caller. Instead, it is used by the
+caller to initialize a variable to a non-legal value. A typical call will look like:
+
+ utf8ness_t string_is_utf8 = UTF8NESS_UNKNOWN
+ const char * string = foo(arg1, arg2, ..., &string_is_utf8);
+ if (string_is_utf8 == UTF8NESS_YES) {
+ do something for UTF-8;
+ }
+
+=back
+
+The following relationships hold between the enum values:
+
+=over
+
+=item S<C<0 E<lt>= I<enum value> E<lt>= UTF8NESS_IMMATERIAL>>
+
+the string may be treated in code as non-UTF8
+
+=item S<C<UTF8NESS_IMMATERIAL E<lt>= <I<enum value>>>
+
+the string may be treated in code as encoded in UTF-8
+
+=back
+
+=cut
+*/
+
+typedef enum {
+ UTF8NESS_NO = 0, /* Definitely not UTF-8 */
+ UTF8NESS_IMMATERIAL = 1, /* Representation is the same in UTF-8 as
+ not, so the UTF8ness doesn't actually
+ matter */
+ UTF8NESS_YES = 2, /* Defintely is UTF-8, wideness
+ unspecified */
+ UTF8NESS_UNKNOWN = (STRLEN) -1, /* Undetermined so far */
+} utf8ness_t;
+
/* Use UTF-8 as the default script encoding?
* Turning this on will break scripts having non-UTF-8 binary
* data (such as Latin-1) in string literals. */