diff options
author | Karl Williamson <khw@cpan.org> | 2022-08-21 09:57:34 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2022-08-22 06:52:08 -0600 |
commit | 1cdf84d21594fdf7969027ab387c8dc03c48f45f (patch) | |
tree | 6bf32f6a678f799cadc4cb02440a954b9f3b345b /utf8.h | |
parent | a0eeb33971b86f17825611fd35b697d0a285f5ab (diff) | |
download | perl-1cdf84d21594fdf7969027ab387c8dc03c48f45f.tar.gz |
Add utf8ness_t typedef
This will be used in future commits
Diffstat (limited to 'utf8.h')
-rw-r--r-- | utf8.h | 77 |
1 files changed, 77 insertions, 0 deletions
@@ -32,6 +32,83 @@ #ifndef PERL_UTF8_H_ /* Guard against recursive inclusion */ #define PERL_UTF8_H_ 1 +/* +=for apidoc Ay||utf8ness_t + +This typedef is used by several core functions that return PV strings, to +indicate the UTF-8ness of those strings. + +(If you write a new function, you probably should instead return the PV in an +SV with the UTF-8 flag of the SV properly set, rather than use this mechanism.) + +The possible values this can be are: + +=over + +=item C<UTF8NESS_YES> + +This means the string definitely should be treated as a sequence of +UTF-8-encoded characters. + +Most code that needs to handle this typedef should be of the form: + + if (utf8ness_flag == UTF8NESS_YES) { + treat as utf8; // like turning on an SV UTF-8 flag + } + +=item C<UTF8NESS_NO> + +This means the string definitely should be treated as a sequence of bytes, not +encoded as UTF-8. + +=item C<UTF8NESS_IMMATERIAL> + +This means it is equally valid to treat the string as bytes, or as UTF-8 +characters; use whichever way you want. This happens when the string consists +entirely of characters which have the same representation whether encoded in +UTF-8 or not. + +=item C<UTF8NESS_UNKNOWN> + +This means it is unknown how the string should be treated. No core function +will ever return this value to a non-core caller. Instead, it is used by the +caller to initialize a variable to a non-legal value. A typical call will look like: + + utf8ness_t string_is_utf8 = UTF8NESS_UNKNOWN + const char * string = foo(arg1, arg2, ..., &string_is_utf8); + if (string_is_utf8 == UTF8NESS_YES) { + do something for UTF-8; + } + +=back + +The following relationships hold between the enum values: + +=over + +=item S<C<0 E<lt>= I<enum value> E<lt>= UTF8NESS_IMMATERIAL>> + +the string may be treated in code as non-UTF8 + +=item S<C<UTF8NESS_IMMATERIAL E<lt>= <I<enum value>>> + +the string may be treated in code as encoded in UTF-8 + +=back + +=cut +*/ + +typedef enum { + UTF8NESS_NO = 0, /* Definitely not UTF-8 */ + UTF8NESS_IMMATERIAL = 1, /* Representation is the same in UTF-8 as + not, so the UTF8ness doesn't actually + matter */ + UTF8NESS_YES = 2, /* Defintely is UTF-8, wideness + unspecified */ + UTF8NESS_UNKNOWN = (STRLEN) -1, /* Undetermined so far */ +} utf8ness_t; + /* Use UTF-8 as the default script encoding? * Turning this on will break scripts having non-UTF-8 binary * data (such as Latin-1) in string literals. */ |