diff options
author | Karl Williamson <khw@cpan.org> | 2021-03-27 13:20:09 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2022-06-12 09:50:10 -0600 |
commit | 33bdb9d365b93e59fc6d3c78402989659bb7ad37 (patch) | |
tree | a84352a27d220a90f197d021ceefbec825ee1164 /handy.h | |
parent | 8fd8ea433fb12e11f83365f8deb205f454abf46c (diff) | |
download | perl-33bdb9d365b93e59fc6d3c78402989659bb7ad37.tar.gz |
handy.h: Add layer for char classification/case change
This layer currently expands to just the layer below it, but that will
be changed in a future commit.
Diffstat (limited to 'handy.h')
-rw-r--r-- | handy.h | 140 |
1 files changed, 100 insertions, 40 deletions
@@ -1896,29 +1896,43 @@ END_EXTERN_C generic_isCC_A_(c, classnum) #endif -/* Use the libc versions for these if available. */ +/* Below are the definitions for the locale-sensitive character classification + * macros whose input domain is a byte, and the locale isn't UTF-8. These are + * as close as possible to the bare versions on the platform and still yield + * POSIX Standard-compliant results. + * + * There is currently only one place these definitions should be used, in + * certain function calls like Perl_iswordchar_() in inline.h. + * + * Most likely you want to use the macros a ways below with names like + * isALPHA_LC(). Rarely, you may want isU8_ALPHA_LC(), somewhat below. + * + * The first two aren't in C89, so the fallback is to use the non-locale + * sensitive versions; these are the same for all platforms */ #if defined(HAS_ISASCII) -# define isU8_ASCII_LC(c) isascii((U8) (c)) +# define is_porcelain_ASCII(c) isascii((U8) (c)) #else -# define isU8_ASCII_LC(c) isASCII(c) +# define is_porcelain_ASCII(c) isASCII(c) #endif #if defined(HAS_ISBLANK) -# define isU8_BLANK_LC(c) isblank((U8) (c)) +# define is_porcelain_BLANK(c) isblank((U8) (c)) #else -# define isU8_BLANK_LC(c) isBLANK(c) +# define is_porcelain_BLANK(c) isBLANK(c) #endif /* The next few are the same in all platforms. */ -#define isU8_CNTRL_LC(c) iscntrl((U8) (c)) -#define isU8_IDFIRST_LC(c) (UNLIKELY((c) == '_') || isU8_ALPHA_LC(c)) -#define isU8_SPACE_LC(c) isspace((U8) (c)) -#define isU8_WORDCHAR_LC(c) (UNLIKELY((c) == '_') || isU8_ALPHANUMERIC_LC(c)) +#define is_porcelain_CNTRL(c) iscntrl((U8) (c)) +#define is_porcelain_IDFIRST(c) (UNLIKELY((c) == '_') || is_porcelain_ALPHA(c)) +#define is_porcelain_SPACE(c) isspace((U8) (c)) +#define is_porcelain_WORDCHAR(c) (UNLIKELY((c) == '_') || is_porcelain_ALPHANUMERIC(c)) -/* The base-leve case changing macros are also the same in all platforms */ -#define toU8_LOWER_LC(c) tolower((U8) (c)) -#define toU8_UPPER_LC(c) toupper((U8) (c)) -#define toU8_FOLD_LC(c) toU8_LOWER_LC(c) +/* The base-level case changing macros are also the same in all platforms */ +#define to_porcelain_LOWER(c) tolower((U8) (c)) +#define to_porcelain_UPPER(c) toupper((U8) (c)) +#define to_porcelain_FOLD(c) to_porcelain_LOWER(c) + +#ifdef WIN32 /* The Windows functions don't bother to follow the POSIX standard, which for * example says that something can't both be a printable and a control. But @@ -1930,47 +1944,93 @@ END_EXTERN_C * ispunct(), and things that are \W, like ispunct(), arent't controls. Not * all possible weirdnesses are checked for, just ones that were detected on * actual Microsoft code pages */ -#ifdef WIN32 -# define isU8_ALPHA_LC(c) (isalpha((U8) (c)) && ! isU8_PUNCT_LC(c)) -# define isU8_ALPHANUMERIC_LC(c) (isalnum((U8) (c)) && ! isU8_PUNCT_LC(c)) -# define isU8_CASED_LC(c) ((isupper((U8) (c)) || islower((U8) (c))) \ - && ! isU8_PUNCT_LC(c)) -# define isU8_DIGIT_LC(c) (isdigit((U8) (c)) && ! isU8_PUNCT_LC(c)) -# define isU8_GRAPH_LC(c) (isgraph((U8) (c)) && ! isU8_CNTRL_LC(c)) -# define isU8_LOWER_LC(c) (islower((U8) (c)) && ! isU8_PUNCT_LC(c)) -# define isU8_PRINT_LC(c) (isprint((U8) (c)) && ! isU8_CNTRL_LC(c)) -# define isU8_PUNCT_LC(c) (ispunct((U8) (c)) && ! isU8_CNTRL_LC(c)) -# define isU8_UPPER_LC(c) (isupper((U8) (c)) && ! isU8_PUNCT_LC(c)) -# define isU8_XDIGIT_LC(c) (isxdigit((U8)(c)) && ! isU8_PUNCT_LC(c)) +# define is_porcelain_ALPHA(c) \ + (isalpha((U8) (c)) && ! is_porcelain_PUNCT(c)) +# define is_porcelain_ALPHANUMERIC(c) \ + (isalnum((U8) (c)) && ! is_porcelain_PUNCT(c)) +# define is_porcelain_CASED(c) \ + ((isupper((U8) (c)) || islower((U8) (c))) && ! is_porcelain_PUNCT(c)) +# define is_porcelain_DIGIT(c) \ + (isdigit((U8) (c)) && ! is_porcelain_PUNCT(c)) +# define is_porcelain_GRAPH(c) \ + (isgraph((U8) (c)) && ! is_porcelain_CNTRL(c)) +# define is_porcelain_LOWER(c) \ + (islower((U8) (c)) && ! is_porcelain_PUNCT(c)) +# define is_porcelain_PRINT(c) \ + (isprint((U8) (c)) && ! is_porcelain_CNTRL(c)) +# define is_porcelain_PUNCT(c) \ + (ispunct((U8) (c)) && ! is_porcelain_CNTRL(c)) +# define is_porcelain_UPPER(c) \ + (isupper((U8) (c)) && ! is_porcelain_PUNCT(c)) +# define is_porcelain_XDIGIT(c) \ + (isxdigit((U8) (c)) && ! is_porcelain_PUNCT(c)) #else -/* For all other platforms, as far as we know, the isfoo() functions work - * sanely enough */ - -# define isU8_ALPHA_LC(c) isalpha((U8) (c)) -# define isU8_ALPHANUMERIC_LC(c) isalnum((U8) (c)) -# define isU8_CASED_LC(c) (islower((U8) (c)) || isupper((U8) (c))) -# define isU8_DIGIT_LC(c) isdigit((U8) (c)) +/* For all other platforms, as far as we know, isdigit(), etc. work sanely + * enough */ +# define is_porcelain_ALPHA(c) isalpha((U8) (c)) +# define is_porcelain_ALPHANUMERIC(c) isalnum((U8) (c)) +# define is_porcelain_CASED(c) (islower((U8) (c)) || isupper((U8) (c))) +# define is_porcelain_DIGIT(c) isdigit((U8) (c)) /* ... But it seems that IBM products treat NBSP as both a space and a * graphic; these are the two platforms that we have active test beds for. */ # if defined(OS390) || defined(_AIX) -# define isU8_GRAPH_LC(c) (isgraph((U8) (c)) && ! isspace((U8) (c))) +# define is_porcelain_GRAPH(c) (isgraph((U8) (c)) && ! isspace((U8) (c))) # else -# define isU8_GRAPH_LC(c) isgraph((U8) (c)) +# define is_porcelain_GRAPH(c) isgraph((U8) (c)) # endif -# define isU8_LOWER_LC(c) islower((U8) (c)) -# define isU8_PRINT_LC(c) isprint((U8) (c)) -# define isU8_PUNCT_LC(c) ispunct((U8) (c)) -# define isU8_UPPER_LC(c) isupper((U8) (c)) -# define isU8_XDIGIT_LC(c) isxdigit((U8) (c)) +# define is_porcelain_LOWER(c) islower((U8) (c)) +# define is_porcelain_PRINT(c) isprint((U8) (c)) +# define is_porcelain_PUNCT(c) ispunct((U8) (c)) +# define is_porcelain_UPPER(c) isupper((U8) (c)) +# define is_porcelain_XDIGIT(c) isxdigit((U8) (c)) #endif +/* Below is the next level up, which currently expands to nothing more + * than the previous layer. These are the macros to use if you really need + * something whose input domain is a byte, and the locale isn't UTF-8; that is, + * where you normally would have to use things like bare isalnum(). + * + * But most likely you should instead use the layer defined further below which + * has names like isALPHA_LC. They deal with larger-than-byte inputs, and + * UTF-8 locales. + * + * (Note, proper general operation of the bare libc functons requires you to + * cast to U8. These do that for you automatically.) */ + +# define WRAP_U8_LC_(c, classnum, porcelain) porcelain(c) + +#define isU8_ALPHANUMERIC_LC(c) \ + WRAP_U8_LC_((c), CC_ALPHANUMERIC_, is_porcelain_ALPHANUMERIC) +#define isU8_ALPHA_LC(c) WRAP_U8_LC_((c), CC_ALPHA_, is_porcelain_ALPHA) +#define isU8_ASCII_LC(c) WRAP_U8_LC_((c), CC_ASCII_, is_porcelain_ASCII) +#define isU8_BLANK_LC(c) WRAP_U8_LC_((c), CC_BLANK_, is_porcelain_BLANK) +#define isU8_CASED_LC(c) WRAP_U8_LC_((c), CC_CASED_, is_porcelain_CASED) +#define isU8_CNTRL_LC(c) WRAP_U8_LC_((c), CC_CNTRL_, is_porcelain_CNTRL) +#define isU8_DIGIT_LC(c) WRAP_U8_LC_((c), CC_DIGIT_, is_porcelain_DIGIT) +#define isU8_GRAPH_LC(c) WRAP_U8_LC_((c), CC_GRAPH_, is_porcelain_GRAPH) +#define isU8_IDFIRST_LC(c) WRAP_U8_LC_((c), CC_IDFIRST_, is_porcelain_IDFIRST) +#define isU8_LOWER_LC(c) WRAP_U8_LC_((c), CC_LOWER_, is_porcelain_LOWER) +#define isU8_PRINT_LC(c) WRAP_U8_LC_((c), CC_PRINT_, is_porcelain_PRINT) +#define isU8_PUNCT_LC(c) WRAP_U8_LC_((c), CC_PUNCT_, is_porcelain_PUNCT) +#define isU8_SPACE_LC(c) WRAP_U8_LC_((c), CC_SPACE_, is_porcelain_SPACE) +#define isU8_UPPER_LC(c) WRAP_U8_LC_((c), CC_UPPER_, is_porcelain_UPPER) +#define isU8_WORDCHAR_LC(c) WRAP_U8_LC_((c), CC_WORDCHAR_, is_porcelain_WORDCHAR) +#define isU8_XDIGIT_LC(c) WRAP_U8_LC_((c), CC_XDIGIT_, is_porcelain_XDIGIT) + +#define toU8_LOWER_LC(c) WRAP_U8_LC_((c), CC_TOLOWER_, to_porcelain_LOWER) +#define toU8_UPPER_LC(c) WRAP_U8_LC_((c), CC_TOUPPER_, to_porcelain_UPPER) +#define toU8_FOLD_LC(c) toU8_LOWER_LC(c) + /* The definitions below use the ones above to create versions in which the * input domain isn't restricted to bytes (though always returning false if the * input doesn't fit in a byte), and to behave properly should the locale be - * UTF-8 */ + * UTF-8. These are the documented ones, suitable for general use (though + * toUPPER_LC and toFOLD_LC aren't documented because they need special + * handling to deal with SHARP S expanding to two characters). */ + #define isASCII_LC(c) (FITS_IN_8_BITS(c) && isU8_ASCII_LC(c)) #define isALPHA_LC(c) generic_LC_(c, CC_ALPHA_, isU8_ALPHA_LC) #define isALPHANUMERIC_LC(c) \ |