summaryrefslogtreecommitdiff
path: root/handy.h
diff options
context:
space:
mode:
Diffstat (limited to 'handy.h')
-rw-r--r--handy.h136
1 files changed, 99 insertions, 37 deletions
diff --git a/handy.h b/handy.h
index 1eda7e1ec1..8da5c0afbc 100644
--- a/handy.h
+++ b/handy.h
@@ -438,36 +438,66 @@ C<strncmp>).
/*
=head1 Character classes
-The functions in this section operate using the character set of the platform
-Perl is running on, and are unaffected by locale. For ASCII platforms, they
-will all return false for characters outside the ASCII range. For EBCDIC
-platforms, they use the code page of the platform. The code pages that Perl
-knows about all have 8-bit characters, so most of these functions will return
-true for more characters than on ASCII platforms.
+There are three variants for all the functions in this section. The base ones
+operate using the character set of the platform Perl is running on. The ones
+with an C<_A> suffix operate on the ASCII character set, and the ones with an
+C<_L1> suffix operate on the full Latin1 character set. All are unaffected by
+locale
+
+For ASCII platforms, the base function with no suffix and the one with the
+C<_A> suffix are identical. The function with the C<_L1> suffix imposes the
+Latin-1 character set onto the platform. That is, the code points that are
+ASCII are unaffected, since ASCII is a subset of Latin-1. But the non-ASCII
+code points are treated as if they are Latin-1 characters. For example,
+C<isSPACE_L1()> will return true when called with the code point 0xA0, which is
+the Latin-1 NO-BREAK SPACE.
+
+For EBCDIC platforms, the base function with no suffix and the one with the
+C<_L1> suffix should be identical, since, as of this writing, the EBCDIC code
+pages that Perl knows about all are equivalent to Latin-1. The function that
+ends in an C<_A> suffix will not return true unless the specified character also
+has an ASCII equivalent.
=for apidoc Am|bool|isALPHA|char ch
Returns a boolean indicating whether the specified character is an
alphabetic character in the platform's native character set.
+See the L<top of this section|/Character classes> for an explanation of variants
+C<isALPHA_A> and C<isALPHA_L1>.
+
+=for apidoc Am|bool|isASCII|char ch
+Returns a boolean indicating whether the specified character is one of the 128
+characters in the ASCII character set. On non-ASCII platforms, it is if this
+character corresponds to an ASCII character. Variants C<isASCII_A()> and
+C<isASCII_L1()> are identical to C<isASCII()>.
=for apidoc Am|bool|isDIGIT|char ch
Returns a boolean indicating whether the specified character is a
digit in the platform's native character set.
+Variants C<isDIGIT_A> and C<isDIGIT_L1> are identical to C<isDIGIT>.
=for apidoc Am|bool|isLOWER|char ch
Returns a boolean indicating whether the specified character is a
lowercase character in the platform's native character set.
+See the L<top of this section|/Character classes> for an explanation of variants
+C<isLOWER_A> and C<isLOWER_L1>.
=for apidoc Am|bool|isOCTAL|char ch
Returns a boolean indicating whether the specified character is an
octal digit, [0-7] in the platform's native character set.
+Variants C<isOCTAL_A> and C<isOCTAL_L1> are identical to C<isOCTAL>.
=for apidoc Am|bool|isSPACE|char ch
Returns a boolean indicating whether the specified character is a
-whitespace character in the platform's native character set.
+whitespace character in the platform's native character set. This is the same
+as what C<\s> matches in a regular expression.
+See the L<top of this section|/Character classes> for an explanation of variants
+C<isSPACE_A> and C<isSPACE_L1>.
=for apidoc Am|bool|isUPPER|char ch
Returns a boolean indicating whether the specified character is an
uppercase character in the platform's native character set.
+See the L<top of this section|/Character classes> for an explanation of variants
+C<isUPPER_A> and C<isUPPER_L1>.
=for apidoc Am|bool|isWORDCHAR|char ch
Returns a boolean indicating whether the specified character is a
@@ -476,6 +506,13 @@ same as what C<\w> matches in a regular expression.
C<isALNUM()> is a synonym provided for backward compatibility. Note that it
does not have the standard C language meaning of alphanumeric, since it matches
an underscore and the standard meaning does not.
+See the L<top of this section|/Character classes> for an explanation of variants
+C<isWORDCHAR_A> and C<isWORDCHAR_L1>.
+
+=for apidoc Am|bool|isXDIGIT|char ch
+Returns a boolean indicating whether the specified character is a hexadecimal
+digit, [0-9A-Fa-f]. Variants C<isXDIGIT_A()> and C<isXDIGIT_L1()> are
+identical to C<isXDIGIT()>.
=head1 Character case changing
@@ -489,11 +526,7 @@ character set, if possible; otherwise returns the input character itself.
=cut
-NOTE: Since some of these are macros, there is no check in those that the
-parameter is a char or U8. This means that if called with a larger width
-parameter, casts can silently truncate and yield wrong results.
-
-Also note that these macros are repeated in Devel::PPPort, so should also be
+Note that these macros are repeated in Devel::PPPort, so should also be
patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
*/
@@ -526,7 +559,9 @@ patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
#define isCNTRL_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_CNTRL_A))
#define isDIGIT_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_DIGIT_A))
#define isGRAPH_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_GRAPH_A))
+#define isIDFIRST_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_IDFIRST_A))
#define isLOWER_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_LOWER_A))
+#define isOCTAL_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_OCTAL_A))
#define isPRINT_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PRINT_A))
#define isPSXSPC_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PSXSPC_A))
#define isPUNCT_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PUNCT_A))
@@ -536,39 +571,59 @@ patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
#define isXDIGIT_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_XDIGIT_A))
/* Latin1 definitions */
-/* ALPHAU includes Unicode semantics for latin1 characters. It has an extra
- * >= AA test to speed up ASCII-only tests at the expense of the others */
-/* XXX decide whether to document the ALPHAU, ALNUMU and isSPACE_L1 functions.
- * Most of these should be implemented as table lookup for speed */
-#define isALPHAU(c) (isALPHA_A(c) || (NATIVE_TO_UNI((U8) c) >= 0xAA \
- && ((NATIVE_TO_UNI((U8) c) >= 0xC0 \
- && NATIVE_TO_UNI((U8) c) != 0xD7 && NATIVE_TO_UNI((U8) c) != 0xF7) \
- || NATIVE_TO_UNI((U8) c) == 0xAA \
- || NATIVE_TO_UNI((U8) c) == 0xB5 \
- || NATIVE_TO_UNI((U8) c) == 0xBA)))
-#define isSPACE_L1(c) (isSPACE(c) \
- || (NATIVE_TO_UNI(c) == 0x85 || NATIVE_TO_UNI(c) == 0xA0))
-#define isWORDCHAR_L1(c) (isDIGIT(c) || isALPHAU(c) || (c) == '_')
-
-/* Same macro in non-EBCDIC and EBCDIC. Called macros may evaluate
- * differently between the two */
+#ifdef H_PERL
+# define isALNUMC_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_ALNUMC_L1))
+# define isALPHA_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_ALPHA_L1))
+# define isBLANK_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_BLANK_L1))
+/* continuation character for legal NAME in \N{NAME} */
+# define isCHARNAME_CONT(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_CHARNAME_CONT))
+# define isCNTRL_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_CNTRL_L1))
+# define isGRAPH_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_GRAPH_L1))
+# define isIDFIRST_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_IDFIRST_L1))
+# define isLOWER_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_LOWER_L1))
+# define isPRINT_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PRINT_L1))
+# define isPSXSPC_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PSXSPC_L1))
+# define isPUNCT_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PUNCT_L1))
+# define isSPACE_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_SPACE_L1))
+# define isUPPER_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_UPPER_L1))
+# define isWORDCHAR_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_WORDCHAR_L1))
+#else /* No access to perl.h. Only a few provided here, just in case needed
+ * for backwards compatibility */
+ /* ALPHAU includes Unicode semantics for latin1 characters. It has an extra
+ * >= AA test to speed up ASCII-only tests at the expense of the others */
+# define isALPHA_L1(c) (isALPHA(c) || (NATIVE_TO_UNI((U8) c) >= 0xAA \
+ && ((NATIVE_TO_UNI((U8) c) >= 0xC0 \
+ && NATIVE_TO_UNI((U8) c) != 0xD7 && NATIVE_TO_UNI((U8) c) != 0xF7) \
+ || NATIVE_TO_UNI((U8) c) == 0xAA \
+ || NATIVE_TO_UNI((U8) c) == 0xB5 \
+ || NATIVE_TO_UNI((U8) c) == 0xBA)))
+# define isCHARNAME_CONT(c) (isALNUM_L1(c) || (c) == ' ' || (c) == '-' || (c) == '(' || (c) == ')' || (c) == ':' || NATIVE_TO_UNI((U8) c) == 0xA0)
+#endif
+
+/* Macros for backwards compatibility and for completeness when the ASCII and
+ * Latin1 values are identical */
#define isALNUM(c) isWORDCHAR(c)
#define isALNUMU(c) isWORDCHAR_L1(c)
-/* continuation character for legal NAME in \N{NAME} */
-#define isCHARNAME_CONT(c) (isWORDCHAR_L1(c) || (c) == ' ' || (c) == '-' || (c) == '(' || (c) == ')' || (c) == ':' || NATIVE_TO_UNI((U8) c) == 0xA0)
-#define isIDFIRST(c) (isALPHA(c) || (c) == '_')
-#define isOCTAL_A(c) ((c) >= '0' && (c) <= '7')
-#define isOCTAL(c) isOCTAL_A(c)
-#define isWORDCHAR(c) (isALPHA(c) || isDIGIT(c) || (c) == '_')
-
+#define isALPHAU(c) isALPHA_L1(c)
+#define isDIGIT_L1(c) isDIGIT_A(c)
+#define isOCTAL(c) isOCTAL_A(c)
+#define isOCTAL_L1(c) isOCTAL_A(c)
+#define isXDIGIT_L1(c) isXDIGIT_A(c)
+
+/* Macros that differ between EBCDIC and ASCII. Where C89 defines a function,
+ * that is used in the EBCDIC form, because in EBCDIC we do not do locales:
+ * therefore can use native functions. For those where C89 doesn't define a
+ * function, use our function, assuming that the EBCDIC code page is isomorphic
+ * with Latin1, which the three currently recognized by Perl are. Some libc's
+ * have an isblank(), but it's not guaranteed. */
#ifdef EBCDIC
- /* In EBCDIC we do not do locales: therefore can use native functions */
# define isALNUMC(c) isalnum(c)
# define isALPHA(c) isalpha(c)
# define isBLANK(c) ((c) == ' ' || (c) == '\t' || NATIVE_TO_UNI(c) == 0xA0)
# define isCNTRL(c) iscntrl(c)
# define isDIGIT(c) isdigit(c)
# define isGRAPH(c) isgraph(c)
+# define isIDFIRST(c) (isALPHA(c) || (c) == '_')
# define isLOWER(c) islower(c)
# define isPRINT(c) isprint(c)
# define isPSXSPC(c) isspace(c)
@@ -576,6 +631,7 @@ patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
# define isSPACE(c) (isPSXSPC(c) && (c) != '\v')
# define isUPPER(c) isupper(c)
# define isXDIGIT(c) isxdigit(c)
+# define isWORDCHAR(c) (isalnum(c) || (c) == '_')
# define toLOWER(c) tolower(c)
# define toUPPER(c) toupper(c)
#else /* Not EBCDIC: ASCII-only matching */
@@ -585,15 +641,21 @@ patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
# define isCNTRL(c) isCNTRL_A(c)
# define isDIGIT(c) isDIGIT_A(c)
# define isGRAPH(c) isGRAPH_A(c)
+# define isIDFIRST(c) isIDFIRST_A(c)
# define isLOWER(c) isLOWER_A(c)
# define isPRINT(c) isPRINT_A(c)
# define isPSXSPC(c) isPSXSPC_A(c)
# define isPUNCT(c) isPUNCT_A(c)
# define isSPACE(c) isSPACE_A(c)
# define isUPPER(c) isUPPER_A(c)
+# define isWORDCHAR(c) isWORDCHAR_A(c)
# define isXDIGIT(c) isXDIGIT_A(c)
- /* ASCII casing. */
+ /* ASCII casing. These could also be written as
+ #define toLOWER(c) (isASCII(c) ? toLOWER_LATIN1(c) : (c))
+ #define toUPPER(c) (isASCII(c) ? toUPPER_LATIN1_MOD(c) : (c))
+ which uses table lookup and mask instead of subtraction. (This would
+ work because the _MOD does not apply in the ASCII range) */
# define toLOWER(c) (isUPPER(c) ? (c) + ('a' - 'A') : (c))
# define toUPPER(c) (isLOWER(c) ? (c) - ('a' - 'A') : (c))
#endif