handy.h: Add isFOO_L1() macros, using table lookup

This patch adds *_L1() macros for character class lookup, using table lookup for O(1) performance. These force a Latin-1 interpretation on ASCII platforms. There were a couple existing macros that had the suffix U for Unicode semantics. I thought that those names might be confusing, so settled on L1 as the least bad name. The older names are kept as synonyms for backward compatibility. The problem with those names is that these are actually macros, not functions, and hence can be called with any int, including any Unicode code point. The U suffix might be mistaken for indicating they are more general purpose, whereas they are really only valid for the latin1 subset of Unicode (including the EBCDIC isomorphs). When called with something outside the latin1 range, they will return false. This patch necessitated rearranging a few things in the file. I added documentation for several more macros, and intend to document the rest. (This commit was modified from its original form by Steffen.)
author: Karl Williamson <public@khwilliamson.com> 2010-09-23 23:14:58 -0600
committer: Steffen Mueller <smueller@cpan.org> 2010-09-25 11:15:32 +0200
commit: 8a58bdcf4d2660cff0818d3e1176af18a113c88d (patch)
tree: 1c4c9f48b4ddd0c922830b012cf9d693ece6a152 /handy.h
parent: 8eea39dda497d40e67ca52cc97f1fe9318b032c2 (diff)
download: perl-8a58bdcf4d2660cff0818d3e1176af18a113c88d.tar.gz
1 files changed, 99 insertions, 37 deletions
diff --git a/handy.h b/handy.h
index 1eda7e1ec1..8da5c0afbc 100644
--- a/handy.h
+++ b/handy.h
@@ -438,36 +438,66 @@ C<strncmp>).
 /*
 
 =head1 Character classes
-The functions in this section operate using the character set of the platform
-Perl is running on, and are unaffected by locale.  For ASCII platforms, they
-will all return false for characters outside the ASCII range.  For EBCDIC
-platforms, they use the code page of the platform.  The code pages that Perl
-knows about all have 8-bit characters, so most of these functions will return
-true for more characters than on ASCII platforms.
+There are three variants for all the functions in this section.  The base ones
+operate using the character set of the platform Perl is running on.  The ones
+with an C<_A> suffix operate on the ASCII character set, and the ones with an
+C<_L1> suffix operate on the full Latin1 character set.  All are unaffected by
+locale
+
+For ASCII platforms, the base function with no suffix and the one with the
+C<_A> suffix are identical.  The function with the C<_L1> suffix imposes the
+Latin-1 character set onto the platform.  That is, the code points that are
+ASCII are unaffected, since ASCII is a subset of Latin-1.  But the non-ASCII
+code points are treated as if they are Latin-1 characters.  For example,
+C<isSPACE_L1()> will return true when called with the code point 0xA0, which is
+the Latin-1 NO-BREAK SPACE.
+
+For EBCDIC platforms, the base function with no suffix and the one with the
+C<_L1> suffix should be identical, since, as of this writing, the EBCDIC code
+pages that Perl knows about all are equivalent to Latin-1.  The function that
+ends in an C<_A> suffix will not return true unless the specified character also
+has an ASCII equivalent.
 
 =for apidoc Am|bool|isALPHA|char ch
 Returns a boolean indicating whether the specified character is an
 alphabetic character in the platform's native character set.
+See the L<top of this section|/Character classes> for an explanation of variants
+C<isALPHA_A> and C<isALPHA_L1>.
+
+=for apidoc Am|bool|isASCII|char ch
+Returns a boolean indicating whether the specified character is one of the 128
+characters in the ASCII character set.  On non-ASCII platforms, it is if this
+character corresponds to an ASCII character.  Variants C<isASCII_A()> and
+C<isASCII_L1()> are identical to C<isASCII()>.
 
 =for apidoc Am|bool|isDIGIT|char ch
 Returns a boolean indicating whether the specified character is a
 digit in the platform's native character set.
+Variants C<isDIGIT_A> and C<isDIGIT_L1> are identical to C<isDIGIT>.
 
 =for apidoc Am|bool|isLOWER|char ch
 Returns a boolean indicating whether the specified character is a
 lowercase character in the platform's native character set.
+See the L<top of this section|/Character classes> for an explanation of variants
+C<isLOWER_A> and C<isLOWER_L1>.
 
 =for apidoc Am|bool|isOCTAL|char ch
 Returns a boolean indicating whether the specified character is an
 octal digit, [0-7] in the platform's native character set.
+Variants C<isOCTAL_A> and C<isOCTAL_L1> are identical to C<isOCTAL>.
 
 =for apidoc Am|bool|isSPACE|char ch
 Returns a boolean indicating whether the specified character is a
-whitespace character in the platform's native character set.
+whitespace character in the platform's native character set.  This is the same
+as what C<\s> matches in a regular expression.
+See the L<top of this section|/Character classes> for an explanation of variants
+C<isSPACE_A> and C<isSPACE_L1>.
 
 =for apidoc Am|bool|isUPPER|char ch
 Returns a boolean indicating whether the specified character is an
 uppercase character in the platform's native character set.
+See the L<top of this section|/Character classes> for an explanation of variants
+C<isUPPER_A> and C<isUPPER_L1>.
 
 =for apidoc Am|bool|isWORDCHAR|char ch
 Returns a boolean indicating whether the specified character is a
@@ -476,6 +506,13 @@ same as what C<\w> matches in a regular expression.
 C<isALNUM()> is a synonym provided for backward compatibility.  Note that it
 does not have the standard C language meaning of alphanumeric, since it matches
 an underscore and the standard meaning does not.
+See the L<top of this section|/Character classes> for an explanation of variants
+C<isWORDCHAR_A> and C<isWORDCHAR_L1>.
+
+=for apidoc Am|bool|isXDIGIT|char ch
+Returns a boolean indicating whether the specified character is a hexadecimal
+digit, [0-9A-Fa-f].  Variants C<isXDIGIT_A()> and C<isXDIGIT_L1()> are
+identical to C<isXDIGIT()>.
 
 =head1 Character case changing
 
@@ -489,11 +526,7 @@ character set, if possible; otherwise returns the input character itself.
 
 =cut
 
-NOTE:  Since some of these are macros, there is no check in those that the
-parameter is a char or U8.  This means that if called with a larger width
-parameter, casts can silently truncate and yield wrong results.
-
-Also note that these macros are repeated in Devel::PPPort, so should also be
+Note that these macros are repeated in Devel::PPPort, so should also be
 patched there.  The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
 
 */
@@ -526,7 +559,9 @@ patched there.  The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
 #define isCNTRL_A(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_CNTRL_A))
 #define isDIGIT_A(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_DIGIT_A))
 #define isGRAPH_A(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_GRAPH_A))
+#define isIDFIRST_A(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_IDFIRST_A))
 #define isLOWER_A(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_LOWER_A))
+#define isOCTAL_A(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_OCTAL_A))
 #define isPRINT_A(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PRINT_A))
 #define isPSXSPC_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PSXSPC_A))
 #define isPUNCT_A(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PUNCT_A))
@@ -536,39 +571,59 @@ patched there.  The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
 #define isXDIGIT_A(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_XDIGIT_A))
 
 /* Latin1 definitions */
-/* ALPHAU includes Unicode semantics for latin1 characters.  It has an extra
- * >= AA test to speed up ASCII-only tests at the expense of the others */
-/* XXX decide whether to document the ALPHAU, ALNUMU and isSPACE_L1 functions.
- * Most of these should be implemented as table lookup for speed */
-#define isALPHAU(c)	(isALPHA_A(c) || (NATIVE_TO_UNI((U8) c) >= 0xAA \
-     && ((NATIVE_TO_UNI((U8) c) >= 0xC0 \
-	    && NATIVE_TO_UNI((U8) c) != 0xD7 && NATIVE_TO_UNI((U8) c) != 0xF7) \
-	|| NATIVE_TO_UNI((U8) c) == 0xAA \
-	|| NATIVE_TO_UNI((U8) c) == 0xB5 \
-	|| NATIVE_TO_UNI((U8) c) == 0xBA)))
-#define isSPACE_L1(c) (isSPACE(c) \
-		    || (NATIVE_TO_UNI(c) == 0x85 || NATIVE_TO_UNI(c) == 0xA0))
-#define isWORDCHAR_L1(c) (isDIGIT(c) || isALPHAU(c) || (c) == '_')
-
-/* Same macro in non-EBCDIC and EBCDIC.  Called macros may evaluate
- * differently between the two */
+#ifdef H_PERL
+#   define isALNUMC_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_ALNUMC_L1))
+#   define isALPHA_L1(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_ALPHA_L1))
+#   define isBLANK_L1(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_BLANK_L1))
+/*  continuation character for legal NAME in \N{NAME} */
+#   define isCHARNAME_CONT(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_CHARNAME_CONT))
+#   define isCNTRL_L1(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_CNTRL_L1))
+#   define isGRAPH_L1(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_GRAPH_L1))
+#   define isIDFIRST_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_IDFIRST_L1))
+#   define isLOWER_L1(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_LOWER_L1))
+#   define isPRINT_L1(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PRINT_L1))
+#   define isPSXSPC_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PSXSPC_L1))
+#   define isPUNCT_L1(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PUNCT_L1))
+#   define isSPACE_L1(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_SPACE_L1))
+#   define isUPPER_L1(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_UPPER_L1))
+#   define isWORDCHAR_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_WORDCHAR_L1))
+#else /* No access to perl.h.  Only a few provided here, just in case needed
+       * for backwards compatibility */
+    /* ALPHAU includes Unicode semantics for latin1 characters.  It has an extra
+     * >= AA test to speed up ASCII-only tests at the expense of the others */
+#   define isALPHA_L1(c) (isALPHA(c) || (NATIVE_TO_UNI((U8) c) >= 0xAA \
+	&& ((NATIVE_TO_UNI((U8) c) >= 0xC0 \
+		&& NATIVE_TO_UNI((U8) c) != 0xD7 && NATIVE_TO_UNI((U8) c) != 0xF7) \
+	    || NATIVE_TO_UNI((U8) c) == 0xAA \
+	    || NATIVE_TO_UNI((U8) c) == 0xB5 \
+	    || NATIVE_TO_UNI((U8) c) == 0xBA)))
+#   define isCHARNAME_CONT(c) (isALNUM_L1(c) || (c) == ' ' || (c) == '-' || (c) == '(' || (c) == ')' || (c) == ':' || NATIVE_TO_UNI((U8) c) == 0xA0)
+#endif
+
+/* Macros for backwards compatibility and for completeness when the ASCII and
+ * Latin1 values are identical */
 #define isALNUM(c)      isWORDCHAR(c)
 #define isALNUMU(c)     isWORDCHAR_L1(c)
-/* continuation character for legal NAME in \N{NAME} */
-#define isCHARNAME_CONT(c) (isWORDCHAR_L1(c) || (c) == ' ' || (c) == '-' || (c) == '(' || (c) == ')' || (c) == ':' || NATIVE_TO_UNI((U8) c) == 0xA0)
-#define isIDFIRST(c)	(isALPHA(c) || (c) == '_')
-#define isOCTAL_A(c)	((c) >= '0' && (c) <= '7')
-#define isOCTAL(c)	isOCTAL_A(c)
-#define isWORDCHAR(c)   (isALPHA(c) || isDIGIT(c) || (c) == '_')
-
+#define isALPHAU(c)     isALPHA_L1(c)
+#define isDIGIT_L1(c)   isDIGIT_A(c)
+#define isOCTAL(c)      isOCTAL_A(c)
+#define isOCTAL_L1(c)   isOCTAL_A(c)
+#define isXDIGIT_L1(c)  isXDIGIT_A(c)
+
+/* Macros that differ between EBCDIC and ASCII.  Where C89 defines a function,
+ * that is used in the EBCDIC form, because in EBCDIC we do not do locales:
+ * therefore can use native functions.  For those where C89 doesn't define a
+ * function, use our function, assuming that the EBCDIC code page is isomorphic
+ * with Latin1, which the three currently recognized by Perl are.  Some libc's
+ * have an isblank(), but it's not guaranteed. */
 #ifdef EBCDIC
-    /* In EBCDIC we do not do locales: therefore can use native functions */
 #   define isALNUMC(c)	isalnum(c)
 #   define isALPHA(c)	isalpha(c)
 #   define isBLANK(c)	((c) == ' ' || (c) == '\t' || NATIVE_TO_UNI(c) == 0xA0)
 #   define isCNTRL(c)	iscntrl(c)
 #   define isDIGIT(c)	isdigit(c)
 #   define isGRAPH(c)	isgraph(c)
+#   define isIDFIRST(c) (isALPHA(c) || (c) == '_')
 #   define isLOWER(c)	islower(c)
 #   define isPRINT(c)	isprint(c)
 #   define isPSXSPC(c)	isspace(c)
@@ -576,6 +631,7 @@ patched there.  The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
 #   define isSPACE(c)   (isPSXSPC(c) && (c) != '\v')
 #   define isUPPER(c)	isupper(c)
 #   define isXDIGIT(c)	isxdigit(c)
+#   define isWORDCHAR(c) (isalnum(c) || (c) == '_')
 #   define toLOWER(c)	tolower(c)
 #   define toUPPER(c)	toupper(c)
 #else /* Not EBCDIC: ASCII-only matching */
@@ -585,15 +641,21 @@ patched there.  The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
 #   define isCNTRL(c)   isCNTRL_A(c)
 #   define isDIGIT(c)   isDIGIT_A(c)
 #   define isGRAPH(c)   isGRAPH_A(c)
+#   define isIDFIRST(c) isIDFIRST_A(c)
 #   define isLOWER(c)   isLOWER_A(c)
 #   define isPRINT(c)   isPRINT_A(c)
 #   define isPSXSPC(c)	isPSXSPC_A(c)
 #   define isPUNCT(c)   isPUNCT_A(c)
 #   define isSPACE(c)   isSPACE_A(c)
 #   define isUPPER(c)   isUPPER_A(c)
+#   define isWORDCHAR(c) isWORDCHAR_A(c)
 #   define isXDIGIT(c)  isXDIGIT_A(c)
 
-    /* ASCII casing. */
+    /* ASCII casing.  These could also be written as
+	#define toLOWER(c) (isASCII(c) ? toLOWER_LATIN1(c) : (c))
+	#define toUPPER(c) (isASCII(c) ? toUPPER_LATIN1_MOD(c) : (c))
+       which uses table lookup and mask instead of subtraction.  (This would
+       work because the _MOD does not apply in the ASCII range) */
 #   define toLOWER(c)	(isUPPER(c) ? (c) + ('a' - 'A') : (c))
 #   define toUPPER(c)	(isLOWER(c) ? (c) - ('a' - 'A') : (c))
 #endif
author	Karl Williamson <public@khwilliamson.com>	2010-09-23 23:14:58 -0600
committer	Steffen Mueller <smueller@cpan.org>	2010-09-25 11:15:32 +0200
commit	8a58bdcf4d2660cff0818d3e1176af18a113c88d (patch)
tree	1c4c9f48b4ddd0c922830b012cf9d693ece6a152 /handy.h
parent	8eea39dda497d40e67ca52cc97f1fe9318b032c2 (diff)
download	perl-8a58bdcf4d2660cff0818d3e1176af18a113c88d.tar.gz