Add and use macros for case-insensitive comparison

This adds to handy.h isALPHA_FOLD_EQ(c1,c2) which efficiently tests if c1 and c2 are the same character, case-insensitively. For example isALPHA_FOLD_EQ(c, 's') returns true if and only if <c> is 's' or 'S'. isALPHA_FOLD_NE() is also added by this commit. At least one of c1 and c2 must be known to be in [A-Za-z] or this macro doesn't work properly. (There is an assert for this in the macro in DEBUGGING builds). That is why the name includes "ALPHA", so you won't forget when using it. This functionality has been in regcomp.c for a while, under a different name. I had thought that the only reason to make it more generally available was potential speed gain, but recent gcc versions optimize to the same code, so I thought there wasn't any point to doing so. But I now think that using this makes things easier to read (and certainly shorter to type in). Once you grok what this macro does, it simplifies what you have to keep in your mind when reading logical expressions with multiple operands. That something can be either upper or lower case can be a distraction to understanding the larger point of the expression.
author: Karl Williamson <khw@cpan.org> 2014-08-21 17:29:10 -0600
committer: Karl Williamson <khw@cpan.org> 2014-08-22 12:14:59 -0600
commit: 305b86516461e93877909338ac3642c6ac09b651 (patch)
tree: d5d7b3c47a3af1537ffb549ee3eacf937bcd9571
parent: b51533f3c738c0d34d686dc15720c781f1043802 (diff)
download: perl-305b86516461e93877909338ac3642c6ac09b651.tar.gz
11 files changed, 71 insertions, 72 deletions
diff --git a/dump.c b/dump.c
index 5ee69101e1..75f0fb4b09 100644
--- a/dump.c
+++ b/dump.c
@@ -2139,7 +2139,7 @@ Perl_do_sv_dump(pTHX_ I32 level, PerlIO *file, SV *sv, I32 nest, I32 maxnest, bo
 	    Perl_dump_indent(aTHX_ level, file, "  TARGLEN = %"IVdf"\n", (IV)LvTARGLEN(sv));
 	    Perl_dump_indent(aTHX_ level, file, "  TARG = 0x%"UVxf"\n", PTR2UV(LvTARG(sv)));
 	    Perl_dump_indent(aTHX_ level, file, "  FLAGS = %"IVdf"\n", (IV)LvFLAGS(sv));
-	    if (LvTYPE(sv) != 't' && LvTYPE(sv) != 'T')
+	    if (isALPHA_FOLD_NE(LvTYPE(sv), 't'))
 		do_sv_dump(level+1, file, LvTARG(sv), nest+1, maxnest,
 		    dumpops, pvlim);
 	}
diff --git a/handy.h b/handy.h
index c5c4d4b100..d4c15a54af 100644
--- a/handy.h
+++ b/handy.h
@@ -1706,6 +1706,24 @@ typedef U32 line_t;
  * both ASCII and EBCDIC the last 3 bits of the octal digits range from 0-7. */
 #define OCTAL_VALUE(c) (__ASSERT_(isOCTAL(c)) (7 & (c)))
 
+/* Efficiently returns a boolean as to if two native characters are equivalent
+ * case-insenstively.  At least one of the characters must be one of [A-Za-z];
+ * the ALPHA in the name is to remind you of that.  This is asserted() in
+ * DEBUGGING builds.  Because [A-Za-z] are invariant under UTF-8, this macro
+ * works (on valid input) for both non- and UTF-8-encoded bytes.
+ *
+ * When one of the inputs is a compile-time constant and gets folded by the
+ * compiler, this reduces to an AND and a TEST.  On both EBCDIC and ASCII
+ * machines, 'A' and 'a' differ by a single bit; the same with the upper and
+ * lower case of all other ASCII-range alphabetics.  On ASCII platforms, they
+ * are 32 apart; on EBCDIC, they are 64.  This uses an exclusive 'or' to find
+ * that bit and then inverts it to form a mask, with just a single 0, in the
+ * bit position where the upper- and lowercase differ. */
+#define isALPHA_FOLD_EQ(c1, c2)                                         \
+                      (__ASSERT_(isALPHA_A(c1) || isALPHA_A(c2))        \
+                      ((c1) & ~('A' ^ 'a')) ==  ((c2) & ~('A' ^ 'a')))
+#define isALPHA_FOLD_NE(c1, c2) (! isALPHA_FOLD_EQ((c1), (c2)))
+
 /*
 =head1 Memory Management
 
diff --git a/locale.c b/locale.c
index 8f77885a6f..2e68b237e7 100644
--- a/locale.c
+++ b/locale.c
@@ -1440,8 +1440,8 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
         while ((name += strcspn(name, "Uu") + 1)
                                             <= save_input_locale + final_pos - 2)
         {
-            if (toFOLD(*(name)) != 't'
-                || toFOLD(*(name + 1)) != 'f')
+            if (!isALPHA_FOLD_NE(*name, 't')
+                || isALPHA_FOLD_NE(*(name + 1), 'f'))
             {
                 continue;
             }
diff --git a/numeric.c b/numeric.c
index b137ad97d0..ce8bbdd3da 100644
--- a/numeric.c
+++ b/numeric.c
@@ -153,11 +153,11 @@ Perl_grok_bin(pTHX_ const char *start, STRLEN *len_p, I32 *flags, NV *result)
            for compatibility silently suffer "b" and "0b" as valid binary
            numbers. */
         if (len >= 1) {
-            if (s[0] == 'b' || s[0] == 'B') {
+            if (isALPHA_FOLD_EQ(s[0], 'b')) {
                 s++;
                 len--;
             }
-            else if (len >= 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B')) {
+            else if (len >= 2 && s[0] == '0' && (isALPHA_FOLD_EQ(s[1], 'b'))) {
                 s+=2;
                 len-=2;
             }
@@ -274,11 +274,11 @@ Perl_grok_hex(pTHX_ const char *start, STRLEN *len_p, I32 *flags, NV *result)
            for compatibility silently suffer "x" and "0x" as valid hex numbers.
         */
         if (len >= 1) {
-            if (s[0] == 'x' || s[0] == 'X') {
+            if (isALPHA_FOLD_EQ(s[0], 'x')) {
                 s++;
                 len--;
             }
-            else if (len >= 2 && s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
+            else if (len >= 2 && s[0] == '0' && (isALPHA_FOLD_EQ(s[1], 'x'))) {
                 s+=2;
                 len-=2;
             }
@@ -588,9 +588,9 @@ Perl_grok_number(pTHX_ const char *pv, STRLEN len, UV *valuep)
 
 /* Peek ahead to see whether this could be Inf/NaN/qNaN/snan/1.#INF */
 #define PEEK_INFNAN(d) \
-    (*s == 'I' || *s == 'i' || *s == 'N' || *s == 'n') || \
-     ((*s == 'Q' || *s == 'q' || *s == 'S' || *s == 's') && \
-      (s[1] == 'N' || s[1] == 'n')) || \
+    (isALPHA_FOLD_EQ(*s, 'I') || isALPHA_FOLD_EQ(*s, 'N')) || \
+     ((isALPHA_FOLD_EQ(*s, 'Q') || isALPHA_FOLD_EQ(*s, 'S')) && \
+      isALPHA_FOLD_EQ(s[1], 'N')) || \
     (*s == '1' && ((s[1] == '.' && s[2] == '#') || s[1] == '#'))
 
 /*
@@ -637,24 +637,24 @@ Perl_grok_infnan(const char** sp, const char* send)
             return 0;
     }
 
-    if (*s == 'I' || *s == 'i') {
+    if (isALPHA_FOLD_EQ(*s, 'I')) {
         /* INF or IND (1.#IND is indeterminate, a certain type of NAN) */
-        s++; if (s == send || (*s != 'N' && *s != 'n')) return 0;
+        s++; if (s == send || isALPHA_FOLD_NE(*s, 'N')) return 0;
         s++; if (s == send) return 0;
-        if (*s == 'F' || *s == 'f') {
+        if (isALPHA_FOLD_EQ(*s, 'F')) {
             s++;
-            if (s < send && (*s == 'I' || *s == 'i')) {
-                s++; if (s == send || (*s != 'N' && *s != 'n')) return 0;
-                s++; if (s == send || (*s != 'I' && *s != 'i')) return 0;
-                s++; if (s == send || (*s != 'T' && *s != 't')) return 0;
+            if (s < send && (isALPHA_FOLD_EQ(*s, 'I'))) {
+                s++; if (s == send || isALPHA_FOLD_NE(*s, 'N')) return 0;
+                s++; if (s == send || isALPHA_FOLD_NE(*s, 'I')) return 0;
+                s++; if (s == send || isALPHA_FOLD_NE(*s, 'T')) return 0;
                 /* XXX maybe also grok "infinite"? */
-                s++; if (s == send || (*s != 'Y' && *s != 'y')) return 0;
+                s++; if (s == send || isALPHA_FOLD_NE(*s, 'Y')) return 0;
                 s++;
             } else if (*s)
                 return 0;
             flags |= IS_NUMBER_INFINITY | IS_NUMBER_NOT_INT;
         }
-        else if (*s == 'D' || *s == 'd') {
+        else if (isALPHA_FOLD_EQ(*s, 'D')) {
             s++;
             flags |= IS_NUMBER_NAN | IS_NUMBER_NOT_INT;
         } else
@@ -665,15 +665,15 @@ Perl_grok_infnan(const char** sp, const char* send)
     }
     else {
         /* NAN */
-        if (*s == 'S' || *s == 's' || *s == 'Q' || *s == 'q') {
+        if (isALPHA_FOLD_EQ(*s, 'S') || isALPHA_FOLD_EQ(*s, 'Q')) {
             /* snan, qNaN */
             /* XXX do something with the snan/qnan difference */
             s++; if (s == send) return 0;
         }
 
-        if (*s == 'N' || *s == 'n') {
-            s++; if (s == send || (*s != 'A' && *s != 'a')) return 0;
-            s++; if (s == send || (*s != 'N' && *s != 'n')) return 0;
+        if (isALPHA_FOLD_EQ(*s, 'N')) {
+            s++; if (s == send || isALPHA_FOLD_NE(*s, 'A')) return 0;
+            s++; if (s == send || isALPHA_FOLD_NE(*s, 'N')) return 0;
             s++;
 
             flags |= IS_NUMBER_NAN | IS_NUMBER_NOT_INT;
@@ -863,7 +863,7 @@ Perl_grok_number_flags(pTHX_ const char *pv, STRLEN len, UV *valuep, U32 flags)
     numtype |= IS_NUMBER_NAN | IS_NUMBER_NOT_INT;
   } else if (s < send) {
     /* we can have an optional exponent part */
-    if (*s == 'e' || *s == 'E') {
+    if (isALPHA_FOLD_EQ(*s, 'e')) {
       s++;
       if (s < send && (*s == '-' || *s == '+'))
         s++;
@@ -1268,7 +1268,7 @@ Perl_my_atof2(pTHX_ const char* orig, NV* value)
 	result[1] = S_mulexp10(result[1], exp_acc[1]) + (NV)accumulator[1];
     }
 
-    if (seen_digit && (*s == 'e' || *s == 'E')) {
+    if (seen_digit && (isALPHA_FOLD_EQ(*s, 'e'))) {
 	bool expnegative = 0;
 
 	++s;
diff --git a/op.c b/op.c
index f785c552ed..e9885176d4 100644
--- a/op.c
+++ b/op.c
@@ -3775,7 +3775,7 @@ S_fold_constants(pTHX_ OP *o)
 	    {
 		const char *s = SvPVX_const(sv);
 		while (s < SvEND(sv)) {
-		    if (*s == 'p' || *s == 'P') goto nope;
+		    if (isALPHA_FOLD_EQ(*s, 'p')) goto nope;
 		    s++;
 		}
 	    }
diff --git a/perl.c b/perl.c
index e84f1d53ae..3de3acf141 100644
--- a/perl.c
+++ b/perl.c
@@ -3906,7 +3906,7 @@ Perl_doing_taint(int argc, char *argv[], char *envp[])
      * if -T are the first chars together; otherwise one gets
      *  "Too late" message. */
     if ( argc > 1 && argv[1][0] == '-'
-         && (argv[1][1] == 't' || argv[1][1] == 'T') )
+         && isALPHA_FOLD_EQ(argv[1][1], 't'))
 	return 1;
     return 0;
 }
diff --git a/pp.c b/pp.c
index 5218f7bccd..67bf36bc32 100644
--- a/pp.c
+++ b/pp.c
@@ -2919,11 +2919,11 @@ PP(pp_oct)
         tmps++, len--;
     if (*tmps == '0')
         tmps++, len--;
-    if (*tmps == 'x' || *tmps == 'X') {
+    if (isALPHA_FOLD_EQ(*tmps, 'x')) {
     hex:
         result_uv = grok_hex (tmps, &len, &flags, &result_nv);
     }
-    else if (*tmps == 'b' || *tmps == 'B')
+    else if (isALPHA_FOLD_EQ(*tmps, 'b'))
         result_uv = grok_bin (tmps, &len, &flags, &result_nv);
     else
         result_uv = grok_oct (tmps, &len, &flags, &result_nv);
diff --git a/regcomp.c b/regcomp.c
index d3635cc8d6..4b82880952 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -377,24 +377,6 @@ typedef struct scan_data_t {
     regnode_ssc *start_class;
 } scan_data_t;
 
-/* The below is perhaps overboard, but this allows us to save a test at the
- * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
- * and 'a' differ by a single bit; the same with the upper and lower case of
- * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
- * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
- * then inverts it to form a mask, with just a single 0, in the bit position
- * where the upper- and lowercase differ.  XXX There are about 40 other
- * instances in the Perl core where this micro-optimization could be used.
- * Should decide if maintenance cost is worse, before changing those
- *
- * Returns a boolean as to whether or not 'v' is either a lowercase or
- * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
- * compile-time constant, the generated code is better than some optimizing
- * compilers figure out, amounting to a mask and test.  The results are
- * meaningless if 'c' is not one of [A-Za-z] */
-#define isARG2_lower_or_UPPER_ARG1(c, v) \
-                              (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
-
 /*
  * Forward declarations for pregcomp()'s friends.
  */
@@ -3518,8 +3500,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
                 }
 
                 if (len == 2
-                    && isARG2_lower_or_UPPER_ARG1('s', *s)
-                    && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
+                    && isALPHA_FOLD_EQ(*s, 's')
+                    && isALPHA_FOLD_EQ(*(s+1), 's'))
                 {
 
                     /* EXACTF nodes need to know that the minimum length
@@ -12129,9 +12111,8 @@ tryagain:
                             && (PL_fold[ender] != PL_fold_latin1[ender]
                                 || ender == LATIN_SMALL_LETTER_SHARP_S
                                 || (len > 0
-                                   && isARG2_lower_or_UPPER_ARG1('s', ender)
-                                   && isARG2_lower_or_UPPER_ARG1('s',
-                                                                 *(s-1)))))
+                                   && isALPHA_FOLD_EQ(ender, 's')
+                                   && isALPHA_FOLD_EQ(*(s-1), 's'))))
                         {
                             maybe_exactfu = FALSE;
                         }
@@ -12315,7 +12296,7 @@ tryagain:
                      * as if it turns into an EXACTFU, it could later get
                      * joined with another 's' that would then wrongly match
                      * the sharp s */
-                    if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
+                    if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
                     {
                         maybe_exactfu = FALSE;
                     }
diff --git a/sv.c b/sv.c
index 5b602952ef..017ab87178 100644
--- a/sv.c
+++ b/sv.c
@@ -8655,7 +8655,7 @@ Perl_sv_inc_nomg(pTHX_ SV *const sv)
 	     * arranged in order (although not consecutively) and that only
 	     * [A-Za-z] are accepted by isALPHA in the C locale.
 	     */
-	    if (*d != 'z' && *d != 'Z') {
+	    if (isALPHA_FOLD_NE(*d, 'z')) {
 		do { ++*d; } while (!isALPHA(*d));
 		return;
 	    }
@@ -9743,7 +9743,7 @@ Perl_sv_reftype(pTHX_ const SV *const sv, const int ob)
 	case SVt_PVLV:		return (char *)  (SvROK(sv) ? "REF"
 				/* tied lvalues should appear to be
 				 * scalars for backwards compatibility */
-				: (LvTYPE(sv) == 't' || LvTYPE(sv) == 'T')
+				: (isALPHA_FOLD_EQ(LvTYPE(sv), 't'))
 				    ? "SCALAR" : "LVALUE");
 	case SVt_PVAV:		return "ARRAY";
 	case SVt_PVHV:		return "HASH";
@@ -11739,12 +11739,12 @@ Perl_sv_vcatpvfn_flags(pTHX_ SV *const sv, const char *const pat, const STRLEN p
              * nan/inf/-inf, so let's avoid calling that on those
              * three values. nv * 0 will be NaN for NaN, +Inf and -Inf,
              * and 0 for anything else. */
-	    if (c != 'e' && c != 'E' && (nv * 0) == 0) {
+	    if (isALPHA_FOLD_NE(c, 'e') && (nv * 0) == 0) {
                 i = PERL_INT_MIN;
                 (void)Perl_frexp(nv, &i);
                 if (i == PERL_INT_MIN)
                     Perl_die(aTHX_ "panic: frexp");
-                hexfp = (c == 'a' || c == 'A');
+                hexfp = isALPHA_FOLD_EQ(c, 'a');
                 if (UNLIKELY(hexfp)) {
                     /* Hexadecimal floating point: this size
                      * computation probably overshoots, but that is
diff --git a/toke.c b/toke.c
index d2e9eee735..dee6f42ff1 100644
--- a/toke.c
+++ b/toke.c
@@ -4788,7 +4788,7 @@ Perl_yylex(pTHX)
 		     * line contains "Perl" rather than "perl" */
 		    if (!d) {
 			for (d = ipathend-4; d >= ipath; --d) {
-			    if ((*d == 'p' || *d == 'P')
+			    if (isALPHA_FOLD_EQ(*d, 'p')
 				&& !ibcmp(d, "perl", 4))
 			    {
 				break;
@@ -4870,7 +4870,7 @@ Perl_yylex(pTHX)
 				    != PL_unicode)
 				    baduni = TRUE;
 			    }
-			    if (baduni || *d1 == 'M' || *d1 == 'm') {
+			    if (baduni || isALPHA_FOLD_EQ(*d1, 'M')) {
 				const char * const m = d1;
 				while (*d1 && !isSPACE(*d1))
 				    d1++;
@@ -9868,17 +9868,17 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
 	    const char *base, *Base, *max;
 
 	    /* check for hex */
-	    if (s[1] == 'x' || s[1] == 'X') {
+	    if (isALPHA_FOLD_EQ(s[1], 'x')) {
 		shift = 4;
 		s += 2;
 		just_zero = FALSE;
-	    } else if (s[1] == 'b' || s[1] == 'B') {
+	    } else if (isALPHA_FOLD_EQ(s[1], 'b')) {
 		shift = 1;
 		s += 2;
 		just_zero = FALSE;
 	    }
 	    /* check for a decimal in disguise */
-	    else if (s[1] == '.' || s[1] == 'e' || s[1] == 'E')
+	    else if (s[1] == '.' || isALPHA_FOLD_EQ(s[1], 'e'))
 		goto decimal;
 	    /* so it must be octal */
 	    else {
@@ -9982,8 +9982,8 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
                      * to avoid matching ".." */
 #define HEXFP_PEEK(s) \
 	(((s[0] == '.') && \
-	  (isXDIGIT(s[1]) || s[1] == 'p' || s[1] == 'P')) \
-	 || s[0] == 'p' || s[0] == 'P')
+	  (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) \
+	 || isALPHA_FOLD_EQ(s[0], 'p'))
                     if (UNLIKELY(HEXFP_PEEK(s))) {
                         goto out;
                     }
@@ -10044,7 +10044,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
                         total_bits--;
                 }
 
-                if (total_bits > 0 && (*h == 'p' || *h == 'P')) {
+                if (total_bits > 0 && (isALPHA_FOLD_EQ(*h, 'p'))) {
                     bool negexp = FALSE;
                     h++;
                     if (*h == '+')
@@ -10203,18 +10203,19 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
 	}
 
 	/* read exponent part, if present */
-	if (((*s == 'e' || *s == 'E') ||
-             UNLIKELY(hexfp && (*s == 'p' || *s == 'P'))) &&
-            strchr("+-0123456789_", s[1])) {
+	if ((isALPHA_FOLD_EQ(*s, 'e')
+              || UNLIKELY(hexfp && isALPHA_FOLD_EQ(*s, 'p')))
+            && strchr("+-0123456789_", s[1]))
+        {
             floatit = TRUE;
 
 	    /* regardless of whether user said 3E5 or 3e5, use lower 'e',
                ditto for p (hexfloats) */
-            if ((*s == 'e' || *s == 'E')) {
+            if ((isALPHA_FOLD_EQ(*s, 'e'))) {
 		/* At least some Mach atof()s don't grok 'E' */
                 *d++ = 'e';
             }
-            else if (UNLIKELY(hexfp && (*s == 'p' || *s == 'P'))) {
+            else if (UNLIKELY(hexfp && (isALPHA_FOLD_EQ(*s, 'p')))) {
                 *d++ = 'p';
             }
 
diff --git a/utf8.h b/utf8.h
index 613389cc49..d3b55ee694 100644
--- a/utf8.h
+++ b/utf8.h
@@ -577,8 +577,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 	 (ANYOF_NONBITMAP(node)) && \
 	 (ANYOF_FLAGS(node) & ANYOF_LOC_NONBITMAP_FOLD) && \
 	 ((end) > (input) + 1) && \
-	 toFOLD((input)[0]) == 's' && \
-	 toFOLD((input)[1]) == 's')
+	 isALPHA_FOLD_EQ((input)[0], 's'))
 
 #define SHARP_S_SKIP 2
author	Karl Williamson <khw@cpan.org>	2014-08-21 17:29:10 -0600
committer	Karl Williamson <khw@cpan.org>	2014-08-22 12:14:59 -0600
commit	305b86516461e93877909338ac3642c6ac09b651 (patch)
tree	d5d7b3c47a3af1537ffb549ee3eacf937bcd9571
parent	b51533f3c738c0d34d686dc15720c781f1043802 (diff)
download	perl-305b86516461e93877909338ac3642c6ac09b651.tar.gz