summaryrefslogtreecommitdiff
path: root/handy.h
diff options
context:
space:
mode:
authorFather Chrysostomos <sprout@cpan.org>2010-11-14 06:46:27 -0800
committerFather Chrysostomos <sprout@cpan.org>2010-11-14 06:47:29 -0800
commitd7425188c3bef8a77425c103db57cf8cde99f5a0 (patch)
tree1a649e3245dda59642b418932fae58062cc9a02b /handy.h
parentb5d9a95357621a0a9d375ff6a83672c7f150655e (diff)
downloadperl-d7425188c3bef8a77425c103db57cf8cde99f5a0.tar.gz
[perl #74022] Parser hangs on some Unicode characters
This changes the definition of isIDFIRST_utf8 to avoid any characters that would put the parser in a loop. isIDFIRST_utf8 is used all over the place in toke.c. Almost every instance is followed by a call to S_scan_word. S_scan_word is only called when it is known that there is a word to scan. What was happening was that isIDFIRST_utf8 would accept a character, but S_scan_word in toke.t would then reject it, as it was using is_utf8_alnum, resulting in an infinite number of zero-length identifiers. Another possible solution was to change S_scan_word to use isIDFIRST_utf8 or similar, but that has back-compatibility problems, as it stops q·foo· from being a strings and makes it an identi- fier instead.
Diffstat (limited to 'handy.h')
-rw-r--r--handy.h14
1 files changed, 10 insertions, 4 deletions
diff --git a/handy.h b/handy.h
index d966bfe0b6..7f086886c3 100644
--- a/handy.h
+++ b/handy.h
@@ -849,10 +849,16 @@ patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
#define isBLANK_LC_uni(c) isBLANK(c) /* could be wrong */
#define isALNUM_utf8(p) is_utf8_alnum(p)
-/* The ID_Start of Unicode is quite limiting: it assumes a L-class
- * character (meaning that you cannot have, say, a CJK character).
- * Instead, let's allow ID_Continue but not digits. */
-#define isIDFIRST_utf8(p) (is_utf8_idcont(p) && !is_utf8_digit(p))
+/* The ID_Start of Unicode was originally quite limiting: it assumed an
+ * L-class character (meaning that you could not have, say, a CJK charac-
+ * ter). So, instead, perl has for a long time allowed ID_Continue but
+ * not digits.
+ * We still preserve that for backward compatibility. But we also make sure
+ * that it is alphanumeric, so S_scan_word in toke.c will not hang. See
+ * http://rt.perl.org/rt3/Ticket/Display.html?id=74022
+ * for more detail than you ever wanted to know about. */
+#define isIDFIRST_utf8(p) \
+ (is_utf8_idcont(p) && !is_utf8_digit(p) && is_utf8_alnum(p))
#define isALPHA_utf8(p) is_utf8_alpha(p)
#define isSPACE_utf8(p) is_utf8_space(p)
#define isDIGIT_utf8(p) is_utf8_digit(p)