summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/unicore/mktables2
-rw-r--r--lib/utf8_heavy.pl3
-rw-r--r--regcomp.c118
-rw-r--r--regcomp.h2
-rw-r--r--utf8.c1
5 files changed, 70 insertions, 56 deletions
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 7b7af71396..03f122b6f5 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -15633,7 +15633,7 @@ sub make_property_test_script() {
# others except DAge.txt (as data in an extracted file can be over-ridden by
# the non-extracted. Some other files depend on data derived from an earlier
# file, like UnicodeData requires data from Jamo, and the case changing and
-# folding requires data from Unicode. Mostly, it safest to order by first
+# folding requires data from Unicode. Mostly, it is safest to order by first
# version releases in (except the Jamo). DAge.txt is read before the
# extracted ones because of the rarely used feature $compare_versions. In the
# unlikely event that there were ever an extracted file that contained the Age
diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl
index 84a81676f0..95758f7287 100644
--- a/lib/utf8_heavy.pl
+++ b/lib/utf8_heavy.pl
@@ -105,9 +105,8 @@ sub _loose_name ($) {
if ($type)
{
-
# Verify that this isn't a recursive call for this property.
- # Can't use croak, as it may try to recurse here itself.
+ # Can't use croak, as it may try to recurse to here itself.
my $class_type = $class . "::$type";
if (grep { $_ eq $class_type } @recursed) {
CORE::die "panic: Infinite recursion in SWASHNEW for '$type'\n";
diff --git a/regcomp.c b/regcomp.c
index 4669c27a0b..6e27f1ef97 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -5840,8 +5840,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
* The 1th element is the first element beyond that not in the list. In other
* words, the first range is
* invlist[0]..(invlist[1]-1)
- * The other ranges follow. Thus every element that is divisible by two marks
- * the beginning of a range that is in the list, and every element not
+ * The other ranges follow. Thus every element whose index is divisible by two
+ * marks the beginning of a range that is in the list, and every element not
* divisible by two marks the beginning of a range not in the list. A single
* element inversion list that contains the single code point N generally
* consists of two elements
@@ -5922,7 +5922,8 @@ S_invlist_array(pTHX_ SV* const invlist)
PERL_ARGS_ASSERT_INVLIST_ARRAY;
- /* Must not be empty */
+ /* Must not be empty. If these fail, you probably didn't check for <len>
+ * being non-zero before trying to get the array */
assert(*get_invlist_len_addr(invlist));
assert(*get_invlist_zero_addr(invlist) == 0
|| *get_invlist_zero_addr(invlist) == 1);
@@ -5948,7 +5949,8 @@ S_get_invlist_len_addr(pTHX_ SV* invlist)
PERL_STATIC_INLINE UV
S_invlist_len(pTHX_ SV* const invlist)
{
- /* Returns the current number of elements in the inversion list's array */
+ /* Returns the current number of elements stored in the inversion list's
+ * array */
PERL_ARGS_ASSERT_INVLIST_LEN;
@@ -6059,7 +6061,6 @@ S_invlist_trim(pTHX_ SV* const invlist)
/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
* etc */
-
#define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
#define PREV_ELEMENT_IN_INVLIST_SET(i) (! ELEMENT_IN_INVLIST_SET(i))
@@ -6105,7 +6106,7 @@ Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV
}
else {
/* But if the end is the maximum representable on the machine,
- * just let the range that this would extend have no end */
+ * just let the range that this would extend to have no end */
invlist_set_len(invlist, len - 1);
}
return;
@@ -6145,7 +6146,7 @@ void
Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
{
/* Take the union of two inversion lists and point 'result' to it. If
- * 'result' on input points to one of the two lists, the reference count to
+ * 'output' on input points to one of the two lists, the reference count to
* that list will be decremented.
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
@@ -6191,8 +6192,7 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
}
else if (output != &b) {
*output = invlist_clone(b);
- }
- /* else *output already = b; */
+ } /* else *output already = b; */
return;
}
else if ((len_b = invlist_len(b)) == 0) {
@@ -6636,8 +6636,8 @@ S_invlist_clone(pTHX_ SV* const invlist)
void
Perl__invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result)
{
- /* Point result to an inversion list which consists of all elements in 'a'
- * that aren't also in 'b' */
+ /* Point <result> to an inversion list which consists of all elements in
+ * <a> that aren't also in <b> */
PERL_ARGS_ASSERT__INVLIST_SUBTRACT;
@@ -6687,6 +6687,13 @@ S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
STATIC bool
S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
{
+ /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
+ * This call sets in <*start> and <*end>, the next range in <invlist>.
+ * Returns <TRUE> if successful and the next call will return the next
+ * range; <FALSE> if was already at the end of the list. If the latter,
+ * <*start> and <*end> are unchanged, and the next call to this function
+ * will start over at the beginning of the list */
+
UV* pos = get_invlist_iter_addr(invlist);
UV len = invlist_len(invlist);
UV *array;
@@ -10469,10 +10476,10 @@ parseit:
}
}
- /* Only the characters in this class that participate in folds need
- * be checked. Get the intersection of this class and all the
- * possible characters that are foldable. This can quickly narrow
- * down a large class */
+ /* Only the characters in this class that participate in folds need be
+ * checked. Get the intersection of this class and all the possible
+ * characters that are foldable. This can quickly narrow down a large
+ * class */
_invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
/* Now look at the foldable characters in this class individually */
@@ -10491,23 +10498,22 @@ parseit:
if (foldlen > (STRLEN)UNISKIP(f)) {
- /* Any multicharacter foldings (disallowed in
- * lookbehind patterns) require the following
- * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
- * E folds into "pq" and F folds into "rst", all other
- * characters fold to single characters. We save away
- * these multicharacter foldings, to be later saved as
- * part of the additional "s" data. */
+ /* Any multicharacter foldings (disallowed in lookbehind
+ * patterns) require the following transform: [ABCDEF] ->
+ * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
+ * folds into "rst", all other characters fold to single
+ * characters. We save away these multicharacter foldings,
+ * to be later saved as part of the additional "s" data. */
if (! RExC_in_lookbehind) {
U8* loc = foldbuf;
U8* e = foldbuf + foldlen;
- /* If any of the folded characters of this are in
- * the Latin1 range, tell the regex engine that
- * this can match a non-utf8 target string. The
- * only multi-byte fold whose source is in the
- * Latin1 range (U+00DF) applies only when the
- * target string is utf8, or under unicode rules */
+ /* If any of the folded characters of this are in the
+ * Latin1 range, tell the regex engine that this can
+ * match a non-utf8 target string. The only multi-byte
+ * fold whose source is in the Latin1 range (U+00DF)
+ * applies only when the target string is utf8, or
+ * under unicode rules */
if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
while (loc < e) {
@@ -10520,8 +10526,8 @@ parseit:
if (UTF8_IS_INVARIANT(*loc)
|| UTF8_IS_DOWNGRADEABLE_START(*loc))
{
- /* Can't mix above and below 256 under
- * LOC */
+ /* Can't mix above and below 256 under LOC
+ */
if (LOC) {
goto end_multi_fold;
}
@@ -10551,13 +10557,13 @@ parseit:
}
else {
/* Single character fold. Add everything in its fold
- * closure to the list that this node should match */
+ * closure to the list that this node should match */
SV** listp;
- /* The fold closures data structure is a hash with the
- * keys being every character that is folded to, like
- * 'k', and the values each an array of everything that
- * folds to its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
+ /* The fold closures data structure is a hash with the keys
+ * being every character that is folded to, like 'k', and
+ * the values each an array of everything that folds to its
+ * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
if ((listp = hv_fetch(PL_utf8_foldclosures,
(char *) foldbuf, foldlen, FALSE)))
{
@@ -10571,9 +10577,9 @@ parseit:
}
c = SvUV(*c_p);
- /* /aa doesn't allow folds between ASCII and
- * non-; /l doesn't allow them between above
- * and below 256 */
+ /* /aa doesn't allow folds between ASCII and non-;
+ * /l doesn't allow them between above and below
+ * 256 */
if ((MORE_ASCII_RESTRICTED
&& (isASCII(c) != isASCII(j)))
|| (LOC && ((c < 256) != (j < 256))))
@@ -10587,9 +10593,9 @@ parseit:
(U8) c,
&l1_fold_invlist, &unicode_alternate);
}
- /* It may be that the code point is already
- * in this range or already in the bitmap,
- * in which case we need do nothing */
+ /* It may be that the code point is already in
+ * this range or already in the bitmap, in
+ * which case we need do nothing */
else if ((c < start || c > end)
&& (c > 255
|| ! ANYOF_BITMAP_TEST(ret, c)))
@@ -10616,21 +10622,25 @@ parseit:
}
/* Here, we have calculated what code points should be in the character
- * class. Now we can see about various optimizations. Fold calculation
- * needs to take place before inversion. Otherwise /[^k]/i would invert to
- * include K, which under /i would match k. */
+ * class.
+ *
+ * Now we can see about various optimizations. Fold calculation (which we
+ * did above) needs to take place before inversion. Otherwise /[^k]/i
+ * would invert to include K, which under /i would match k, which it
+ * shouldn't. */
/* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
- * set the FOLD flag yet, so this this does optimize those. It doesn't
+ * set the FOLD flag yet, so this does optimize those. It doesn't
* optimize locale. Doing so perhaps could be done as long as there is
* nothing like \w in it; some thought also would have to be given to the
* interaction with above 0x100 chars */
- if (! LOC
- && (ANYOF_FLAGS(ret) & ANYOF_INVERT)
+ if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
+ && ! LOC
&& ! unicode_alternate
/* In case of /d, there are some things that should match only when in
* not in the bitmap, i.e., they require UTF8 to match. These are
- * listed in nonbitmap. */
+ * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
+ * case, they don't require UTF8, so can invert here */
&& (! nonbitmap
|| ! DEPENDS_SEMANTICS
|| (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
@@ -10657,6 +10667,8 @@ parseit:
ANYOF_BITMAP_SET(ret, value);
}
}
+
+ /* And do the removal */
_invlist_subtract(nonbitmap, remove_list, &nonbitmap);
SvREFCNT_dec(remove_list);
}
@@ -10794,11 +10806,11 @@ parseit:
/* The 0th element stores the character class description
* in its textual form: used later (regexec.c:Perl_regclass_swash())
* to initialize the appropriate swash (which gets stored in
- * the 1st element), and also useful for dumping the regnode.
- * The 2nd element stores the multicharacter foldings,
+ * element [1]), and also useful for dumping the regnode.
+ * Element [2] stores the multicharacter foldings,
* used later (regexec.c:S_reginclass()). */
av_store(av, 0, listsv);
- av_store(av, 1, NULL);
+ av_store(av, 1, NULL); /* Placeholder for generated swash */
/* Store any computed multi-char folds only if we are allowing
* them */
@@ -11610,14 +11622,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
sv_catpvs(sv, "{outside bitmap}");
if (ANYOF_NONBITMAP(o)) {
- SV *lv;
+ SV *lv; /* Set if there is something outside the bit map */
SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
if (lv) {
if (sw) {
U8 s[UTF8_MAXBYTES_CASE+1];
- for (i = 0; i <= 256; i++) { /* just the first 256 */
+ for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
uvchr_to_utf8(s, i);
if (i < 256 && swash_fetch(sw, s, TRUE)) {
diff --git a/regcomp.h b/regcomp.h
index 81c8a5ddd7..0540c637c0 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -311,6 +311,8 @@ struct regnode_charclass_class {
* are done to share them, as described below. If necessary, the ANYOF_LOCALE
* and ANYOF_CLASS bits could be shared with a space penalty for locale nodes,
* but this isn't quite so easy, as the optimizer also uses ANYOF_CLASS.
+ * Another option would be to push them into new nodes. E.g. there could be an
+ * ANYOF_LOCALE node that would be in place of the flag of the same name.
* Once the planned change to compile all the above-latin1 code points is done,
* then the UNICODE_ALL bit can be freed up, with a small performance penalty.
* If flags need to be added that are applicable to the synthetic start class
diff --git a/utf8.c b/utf8.c
index 23308a368d..61c6c23e03 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2586,6 +2586,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
PERL_ARGS_ASSERT_SWASH_FETCH;
+ /* Convert to utf8 if not already */
if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);