summaryrefslogtreecommitdiff
path: root/embed.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-02-09 11:50:04 -0700
committerKarl Williamson <khw@cpan.org>2016-02-09 23:30:54 -0700
commit46d34d0e1e7de87f74f8b2df4b32f291baf21dbb (patch)
treed1c55a71af5488e197ca31951e49f480adb1c325 /embed.h
parentd8fd4ea0c782a6d356681b28eb35e215d74e4ccd (diff)
downloadperl-46d34d0e1e7de87f74f8b2df4b32f291baf21dbb.tar.gz
PATCH: [perl #8904] Revamp [:posix:] parsing
A problem with bracketed character classes, qr/[foo]/, is that there is very little structure about them, so almost anything is legal, and so typos just silently compile into something unintended. One of the possible components are posix character classes. There are 14 of them, and they have a very restricted structure, which is easy to get slightly wrong, so that instead of the intended posix class being compiled, something else silently is created. This commit causes the regex compiler to look for slightly misspelled posix character classes and to raise a warning when found. It does not change the results of the compilation. To do this, it introduces fuzzy parsing into the regex compiler, using the Damerau-Levenshtein algorithm to find out how many single character edits it would take to transform the input into one of the 14 classes. If it is 1 or 2 off, it considers the input to have been intended to be that class and raises the warning. If more edits would be needed, it remains silent. This is a heuristic, and someone could have made enough typos that this thinks a class wasn't intended that was. Conversely it could raise a warning when no class was intended, though warnings only happen when the input very closely resembles a posix class of one of the 14 legal ones. The algorithm can be tweaked if experience indicates it should. But the bottom line is that many more cases of unintended results will now be warned about. Things like having blanks in the construct and having the '^' before the colon are recognized as being intended posix classes (given that the actual names are close to one of the 14), and raise warnings. Again this commit does not change what gets compiled. This found a bug in autodoc.pl which was fixed a few commits ago. The [. .] and [= =] POSIX constructs cause perl to croak that they are unimplemented. This commit improves the parsing of these two, and fixes some false positives. See http://nntp.perl.org/group/perl.perl5.porters/230975 The new code combines two functions in regcomp.c into one new one.
Diffstat (limited to 'embed.h')
-rw-r--r--embed.h5
1 files changed, 2 insertions, 3 deletions
diff --git a/embed.h b/embed.h
index 82b7ced268..ab70dbb06b 100644
--- a/embed.h
+++ b/embed.h
@@ -996,11 +996,11 @@
#define cntrl_to_mnemonic S_cntrl_to_mnemonic
#define compute_EXACTish S_compute_EXACTish
#define construct_ahocorasick_from_trie(a,b,c) S_construct_ahocorasick_from_trie(aTHX_ a,b,c)
-#define could_it_be_a_POSIX_class S_could_it_be_a_POSIX_class
#define edit_distance S_edit_distance
#define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b)
#define get_invlist_iter_addr S_get_invlist_iter_addr
#define grok_bslash_N(a,b,c,d,e,f) S_grok_bslash_N(aTHX_ a,b,c,d,e,f)
+#define handle_possible_posix(a,b,c,d) S_handle_possible_posix(aTHX_ a,b,c,d)
#define handle_regex_sets(a,b,c,d,e) S_handle_regex_sets(aTHX_ a,b,c,d,e)
#define invlist_clone(a) S_invlist_clone(aTHX_ a)
#define invlist_extend(a,b) S_invlist_extend(aTHX_ a,b)
@@ -1025,12 +1025,11 @@
#define reganode(a,b,c) S_reganode(aTHX_ a,b,c)
#define regatom(a,b,c) S_regatom(aTHX_ a,b,c)
#define regbranch(a,b,c,d) S_regbranch(aTHX_ a,b,c,d)
-#define regclass(a,b,c,d,e,f,g,h,i) S_regclass(aTHX_ a,b,c,d,e,f,g,h,i)
+#define regclass(a,b,c,d,e,f,g,h,i,j) S_regclass(aTHX_ a,b,c,d,e,f,g,h,i,j)
#define regex_set_precedence S_regex_set_precedence
#define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d)
#define regnode_guts(a,b,c,d) S_regnode_guts(aTHX_ a,b,c,d)
#define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c)
-#define regpposixcc(a,b,c) S_regpposixcc(aTHX_ a,b,c)
#define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d)
#define scan_commit(a,b,c,d) S_scan_commit(aTHX_ a,b,c,d)
#define set_ANYOF_arg(a,b,c,d,e,f,g) S_set_ANYOF_arg(aTHX_ a,b,c,d,e,f,g)