summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-02-09 10:40:38 -0700
committerKarl Williamson <khw@cpan.org>2016-02-09 23:30:53 -0700
commitc6871b765d934773fb82823602c1e03e073c38ac (patch)
treeff9962440fdd86916e5ebe1953801dbfe6f372cf
parent54e70d91fbed63597a6d5ea1113abcb56b8a22c0 (diff)
downloadperl-c6871b765d934773fb82823602c1e03e073c38ac.tar.gz
regcomp.c: Add code to compute edit distance (Damerau–Levenshtein)
This will be used in a future commit. This code is taken from CPAN Text::Levenshtein::Damerau::XS with the author's knowledge. There have been white-space changes to make it conform better to perl's core coding standards, and declaration changes to make it more portable, such as using UV instead of 'unsigned int', and PERL_STATIC_INLINE instead of a less portable form, but the logic is unchanged. One variable was changed to signed from unsigned to avoid a warning message from some compilers. The author and I will decide later about keeping the cpan module and this code in sync. It changes very rarely.
-rw-r--r--embed.fnc5
-rw-r--r--embed.h1
-rw-r--r--proto.h5
-rw-r--r--regcomp.c145
4 files changed, 156 insertions, 0 deletions
diff --git a/embed.fnc b/embed.fnc
index 97ecfa3140..020f432e88 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2244,6 +2244,11 @@ Es |I32 |make_trie |NN RExC_state_t *pRExC_state \
Es |regnode *|construct_ahocorasick_from_trie|NN RExC_state_t *pRExC_state \
|NN regnode *source|U32 depth
EnPs |const char *|cntrl_to_mnemonic|const U8 c
+EnPs |int |edit_distance |NN const UV *src \
+ |NN const UV *tgt \
+ |const STRLEN x \
+ |const STRLEN y \
+ |const SSize_t maxDistance
# ifdef DEBUGGING
Es |void |regdump_intflags|NULLOK const char *lead| const U32 flags
Es |void |regdump_extflags|NULLOK const char *lead| const U32 flags
diff --git a/embed.h b/embed.h
index 24b61bff4d..82b7ced268 100644
--- a/embed.h
+++ b/embed.h
@@ -997,6 +997,7 @@
#define compute_EXACTish S_compute_EXACTish
#define construct_ahocorasick_from_trie(a,b,c) S_construct_ahocorasick_from_trie(aTHX_ a,b,c)
#define could_it_be_a_POSIX_class S_could_it_be_a_POSIX_class
+#define edit_distance S_edit_distance
#define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b)
#define get_invlist_iter_addr S_get_invlist_iter_addr
#define grok_bslash_N(a,b,c,d,e,f) S_grok_bslash_N(aTHX_ a,b,c,d,e,f)
diff --git a/proto.h b/proto.h
index 512a264287..6bb11c860b 100644
--- a/proto.h
+++ b/proto.h
@@ -4737,6 +4737,11 @@ STATIC regnode * S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_sta
STATIC bool S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state);
#define PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS \
assert(pRExC_state)
+STATIC int S_edit_distance(const UV *src, const UV *tgt, const STRLEN x, const STRLEN y, const SSize_t maxDistance)
+ __attribute__pure__;
+#define PERL_ARGS_ASSERT_EDIT_DISTANCE \
+ assert(src); assert(tgt)
+
STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, const regnode_charclass* const node);
#define PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC \
assert(pRExC_state); assert(node)
diff --git a/regcomp.c b/regcomp.c
index d6a3b5b363..d6e4545867 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -997,6 +997,151 @@ DEBUG_OPTIMISE_MORE_r(if(data){ \
PerlIO_printf(Perl_debug_log,"\n"); \
});
+/* =========================================================
+ * BEGIN edit_distance stuff.
+ *
+ * This calculates how many single character changes of any type are needed to
+ * transform a string into another one. It is taken from version 3.1 of
+ *
+ * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
+ */
+
+/* Our unsorted dictionary linked list. */
+/* Note we use UVs, not chars. */
+
+struct dictionary{
+ UV key;
+ UV value;
+ struct dictionary* next;
+};
+typedef struct dictionary item;
+
+
+PERL_STATIC_INLINE item*
+push(UV key,item* curr)
+{
+ item* head;
+ Newxz(head, 1, item);
+ head->key = key;
+ head->value = 0;
+ head->next = curr;
+ return head;
+}
+
+
+PERL_STATIC_INLINE item*
+find(item* head, UV key)
+{
+ item* iterator = head;
+ while (iterator){
+ if (iterator->key == key){
+ return iterator;
+ }
+ iterator = iterator->next;
+ }
+
+ return NULL;
+}
+
+PERL_STATIC_INLINE item*
+uniquePush(item* head,UV key)
+{
+ item* iterator = head;
+
+ while (iterator){
+ if (iterator->key == key) {
+ return head;
+ }
+ iterator = iterator->next;
+ }
+
+ return push(key,head);
+}
+
+PERL_STATIC_INLINE void
+dict_free(item* head)
+{
+ item* iterator = head;
+
+ while (iterator) {
+ item* temp = iterator;
+ iterator = iterator->next;
+ Safefree(temp);
+ }
+
+ head = NULL;
+}
+
+/* End of Dictionary Stuff */
+
+/* All calculations/work are done here */
+STATIC int
+S_edit_distance(const UV* src,
+ const UV* tgt,
+ const STRLEN x, /* length of src[] */
+ const STRLEN y, /* length of tgt[] */
+ const SSize_t maxDistance
+)
+{
+ item *head = NULL;
+ UV swapCount,swapScore,targetCharCount,i,j;
+ UV *scores;
+ UV score_ceil = x + y;
+
+ PERL_ARGS_ASSERT_EDIT_DISTANCE;
+
+ /* intialize matrix start values */
+ Newxz(scores, ( (x + 2) * (y + 2)), UV);
+ scores[0] = score_ceil;
+ scores[1 * (y + 2) + 0] = score_ceil;
+ scores[0 * (y + 2) + 1] = score_ceil;
+ scores[1 * (y + 2) + 1] = 0;
+ head = uniquePush(uniquePush(head,src[0]),tgt[0]);
+
+ /* work loops */
+ /* i = src index */
+ /* j = tgt index */
+ for (i=1;i<=x;i++) {
+ if (i < x)
+ head = uniquePush(head,src[i]);
+ scores[(i+1) * (y + 2) + 1] = i;
+ scores[(i+1) * (y + 2) + 0] = score_ceil;
+ swapCount = 0;
+
+ for (j=1;j<=y;j++) {
+ if (i == 1) {
+ if(j < y)
+ head = uniquePush(head,tgt[j]);
+ scores[1 * (y + 2) + (j + 1)] = j;
+ scores[0 * (y + 2) + (j + 1)] = score_ceil;
+ }
+
+ targetCharCount = find(head,tgt[j-1])->value;
+ swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
+
+ if (src[i-1] != tgt[j-1]){
+ scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
+ }
+ else {
+ swapCount = j;
+ scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
+ }
+ }
+
+ find(head,src[i-1])->value = i;
+ }
+
+ {
+ IV score = scores[(x+1) * (y + 2) + (y + 1)];
+ dict_free(head);
+ Safefree(scores);
+ return (maxDistance != 0 && maxDistance < score)?(-1):score;
+ }
+}
+
+/* END of edit_distance() stuff
+ * ========================================================= */
+
/* is c a control character for which we have a mnemonic? */
#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)