diff options
author | Karl Williamson <khw@cpan.org> | 2016-02-09 10:40:38 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2016-02-09 23:30:53 -0700 |
commit | c6871b765d934773fb82823602c1e03e073c38ac (patch) | |
tree | ff9962440fdd86916e5ebe1953801dbfe6f372cf | |
parent | 54e70d91fbed63597a6d5ea1113abcb56b8a22c0 (diff) | |
download | perl-c6871b765d934773fb82823602c1e03e073c38ac.tar.gz |
regcomp.c: Add code to compute edit distance (Damerau–Levenshtein)
This will be used in a future commit.
This code is taken from CPAN Text::Levenshtein::Damerau::XS with the
author's knowledge. There have been white-space changes to make it
conform better to perl's core coding standards, and declaration changes
to make it more portable, such as using UV instead of 'unsigned int',
and PERL_STATIC_INLINE instead of a less portable form, but the logic is
unchanged. One variable was changed to signed from unsigned to avoid a
warning message from some compilers.
The author and I will decide later about keeping the cpan module and
this code in sync. It changes very rarely.
-rw-r--r-- | embed.fnc | 5 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | proto.h | 5 | ||||
-rw-r--r-- | regcomp.c | 145 |
4 files changed, 156 insertions, 0 deletions
@@ -2244,6 +2244,11 @@ Es |I32 |make_trie |NN RExC_state_t *pRExC_state \ Es |regnode *|construct_ahocorasick_from_trie|NN RExC_state_t *pRExC_state \ |NN regnode *source|U32 depth EnPs |const char *|cntrl_to_mnemonic|const U8 c +EnPs |int |edit_distance |NN const UV *src \ + |NN const UV *tgt \ + |const STRLEN x \ + |const STRLEN y \ + |const SSize_t maxDistance # ifdef DEBUGGING Es |void |regdump_intflags|NULLOK const char *lead| const U32 flags Es |void |regdump_extflags|NULLOK const char *lead| const U32 flags @@ -997,6 +997,7 @@ #define compute_EXACTish S_compute_EXACTish #define construct_ahocorasick_from_trie(a,b,c) S_construct_ahocorasick_from_trie(aTHX_ a,b,c) #define could_it_be_a_POSIX_class S_could_it_be_a_POSIX_class +#define edit_distance S_edit_distance #define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b) #define get_invlist_iter_addr S_get_invlist_iter_addr #define grok_bslash_N(a,b,c,d,e,f) S_grok_bslash_N(aTHX_ a,b,c,d,e,f) @@ -4737,6 +4737,11 @@ STATIC regnode * S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_sta STATIC bool S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state); #define PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS \ assert(pRExC_state) +STATIC int S_edit_distance(const UV *src, const UV *tgt, const STRLEN x, const STRLEN y, const SSize_t maxDistance) + __attribute__pure__; +#define PERL_ARGS_ASSERT_EDIT_DISTANCE \ + assert(src); assert(tgt) + STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, const regnode_charclass* const node); #define PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC \ assert(pRExC_state); assert(node) @@ -997,6 +997,151 @@ DEBUG_OPTIMISE_MORE_r(if(data){ \ PerlIO_printf(Perl_debug_log,"\n"); \ }); +/* ========================================================= + * BEGIN edit_distance stuff. + * + * This calculates how many single character changes of any type are needed to + * transform a string into another one. It is taken from version 3.1 of + * + * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS + */ + +/* Our unsorted dictionary linked list. */ +/* Note we use UVs, not chars. */ + +struct dictionary{ + UV key; + UV value; + struct dictionary* next; +}; +typedef struct dictionary item; + + +PERL_STATIC_INLINE item* +push(UV key,item* curr) +{ + item* head; + Newxz(head, 1, item); + head->key = key; + head->value = 0; + head->next = curr; + return head; +} + + +PERL_STATIC_INLINE item* +find(item* head, UV key) +{ + item* iterator = head; + while (iterator){ + if (iterator->key == key){ + return iterator; + } + iterator = iterator->next; + } + + return NULL; +} + +PERL_STATIC_INLINE item* +uniquePush(item* head,UV key) +{ + item* iterator = head; + + while (iterator){ + if (iterator->key == key) { + return head; + } + iterator = iterator->next; + } + + return push(key,head); +} + +PERL_STATIC_INLINE void +dict_free(item* head) +{ + item* iterator = head; + + while (iterator) { + item* temp = iterator; + iterator = iterator->next; + Safefree(temp); + } + + head = NULL; +} + +/* End of Dictionary Stuff */ + +/* All calculations/work are done here */ +STATIC int +S_edit_distance(const UV* src, + const UV* tgt, + const STRLEN x, /* length of src[] */ + const STRLEN y, /* length of tgt[] */ + const SSize_t maxDistance +) +{ + item *head = NULL; + UV swapCount,swapScore,targetCharCount,i,j; + UV *scores; + UV score_ceil = x + y; + + PERL_ARGS_ASSERT_EDIT_DISTANCE; + + /* intialize matrix start values */ + Newxz(scores, ( (x + 2) * (y + 2)), UV); + scores[0] = score_ceil; + scores[1 * (y + 2) + 0] = score_ceil; + scores[0 * (y + 2) + 1] = score_ceil; + scores[1 * (y + 2) + 1] = 0; + head = uniquePush(uniquePush(head,src[0]),tgt[0]); + + /* work loops */ + /* i = src index */ + /* j = tgt index */ + for (i=1;i<=x;i++) { + if (i < x) + head = uniquePush(head,src[i]); + scores[(i+1) * (y + 2) + 1] = i; + scores[(i+1) * (y + 2) + 0] = score_ceil; + swapCount = 0; + + for (j=1;j<=y;j++) { + if (i == 1) { + if(j < y) + head = uniquePush(head,tgt[j]); + scores[1 * (y + 2) + (j + 1)] = j; + scores[0 * (y + 2) + (j + 1)] = score_ceil; + } + + targetCharCount = find(head,tgt[j-1])->value; + swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount; + + if (src[i-1] != tgt[j-1]){ + scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1)); + } + else { + swapCount = j; + scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore); + } + } + + find(head,src[i-1])->value = i; + } + + { + IV score = scores[(x+1) * (y + 2) + (y + 1)]; + dict_free(head); + Safefree(scores); + return (maxDistance != 0 && maxDistance < score)?(-1):score; + } +} + +/* END of edit_distance() stuff + * ========================================================= */ + /* is c a control character for which we have a mnemonic? */ #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) |