diff options
author | Hartmut Holzgraefe <hholzgra@php.net> | 2000-12-13 23:26:19 +0000 |
---|---|---|
committer | Hartmut Holzgraefe <hholzgra@php.net> | 2000-12-13 23:26:19 +0000 |
commit | 63e6b0b5bf8049ab2d80863cc132ea667f7bbf76 (patch) | |
tree | 0faa2cdca8e850290fe3f5cf3483090a46a0d9a0 | |
parent | be895bcb96157b5a36d1183d3b9567ede57a45f8 (diff) | |
download | php-git-63e6b0b5bf8049ab2d80863cc132ea667f7bbf76.tar.gz |
levenshtein() fixed, regression tests added (bug id #6562 and #7368)
# fallback to unoptimized version for 4.0.4 release
-rw-r--r-- | ext/standard/levenshtein.c | 198 | ||||
-rw-r--r-- | ext/standard/tests/general_functions/003.phpt | 54 |
2 files changed, 97 insertions, 155 deletions
diff --git a/ext/standard/levenshtein.c b/ext/standard/levenshtein.c index d157cf7fab..080c86a327 100644 --- a/ext/standard/levenshtein.c +++ b/ext/standard/levenshtein.c @@ -23,162 +23,47 @@ #include <ctype.h> #include "php_string.h" -/* faster, but obfuscated, all operations have a cost of 1 */ -static int fastest_levdist(const char *s1, const char *s2) +/* reference implementation, only optimized for memory usage, not speed */ +static int reference_levdist(const char *s1, int l1, + const char *s2, int l2, + int cost_ins, int cost_rep, int cost_del ) { - register char *p1,*p2; - register int i,j,n; - int l1=0,l2=0; - char r[512]; - const char *tmp; - - /* skip equal start sequence, if any */ - while(*s1==*s2) { - if(!*s1) break; - s1++; s2++; - } - - /* if we already used up one string, then - the result is the length of the other */ - if(*s1=='\0') return strlen(s2); - if(*s2=='\0') return strlen(s1); - - /* length count */ - while(*s1++) l1++; - while(*s2++) l2++; - - /* cut of equal tail sequence, if any */ - while(*--s1 == *--s2) { - l1--; l2--; - } - - - /* reset pointers, adjust length */ - s1-=l1++; - s2-=l2++; + int *p1,*p2,*tmp; + int i1,i2,c0,c1,c2; + if(l1==0) return l2*cost_ins; + if(l2==0) return l1*cost_del; + + p1=malloc(l2*sizeof(int)); + p2=malloc(l2*sizeof(int)); + + p1[0]=(s1[0]==s2[0])?0:cost_rep; + + for(i2=1;i2<l2;i2++) + p1[i2]=i2*cost_ins; + + for(i1=1;i1<l1;i1++) + { + p2[0]=i1*cost_del; + for(i2=1;i2<l2;i2++) + { + c0=p1[i2-1]+((s1[i1]==s2[i2])?0:cost_rep); + c1=p1[i2]+cost_del; if(c1<c0) c0=c1; + c2=p2[i2-1]+cost_ins; if(c2<c0) c0=c2; + p2[i2]=c0; + } + tmp=p1; p1=p2; p2=tmp; + } - /* possible dist to great? */ - if(abs(l1-l2)>=255) return -1; + c0=p1[l2-1]; - /* swap if l2 longer than l1 */ - if(l1<l2) { - tmp=s1; s1=s2; s2=tmp; - l1 ^= l2; l2 ^= l1; l1 ^= l2; - } + free(p1); + free(p2); - - /* fill initial row */ - n=1; - for(i=0,p1=r;i<l1;i++,*p1++=n++,p1++) {/*empty*/} - - /* calc. rowwise */ - for(j=1;j<l2;j++) { - /* init pointers and col#0 */ - p1 = r + !(j&1); - p2 = r + (j&1); - n=*p1+1; - *p2++=n;p2++; - s2++; - - /* foreach column */ - for(i=1;i<l1;i++) { - if(*p1<n) n=*p1+(*(s1+i)!=*(s2)); /* replace cheaper than delete? */ - p1++; - if(*++p1<n) n=*p1+1; /* insert cheaper then replace ? */ - *p2++=n++; /* update field and cost for next col's delete */ - p2++; - } - } - - /* return result */ - return n-1; + return c0; } -static int weighted_levdist( const char *s1 - , const char *s2 - , const int cost_ins - , const int cost_rep - , const int cost_del - ) -{ - register int *p1,*p2; - register int i,j,n,c; - int l1=0,l2=0; - int r[512]; - const char *tmp; - - /* skip equal start sequence, if any */ - while(*s1==*s2) { - if(!*s1) break; - s1++; s2++; - } - - /* if we already used up one string, then - the result is the length of the other */ - if(*s1=='\0') return strlen(s2); - if(*s2=='\0') return strlen(s1); - - /* length count */ - while(*s1++) l1++; - while(*s2++) l2++; - - /* cut of equal tail sequence, if any */ - while(*--s1 == *--s2) { - l1--; l2--; - } - - - /* reset pointers, adjust length */ - s1-=l1++; - s2-=l2++; - - - /* possible dist to great? */ - if(abs(l1-l2)>=255) return -1; - - /* swap if l2 longer than l1 */ - if(l1<l2) { - tmp=s1; s1=s2; s2=tmp; - l1 ^= l2; l2 ^= l1; l1 ^= l2; - } - - if((l1==1)&&(l2==1)) { - n= cost_del+cost_ins; - return n<cost_rep?n:cost_rep; - } - - /* fill initial row */ - n=cost_ins; - for(i=0,p1=r;i<l1;i++,*p1++=n,p1++) {n+=cost_ins;} - - /* calc. rowwise */ - for(j=1;j<l2;j++) { - /* init pointers and col#0 */ - p1 = r + !(j&1); - p2 = r + (j&1); - n=*p1+cost_del; - *p2++=n;p2++; - s2++; - - /* foreach column */ - for(i=1;i<l1;i++) { - c = *p1; if(*(s1+i)!=*(s2)) c+=cost_rep; - if(c<n) n=c; /* replace cheaper than delete? */ - p1++; - c= *++p1+cost_ins; - if(c<n) n=c; /* insert cheaper then replace ? */ - *p2++=n; /* update field and cost for next col's delete */ - n+=cost_del; /* update field and cost for next col's delete */ - p2++; - } - } - - /* return result */ - return n-=cost_del; -} - static int custom_levdist(char *str1,char *str2,char *callback_name) { php_error(E_WARNING,"the general Levenshtein support is not there yet"); @@ -202,8 +87,11 @@ PHP_FUNCTION(levenshtein) } convert_to_string_ex(str1); convert_to_string_ex(str2); - - distance = fastest_levdist((*str1)->value.str.val, (*str2)->value.str.val); + + distance = reference_levdist((*str1)->value.str.val,(*str1)->value.str.len, + (*str2)->value.str.val,(*str2)->value.str.len, + 1,1,1); + break; case 5: /* more general version: calc cost by ins/rep/del weights */ @@ -216,11 +104,11 @@ PHP_FUNCTION(levenshtein) convert_to_long_ex(cost_rep); convert_to_long_ex(cost_del); - distance = weighted_levdist((*str1)->value.str.val - , (*str2)->value.str.val - , (*cost_ins)->value.lval - , (*cost_rep)->value.lval - , (*cost_del)->value.lval + distance = reference_levdist((*str1)->value.str.val,(*str1)->value.str.len, + (*str2)->value.str.val,(*str2)->value.str.len, + (*cost_ins)->value.lval, + (*cost_rep)->value.lval, + (*cost_del)->value.lval ); break; diff --git a/ext/standard/tests/general_functions/003.phpt b/ext/standard/tests/general_functions/003.phpt new file mode 100644 index 0000000000..22cbd4eb4e --- /dev/null +++ b/ext/standard/tests/general_functions/003.phpt @@ -0,0 +1,54 @@ +--TEST-- +levenshtein() function test +--POST-- +--GET-- +--FILE-- +<?php + +function test_me($title,$expect,$text1,$text2,$cost1=0,$cost2=0,$cost3=0) { + + if($cost1==0) + $result=levenshtein($text1,$text2); + else + $result=levenshtein($text1,$text2,$cost1,$cost2,$cost3); + + if($result==$expect) return 0; + + echo "$title: result is $result instead of $expect "; + echo "for '$text1'/'$text2' "; + if($cost1) echo "($cost1:$cost2:$cost3)"; + echo "\n"; + + return 1; +} + +$n=0; + +$n += test_me("equal" , 0, "12345", "12345"); +$n += test_me("1st empty" , 3, "", "xzy"); +$n += test_me("2nd empty" , 3, "xzy", ""); +$n += test_me("both empty" , 0, "", ""); +$n += test_me("1 char" , 1, "1", "2"); +$n += test_me("2 char swap", 2, "12", "21"); + +$n += test_me("inexpensive delete", 2, "2121", "11", 2, 1, 1); +$n += test_me("expensive delete" , 10, "2121", "11", 2, 1, 5); +$n += test_me("inexpensive insert", 2, "11", "2121", 1, 1, 1); +$n += test_me("expensive insert" , 10, "11", "2121", 5, 1, 1); + +$n += test_me("expensive replace" , 3, "111", "121", 2, 3, 2); +$n += test_me("very expensive replace", 4, "111", "121", 2, 9, 2); + +$n += test_me("bug #7368", 2, "13458", "12345"); +$n += test_me("bug #7368", 2, "1345", "1234"); + +$n += test_me("bug #6562", 1, "debugg", "debug"); +$n += test_me("bug #6562", 1, "ddebug", "debug"); +$n += test_me("bug #6562", 2, "debbbug", "debug"); +$n += test_me("bug #6562", 1, "debugging", "debuging"); + +echo ($n==0)?"all passed\n":"$n failed\n"; + +?> +--EXPECT-- +all passed |