diff options
Diffstat (limited to 'ext/standard/levenshtein.c')
-rw-r--r-- | ext/standard/levenshtein.c | 162 |
1 files changed, 147 insertions, 15 deletions
diff --git a/ext/standard/levenshtein.c b/ext/standard/levenshtein.c index ccb767b6d8..31699be296 100644 --- a/ext/standard/levenshtein.c +++ b/ext/standard/levenshtein.c @@ -23,7 +23,8 @@ #include <ctype.h> #include "php_string.h" -static int calc_levdist(const char *s1, const char *s2) /* faster, but obfuscated */ +/* faster, but obfuscated, all operations have a cost of 1 */ +static int fastest_levdist(const char *s1, const char *s2) { register char *p1,*p2; register int i,j,n; @@ -68,7 +69,7 @@ static int calc_levdist(const char *s1, const char *s2) /* faster, but obfuscate /* fill initial row */ - n=(*s1!=*s2); + n=1; for(i=0,p1=r;i<l1;i++,*p1++=n++,p1++) {/*empty*/} /* calc. rowwise */ @@ -84,7 +85,7 @@ static int calc_levdist(const char *s1, const char *s2) /* faster, but obfuscate for(i=1;i<l1;i++) { if(*p1<n) n=*p1+(*(s1+i)!=*(s2)); /* replace cheaper than delete? */ p1++; - if(*++p1<n) n=*p1+1; /* insert cheaper then insert ? */ + if(*++p1<n) n=*p1+1; /* insert cheaper then replace ? */ *p2++=n++; /* update field and cost for next col's delete */ p2++; } @@ -95,26 +96,157 @@ static int calc_levdist(const char *s1, const char *s2) /* faster, but obfuscate } +static int weighted_levdist( const char *s1 + , const char *s2 + , const int cost_ins + , const int cost_rep + , const int cost_del + ) +{ + register int *p1,*p2; + register int i,j,n,c; + int l1=0,l2=0; + int r[512]; + const char *tmp; + + /* skip equal start sequence, if any */ + while(*s1==*s2) { + if(!*s1) break; + s1++; s2++; + } + + /* if we already used up one string, then + the result is the length of the other */ + if(*s1=='\0') return strlen(s2); + if(*s2=='\0') return strlen(s1); + + /* length count */ + while(*s1++) l1++; + while(*s2++) l2++; + + /* cut of equal tail sequence, if any */ + while(*--s1 == *--s2) { + l1--; l2--; + } + + + /* reset pointers, adjust length */ + s1-=l1++; + s2-=l2++; + + + /* possible dist to great? */ + if(abs(l1-l2)>=255) return -1; + + /* swap if l2 longer than l1 */ + if(l1<l2) { + tmp=s1; s1=s2; s2=tmp; + l1 ^= l2; l2 ^= l1; l1 ^= l2; + } + + if((l1==1)&&(l2==1)) { + n= cost_del+cost_ins; + return n<cost_rep?n:cost_rep; + } + + /* fill initial row */ + n=cost_ins; + for(i=0,p1=r;i<l1;i++,*p1++=n,p1++) {n+=cost_ins;} + + /* calc. rowwise */ + for(j=1;j<l2;j++) { + /* init pointers and col#0 */ + p1 = r + !(j&1); + p2 = r + (j&1); + n=*p1+cost_del; + *p2++=n;p2++; + s2++; + + /* foreach column */ + for(i=1;i<l1;i++) { + c = *p1; if(*(s1+i)!=*(s2)) c+=cost_rep; + if(c<n) n=c; /* replace cheaper than delete? */ + p1++; + c= *++p1+cost_ins; + if(c<n) n=c; /* insert cheaper then replace ? */ + *p2++=n; /* update field and cost for next col's delete */ + n+=cost_del; /* update field and cost for next col's delete */ + p2++; + } + } + + /* return result */ + return n-=cost_del; +} + +int custom_levdist(char *str1,char *str2,char *callback_name) +{ + php_error(E_WARNING,"the general Levenshtein support is not there yet"); + /* not there yet */ + + return -1; +} + + /* {{{ proto int levenshtein(string str1, string str2) Calculate Levenshtein distance between two strings */ PHP_FUNCTION(levenshtein) { - zval **str1, **str2; - int l; + zval **str1, **str2, **cost_ins, **cost_rep, **cost_del,**callback_name; + int distance=-1; - if (ZEND_NUM_ARGS() != 2 || zend_get_parameters_ex(2, &str1, &str2) == FAILURE) { + switch(ZEND_NUM_ARGS()) { + case 2: /* just two string: use maximum performance version */ + if (zend_get_parameters_ex(2, &str1, &str2) == FAILURE) { + WRONG_PARAM_COUNT; + } + convert_to_string_ex(str1); + convert_to_string_ex(str2); + + distance = fastest_levdist((*str1)->value.str.val, (*str2)->value.str.val); + break; + + case 5: /* more general version: calc cost by ins/rep/del weights */ + if (zend_get_parameters_ex(5, &str1, &str2, &cost_ins, &cost_rep, &cost_del) == FAILURE) { + WRONG_PARAM_COUNT; + } + convert_to_string_ex(str1); + convert_to_string_ex(str2); + convert_to_long_ex(cost_ins); + convert_to_long_ex(cost_rep); + convert_to_long_ex(cost_del); + + distance = weighted_levdist((*str1)->value.str.val + , (*str2)->value.str.val + , (*cost_ins)->value.lval + , (*cost_rep)->value.lval + , (*cost_del)->value.lval + ); + + break; + + case 3: /* most general version: calc cost by user-supplied function */ + if (zend_get_parameters_ex(3, &str1, &str2, &callback_name) == FAILURE) { + WRONG_PARAM_COUNT; + } + convert_to_string_ex(str1); + convert_to_string_ex(str2); + convert_to_string_ex(callback_name); + + distance = custom_levdist((*str1)->value.str.val + , (*str2)->value.str.val + , (*callback_name)->value.str.lval + ); + break; + + default: WRONG_PARAM_COUNT; - } - convert_to_string_ex(str1); - convert_to_string_ex(str2); - - l = calc_levdist((*str1)->value.str.val, (*str2)->value.str.val); + } - if(l<0) { + if(distance<0) { php_error(E_WARNING,"levenshtein(): argument string(s) too long"); } - - RETURN_LONG(l); + + RETURN_LONG(distance); } /* }}} */ - |