diff options
author | Hartmut Holzgraefe <hholzgra@php.net> | 2000-12-14 16:16:04 +0000 |
---|---|---|
committer | Hartmut Holzgraefe <hholzgra@php.net> | 2000-12-14 16:16:04 +0000 |
commit | e9c5c6ea7a872d5d8ead7f2f11cc54bb7caae252 (patch) | |
tree | 12b17065ed347847652a57a8fe1411a1b728558b | |
parent | 0493e78c8d1e9d6d4207d8846558a320083024dd (diff) | |
download | php-git-e9c5c6ea7a872d5d8ead7f2f11cc54bb7caae252.tar.gz |
revert my last patch
-rw-r--r-- | ext/standard/levenshtein.c | 310 | ||||
-rw-r--r-- | ext/standard/tests/general_functions/003.phpt | 54 |
2 files changed, 211 insertions, 153 deletions
diff --git a/ext/standard/levenshtein.c b/ext/standard/levenshtein.c index 6887946da2..d157cf7fab 100644 --- a/ext/standard/levenshtein.c +++ b/ext/standard/levenshtein.c @@ -23,53 +23,168 @@ #include <ctype.h> #include "php_string.h" -/* reference implementation, only optimized for memory usage, not speed */ -static int reference_levdist(const char *s1, int l1, - const char *s2, int l2, - int cost_ins, int cost_rep, int cost_del ) +/* faster, but obfuscated, all operations have a cost of 1 */ +static int fastest_levdist(const char *s1, const char *s2) { - int *p1,*p2,*tmp; - int i1,i2,c0,c1,c2; - - if(l1==0) return l2*cost_ins; - if(l2==0) return l1*cost_del; - - p1=malloc(l2*sizeof(int)); - p2=malloc(l2*sizeof(int)); - - p1[0]=(s1[0]==s2[0])?0:cost_rep; - - for(i2=1;i2<l2;i2++) - p1[i2]=i2*cost_ins; - - for(i1=1;i1<l1;i1++) - { - p2[0]=i1*cost_del; - for(i2=1;i2<l2;i2++) - { - c0=p1[i2-1]+((s1[i1]==s2[i2])?0:cost_rep); - c1=p1[i2]+cost_del; if(c1<c0) c0=c1; - c2=p2[i2-1]+cost_ins; if(c2<c0) c0=c2; - p2[i2]=c0; - } - tmp=p1; p1=p2; p2=tmp; - } - - c0=p1[l2-1]; - - free(p1); - free(p2); - - return c0; + register char *p1,*p2; + register int i,j,n; + int l1=0,l2=0; + char r[512]; + const char *tmp; + + /* skip equal start sequence, if any */ + while(*s1==*s2) { + if(!*s1) break; + s1++; s2++; + } + + /* if we already used up one string, then + the result is the length of the other */ + if(*s1=='\0') return strlen(s2); + if(*s2=='\0') return strlen(s1); + + /* length count */ + while(*s1++) l1++; + while(*s2++) l2++; + + /* cut of equal tail sequence, if any */ + while(*--s1 == *--s2) { + l1--; l2--; + } + + + /* reset pointers, adjust length */ + s1-=l1++; + s2-=l2++; + + + /* possible dist to great? */ + if(abs(l1-l2)>=255) return -1; + + /* swap if l2 longer than l1 */ + if(l1<l2) { + tmp=s1; s1=s2; s2=tmp; + l1 ^= l2; l2 ^= l1; l1 ^= l2; + } + + + /* fill initial row */ + n=1; + for(i=0,p1=r;i<l1;i++,*p1++=n++,p1++) {/*empty*/} + + /* calc. rowwise */ + for(j=1;j<l2;j++) { + /* init pointers and col#0 */ + p1 = r + !(j&1); + p2 = r + (j&1); + n=*p1+1; + *p2++=n;p2++; + s2++; + + /* foreach column */ + for(i=1;i<l1;i++) { + if(*p1<n) n=*p1+(*(s1+i)!=*(s2)); /* replace cheaper than delete? */ + p1++; + if(*++p1<n) n=*p1+1; /* insert cheaper then replace ? */ + *p2++=n++; /* update field and cost for next col's delete */ + p2++; + } + } + + /* return result */ + return n-1; } +static int weighted_levdist( const char *s1 + , const char *s2 + , const int cost_ins + , const int cost_rep + , const int cost_del + ) +{ + register int *p1,*p2; + register int i,j,n,c; + int l1=0,l2=0; + int r[512]; + const char *tmp; + + /* skip equal start sequence, if any */ + while(*s1==*s2) { + if(!*s1) break; + s1++; s2++; + } + + /* if we already used up one string, then + the result is the length of the other */ + if(*s1=='\0') return strlen(s2); + if(*s2=='\0') return strlen(s1); + + /* length count */ + while(*s1++) l1++; + while(*s2++) l2++; + + /* cut of equal tail sequence, if any */ + while(*--s1 == *--s2) { + l1--; l2--; + } + + + /* reset pointers, adjust length */ + s1-=l1++; + s2-=l2++; + + + /* possible dist to great? */ + if(abs(l1-l2)>=255) return -1; + + /* swap if l2 longer than l1 */ + if(l1<l2) { + tmp=s1; s1=s2; s2=tmp; + l1 ^= l2; l2 ^= l1; l1 ^= l2; + } + + if((l1==1)&&(l2==1)) { + n= cost_del+cost_ins; + return n<cost_rep?n:cost_rep; + } + + /* fill initial row */ + n=cost_ins; + for(i=0,p1=r;i<l1;i++,*p1++=n,p1++) {n+=cost_ins;} + + /* calc. rowwise */ + for(j=1;j<l2;j++) { + /* init pointers and col#0 */ + p1 = r + !(j&1); + p2 = r + (j&1); + n=*p1+cost_del; + *p2++=n;p2++; + s2++; + + /* foreach column */ + for(i=1;i<l1;i++) { + c = *p1; if(*(s1+i)!=*(s2)) c+=cost_rep; + if(c<n) n=c; /* replace cheaper than delete? */ + p1++; + c= *++p1+cost_ins; + if(c<n) n=c; /* insert cheaper then replace ? */ + *p2++=n; /* update field and cost for next col's delete */ + n+=cost_del; /* update field and cost for next col's delete */ + p2++; + } + } + + /* return result */ + return n-=cost_del; +} + static int custom_levdist(char *str1,char *str2,char *callback_name) { - php_error(E_WARNING,"the general Levenshtein support is not there yet"); - /* not there yet */ + php_error(E_WARNING,"the general Levenshtein support is not there yet"); + /* not there yet */ - return -1; + return -1; } @@ -77,64 +192,61 @@ static int custom_levdist(char *str1,char *str2,char *callback_name) Calculate Levenshtein distance between two strings */ PHP_FUNCTION(levenshtein) { - zval **str1, **str2, **cost_ins, **cost_rep, **cost_del,**callback_name; - int distance=-1; - - switch(ZEND_NUM_ARGS()) { - case 2: /* just two string: use maximum performance version */ - if (zend_get_parameters_ex(2, &str1, &str2) == FAILURE) { - WRONG_PARAM_COUNT; - } - convert_to_string_ex(str1); - convert_to_string_ex(str2); - - distance = reference_levdist((*str1)->value.str.val,(*str1)->value.str.len, - (*str2)->value.str.val,(*str2)->value.str.len, - 1,1,1); - - break; - - case 5: /* more general version: calc cost by ins/rep/del weights */ - if (zend_get_parameters_ex(5, &str1, &str2, &cost_ins, &cost_rep, &cost_del) == FAILURE) { - WRONG_PARAM_COUNT; - } - convert_to_string_ex(str1); - convert_to_string_ex(str2); - convert_to_long_ex(cost_ins); - convert_to_long_ex(cost_rep); - convert_to_long_ex(cost_del); - - distance = reference_levdist((*str1)->value.str.val,(*str1)->value.str.len, - (*str2)->value.str.val,(*str2)->value.str.len, - (*cost_ins)->value.lval, - (*cost_rep)->value.lval, - (*cost_del)->value.lval - ); - - break; - - case 3: /* most general version: calc cost by user-supplied function */ - if (zend_get_parameters_ex(3, &str1, &str2, &callback_name) == FAILURE) { - WRONG_PARAM_COUNT; - } - convert_to_string_ex(str1); - convert_to_string_ex(str2); - convert_to_string_ex(callback_name); - - distance = custom_levdist((*str1)->value.str.val - , (*str2)->value.str.val - , (*callback_name)->value.str.val - ); - break; - - default: - WRONG_PARAM_COUNT; - } - - if(distance<0) { - php_error(E_WARNING,"levenshtein(): argument string(s) too long"); - } - - RETURN_LONG(distance); + zval **str1, **str2, **cost_ins, **cost_rep, **cost_del,**callback_name; + int distance=-1; + + switch(ZEND_NUM_ARGS()) { + case 2: /* just two string: use maximum performance version */ + if (zend_get_parameters_ex(2, &str1, &str2) == FAILURE) { + WRONG_PARAM_COUNT; + } + convert_to_string_ex(str1); + convert_to_string_ex(str2); + + distance = fastest_levdist((*str1)->value.str.val, (*str2)->value.str.val); + break; + + case 5: /* more general version: calc cost by ins/rep/del weights */ + if (zend_get_parameters_ex(5, &str1, &str2, &cost_ins, &cost_rep, &cost_del) == FAILURE) { + WRONG_PARAM_COUNT; + } + convert_to_string_ex(str1); + convert_to_string_ex(str2); + convert_to_long_ex(cost_ins); + convert_to_long_ex(cost_rep); + convert_to_long_ex(cost_del); + + distance = weighted_levdist((*str1)->value.str.val + , (*str2)->value.str.val + , (*cost_ins)->value.lval + , (*cost_rep)->value.lval + , (*cost_del)->value.lval + ); + + break; + + case 3: /* most general version: calc cost by user-supplied function */ + if (zend_get_parameters_ex(3, &str1, &str2, &callback_name) == FAILURE) { + WRONG_PARAM_COUNT; + } + convert_to_string_ex(str1); + convert_to_string_ex(str2); + convert_to_string_ex(callback_name); + + distance = custom_levdist((*str1)->value.str.val + , (*str2)->value.str.val + , (*callback_name)->value.str.val + ); + break; + + default: + WRONG_PARAM_COUNT; + } + + if(distance<0) { + php_error(E_WARNING,"levenshtein(): argument string(s) too long"); + } + + RETURN_LONG(distance); } /* }}} */ diff --git a/ext/standard/tests/general_functions/003.phpt b/ext/standard/tests/general_functions/003.phpt deleted file mode 100644 index 22cbd4eb4e..0000000000 --- a/ext/standard/tests/general_functions/003.phpt +++ /dev/null @@ -1,54 +0,0 @@ ---TEST-- -levenshtein() function test ---POST-- ---GET-- ---FILE-- -<?php - -function test_me($title,$expect,$text1,$text2,$cost1=0,$cost2=0,$cost3=0) { - - if($cost1==0) - $result=levenshtein($text1,$text2); - else - $result=levenshtein($text1,$text2,$cost1,$cost2,$cost3); - - if($result==$expect) return 0; - - echo "$title: result is $result instead of $expect "; - echo "for '$text1'/'$text2' "; - if($cost1) echo "($cost1:$cost2:$cost3)"; - echo "\n"; - - return 1; -} - -$n=0; - -$n += test_me("equal" , 0, "12345", "12345"); -$n += test_me("1st empty" , 3, "", "xzy"); -$n += test_me("2nd empty" , 3, "xzy", ""); -$n += test_me("both empty" , 0, "", ""); -$n += test_me("1 char" , 1, "1", "2"); -$n += test_me("2 char swap", 2, "12", "21"); - -$n += test_me("inexpensive delete", 2, "2121", "11", 2, 1, 1); -$n += test_me("expensive delete" , 10, "2121", "11", 2, 1, 5); -$n += test_me("inexpensive insert", 2, "11", "2121", 1, 1, 1); -$n += test_me("expensive insert" , 10, "11", "2121", 5, 1, 1); - -$n += test_me("expensive replace" , 3, "111", "121", 2, 3, 2); -$n += test_me("very expensive replace", 4, "111", "121", 2, 9, 2); - -$n += test_me("bug #7368", 2, "13458", "12345"); -$n += test_me("bug #7368", 2, "1345", "1234"); - -$n += test_me("bug #6562", 1, "debugg", "debug"); -$n += test_me("bug #6562", 1, "ddebug", "debug"); -$n += test_me("bug #6562", 2, "debbbug", "debug"); -$n += test_me("bug #6562", 1, "debugging", "debuging"); - -echo ($n==0)?"all passed\n":"$n failed\n"; - -?> ---EXPECT-- -all passed |