summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHartmut Holzgraefe <hholzgra@php.net>2000-12-14 16:16:04 +0000
committerHartmut Holzgraefe <hholzgra@php.net>2000-12-14 16:16:04 +0000
commite9c5c6ea7a872d5d8ead7f2f11cc54bb7caae252 (patch)
tree12b17065ed347847652a57a8fe1411a1b728558b
parent0493e78c8d1e9d6d4207d8846558a320083024dd (diff)
downloadphp-git-e9c5c6ea7a872d5d8ead7f2f11cc54bb7caae252.tar.gz
revert my last patch
-rw-r--r--ext/standard/levenshtein.c310
-rw-r--r--ext/standard/tests/general_functions/003.phpt54
2 files changed, 211 insertions, 153 deletions
diff --git a/ext/standard/levenshtein.c b/ext/standard/levenshtein.c
index 6887946da2..d157cf7fab 100644
--- a/ext/standard/levenshtein.c
+++ b/ext/standard/levenshtein.c
@@ -23,53 +23,168 @@
#include <ctype.h>
#include "php_string.h"
-/* reference implementation, only optimized for memory usage, not speed */
-static int reference_levdist(const char *s1, int l1,
- const char *s2, int l2,
- int cost_ins, int cost_rep, int cost_del )
+/* faster, but obfuscated, all operations have a cost of 1 */
+static int fastest_levdist(const char *s1, const char *s2)
{
- int *p1,*p2,*tmp;
- int i1,i2,c0,c1,c2;
-
- if(l1==0) return l2*cost_ins;
- if(l2==0) return l1*cost_del;
-
- p1=malloc(l2*sizeof(int));
- p2=malloc(l2*sizeof(int));
-
- p1[0]=(s1[0]==s2[0])?0:cost_rep;
-
- for(i2=1;i2<l2;i2++)
- p1[i2]=i2*cost_ins;
-
- for(i1=1;i1<l1;i1++)
- {
- p2[0]=i1*cost_del;
- for(i2=1;i2<l2;i2++)
- {
- c0=p1[i2-1]+((s1[i1]==s2[i2])?0:cost_rep);
- c1=p1[i2]+cost_del; if(c1<c0) c0=c1;
- c2=p2[i2-1]+cost_ins; if(c2<c0) c0=c2;
- p2[i2]=c0;
- }
- tmp=p1; p1=p2; p2=tmp;
- }
-
- c0=p1[l2-1];
-
- free(p1);
- free(p2);
-
- return c0;
+ register char *p1,*p2;
+ register int i,j,n;
+ int l1=0,l2=0;
+ char r[512];
+ const char *tmp;
+
+ /* skip equal start sequence, if any */
+ while(*s1==*s2) {
+ if(!*s1) break;
+ s1++; s2++;
+ }
+
+ /* if we already used up one string, then
+ the result is the length of the other */
+ if(*s1=='\0') return strlen(s2);
+ if(*s2=='\0') return strlen(s1);
+
+ /* length count */
+ while(*s1++) l1++;
+ while(*s2++) l2++;
+
+ /* cut of equal tail sequence, if any */
+ while(*--s1 == *--s2) {
+ l1--; l2--;
+ }
+
+
+ /* reset pointers, adjust length */
+ s1-=l1++;
+ s2-=l2++;
+
+
+ /* possible dist to great? */
+ if(abs(l1-l2)>=255) return -1;
+
+ /* swap if l2 longer than l1 */
+ if(l1<l2) {
+ tmp=s1; s1=s2; s2=tmp;
+ l1 ^= l2; l2 ^= l1; l1 ^= l2;
+ }
+
+
+ /* fill initial row */
+ n=1;
+ for(i=0,p1=r;i<l1;i++,*p1++=n++,p1++) {/*empty*/}
+
+ /* calc. rowwise */
+ for(j=1;j<l2;j++) {
+ /* init pointers and col#0 */
+ p1 = r + !(j&1);
+ p2 = r + (j&1);
+ n=*p1+1;
+ *p2++=n;p2++;
+ s2++;
+
+ /* foreach column */
+ for(i=1;i<l1;i++) {
+ if(*p1<n) n=*p1+(*(s1+i)!=*(s2)); /* replace cheaper than delete? */
+ p1++;
+ if(*++p1<n) n=*p1+1; /* insert cheaper then replace ? */
+ *p2++=n++; /* update field and cost for next col's delete */
+ p2++;
+ }
+ }
+
+ /* return result */
+ return n-1;
}
+static int weighted_levdist( const char *s1
+ , const char *s2
+ , const int cost_ins
+ , const int cost_rep
+ , const int cost_del
+ )
+{
+ register int *p1,*p2;
+ register int i,j,n,c;
+ int l1=0,l2=0;
+ int r[512];
+ const char *tmp;
+
+ /* skip equal start sequence, if any */
+ while(*s1==*s2) {
+ if(!*s1) break;
+ s1++; s2++;
+ }
+
+ /* if we already used up one string, then
+ the result is the length of the other */
+ if(*s1=='\0') return strlen(s2);
+ if(*s2=='\0') return strlen(s1);
+
+ /* length count */
+ while(*s1++) l1++;
+ while(*s2++) l2++;
+
+ /* cut of equal tail sequence, if any */
+ while(*--s1 == *--s2) {
+ l1--; l2--;
+ }
+
+
+ /* reset pointers, adjust length */
+ s1-=l1++;
+ s2-=l2++;
+
+
+ /* possible dist to great? */
+ if(abs(l1-l2)>=255) return -1;
+
+ /* swap if l2 longer than l1 */
+ if(l1<l2) {
+ tmp=s1; s1=s2; s2=tmp;
+ l1 ^= l2; l2 ^= l1; l1 ^= l2;
+ }
+
+ if((l1==1)&&(l2==1)) {
+ n= cost_del+cost_ins;
+ return n<cost_rep?n:cost_rep;
+ }
+
+ /* fill initial row */
+ n=cost_ins;
+ for(i=0,p1=r;i<l1;i++,*p1++=n,p1++) {n+=cost_ins;}
+
+ /* calc. rowwise */
+ for(j=1;j<l2;j++) {
+ /* init pointers and col#0 */
+ p1 = r + !(j&1);
+ p2 = r + (j&1);
+ n=*p1+cost_del;
+ *p2++=n;p2++;
+ s2++;
+
+ /* foreach column */
+ for(i=1;i<l1;i++) {
+ c = *p1; if(*(s1+i)!=*(s2)) c+=cost_rep;
+ if(c<n) n=c; /* replace cheaper than delete? */
+ p1++;
+ c= *++p1+cost_ins;
+ if(c<n) n=c; /* insert cheaper then replace ? */
+ *p2++=n; /* update field and cost for next col's delete */
+ n+=cost_del; /* update field and cost for next col's delete */
+ p2++;
+ }
+ }
+
+ /* return result */
+ return n-=cost_del;
+}
+
static int custom_levdist(char *str1,char *str2,char *callback_name)
{
- php_error(E_WARNING,"the general Levenshtein support is not there yet");
- /* not there yet */
+ php_error(E_WARNING,"the general Levenshtein support is not there yet");
+ /* not there yet */
- return -1;
+ return -1;
}
@@ -77,64 +192,61 @@ static int custom_levdist(char *str1,char *str2,char *callback_name)
Calculate Levenshtein distance between two strings */
PHP_FUNCTION(levenshtein)
{
- zval **str1, **str2, **cost_ins, **cost_rep, **cost_del,**callback_name;
- int distance=-1;
-
- switch(ZEND_NUM_ARGS()) {
- case 2: /* just two string: use maximum performance version */
- if (zend_get_parameters_ex(2, &str1, &str2) == FAILURE) {
- WRONG_PARAM_COUNT;
- }
- convert_to_string_ex(str1);
- convert_to_string_ex(str2);
-
- distance = reference_levdist((*str1)->value.str.val,(*str1)->value.str.len,
- (*str2)->value.str.val,(*str2)->value.str.len,
- 1,1,1);
-
- break;
-
- case 5: /* more general version: calc cost by ins/rep/del weights */
- if (zend_get_parameters_ex(5, &str1, &str2, &cost_ins, &cost_rep, &cost_del) == FAILURE) {
- WRONG_PARAM_COUNT;
- }
- convert_to_string_ex(str1);
- convert_to_string_ex(str2);
- convert_to_long_ex(cost_ins);
- convert_to_long_ex(cost_rep);
- convert_to_long_ex(cost_del);
-
- distance = reference_levdist((*str1)->value.str.val,(*str1)->value.str.len,
- (*str2)->value.str.val,(*str2)->value.str.len,
- (*cost_ins)->value.lval,
- (*cost_rep)->value.lval,
- (*cost_del)->value.lval
- );
-
- break;
-
- case 3: /* most general version: calc cost by user-supplied function */
- if (zend_get_parameters_ex(3, &str1, &str2, &callback_name) == FAILURE) {
- WRONG_PARAM_COUNT;
- }
- convert_to_string_ex(str1);
- convert_to_string_ex(str2);
- convert_to_string_ex(callback_name);
-
- distance = custom_levdist((*str1)->value.str.val
- , (*str2)->value.str.val
- , (*callback_name)->value.str.val
- );
- break;
-
- default:
- WRONG_PARAM_COUNT;
- }
-
- if(distance<0) {
- php_error(E_WARNING,"levenshtein(): argument string(s) too long");
- }
-
- RETURN_LONG(distance);
+ zval **str1, **str2, **cost_ins, **cost_rep, **cost_del,**callback_name;
+ int distance=-1;
+
+ switch(ZEND_NUM_ARGS()) {
+ case 2: /* just two string: use maximum performance version */
+ if (zend_get_parameters_ex(2, &str1, &str2) == FAILURE) {
+ WRONG_PARAM_COUNT;
+ }
+ convert_to_string_ex(str1);
+ convert_to_string_ex(str2);
+
+ distance = fastest_levdist((*str1)->value.str.val, (*str2)->value.str.val);
+ break;
+
+ case 5: /* more general version: calc cost by ins/rep/del weights */
+ if (zend_get_parameters_ex(5, &str1, &str2, &cost_ins, &cost_rep, &cost_del) == FAILURE) {
+ WRONG_PARAM_COUNT;
+ }
+ convert_to_string_ex(str1);
+ convert_to_string_ex(str2);
+ convert_to_long_ex(cost_ins);
+ convert_to_long_ex(cost_rep);
+ convert_to_long_ex(cost_del);
+
+ distance = weighted_levdist((*str1)->value.str.val
+ , (*str2)->value.str.val
+ , (*cost_ins)->value.lval
+ , (*cost_rep)->value.lval
+ , (*cost_del)->value.lval
+ );
+
+ break;
+
+ case 3: /* most general version: calc cost by user-supplied function */
+ if (zend_get_parameters_ex(3, &str1, &str2, &callback_name) == FAILURE) {
+ WRONG_PARAM_COUNT;
+ }
+ convert_to_string_ex(str1);
+ convert_to_string_ex(str2);
+ convert_to_string_ex(callback_name);
+
+ distance = custom_levdist((*str1)->value.str.val
+ , (*str2)->value.str.val
+ , (*callback_name)->value.str.val
+ );
+ break;
+
+ default:
+ WRONG_PARAM_COUNT;
+ }
+
+ if(distance<0) {
+ php_error(E_WARNING,"levenshtein(): argument string(s) too long");
+ }
+
+ RETURN_LONG(distance);
}
/* }}} */
diff --git a/ext/standard/tests/general_functions/003.phpt b/ext/standard/tests/general_functions/003.phpt
deleted file mode 100644
index 22cbd4eb4e..0000000000
--- a/ext/standard/tests/general_functions/003.phpt
+++ /dev/null
@@ -1,54 +0,0 @@
---TEST--
-levenshtein() function test
---POST--
---GET--
---FILE--
-<?php
-
-function test_me($title,$expect,$text1,$text2,$cost1=0,$cost2=0,$cost3=0) {
-
- if($cost1==0)
- $result=levenshtein($text1,$text2);
- else
- $result=levenshtein($text1,$text2,$cost1,$cost2,$cost3);
-
- if($result==$expect) return 0;
-
- echo "$title: result is $result instead of $expect ";
- echo "for '$text1'/'$text2' ";
- if($cost1) echo "($cost1:$cost2:$cost3)";
- echo "\n";
-
- return 1;
-}
-
-$n=0;
-
-$n += test_me("equal" , 0, "12345", "12345");
-$n += test_me("1st empty" , 3, "", "xzy");
-$n += test_me("2nd empty" , 3, "xzy", "");
-$n += test_me("both empty" , 0, "", "");
-$n += test_me("1 char" , 1, "1", "2");
-$n += test_me("2 char swap", 2, "12", "21");
-
-$n += test_me("inexpensive delete", 2, "2121", "11", 2, 1, 1);
-$n += test_me("expensive delete" , 10, "2121", "11", 2, 1, 5);
-$n += test_me("inexpensive insert", 2, "11", "2121", 1, 1, 1);
-$n += test_me("expensive insert" , 10, "11", "2121", 5, 1, 1);
-
-$n += test_me("expensive replace" , 3, "111", "121", 2, 3, 2);
-$n += test_me("very expensive replace", 4, "111", "121", 2, 9, 2);
-
-$n += test_me("bug #7368", 2, "13458", "12345");
-$n += test_me("bug #7368", 2, "1345", "1234");
-
-$n += test_me("bug #6562", 1, "debugg", "debug");
-$n += test_me("bug #6562", 1, "ddebug", "debug");
-$n += test_me("bug #6562", 2, "debbbug", "debug");
-$n += test_me("bug #6562", 1, "debugging", "debuging");
-
-echo ($n==0)?"all passed\n":"$n failed\n";
-
-?>
---EXPECT--
-all passed