diff options
Diffstat (limited to 'src/assistant/3rdparty/clucene/src/CLucene/search/FuzzyQuery.cpp')
-rw-r--r-- | src/assistant/3rdparty/clucene/src/CLucene/search/FuzzyQuery.cpp | 357 |
1 files changed, 357 insertions, 0 deletions
diff --git a/src/assistant/3rdparty/clucene/src/CLucene/search/FuzzyQuery.cpp b/src/assistant/3rdparty/clucene/src/CLucene/search/FuzzyQuery.cpp new file mode 100644 index 000000000..e95d48da3 --- /dev/null +++ b/src/assistant/3rdparty/clucene/src/CLucene/search/FuzzyQuery.cpp @@ -0,0 +1,357 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "FuzzyQuery.h" + +CL_NS_USE(index) +CL_NS_USE(util) +CL_NS_DEF(search) + + /** + * Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of + * length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity > + * <code>minSimilarity</code>. + * + * @param reader Delivers terms. + * @param term Pattern term. + * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f. + * @param prefixLength Length of required common prefix. Default value is 0. + * @throws IOException + */ + FuzzyTermEnum::FuzzyTermEnum(const IndexReader* reader, Term* term, qreal minSimilarity, size_t prefixLength): + distance(0), + _endEnum(false), + prefix(LUCENE_BLANK_STRING), + prefixLength(0), + minimumSimilarity(minSimilarity) + { + //Func - Constructor + //Pre - reader contains a valid reference to an IndexReader + // term != NULL + //Post - The instance has been created + + CND_PRECONDITION(term != NULL,"term is NULL"); + + scale_factor = 1.0f / (1.0f - minimumSimilarity); + searchTerm = _CL_POINTER(term); + + text = STRDUP_TtoT(term->text()); + textLen = term->textLength(); + + + //Initialize e to NULL + e = NULL; + eWidth = 0; + eHeight = 0; + + if(prefixLength > 0 && prefixLength < textLen){ + this->prefixLength = prefixLength; + + prefix = _CL_NEWARRAY(TCHAR,prefixLength+1); + _tcsncpy(prefix,text,prefixLength); + prefix[prefixLength]='\0'; + + textLen = prefixLength; + text[textLen]='\0'; + } + + + //Set the enumeration + Term* trm = _CLNEW Term(term, prefix); + setEnum(reader->terms(trm)); + _CLDECDELETE(trm); + } + + FuzzyTermEnum::~FuzzyTermEnum(){ + //Func - Destructor + //Pre - true + //Post - FuzzyTermEnum has been destroyed + + //Close the enumeration + close(); + } + + bool FuzzyTermEnum::endEnum() { + //Func - Returns the fact if the current term in the enumeration has reached the end + //Pre - true + //Post - The boolean value of endEnum has been returned + + return _endEnum; + } + + void FuzzyTermEnum::close(){ + //Func - Close the enumeration + //Pre - true + //Post - The enumeration has been closed + + FilteredTermEnum::close(); + + //Finalize the searchTerm + _CLDECDELETE(searchTerm); + //Destroy e + _CLDELETE_ARRAY(e); + + _CLDELETE_CARRAY(text); + + if ( prefix != LUCENE_BLANK_STRING ) + _CLDELETE_CARRAY(prefix); + } + + bool FuzzyTermEnum::termCompare(Term* term) { + //Func - Compares term with the searchTerm using the Levenshtein distance. + //Pre - term is NULL or term points to a Term + //Post - if pre(term) is NULL then false is returned otherwise + // if the distance of the current term in the enumeration is bigger than the FUZZY_THRESHOLD + // then true is returned + + if (term == NULL){ + return false; //Note that endEnum is not set to true! + } + + const TCHAR* termText = term->text(); + size_t termTextLen = term->textLength(); + + //Check if the field name of searchTerm of term match + //(we can use == because fields are interned) + if ( searchTerm->field() == term->field() && + (prefixLength==0 || _tcsncmp(termText,prefix,prefixLength)==0 )) { + + const TCHAR* target = termText+prefixLength; + size_t targetLen = termTextLen-prefixLength; + + //Calculate the Levenshtein distance + int32_t dist = editDistance(text, target, textLen, targetLen); + distance = 1 - ((qreal)dist / (qreal)min(textLen, targetLen)); + return (distance > minimumSimilarity); + } + _endEnum = true; + return false; + } + + qreal FuzzyTermEnum::difference() { + //Func - Returns the difference between the distance and the fuzzy threshold + // multiplied by the scale factor + //Pre - true + //Post - The difference is returned + + return (qreal)((distance - minimumSimilarity) * scale_factor ); + } + + + /** Finds and returns the smallest of three integers + precondition: Must define int32_t __t for temporary storage and result + */ + #define min3(a, b, c) __t = (a < b) ? a : b; __t = (__t < c) ? __t : c; + + int32_t FuzzyTermEnum::editDistance(const TCHAR* s, const TCHAR* t, const int32_t n, const int32_t m) { + //Func - Calculates the Levenshtein distance also known as edit distance is a measure of similiarity + // between two strings where the distance is measured as the number of character + // deletions, insertions or substitutions required to transform one string to + // the other string. + //Pre - s != NULL and contains the source string + // t != NULL and contains the target string + // n >= 0 and contains the length of the source string + // m >= 0 and containts the length of th target string + //Post - The distance has been returned + + CND_PRECONDITION(s != NULL, "s is NULL"); + CND_PRECONDITION(t != NULL, "t is NULL"); + CND_PRECONDITION(n >= 0," n is a negative number"); + CND_PRECONDITION(n >= 0," n is a negative number"); + + int32_t i; // iterates through s + int32_t j; // iterates through t + TCHAR s_i; // ith character of s + + if (n == 0) + return m; + if (m == 0) + return n; + + //Check if the array must be reallocated because it is too small or does not exist + if (e == NULL || eWidth <= n || eHeight <= m) { + //Delete e if possible + _CLDELETE_ARRAY(e); + //resize e + eWidth = max(eWidth, n+1); + eHeight = max(eHeight, m+1); + e = _CL_NEWARRAY(int32_t,eWidth*eHeight); + } + + CND_CONDITION(e != NULL,"e is NULL"); + + // init matrix e + for (i = 0; i <= n; i++){ + e[i + (0*eWidth)] = i; + } + for (j = 0; j <= m; j++){ + e[0 + (j*eWidth)] = j; + } + + int32_t __t; //temporary variable for min3 + + // start computing edit distance + for (i = 1; i <= n; i++) { + s_i = s[i - 1]; + for (j = 1; j <= m; j++) { + if (s_i != t[j-1]){ + min3(e[i + (j*eWidth) - 1], e[i + ((j-1)*eWidth)], e[i + ((j-1)*eWidth)-1]); + e[i + (j*eWidth)] = __t+1; + }else{ + min3(e[i + (j*eWidth) -1]+1, e[i + ((j-1)*eWidth)]+1, e[i + ((j-1)*eWidth)-1]); + e[i + (j*eWidth)] = __t; + } + } + } + + // we got the result! + return e[n + ((m)*eWidth)]; + } + + + /** + * Create a new FuzzyQuery that will match terms with a similarity + * of at least <code>minimumSimilarity</code> to <code>term</code>. + * If a <code>prefixLength</code> > 0 is specified, a common prefix + * of that length is also required. + * + * @param term the term to search for + * @param minimumSimilarity a value between 0 and 1 to set the required similarity + * between the query term and the matching terms. For example, for a + * <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length + * as the query term is considered similar to the query term if the edit distance + * between both terms is less than <code>length(term)*0.5</code> + * @param prefixLength length of common (non-fuzzy) prefix + * @throws IllegalArgumentException if minimumSimilarity is > 1 or < 0 + * or if prefixLength < 0 or > <code>term.text().length()</code>. + */ + FuzzyQuery::FuzzyQuery(Term* term, qreal minimumSimilarity, size_t prefixLength): + MultiTermQuery(term) + { + //Func - Constructor + //Pre - term != NULL + //Post - The instance has been created + + CND_PRECONDITION(term != NULL,"term is NULL"); + + if (minimumSimilarity > 1.0f) + _CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity > 1"); + else if (minimumSimilarity < 0.0f) + _CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity < 0"); + + this->minimumSimilarity = minimumSimilarity; + + if(prefixLength >= term->textLength()) + _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()"); + this->prefixLength = prefixLength; + + } + + + qreal FuzzyQuery::defaultMinSimilarity = 0.5f; + + FuzzyQuery::~FuzzyQuery(){ + //Func - Destructor + //Pre - true + //Post - Instance has been destroyed + } + + TCHAR* FuzzyQuery::toString(const TCHAR* field) const{ + //Func - Returns the query string + //Pre - field != NULL + //Post - The query string has been returned + + CND_PRECONDITION(field != NULL,"field is NULL"); + + StringBuffer buffer; + const TCHAR* b = MultiTermQuery::toString(field); + + buffer.append ( b ); + _CLDELETE_CARRAY(b); + buffer.append( _T("~") ); + + buffer.appendFloat(minimumSimilarity,1); + + return buffer.toString(); + } + + const TCHAR* FuzzyQuery::getQueryName() const{ + //Func - Returns the name of the query + //Pre - true + //post - The string FuzzyQuery has been returned + + return getClassName(); + } + const TCHAR* FuzzyQuery::getClassName(){ + //Func - Returns the name of the query + //Pre - true + //post - The string FuzzyQuery has been returned + + return _T("FuzzyQuery"); + } + + + /** + * Returns the minimum similarity that is required for this query to match. + * @return float value between 0.0 and 1.0 + */ + qreal FuzzyQuery::getMinSimilarity() const { + return minimumSimilarity; + } + + FuzzyQuery::FuzzyQuery(const FuzzyQuery& clone): + MultiTermQuery(clone) + { + this->minimumSimilarity = clone.getMinSimilarity(); + this->prefixLength = clone.getPrefixLength(); + + //if(prefixLength < 0) + // _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength < 0"); + //else + if(prefixLength >= clone.getTerm()->textLength()) + _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()"); + + } + + Query* FuzzyQuery::clone() const{ + return _CLNEW FuzzyQuery(*this); + } + size_t FuzzyQuery::hashCode() const{ + //todo: we should give the query a seeding value... but + //need to do it for all hascode functions + size_t val = Similarity::floatToByte(getBoost()) ^ getTerm()->hashCode(); + val ^= Similarity::floatToByte(this->getMinSimilarity()); + val ^= this->getPrefixLength(); + return val; + } + bool FuzzyQuery::equals(Query* other) const{ + if (!(other->instanceOf(FuzzyQuery::getClassName()))) + return false; + + FuzzyQuery* fq = (FuzzyQuery*)other; + return (this->getBoost() == fq->getBoost()) + && this->getMinSimilarity() == fq->getMinSimilarity() + && this->getPrefixLength() == fq->getPrefixLength() + && getTerm()->equals(fq->getTerm()); + } + + /** + * Returns the prefix length, i.e. the number of characters at the start + * of a term that must be identical (not fuzzy) to the query term if the query + * is to match that term. + */ + size_t FuzzyQuery::getPrefixLength() const { + return prefixLength; + } + + FilteredTermEnum* FuzzyQuery::getEnum(IndexReader* reader){ + Term* term = getTerm(false); + FuzzyTermEnum* ret = _CLNEW FuzzyTermEnum(reader, term, minimumSimilarity, prefixLength); + return ret; + } + +CL_NS_END |