summaryrefslogtreecommitdiff
path: root/3rdparty/clucene/src/CLucene/search/FuzzyQuery.cpp
blob: e95d48da385fe0d97a567fa119f92a1a3d4ec2e3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/StdHeader.h"
#include "FuzzyQuery.h"

CL_NS_USE(index)
CL_NS_USE(util)
CL_NS_DEF(search)

	/**
     * Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
     * length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity &gt;
     * <code>minSimilarity</code>. 
     * 
     * @param reader Delivers terms.
     * @param term Pattern term.
     * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f.
     * @param prefixLength Length of required common prefix. Default value is 0.
     * @throws IOException
     */
	 FuzzyTermEnum::FuzzyTermEnum(const IndexReader* reader, Term* term, qreal minSimilarity, size_t prefixLength): 
        distance(0),
        _endEnum(false),
		prefix(LUCENE_BLANK_STRING),
		prefixLength(0),
		minimumSimilarity(minSimilarity)
	{
	//Func - Constructor
	//Pre  - reader contains a valid reference to an IndexReader
	//       term != NULL
	//Post - The instance has been created

		CND_PRECONDITION(term != NULL,"term is NULL");
		
		scale_factor = 1.0f / (1.0f - minimumSimilarity);
		searchTerm = _CL_POINTER(term);
		
		text = STRDUP_TtoT(term->text());
		textLen = term->textLength();
		
		
		//Initialize e to NULL
		e          = NULL;
		eWidth     = 0;
		eHeight    = 0;
		
		if(prefixLength > 0 && prefixLength < textLen){
			this->prefixLength = prefixLength;
		
			prefix = _CL_NEWARRAY(TCHAR,prefixLength+1);
			_tcsncpy(prefix,text,prefixLength);
			prefix[prefixLength]='\0';
		
			textLen = prefixLength;
			text[textLen]='\0';
		}
		
		
		//Set the enumeration 
		Term* trm = _CLNEW Term(term, prefix);
		setEnum(reader->terms(trm));
		_CLDECDELETE(trm);
  }

  FuzzyTermEnum::~FuzzyTermEnum(){
  //Func - Destructor
  //Pre  - true
  //Post - FuzzyTermEnum has been destroyed

	  //Close the enumeration
	  close();
  }

  bool FuzzyTermEnum::endEnum() {
  //Func - Returns the fact if the current term in the enumeration has reached the end
  //Pre  - true
  //Post - The boolean value of endEnum has been returned

      return _endEnum;
  }

  void FuzzyTermEnum::close(){
  //Func - Close the enumeration
  //Pre  - true
  //Post - The enumeration has been closed

      FilteredTermEnum::close();
	  
      //Finalize the searchTerm
      _CLDECDELETE(searchTerm);
	  //Destroy e
      _CLDELETE_ARRAY(e);

	  _CLDELETE_CARRAY(text);

	  if ( prefix != LUCENE_BLANK_STRING )
		  _CLDELETE_CARRAY(prefix);
  }

  bool FuzzyTermEnum::termCompare(Term* term) {
  //Func - Compares term with the searchTerm using the Levenshtein distance.
  //Pre  - term is NULL or term points to a Term
  //Post - if pre(term) is NULL then false is returned otherwise
  //       if the distance of the current term in the enumeration is bigger than the FUZZY_THRESHOLD
  //       then true is returned 
	  
	  if (term == NULL){
		  return false;  //Note that endEnum is not set to true!
	  }

	  const TCHAR* termText = term->text();
	  size_t termTextLen = term->textLength();

		  //Check if the field name of searchTerm of term match
		  //(we can use == because fields are interned)
      if ( searchTerm->field() == term->field() && 
		  	(prefixLength==0 || _tcsncmp(termText,prefix,prefixLength)==0 )) {

			const TCHAR* target = termText+prefixLength;
			size_t targetLen = termTextLen-prefixLength;

		    //Calculate the Levenshtein distance
			int32_t dist = editDistance(text, target, textLen, targetLen);
			distance = 1 - ((qreal)dist / (qreal)min(textLen, targetLen));
			return (distance > minimumSimilarity);
      }
		_endEnum = true;
		return false;
  }

  qreal FuzzyTermEnum::difference() {
  //Func - Returns the difference between the distance and the fuzzy threshold
  //       multiplied by the scale factor
  //Pre  - true
  //Post - The difference is returned

     return (qreal)((distance - minimumSimilarity) * scale_factor );
  }
  
  
	/** Finds and returns the smallest of three integers 
		precondition: Must define int32_t __t for temporary storage and result
	*/
	#define min3(a, b, c) __t = (a < b) ? a : b; __t = (__t < c) ? __t : c;

  int32_t FuzzyTermEnum::editDistance(const TCHAR* s, const TCHAR* t, const int32_t n, const int32_t m) {
  //Func - Calculates the Levenshtein distance also known as edit distance is a measure of similiarity
  //       between two strings where the distance is measured as the number of character
  //       deletions, insertions or substitutions required to transform one string to
  //       the other string.
  //Pre  - s != NULL and contains the source string
  //       t != NULL and contains the target string
  //       n >= 0 and contains the length of the source string
  //       m >= 0 and containts the length of th target string
  //Post - The distance has been returned

      CND_PRECONDITION(s != NULL, "s is NULL");
      CND_PRECONDITION(t != NULL, "t is NULL");
	  CND_PRECONDITION(n >= 0," n is a negative number");
	  CND_PRECONDITION(n >= 0," n is a negative number");

      int32_t i;     // iterates through s
      int32_t j;     // iterates through t
      TCHAR s_i; // ith character of s

      if (n == 0) 
          return m;
      if (m == 0) 
          return n;

	//Check if the array must be reallocated because it is too small or does not exist
    if (e == NULL || eWidth <= n || eHeight <= m) {
        //Delete e if possible
        _CLDELETE_ARRAY(e);
        //resize e
		eWidth  = max(eWidth, n+1);
        eHeight = max(eHeight, m+1);
        e = _CL_NEWARRAY(int32_t,eWidth*eHeight);
    }
    
    CND_CONDITION(e != NULL,"e is NULL");

    // init matrix e
	for (i = 0; i <= n; i++){
        e[i + (0*eWidth)] = i;
    }
	for (j = 0; j <= m; j++){
        e[0 + (j*eWidth)] = j;
    }

	int32_t __t; //temporary variable for min3

    // start computing edit distance
    for (i = 1; i <= n; i++) {
        s_i = s[i - 1];
        for (j = 1; j <= m; j++) {
			if (s_i != t[j-1]){
				min3(e[i + (j*eWidth) - 1], e[i + ((j-1)*eWidth)], e[i + ((j-1)*eWidth)-1]);
                e[i + (j*eWidth)] = __t+1;
			}else{
				min3(e[i + (j*eWidth) -1]+1, e[i + ((j-1)*eWidth)]+1, e[i + ((j-1)*eWidth)-1]);
                e[i + (j*eWidth)] = __t;
			}
        }
    }

    // we got the result!
    return e[n + ((m)*eWidth)];
  }


  /**
   * Create a new FuzzyQuery that will match terms with a similarity 
   * of at least <code>minimumSimilarity</code> to <code>term</code>.
   * If a <code>prefixLength</code> &gt; 0 is specified, a common prefix
   * of that length is also required.
   * 
   * @param term the term to search for
   * @param minimumSimilarity a value between 0 and 1 to set the required similarity
   *  between the query term and the matching terms. For example, for a
   *  <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
   *  as the query term is considered similar to the query term if the edit distance
   *  between both terms is less than <code>length(term)*0.5</code>
   * @param prefixLength length of common (non-fuzzy) prefix
   * @throws IllegalArgumentException if minimumSimilarity is &gt; 1 or &lt; 0
   * or if prefixLength &lt; 0 or &gt; <code>term.text().length()</code>.
   */
  FuzzyQuery::FuzzyQuery(Term* term, qreal minimumSimilarity, size_t prefixLength):
	MultiTermQuery(term)
  {
  //Func - Constructor
  //Pre  - term != NULL
  //Post - The instance has been created

	  CND_PRECONDITION(term != NULL,"term is NULL");

	    if (minimumSimilarity > 1.0f)
		  _CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity > 1");
        else if (minimumSimilarity < 0.0f)
		  _CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity < 0");
    
	    this->minimumSimilarity = minimumSimilarity;
    
		if(prefixLength >= term->textLength())
			_CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()");
		this->prefixLength = prefixLength;

    }
  
  
    qreal FuzzyQuery::defaultMinSimilarity = 0.5f;

    FuzzyQuery::~FuzzyQuery(){
    //Func - Destructor
	//Pre  - true
	//Post - Instance has been destroyed
    }

    TCHAR* FuzzyQuery::toString(const TCHAR* field) const{
	//Func - Returns the query string
	//Pre  - field != NULL
	//Post - The query string has been returned

        CND_PRECONDITION(field != NULL,"field is NULL");

        StringBuffer buffer;
        const TCHAR* b = MultiTermQuery::toString(field);
    
        buffer.append ( b );
       _CLDELETE_CARRAY(b);
        buffer.append( _T("~") );

		buffer.appendFloat(minimumSimilarity,1);

        return buffer.toString();
    }

  const TCHAR* FuzzyQuery::getQueryName() const{
  //Func - Returns the name of the query
  //Pre  - true
  //post - The string FuzzyQuery has been returned

     return getClassName();
  }
  const TCHAR* FuzzyQuery::getClassName(){
  //Func - Returns the name of the query
  //Pre  - true
  //post - The string FuzzyQuery has been returned

     return _T("FuzzyQuery");
  }


  /**
   * Returns the minimum similarity that is required for this query to match.
   * @return float value between 0.0 and 1.0
   */
  qreal FuzzyQuery::getMinSimilarity() const {
    return minimumSimilarity;
  }

  FuzzyQuery::FuzzyQuery(const FuzzyQuery& clone):
		MultiTermQuery(clone)
	{
	  this->minimumSimilarity = clone.getMinSimilarity();
	  this->prefixLength = clone.getPrefixLength();
    
		//if(prefixLength < 0)
		//	_CLTHROWA(CL_ERR_IllegalArgument,"prefixLength < 0");
		//else 
		if(prefixLength >= clone.getTerm()->textLength())
			_CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()");

	}

  Query* FuzzyQuery::clone() const{
		return _CLNEW FuzzyQuery(*this);
	}
	size_t FuzzyQuery::hashCode() const{
		//todo: we should give the query a seeding value... but
		//need to do it for all hascode functions
		size_t val = Similarity::floatToByte(getBoost()) ^ getTerm()->hashCode();
		val ^= Similarity::floatToByte(this->getMinSimilarity());
		val ^= this->getPrefixLength();
		return val;
	}
	bool FuzzyQuery::equals(Query* other) const{
		if (!(other->instanceOf(FuzzyQuery::getClassName())))
			return false;

		FuzzyQuery* fq = (FuzzyQuery*)other;
		return (this->getBoost() == fq->getBoost())
			&& this->getMinSimilarity() == fq->getMinSimilarity()
			&& this->getPrefixLength() == fq->getPrefixLength()
			&& getTerm()->equals(fq->getTerm());
	}
    
  /**
   * Returns the prefix length, i.e. the number of characters at the start
   * of a term that must be identical (not fuzzy) to the query term if the query
   * is to match that term. 
   */
  size_t FuzzyQuery::getPrefixLength() const {
    return prefixLength;
  }

  FilteredTermEnum* FuzzyQuery::getEnum(IndexReader* reader){
	  Term* term = getTerm(false);
	  FuzzyTermEnum* ret = _CLNEW FuzzyTermEnum(reader, term, minimumSimilarity, prefixLength);
	  return ret;
  }

CL_NS_END