diff options
author | Jamie McCracken <jamiemcc@src.gnome.org> | 2007-01-22 02:09:57 +0000 |
---|---|---|
committer | Jamie McCracken <jamiemcc@src.gnome.org> | 2007-01-22 02:09:57 +0000 |
commit | ab6c896a9b10978226b7a8c65f83ca214a557649 (patch) | |
tree | 5c52ab0957b49bc24a609bbbe2adaa8832a73697 | |
parent | e40fc06fc89a42f95e6aa4ad3033f35a1bd529e7 (diff) | |
download | tracker-ab6c896a9b10978226b7a8c65f83ca214a557649.tar.gz |
purge unwanted files
svn path=/trunk/; revision=412
-rw-r--r-- | src/trackerd/tracker-stemmer-english.c | 358 | ||||
-rw-r--r-- | src/trackerd/tracker-stemmer-english.h | 25 | ||||
-rw-r--r-- | src/trackerd/trackerd.h | 26 |
3 files changed, 0 insertions, 409 deletions
diff --git a/src/trackerd/tracker-stemmer-english.c b/src/trackerd/tracker-stemmer-english.c deleted file mode 100644 index cc1e06603..000000000 --- a/src/trackerd/tracker-stemmer-english.c +++ /dev/null @@ -1,358 +0,0 @@ - -/* This is the Porter stemming algorithm, coded up as thread-safe ANSI C - by the author. - - It may be be regarded as cononical, in that it follows the algorithm - presented in - - Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, - no. 3, pp 130-137, - - only differing from it at the points maked --DEPARTURE-- below. - - See also http://www.tartarus.org/~martin/PorterStemmer - - The algorithm as described in the paper could be exactly replicated - by adjusting the points of DEPARTURE, but this is barely necessary, - because (a) the points of DEPARTURE are definitely improvements, and - (b) no encoding of the Porter stemmer I have seen is anything like - as exact as this version, even with the points of DEPARTURE! - - You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which - 'stem' takes a list of inputs and sends the stemmed equivalent to - stdout. - - The algorithm as encoded here is particularly fast. - - Release 2 (the more old-fashioned, non-thread-safe version may be - regarded as release 1.) -*/ - -#include <stdlib.h> /* for malloc, free */ -#include <string.h> /* for memcmp, memmove */ -#include <glib.h> - -#include "tracker-stemmer-english.h" - -typedef struct { - char *b; /* buffer for word to be stemmed */ - int k; /* offset to the end of the string */ - int j; /* a general offset into the string */ -} stemmer; - - -/* cons(z, i) is TRUE <=> b[i] is a consonant. ('b' means 'z->b', but here - and below we drop 'z->' in comments. -*/ - -static int cons(stemmer * z, int i) -{ switch (z->b[i]) - { case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE; - case 'y': return (i == 0) ? TRUE : !cons(z, i - 1); - default: return TRUE; - } -} - -/* m(z) measures the number of consonant sequences between 0 and j. if c is - a consonant sequence and v a vowel sequence, and <..> indicates arbitrary - presence, - - <c><v> gives 0 - <c>vc<v> gives 1 - <c>vcvc<v> gives 2 - <c>vcvcvc<v> gives 3 - .... -*/ - -static int m(stemmer * z) -{ int n, i, j; - - n = 0; - i = 0; - j = z->j; - while(TRUE) - { if (i > j) return n; - if (! cons(z, i)) break; i++; - } - i++; - while(TRUE) - { while(TRUE) - { if (i > j) return n; - if (cons(z, i)) break; - i++; - } - i++; - n++; - while(TRUE) - { if (i > j) return n; - if (! cons(z, i)) break; - i++; - } - i++; - } -} - -/* vowelinstem(z) is TRUE <=> 0,...j contains a vowel */ - -static int vowelinstem(stemmer * z) -{ - int j, i; - j = z->j; - for (i = 0; i <= j; i++) if (! cons(z, i)) return TRUE; - return FALSE; -} - -/* doublec(z, j) is TRUE <=> j,(j-1) contain a double consonant. */ - -static int doublec(stemmer * z, int j) -{ - char * b; - b = z->b; - if (j < 1) return FALSE; - if (b[j] != b[j - 1]) return FALSE; - return cons(z, j); -} - -/* cvc(z, i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant - and also if the second c is not w,x or y. this is used when trying to - restore an e at the end of a short word. e.g. - - cav(e), lov(e), hop(e), crim(e), but - snow, box, tray. - -*/ - -static int cvc(stemmer * z, int i) -{ if (i < 2 || !cons(z, i) || cons(z, i - 1) || !cons(z, i - 2)) return FALSE; - { int ch; - ch = z->b[i]; - if (ch == 'w' || ch == 'x' || ch == 'y') return FALSE; - } - return TRUE; -} - -/* ends(z, s) is TRUE <=> 0,...k ends with the string s. */ - -static int ends(stemmer * z, char * s) -{ int length; - char * b; - int k; - length = s[0]; - b = z->b; - k = z->k; - if (s[length] != b[k]) return FALSE; /* tiny speed-up */ - if (length > k + 1) return FALSE; - if (memcmp(b + k - length + 1, s + 1, length) != 0) return FALSE; - z->j = k-length; - return TRUE; -} - -/* setto(z, s) sets (j+1),...k to the characters in the string s, readjusting - k. */ - -static void setto(stemmer * z, char * s) -{ int length; - int j; - length = s[0]; - j = z->j; - memmove(z->b + j + 1, s + 1, length); - z->k = j+length; -} - -/* r(z, s) is used further down. */ - -static void r(stemmer * z, char * s) { if (m(z) > 0) setto(z, s); } - -/* step1ab(z) gets rid of plurals and -ed or -ing. e.g. - - caresses -> caress - ponies -> poni - ties -> ti - caress -> caress - cats -> cat - - feed -> feed - agreed -> agree - disabled -> disable - - matting -> mat - mating -> mate - meeting -> meet - milling -> mill - messing -> mess - - meetings -> meet - -*/ - -static void step1ab(stemmer * z) -{ - char * b; - b = z->b; - - if (b[z->k] == 's') - { if (ends(z, "\04" "sses")) z->k -= 2; else - if (ends(z, "\03" "ies")) setto(z, "\01" "i"); else - if (b[z->k - 1] != 's') {z->k--; } - } - if (ends(z, "\03" "eed")) { if (m(z) > 0) z->k--; } else - if ((ends(z, "\02" "ed") || ends(z, "\03" "ing")) && vowelinstem(z)) - { z->k = z->j; - if (ends(z, "\02" "at")) setto(z, "\03" "ate"); else - if (ends(z, "\02" "bl")) setto(z, "\03" "ble"); else - if (ends(z, "\02" "iz")) setto(z, "\03" "ize"); else - if (doublec(z, z->k)) - { z->k--; - { int ch; - ch = b[z->k]; - if (ch == 'l' || ch == 's' || ch == 'z') z->k++; - } - } - else if (m(z) == 1 && cvc(z, z->k)) setto(z, "\01" "e"); - } -} - -/* step1c(z) turns terminal y to i when there is another vowel in the stem. */ - -static void step1c(stemmer * z) -{ - if (ends(z, "\01" "y") && vowelinstem(z)) z->b[z->k] = 'i'; -} - - -/* step2(z) maps double suffices to single ones. so -ization ( = -ize plus - -ation) maps to -ize etc. note that the string before the suffix must give - m(z) > 0. */ - -static void step2(stemmer * z) { switch (z->b[z->k-1]) -{ - case 'a': if (ends(z, "\07" "ational")) { r(z, "\03" "ate"); break; } - if (ends(z, "\06" "tional")) { r(z, "\04" "tion"); break; } - break; - case 'c': if (ends(z, "\04" "enci")) { r(z, "\04" "ence"); break; } - if (ends(z, "\04" "anci")) { r(z, "\04" "ance"); break; } - break; - case 'e': if (ends(z, "\04" "izer")) { r(z, "\03" "ize"); break; } - break; - case 'l': if (ends(z, "\03" "bli")) { r(z, "\03" "ble"); break; } /*-DEPARTURE-*/ - - /* To match the published algorithm, replace this line with - case 'l': if (ends(z, "\04" "abli")) { r(z, "\04" "able"); break; } */ - - if (ends(z, "\04" "alli")) { r(z, "\02" "al"); break; } - if (ends(z, "\05" "entli")) { r(z, "\03" "ent"); break; } - if (ends(z, "\03" "eli")) { r(z, "\01" "e"); break; } - if (ends(z, "\05" "ousli")) { r(z, "\03" "ous"); break; } - break; - case 'o': if (ends(z, "\07" "ization")) { r(z, "\03" "ize"); break; } - if (ends(z, "\05" "ation")) { r(z, "\03" "ate"); break; } - if (ends(z, "\04" "ator")) { r(z, "\03" "ate"); break; } - break; - case 's': if (ends(z, "\05" "alism")) { r(z, "\02" "al"); break; } - if (ends(z, "\07" "iveness")) { r(z, "\03" "ive"); break; } - if (ends(z, "\07" "fulness")) { r(z, "\03" "ful"); break; } - if (ends(z, "\07" "ousness")) { r(z, "\03" "ous"); break; } - break; - case 't': if (ends(z, "\05" "aliti")) { r(z, "\02" "al"); break; } - if (ends(z, "\05" "iviti")) { r(z, "\03" "ive"); break; } - if (ends(z, "\06" "biliti")) { r(z, "\03" "ble"); break; } - break; - case 'g': if (ends(z, "\04" "logi")) { r(z, "\03" "log"); break; } /*-DEPARTURE-*/ - - /* To match the published algorithm, delete this line */ - -} } - -/* step3(z) deals with -ic-, -full, -ness etc. similar strategy to step2. */ - -static void step3(stemmer * z) { switch (z->b[z->k]) -{ - case 'e': if (ends(z, "\05" "icate")) { r(z, "\02" "ic"); break; } - if (ends(z, "\05" "ative")) { r(z, "\00" ""); break; } - if (ends(z, "\05" "alize")) { r(z, "\02" "al"); break; } - break; - case 'i': if (ends(z, "\05" "iciti")) { r(z, "\02" "ic"); break; } - break; - case 'l': if (ends(z, "\04" "ical")) { r(z, "\02" "ic"); break; } - if (ends(z, "\03" "ful")) { r(z, "\00" ""); break; } - break; - case 's': if (ends(z, "\04" "ness")) { r(z, "\00" ""); break; } - break; -} } - -/* step4(z) takes off -ant, -ence etc., in context <c>vcvc<v>. */ - -static void step4(stemmer * z) -{ switch (z->b[z->k-1]) - { case 'a': if (ends(z, "\02" "al")) break; return; - case 'c': if (ends(z, "\04" "ance")) break; - if (ends(z, "\04" "ence")) break; return; - case 'e': if (ends(z, "\02" "er")) break; return; - case 'i': if (ends(z, "\02" "ic")) break; return; - case 'l': if (ends(z, "\04" "able")) break; - if (ends(z, "\04" "ible")) break; return; - case 'n': if (ends(z, "\03" "ant")) break; - if (ends(z, "\05" "ement")) break; - if (ends(z, "\04" "ment")) break; - if (ends(z, "\03" "ent")) break; return; - case 'o': if (ends(z, "\03" "ion") && (z->b[z->j] == 's' || z->b[z->j] == 't')) break; - if (ends(z, "\02" "ou")) break; return; - /* takes care of -ous */ - case 's': if (ends(z, "\03" "ism")) break; return; - case 't': if (ends(z, "\03" "ate")) break; - if (ends(z, "\03" "iti")) break; return; - case 'u': if (ends(z, "\03" "ous")) break; return; - case 'v': if (ends(z, "\03" "ive")) break; return; - case 'z': if (ends(z, "\03" "ize")) break; return; - default: return; - } - if (m(z) > 1) z->k = z->j; -} - -/* step5(z) removes a final -e if m(z) > 1, and changes -ll to -l if - m(z) > 1. */ - -static void step5(stemmer * z) -{ - char * b; - b = z->b; - z->j = z->k; - if (b[z->k] == 'e') - { int a; - a = m(z); - if (a > 1 || (a == 1 && !cvc(z, z->k - 1))) z->k--; - } - if (b[z->k] == 'l' && doublec(z, z->k) && m(z) > 1) z->k--; -} - -/* In stem(z, b, k), b is a char pointer, and the string to be stemmed is - from b[0] to b[k] inclusive. Possibly b[k+1] == '\0', but it is not - important. The stemmer adjusts the characters b[0] ... b[k] and returns - the new end-point of the string, k'. Stemming never increases word - length, so 0 <= k' <= k. -*/ - -char * tracker_stem_eng (char * b, int k) -{ - - stemmer Stemmer; - char *result; - - if (k <= 1) return g_strdup(b); /*-DEPARTURE-*/ - - Stemmer.b = g_strdup (b); - Stemmer.k = k; /* copy the parameters into z */ - Stemmer.j = 0; - - /* With this line, strings of length 1 or 2 don't go through the - stemming process, although no mention is made of this in the - published algorithm. Remove the line to match the published - algorithm. */ - - step1ab(&Stemmer); step1c(&Stemmer); step2(&Stemmer); step3(&Stemmer); step4(&Stemmer); step5(&Stemmer); - - result = g_strndup (Stemmer.b, Stemmer.k+1); - g_free (Stemmer.b); - - return result; -} diff --git a/src/trackerd/tracker-stemmer-english.h b/src/trackerd/tracker-stemmer-english.h deleted file mode 100644 index 2c7de82b3..000000000 --- a/src/trackerd/tracker-stemmer-english.h +++ /dev/null @@ -1,25 +0,0 @@ -/* Tracker - * Copyright (C) 2005, Mr Jamie McCracken (jamiemcc@gnome.org) - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this library; if not, write to the - * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - * Boston, MA 02110-1301, USA. - */ - -#ifndef _TRACKER_STEMMER_H_ -#define _TRACKER_STEMMER_H_ - -char * tracker_stem_eng (char * b, int k); - -#endif diff --git a/src/trackerd/trackerd.h b/src/trackerd/trackerd.h deleted file mode 100644 index 078464141..000000000 --- a/src/trackerd/trackerd.h +++ /dev/null @@ -1,26 +0,0 @@ -/* Tracker - * Copyright (C) 2005, Mr Jamie McCracken - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this library; if not, write to the - * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - * Boston, MA 02110-1301, USA. - */ - -#include <glib.h> -#include "tracker-db.h" -#include "tracker-global.h" - - - - |