summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJamie McCracken <jamiemcc@src.gnome.org>2007-01-22 02:09:57 +0000
committerJamie McCracken <jamiemcc@src.gnome.org>2007-01-22 02:09:57 +0000
commitab6c896a9b10978226b7a8c65f83ca214a557649 (patch)
tree5c52ab0957b49bc24a609bbbe2adaa8832a73697
parente40fc06fc89a42f95e6aa4ad3033f35a1bd529e7 (diff)
downloadtracker-ab6c896a9b10978226b7a8c65f83ca214a557649.tar.gz
purge unwanted files
svn path=/trunk/; revision=412
-rw-r--r--src/trackerd/tracker-stemmer-english.c358
-rw-r--r--src/trackerd/tracker-stemmer-english.h25
-rw-r--r--src/trackerd/trackerd.h26
3 files changed, 0 insertions, 409 deletions
diff --git a/src/trackerd/tracker-stemmer-english.c b/src/trackerd/tracker-stemmer-english.c
deleted file mode 100644
index cc1e06603..000000000
--- a/src/trackerd/tracker-stemmer-english.c
+++ /dev/null
@@ -1,358 +0,0 @@
-
-/* This is the Porter stemming algorithm, coded up as thread-safe ANSI C
- by the author.
-
- It may be be regarded as cononical, in that it follows the algorithm
- presented in
-
- Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
- no. 3, pp 130-137,
-
- only differing from it at the points maked --DEPARTURE-- below.
-
- See also http://www.tartarus.org/~martin/PorterStemmer
-
- The algorithm as described in the paper could be exactly replicated
- by adjusting the points of DEPARTURE, but this is barely necessary,
- because (a) the points of DEPARTURE are definitely improvements, and
- (b) no encoding of the Porter stemmer I have seen is anything like
- as exact as this version, even with the points of DEPARTURE!
-
- You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
- 'stem' takes a list of inputs and sends the stemmed equivalent to
- stdout.
-
- The algorithm as encoded here is particularly fast.
-
- Release 2 (the more old-fashioned, non-thread-safe version may be
- regarded as release 1.)
-*/
-
-#include <stdlib.h> /* for malloc, free */
-#include <string.h> /* for memcmp, memmove */
-#include <glib.h>
-
-#include "tracker-stemmer-english.h"
-
-typedef struct {
- char *b; /* buffer for word to be stemmed */
- int k; /* offset to the end of the string */
- int j; /* a general offset into the string */
-} stemmer;
-
-
-/* cons(z, i) is TRUE <=> b[i] is a consonant. ('b' means 'z->b', but here
- and below we drop 'z->' in comments.
-*/
-
-static int cons(stemmer * z, int i)
-{ switch (z->b[i])
- { case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE;
- case 'y': return (i == 0) ? TRUE : !cons(z, i - 1);
- default: return TRUE;
- }
-}
-
-/* m(z) measures the number of consonant sequences between 0 and j. if c is
- a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
- presence,
-
- <c><v> gives 0
- <c>vc<v> gives 1
- <c>vcvc<v> gives 2
- <c>vcvcvc<v> gives 3
- ....
-*/
-
-static int m(stemmer * z)
-{ int n, i, j;
-
- n = 0;
- i = 0;
- j = z->j;
- while(TRUE)
- { if (i > j) return n;
- if (! cons(z, i)) break; i++;
- }
- i++;
- while(TRUE)
- { while(TRUE)
- { if (i > j) return n;
- if (cons(z, i)) break;
- i++;
- }
- i++;
- n++;
- while(TRUE)
- { if (i > j) return n;
- if (! cons(z, i)) break;
- i++;
- }
- i++;
- }
-}
-
-/* vowelinstem(z) is TRUE <=> 0,...j contains a vowel */
-
-static int vowelinstem(stemmer * z)
-{
- int j, i;
- j = z->j;
- for (i = 0; i <= j; i++) if (! cons(z, i)) return TRUE;
- return FALSE;
-}
-
-/* doublec(z, j) is TRUE <=> j,(j-1) contain a double consonant. */
-
-static int doublec(stemmer * z, int j)
-{
- char * b;
- b = z->b;
- if (j < 1) return FALSE;
- if (b[j] != b[j - 1]) return FALSE;
- return cons(z, j);
-}
-
-/* cvc(z, i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
- and also if the second c is not w,x or y. this is used when trying to
- restore an e at the end of a short word. e.g.
-
- cav(e), lov(e), hop(e), crim(e), but
- snow, box, tray.
-
-*/
-
-static int cvc(stemmer * z, int i)
-{ if (i < 2 || !cons(z, i) || cons(z, i - 1) || !cons(z, i - 2)) return FALSE;
- { int ch;
- ch = z->b[i];
- if (ch == 'w' || ch == 'x' || ch == 'y') return FALSE;
- }
- return TRUE;
-}
-
-/* ends(z, s) is TRUE <=> 0,...k ends with the string s. */
-
-static int ends(stemmer * z, char * s)
-{ int length;
- char * b;
- int k;
- length = s[0];
- b = z->b;
- k = z->k;
- if (s[length] != b[k]) return FALSE; /* tiny speed-up */
- if (length > k + 1) return FALSE;
- if (memcmp(b + k - length + 1, s + 1, length) != 0) return FALSE;
- z->j = k-length;
- return TRUE;
-}
-
-/* setto(z, s) sets (j+1),...k to the characters in the string s, readjusting
- k. */
-
-static void setto(stemmer * z, char * s)
-{ int length;
- int j;
- length = s[0];
- j = z->j;
- memmove(z->b + j + 1, s + 1, length);
- z->k = j+length;
-}
-
-/* r(z, s) is used further down. */
-
-static void r(stemmer * z, char * s) { if (m(z) > 0) setto(z, s); }
-
-/* step1ab(z) gets rid of plurals and -ed or -ing. e.g.
-
- caresses -> caress
- ponies -> poni
- ties -> ti
- caress -> caress
- cats -> cat
-
- feed -> feed
- agreed -> agree
- disabled -> disable
-
- matting -> mat
- mating -> mate
- meeting -> meet
- milling -> mill
- messing -> mess
-
- meetings -> meet
-
-*/
-
-static void step1ab(stemmer * z)
-{
- char * b;
- b = z->b;
-
- if (b[z->k] == 's')
- { if (ends(z, "\04" "sses")) z->k -= 2; else
- if (ends(z, "\03" "ies")) setto(z, "\01" "i"); else
- if (b[z->k - 1] != 's') {z->k--; }
- }
- if (ends(z, "\03" "eed")) { if (m(z) > 0) z->k--; } else
- if ((ends(z, "\02" "ed") || ends(z, "\03" "ing")) && vowelinstem(z))
- { z->k = z->j;
- if (ends(z, "\02" "at")) setto(z, "\03" "ate"); else
- if (ends(z, "\02" "bl")) setto(z, "\03" "ble"); else
- if (ends(z, "\02" "iz")) setto(z, "\03" "ize"); else
- if (doublec(z, z->k))
- { z->k--;
- { int ch;
- ch = b[z->k];
- if (ch == 'l' || ch == 's' || ch == 'z') z->k++;
- }
- }
- else if (m(z) == 1 && cvc(z, z->k)) setto(z, "\01" "e");
- }
-}
-
-/* step1c(z) turns terminal y to i when there is another vowel in the stem. */
-
-static void step1c(stemmer * z)
-{
- if (ends(z, "\01" "y") && vowelinstem(z)) z->b[z->k] = 'i';
-}
-
-
-/* step2(z) maps double suffices to single ones. so -ization ( = -ize plus
- -ation) maps to -ize etc. note that the string before the suffix must give
- m(z) > 0. */
-
-static void step2(stemmer * z) { switch (z->b[z->k-1])
-{
- case 'a': if (ends(z, "\07" "ational")) { r(z, "\03" "ate"); break; }
- if (ends(z, "\06" "tional")) { r(z, "\04" "tion"); break; }
- break;
- case 'c': if (ends(z, "\04" "enci")) { r(z, "\04" "ence"); break; }
- if (ends(z, "\04" "anci")) { r(z, "\04" "ance"); break; }
- break;
- case 'e': if (ends(z, "\04" "izer")) { r(z, "\03" "ize"); break; }
- break;
- case 'l': if (ends(z, "\03" "bli")) { r(z, "\03" "ble"); break; } /*-DEPARTURE-*/
-
- /* To match the published algorithm, replace this line with
- case 'l': if (ends(z, "\04" "abli")) { r(z, "\04" "able"); break; } */
-
- if (ends(z, "\04" "alli")) { r(z, "\02" "al"); break; }
- if (ends(z, "\05" "entli")) { r(z, "\03" "ent"); break; }
- if (ends(z, "\03" "eli")) { r(z, "\01" "e"); break; }
- if (ends(z, "\05" "ousli")) { r(z, "\03" "ous"); break; }
- break;
- case 'o': if (ends(z, "\07" "ization")) { r(z, "\03" "ize"); break; }
- if (ends(z, "\05" "ation")) { r(z, "\03" "ate"); break; }
- if (ends(z, "\04" "ator")) { r(z, "\03" "ate"); break; }
- break;
- case 's': if (ends(z, "\05" "alism")) { r(z, "\02" "al"); break; }
- if (ends(z, "\07" "iveness")) { r(z, "\03" "ive"); break; }
- if (ends(z, "\07" "fulness")) { r(z, "\03" "ful"); break; }
- if (ends(z, "\07" "ousness")) { r(z, "\03" "ous"); break; }
- break;
- case 't': if (ends(z, "\05" "aliti")) { r(z, "\02" "al"); break; }
- if (ends(z, "\05" "iviti")) { r(z, "\03" "ive"); break; }
- if (ends(z, "\06" "biliti")) { r(z, "\03" "ble"); break; }
- break;
- case 'g': if (ends(z, "\04" "logi")) { r(z, "\03" "log"); break; } /*-DEPARTURE-*/
-
- /* To match the published algorithm, delete this line */
-
-} }
-
-/* step3(z) deals with -ic-, -full, -ness etc. similar strategy to step2. */
-
-static void step3(stemmer * z) { switch (z->b[z->k])
-{
- case 'e': if (ends(z, "\05" "icate")) { r(z, "\02" "ic"); break; }
- if (ends(z, "\05" "ative")) { r(z, "\00" ""); break; }
- if (ends(z, "\05" "alize")) { r(z, "\02" "al"); break; }
- break;
- case 'i': if (ends(z, "\05" "iciti")) { r(z, "\02" "ic"); break; }
- break;
- case 'l': if (ends(z, "\04" "ical")) { r(z, "\02" "ic"); break; }
- if (ends(z, "\03" "ful")) { r(z, "\00" ""); break; }
- break;
- case 's': if (ends(z, "\04" "ness")) { r(z, "\00" ""); break; }
- break;
-} }
-
-/* step4(z) takes off -ant, -ence etc., in context <c>vcvc<v>. */
-
-static void step4(stemmer * z)
-{ switch (z->b[z->k-1])
- { case 'a': if (ends(z, "\02" "al")) break; return;
- case 'c': if (ends(z, "\04" "ance")) break;
- if (ends(z, "\04" "ence")) break; return;
- case 'e': if (ends(z, "\02" "er")) break; return;
- case 'i': if (ends(z, "\02" "ic")) break; return;
- case 'l': if (ends(z, "\04" "able")) break;
- if (ends(z, "\04" "ible")) break; return;
- case 'n': if (ends(z, "\03" "ant")) break;
- if (ends(z, "\05" "ement")) break;
- if (ends(z, "\04" "ment")) break;
- if (ends(z, "\03" "ent")) break; return;
- case 'o': if (ends(z, "\03" "ion") && (z->b[z->j] == 's' || z->b[z->j] == 't')) break;
- if (ends(z, "\02" "ou")) break; return;
- /* takes care of -ous */
- case 's': if (ends(z, "\03" "ism")) break; return;
- case 't': if (ends(z, "\03" "ate")) break;
- if (ends(z, "\03" "iti")) break; return;
- case 'u': if (ends(z, "\03" "ous")) break; return;
- case 'v': if (ends(z, "\03" "ive")) break; return;
- case 'z': if (ends(z, "\03" "ize")) break; return;
- default: return;
- }
- if (m(z) > 1) z->k = z->j;
-}
-
-/* step5(z) removes a final -e if m(z) > 1, and changes -ll to -l if
- m(z) > 1. */
-
-static void step5(stemmer * z)
-{
- char * b;
- b = z->b;
- z->j = z->k;
- if (b[z->k] == 'e')
- { int a;
- a = m(z);
- if (a > 1 || (a == 1 && !cvc(z, z->k - 1))) z->k--;
- }
- if (b[z->k] == 'l' && doublec(z, z->k) && m(z) > 1) z->k--;
-}
-
-/* In stem(z, b, k), b is a char pointer, and the string to be stemmed is
- from b[0] to b[k] inclusive. Possibly b[k+1] == '\0', but it is not
- important. The stemmer adjusts the characters b[0] ... b[k] and returns
- the new end-point of the string, k'. Stemming never increases word
- length, so 0 <= k' <= k.
-*/
-
-char * tracker_stem_eng (char * b, int k)
-{
-
- stemmer Stemmer;
- char *result;
-
- if (k <= 1) return g_strdup(b); /*-DEPARTURE-*/
-
- Stemmer.b = g_strdup (b);
- Stemmer.k = k; /* copy the parameters into z */
- Stemmer.j = 0;
-
- /* With this line, strings of length 1 or 2 don't go through the
- stemming process, although no mention is made of this in the
- published algorithm. Remove the line to match the published
- algorithm. */
-
- step1ab(&Stemmer); step1c(&Stemmer); step2(&Stemmer); step3(&Stemmer); step4(&Stemmer); step5(&Stemmer);
-
- result = g_strndup (Stemmer.b, Stemmer.k+1);
- g_free (Stemmer.b);
-
- return result;
-}
diff --git a/src/trackerd/tracker-stemmer-english.h b/src/trackerd/tracker-stemmer-english.h
deleted file mode 100644
index 2c7de82b3..000000000
--- a/src/trackerd/tracker-stemmer-english.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Tracker
- * Copyright (C) 2005, Mr Jamie McCracken (jamiemcc@gnome.org)
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this library; if not, write to the
- * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
- * Boston, MA 02110-1301, USA.
- */
-
-#ifndef _TRACKER_STEMMER_H_
-#define _TRACKER_STEMMER_H_
-
-char * tracker_stem_eng (char * b, int k);
-
-#endif
diff --git a/src/trackerd/trackerd.h b/src/trackerd/trackerd.h
deleted file mode 100644
index 078464141..000000000
--- a/src/trackerd/trackerd.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Tracker
- * Copyright (C) 2005, Mr Jamie McCracken
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this library; if not, write to the
- * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
- * Boston, MA 02110-1301, USA.
- */
-
-#include <glib.h>
-#include "tracker-db.h"
-#include "tracker-global.h"
-
-
-
-