diff options
author | Jamie McCracken <jamiemcc@src.gnome.org> | 2006-09-13 20:03:56 +0000 |
---|---|---|
committer | Jamie McCracken <jamiemcc@src.gnome.org> | 2006-09-13 20:03:56 +0000 |
commit | b0b81281f1bdc7b50bcf61399c06728feca0099b (patch) | |
tree | b27ba7895620ba5ea57f70f6e7c31940afd43ef9 | |
parent | 73da61ebdd6b5e4e0c7c0f021f70394f84896481 (diff) | |
download | tracker-b0b81281f1bdc7b50bcf61399c06728feca0099b.tar.gz |
updates
-rw-r--r-- | ChangeLog | 16 | ||||
-rw-r--r-- | configure.in | 2 | ||||
-rw-r--r-- | src/Makefile.am | 10 | ||||
-rw-r--r-- | src/trackerd/Makefile.am | 4 | ||||
-rw-r--r-- | src/trackerd/tracker-stemmer-english.c | 358 | ||||
-rw-r--r-- | src/trackerd/tracker-stemmer-english.h | 25 |
6 files changed, 410 insertions, 5 deletions
@@ -1,3 +1,19 @@ +2006-09-13 Jamie McCracken <jamiemcc@gnome.org> + + * Tidied up tracker-GUI + * Inline libsexy widget + * extended parser to support line by line indexing + * added more functionality to new indexer + + +2006-09-13 Edward Duffy <eduffy@gmail.com> + + * Enhanced tracker GUI + +2006-09-13 Jaime Frutos Morales <acidborg@gmail.com> + + * Patch to add skeleton for tracker GUI + 2006-09-11 Samuel Cormier-Iijima <ciyoshi@gmail.com> diff --git a/configure.in b/configure.in index ccb23f2e3..86b02e3f4 100644 --- a/configure.in +++ b/configure.in @@ -19,7 +19,7 @@ AM_PROG_LIBTOOL # Checks for header files. AC_HEADER_STDC -AC_CHECK_HEADERS([fcntl.h sitdlib.h string.h sys/time.h unistd.h magic.h]) +AC_CHECK_HEADERS([fcntl.h sitdlib.h string.h sys/time.h unistd.h]) # Check for glib 2.0 PKG_CHECK_MODULES(GLIB2, [ glib-2.0 >= 2.4.0 ] gthread-2.0 >= 2.4.0) diff --git a/src/Makefile.am b/src/Makefile.am index 7ccf73606..23c45230f 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -5,5 +5,11 @@ else extractor_dir = endif -SUBDIRS = text-filters trackerd libtracker $(extractor_dir) tracker-extract -DIST_SUBDIRS = text-filters trackerd libtracker $(extractor_dir) tracker-extract +if HAVE_GTK +tracker_gui_dir = tracker-gui +else +tracker_gui_dir = +endif + +SUBDIRS = text-filters trackerd libtracker $(extractor_dir) tracker-extract $(tracker_gui_dir) +DIST_SUBDIRS = text-filters trackerd libtracker $(extractor_dir) tracker-extract $(tracker_gui_dir) diff --git a/src/trackerd/Makefile.am b/src/trackerd/Makefile.am index f51824ca1..df45b96bd 100644 --- a/src/trackerd/Makefile.am +++ b/src/trackerd/Makefile.am @@ -98,8 +98,8 @@ trackerd_LDADD = $(GLIB2_LIBS) \ $(PANGO_LIBS) \ $(QDBM_LIBS) \ $(SQLITE3_LIBS) \ - -lstdc++ \ - -lmagic + -lstdc++ + tracker_convert_file_SOURCES = tracker-convert-file.c tracker-parser.c tracker-stemmer.c diff --git a/src/trackerd/tracker-stemmer-english.c b/src/trackerd/tracker-stemmer-english.c new file mode 100644 index 000000000..cc1e06603 --- /dev/null +++ b/src/trackerd/tracker-stemmer-english.c @@ -0,0 +1,358 @@ + +/* This is the Porter stemming algorithm, coded up as thread-safe ANSI C + by the author. + + It may be be regarded as cononical, in that it follows the algorithm + presented in + + Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, + no. 3, pp 130-137, + + only differing from it at the points maked --DEPARTURE-- below. + + See also http://www.tartarus.org/~martin/PorterStemmer + + The algorithm as described in the paper could be exactly replicated + by adjusting the points of DEPARTURE, but this is barely necessary, + because (a) the points of DEPARTURE are definitely improvements, and + (b) no encoding of the Porter stemmer I have seen is anything like + as exact as this version, even with the points of DEPARTURE! + + You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which + 'stem' takes a list of inputs and sends the stemmed equivalent to + stdout. + + The algorithm as encoded here is particularly fast. + + Release 2 (the more old-fashioned, non-thread-safe version may be + regarded as release 1.) +*/ + +#include <stdlib.h> /* for malloc, free */ +#include <string.h> /* for memcmp, memmove */ +#include <glib.h> + +#include "tracker-stemmer-english.h" + +typedef struct { + char *b; /* buffer for word to be stemmed */ + int k; /* offset to the end of the string */ + int j; /* a general offset into the string */ +} stemmer; + + +/* cons(z, i) is TRUE <=> b[i] is a consonant. ('b' means 'z->b', but here + and below we drop 'z->' in comments. +*/ + +static int cons(stemmer * z, int i) +{ switch (z->b[i]) + { case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE; + case 'y': return (i == 0) ? TRUE : !cons(z, i - 1); + default: return TRUE; + } +} + +/* m(z) measures the number of consonant sequences between 0 and j. if c is + a consonant sequence and v a vowel sequence, and <..> indicates arbitrary + presence, + + <c><v> gives 0 + <c>vc<v> gives 1 + <c>vcvc<v> gives 2 + <c>vcvcvc<v> gives 3 + .... +*/ + +static int m(stemmer * z) +{ int n, i, j; + + n = 0; + i = 0; + j = z->j; + while(TRUE) + { if (i > j) return n; + if (! cons(z, i)) break; i++; + } + i++; + while(TRUE) + { while(TRUE) + { if (i > j) return n; + if (cons(z, i)) break; + i++; + } + i++; + n++; + while(TRUE) + { if (i > j) return n; + if (! cons(z, i)) break; + i++; + } + i++; + } +} + +/* vowelinstem(z) is TRUE <=> 0,...j contains a vowel */ + +static int vowelinstem(stemmer * z) +{ + int j, i; + j = z->j; + for (i = 0; i <= j; i++) if (! cons(z, i)) return TRUE; + return FALSE; +} + +/* doublec(z, j) is TRUE <=> j,(j-1) contain a double consonant. */ + +static int doublec(stemmer * z, int j) +{ + char * b; + b = z->b; + if (j < 1) return FALSE; + if (b[j] != b[j - 1]) return FALSE; + return cons(z, j); +} + +/* cvc(z, i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant + and also if the second c is not w,x or y. this is used when trying to + restore an e at the end of a short word. e.g. + + cav(e), lov(e), hop(e), crim(e), but + snow, box, tray. + +*/ + +static int cvc(stemmer * z, int i) +{ if (i < 2 || !cons(z, i) || cons(z, i - 1) || !cons(z, i - 2)) return FALSE; + { int ch; + ch = z->b[i]; + if (ch == 'w' || ch == 'x' || ch == 'y') return FALSE; + } + return TRUE; +} + +/* ends(z, s) is TRUE <=> 0,...k ends with the string s. */ + +static int ends(stemmer * z, char * s) +{ int length; + char * b; + int k; + length = s[0]; + b = z->b; + k = z->k; + if (s[length] != b[k]) return FALSE; /* tiny speed-up */ + if (length > k + 1) return FALSE; + if (memcmp(b + k - length + 1, s + 1, length) != 0) return FALSE; + z->j = k-length; + return TRUE; +} + +/* setto(z, s) sets (j+1),...k to the characters in the string s, readjusting + k. */ + +static void setto(stemmer * z, char * s) +{ int length; + int j; + length = s[0]; + j = z->j; + memmove(z->b + j + 1, s + 1, length); + z->k = j+length; +} + +/* r(z, s) is used further down. */ + +static void r(stemmer * z, char * s) { if (m(z) > 0) setto(z, s); } + +/* step1ab(z) gets rid of plurals and -ed or -ing. e.g. + + caresses -> caress + ponies -> poni + ties -> ti + caress -> caress + cats -> cat + + feed -> feed + agreed -> agree + disabled -> disable + + matting -> mat + mating -> mate + meeting -> meet + milling -> mill + messing -> mess + + meetings -> meet + +*/ + +static void step1ab(stemmer * z) +{ + char * b; + b = z->b; + + if (b[z->k] == 's') + { if (ends(z, "\04" "sses")) z->k -= 2; else + if (ends(z, "\03" "ies")) setto(z, "\01" "i"); else + if (b[z->k - 1] != 's') {z->k--; } + } + if (ends(z, "\03" "eed")) { if (m(z) > 0) z->k--; } else + if ((ends(z, "\02" "ed") || ends(z, "\03" "ing")) && vowelinstem(z)) + { z->k = z->j; + if (ends(z, "\02" "at")) setto(z, "\03" "ate"); else + if (ends(z, "\02" "bl")) setto(z, "\03" "ble"); else + if (ends(z, "\02" "iz")) setto(z, "\03" "ize"); else + if (doublec(z, z->k)) + { z->k--; + { int ch; + ch = b[z->k]; + if (ch == 'l' || ch == 's' || ch == 'z') z->k++; + } + } + else if (m(z) == 1 && cvc(z, z->k)) setto(z, "\01" "e"); + } +} + +/* step1c(z) turns terminal y to i when there is another vowel in the stem. */ + +static void step1c(stemmer * z) +{ + if (ends(z, "\01" "y") && vowelinstem(z)) z->b[z->k] = 'i'; +} + + +/* step2(z) maps double suffices to single ones. so -ization ( = -ize plus + -ation) maps to -ize etc. note that the string before the suffix must give + m(z) > 0. */ + +static void step2(stemmer * z) { switch (z->b[z->k-1]) +{ + case 'a': if (ends(z, "\07" "ational")) { r(z, "\03" "ate"); break; } + if (ends(z, "\06" "tional")) { r(z, "\04" "tion"); break; } + break; + case 'c': if (ends(z, "\04" "enci")) { r(z, "\04" "ence"); break; } + if (ends(z, "\04" "anci")) { r(z, "\04" "ance"); break; } + break; + case 'e': if (ends(z, "\04" "izer")) { r(z, "\03" "ize"); break; } + break; + case 'l': if (ends(z, "\03" "bli")) { r(z, "\03" "ble"); break; } /*-DEPARTURE-*/ + + /* To match the published algorithm, replace this line with + case 'l': if (ends(z, "\04" "abli")) { r(z, "\04" "able"); break; } */ + + if (ends(z, "\04" "alli")) { r(z, "\02" "al"); break; } + if (ends(z, "\05" "entli")) { r(z, "\03" "ent"); break; } + if (ends(z, "\03" "eli")) { r(z, "\01" "e"); break; } + if (ends(z, "\05" "ousli")) { r(z, "\03" "ous"); break; } + break; + case 'o': if (ends(z, "\07" "ization")) { r(z, "\03" "ize"); break; } + if (ends(z, "\05" "ation")) { r(z, "\03" "ate"); break; } + if (ends(z, "\04" "ator")) { r(z, "\03" "ate"); break; } + break; + case 's': if (ends(z, "\05" "alism")) { r(z, "\02" "al"); break; } + if (ends(z, "\07" "iveness")) { r(z, "\03" "ive"); break; } + if (ends(z, "\07" "fulness")) { r(z, "\03" "ful"); break; } + if (ends(z, "\07" "ousness")) { r(z, "\03" "ous"); break; } + break; + case 't': if (ends(z, "\05" "aliti")) { r(z, "\02" "al"); break; } + if (ends(z, "\05" "iviti")) { r(z, "\03" "ive"); break; } + if (ends(z, "\06" "biliti")) { r(z, "\03" "ble"); break; } + break; + case 'g': if (ends(z, "\04" "logi")) { r(z, "\03" "log"); break; } /*-DEPARTURE-*/ + + /* To match the published algorithm, delete this line */ + +} } + +/* step3(z) deals with -ic-, -full, -ness etc. similar strategy to step2. */ + +static void step3(stemmer * z) { switch (z->b[z->k]) +{ + case 'e': if (ends(z, "\05" "icate")) { r(z, "\02" "ic"); break; } + if (ends(z, "\05" "ative")) { r(z, "\00" ""); break; } + if (ends(z, "\05" "alize")) { r(z, "\02" "al"); break; } + break; + case 'i': if (ends(z, "\05" "iciti")) { r(z, "\02" "ic"); break; } + break; + case 'l': if (ends(z, "\04" "ical")) { r(z, "\02" "ic"); break; } + if (ends(z, "\03" "ful")) { r(z, "\00" ""); break; } + break; + case 's': if (ends(z, "\04" "ness")) { r(z, "\00" ""); break; } + break; +} } + +/* step4(z) takes off -ant, -ence etc., in context <c>vcvc<v>. */ + +static void step4(stemmer * z) +{ switch (z->b[z->k-1]) + { case 'a': if (ends(z, "\02" "al")) break; return; + case 'c': if (ends(z, "\04" "ance")) break; + if (ends(z, "\04" "ence")) break; return; + case 'e': if (ends(z, "\02" "er")) break; return; + case 'i': if (ends(z, "\02" "ic")) break; return; + case 'l': if (ends(z, "\04" "able")) break; + if (ends(z, "\04" "ible")) break; return; + case 'n': if (ends(z, "\03" "ant")) break; + if (ends(z, "\05" "ement")) break; + if (ends(z, "\04" "ment")) break; + if (ends(z, "\03" "ent")) break; return; + case 'o': if (ends(z, "\03" "ion") && (z->b[z->j] == 's' || z->b[z->j] == 't')) break; + if (ends(z, "\02" "ou")) break; return; + /* takes care of -ous */ + case 's': if (ends(z, "\03" "ism")) break; return; + case 't': if (ends(z, "\03" "ate")) break; + if (ends(z, "\03" "iti")) break; return; + case 'u': if (ends(z, "\03" "ous")) break; return; + case 'v': if (ends(z, "\03" "ive")) break; return; + case 'z': if (ends(z, "\03" "ize")) break; return; + default: return; + } + if (m(z) > 1) z->k = z->j; +} + +/* step5(z) removes a final -e if m(z) > 1, and changes -ll to -l if + m(z) > 1. */ + +static void step5(stemmer * z) +{ + char * b; + b = z->b; + z->j = z->k; + if (b[z->k] == 'e') + { int a; + a = m(z); + if (a > 1 || (a == 1 && !cvc(z, z->k - 1))) z->k--; + } + if (b[z->k] == 'l' && doublec(z, z->k) && m(z) > 1) z->k--; +} + +/* In stem(z, b, k), b is a char pointer, and the string to be stemmed is + from b[0] to b[k] inclusive. Possibly b[k+1] == '\0', but it is not + important. The stemmer adjusts the characters b[0] ... b[k] and returns + the new end-point of the string, k'. Stemming never increases word + length, so 0 <= k' <= k. +*/ + +char * tracker_stem_eng (char * b, int k) +{ + + stemmer Stemmer; + char *result; + + if (k <= 1) return g_strdup(b); /*-DEPARTURE-*/ + + Stemmer.b = g_strdup (b); + Stemmer.k = k; /* copy the parameters into z */ + Stemmer.j = 0; + + /* With this line, strings of length 1 or 2 don't go through the + stemming process, although no mention is made of this in the + published algorithm. Remove the line to match the published + algorithm. */ + + step1ab(&Stemmer); step1c(&Stemmer); step2(&Stemmer); step3(&Stemmer); step4(&Stemmer); step5(&Stemmer); + + result = g_strndup (Stemmer.b, Stemmer.k+1); + g_free (Stemmer.b); + + return result; +} diff --git a/src/trackerd/tracker-stemmer-english.h b/src/trackerd/tracker-stemmer-english.h new file mode 100644 index 000000000..786163b63 --- /dev/null +++ b/src/trackerd/tracker-stemmer-english.h @@ -0,0 +1,25 @@ +/* Tracker + * Copyright (C) 2005, Mr Jamie McCracken (jamiemcc@gnome.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#ifndef _TRACKER_STEMMER_H_ +#define _TRACKER_STEMMER_H_ + +char * tracker_stem_eng (char * b, int k); + +#endif |