summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJamie McCracken <jamiemcc@src.gnome.org>2006-09-13 20:03:56 +0000
committerJamie McCracken <jamiemcc@src.gnome.org>2006-09-13 20:03:56 +0000
commitb0b81281f1bdc7b50bcf61399c06728feca0099b (patch)
treeb27ba7895620ba5ea57f70f6e7c31940afd43ef9
parent73da61ebdd6b5e4e0c7c0f021f70394f84896481 (diff)
downloadtracker-b0b81281f1bdc7b50bcf61399c06728feca0099b.tar.gz
updates
-rw-r--r--ChangeLog16
-rw-r--r--configure.in2
-rw-r--r--src/Makefile.am10
-rw-r--r--src/trackerd/Makefile.am4
-rw-r--r--src/trackerd/tracker-stemmer-english.c358
-rw-r--r--src/trackerd/tracker-stemmer-english.h25
6 files changed, 410 insertions, 5 deletions
diff --git a/ChangeLog b/ChangeLog
index 5584c3f3f..3bda0ded7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+2006-09-13 Jamie McCracken <jamiemcc@gnome.org>
+
+ * Tidied up tracker-GUI
+ * Inline libsexy widget
+ * extended parser to support line by line indexing
+ * added more functionality to new indexer
+
+
+2006-09-13 Edward Duffy <eduffy@gmail.com>
+
+ * Enhanced tracker GUI
+
+2006-09-13 Jaime Frutos Morales <acidborg@gmail.com>
+
+ * Patch to add skeleton for tracker GUI
+
2006-09-11 Samuel Cormier-Iijima <ciyoshi@gmail.com>
diff --git a/configure.in b/configure.in
index ccb23f2e3..86b02e3f4 100644
--- a/configure.in
+++ b/configure.in
@@ -19,7 +19,7 @@ AM_PROG_LIBTOOL
# Checks for header files.
AC_HEADER_STDC
-AC_CHECK_HEADERS([fcntl.h sitdlib.h string.h sys/time.h unistd.h magic.h])
+AC_CHECK_HEADERS([fcntl.h sitdlib.h string.h sys/time.h unistd.h])
# Check for glib 2.0
PKG_CHECK_MODULES(GLIB2, [ glib-2.0 >= 2.4.0 ] gthread-2.0 >= 2.4.0)
diff --git a/src/Makefile.am b/src/Makefile.am
index 7ccf73606..23c45230f 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -5,5 +5,11 @@ else
extractor_dir =
endif
-SUBDIRS = text-filters trackerd libtracker $(extractor_dir) tracker-extract
-DIST_SUBDIRS = text-filters trackerd libtracker $(extractor_dir) tracker-extract
+if HAVE_GTK
+tracker_gui_dir = tracker-gui
+else
+tracker_gui_dir =
+endif
+
+SUBDIRS = text-filters trackerd libtracker $(extractor_dir) tracker-extract $(tracker_gui_dir)
+DIST_SUBDIRS = text-filters trackerd libtracker $(extractor_dir) tracker-extract $(tracker_gui_dir)
diff --git a/src/trackerd/Makefile.am b/src/trackerd/Makefile.am
index f51824ca1..df45b96bd 100644
--- a/src/trackerd/Makefile.am
+++ b/src/trackerd/Makefile.am
@@ -98,8 +98,8 @@ trackerd_LDADD = $(GLIB2_LIBS) \
$(PANGO_LIBS) \
$(QDBM_LIBS) \
$(SQLITE3_LIBS) \
- -lstdc++ \
- -lmagic
+ -lstdc++
+
tracker_convert_file_SOURCES = tracker-convert-file.c tracker-parser.c tracker-stemmer.c
diff --git a/src/trackerd/tracker-stemmer-english.c b/src/trackerd/tracker-stemmer-english.c
new file mode 100644
index 000000000..cc1e06603
--- /dev/null
+++ b/src/trackerd/tracker-stemmer-english.c
@@ -0,0 +1,358 @@
+
+/* This is the Porter stemming algorithm, coded up as thread-safe ANSI C
+ by the author.
+
+ It may be be regarded as cononical, in that it follows the algorithm
+ presented in
+
+ Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+ no. 3, pp 130-137,
+
+ only differing from it at the points maked --DEPARTURE-- below.
+
+ See also http://www.tartarus.org/~martin/PorterStemmer
+
+ The algorithm as described in the paper could be exactly replicated
+ by adjusting the points of DEPARTURE, but this is barely necessary,
+ because (a) the points of DEPARTURE are definitely improvements, and
+ (b) no encoding of the Porter stemmer I have seen is anything like
+ as exact as this version, even with the points of DEPARTURE!
+
+ You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
+ 'stem' takes a list of inputs and sends the stemmed equivalent to
+ stdout.
+
+ The algorithm as encoded here is particularly fast.
+
+ Release 2 (the more old-fashioned, non-thread-safe version may be
+ regarded as release 1.)
+*/
+
+#include <stdlib.h> /* for malloc, free */
+#include <string.h> /* for memcmp, memmove */
+#include <glib.h>
+
+#include "tracker-stemmer-english.h"
+
+typedef struct {
+ char *b; /* buffer for word to be stemmed */
+ int k; /* offset to the end of the string */
+ int j; /* a general offset into the string */
+} stemmer;
+
+
+/* cons(z, i) is TRUE <=> b[i] is a consonant. ('b' means 'z->b', but here
+ and below we drop 'z->' in comments.
+*/
+
+static int cons(stemmer * z, int i)
+{ switch (z->b[i])
+ { case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE;
+ case 'y': return (i == 0) ? TRUE : !cons(z, i - 1);
+ default: return TRUE;
+ }
+}
+
+/* m(z) measures the number of consonant sequences between 0 and j. if c is
+ a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
+ presence,
+
+ <c><v> gives 0
+ <c>vc<v> gives 1
+ <c>vcvc<v> gives 2
+ <c>vcvcvc<v> gives 3
+ ....
+*/
+
+static int m(stemmer * z)
+{ int n, i, j;
+
+ n = 0;
+ i = 0;
+ j = z->j;
+ while(TRUE)
+ { if (i > j) return n;
+ if (! cons(z, i)) break; i++;
+ }
+ i++;
+ while(TRUE)
+ { while(TRUE)
+ { if (i > j) return n;
+ if (cons(z, i)) break;
+ i++;
+ }
+ i++;
+ n++;
+ while(TRUE)
+ { if (i > j) return n;
+ if (! cons(z, i)) break;
+ i++;
+ }
+ i++;
+ }
+}
+
+/* vowelinstem(z) is TRUE <=> 0,...j contains a vowel */
+
+static int vowelinstem(stemmer * z)
+{
+ int j, i;
+ j = z->j;
+ for (i = 0; i <= j; i++) if (! cons(z, i)) return TRUE;
+ return FALSE;
+}
+
+/* doublec(z, j) is TRUE <=> j,(j-1) contain a double consonant. */
+
+static int doublec(stemmer * z, int j)
+{
+ char * b;
+ b = z->b;
+ if (j < 1) return FALSE;
+ if (b[j] != b[j - 1]) return FALSE;
+ return cons(z, j);
+}
+
+/* cvc(z, i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
+ and also if the second c is not w,x or y. this is used when trying to
+ restore an e at the end of a short word. e.g.
+
+ cav(e), lov(e), hop(e), crim(e), but
+ snow, box, tray.
+
+*/
+
+static int cvc(stemmer * z, int i)
+{ if (i < 2 || !cons(z, i) || cons(z, i - 1) || !cons(z, i - 2)) return FALSE;
+ { int ch;
+ ch = z->b[i];
+ if (ch == 'w' || ch == 'x' || ch == 'y') return FALSE;
+ }
+ return TRUE;
+}
+
+/* ends(z, s) is TRUE <=> 0,...k ends with the string s. */
+
+static int ends(stemmer * z, char * s)
+{ int length;
+ char * b;
+ int k;
+ length = s[0];
+ b = z->b;
+ k = z->k;
+ if (s[length] != b[k]) return FALSE; /* tiny speed-up */
+ if (length > k + 1) return FALSE;
+ if (memcmp(b + k - length + 1, s + 1, length) != 0) return FALSE;
+ z->j = k-length;
+ return TRUE;
+}
+
+/* setto(z, s) sets (j+1),...k to the characters in the string s, readjusting
+ k. */
+
+static void setto(stemmer * z, char * s)
+{ int length;
+ int j;
+ length = s[0];
+ j = z->j;
+ memmove(z->b + j + 1, s + 1, length);
+ z->k = j+length;
+}
+
+/* r(z, s) is used further down. */
+
+static void r(stemmer * z, char * s) { if (m(z) > 0) setto(z, s); }
+
+/* step1ab(z) gets rid of plurals and -ed or -ing. e.g.
+
+ caresses -> caress
+ ponies -> poni
+ ties -> ti
+ caress -> caress
+ cats -> cat
+
+ feed -> feed
+ agreed -> agree
+ disabled -> disable
+
+ matting -> mat
+ mating -> mate
+ meeting -> meet
+ milling -> mill
+ messing -> mess
+
+ meetings -> meet
+
+*/
+
+static void step1ab(stemmer * z)
+{
+ char * b;
+ b = z->b;
+
+ if (b[z->k] == 's')
+ { if (ends(z, "\04" "sses")) z->k -= 2; else
+ if (ends(z, "\03" "ies")) setto(z, "\01" "i"); else
+ if (b[z->k - 1] != 's') {z->k--; }
+ }
+ if (ends(z, "\03" "eed")) { if (m(z) > 0) z->k--; } else
+ if ((ends(z, "\02" "ed") || ends(z, "\03" "ing")) && vowelinstem(z))
+ { z->k = z->j;
+ if (ends(z, "\02" "at")) setto(z, "\03" "ate"); else
+ if (ends(z, "\02" "bl")) setto(z, "\03" "ble"); else
+ if (ends(z, "\02" "iz")) setto(z, "\03" "ize"); else
+ if (doublec(z, z->k))
+ { z->k--;
+ { int ch;
+ ch = b[z->k];
+ if (ch == 'l' || ch == 's' || ch == 'z') z->k++;
+ }
+ }
+ else if (m(z) == 1 && cvc(z, z->k)) setto(z, "\01" "e");
+ }
+}
+
+/* step1c(z) turns terminal y to i when there is another vowel in the stem. */
+
+static void step1c(stemmer * z)
+{
+ if (ends(z, "\01" "y") && vowelinstem(z)) z->b[z->k] = 'i';
+}
+
+
+/* step2(z) maps double suffices to single ones. so -ization ( = -ize plus
+ -ation) maps to -ize etc. note that the string before the suffix must give
+ m(z) > 0. */
+
+static void step2(stemmer * z) { switch (z->b[z->k-1])
+{
+ case 'a': if (ends(z, "\07" "ational")) { r(z, "\03" "ate"); break; }
+ if (ends(z, "\06" "tional")) { r(z, "\04" "tion"); break; }
+ break;
+ case 'c': if (ends(z, "\04" "enci")) { r(z, "\04" "ence"); break; }
+ if (ends(z, "\04" "anci")) { r(z, "\04" "ance"); break; }
+ break;
+ case 'e': if (ends(z, "\04" "izer")) { r(z, "\03" "ize"); break; }
+ break;
+ case 'l': if (ends(z, "\03" "bli")) { r(z, "\03" "ble"); break; } /*-DEPARTURE-*/
+
+ /* To match the published algorithm, replace this line with
+ case 'l': if (ends(z, "\04" "abli")) { r(z, "\04" "able"); break; } */
+
+ if (ends(z, "\04" "alli")) { r(z, "\02" "al"); break; }
+ if (ends(z, "\05" "entli")) { r(z, "\03" "ent"); break; }
+ if (ends(z, "\03" "eli")) { r(z, "\01" "e"); break; }
+ if (ends(z, "\05" "ousli")) { r(z, "\03" "ous"); break; }
+ break;
+ case 'o': if (ends(z, "\07" "ization")) { r(z, "\03" "ize"); break; }
+ if (ends(z, "\05" "ation")) { r(z, "\03" "ate"); break; }
+ if (ends(z, "\04" "ator")) { r(z, "\03" "ate"); break; }
+ break;
+ case 's': if (ends(z, "\05" "alism")) { r(z, "\02" "al"); break; }
+ if (ends(z, "\07" "iveness")) { r(z, "\03" "ive"); break; }
+ if (ends(z, "\07" "fulness")) { r(z, "\03" "ful"); break; }
+ if (ends(z, "\07" "ousness")) { r(z, "\03" "ous"); break; }
+ break;
+ case 't': if (ends(z, "\05" "aliti")) { r(z, "\02" "al"); break; }
+ if (ends(z, "\05" "iviti")) { r(z, "\03" "ive"); break; }
+ if (ends(z, "\06" "biliti")) { r(z, "\03" "ble"); break; }
+ break;
+ case 'g': if (ends(z, "\04" "logi")) { r(z, "\03" "log"); break; } /*-DEPARTURE-*/
+
+ /* To match the published algorithm, delete this line */
+
+} }
+
+/* step3(z) deals with -ic-, -full, -ness etc. similar strategy to step2. */
+
+static void step3(stemmer * z) { switch (z->b[z->k])
+{
+ case 'e': if (ends(z, "\05" "icate")) { r(z, "\02" "ic"); break; }
+ if (ends(z, "\05" "ative")) { r(z, "\00" ""); break; }
+ if (ends(z, "\05" "alize")) { r(z, "\02" "al"); break; }
+ break;
+ case 'i': if (ends(z, "\05" "iciti")) { r(z, "\02" "ic"); break; }
+ break;
+ case 'l': if (ends(z, "\04" "ical")) { r(z, "\02" "ic"); break; }
+ if (ends(z, "\03" "ful")) { r(z, "\00" ""); break; }
+ break;
+ case 's': if (ends(z, "\04" "ness")) { r(z, "\00" ""); break; }
+ break;
+} }
+
+/* step4(z) takes off -ant, -ence etc., in context <c>vcvc<v>. */
+
+static void step4(stemmer * z)
+{ switch (z->b[z->k-1])
+ { case 'a': if (ends(z, "\02" "al")) break; return;
+ case 'c': if (ends(z, "\04" "ance")) break;
+ if (ends(z, "\04" "ence")) break; return;
+ case 'e': if (ends(z, "\02" "er")) break; return;
+ case 'i': if (ends(z, "\02" "ic")) break; return;
+ case 'l': if (ends(z, "\04" "able")) break;
+ if (ends(z, "\04" "ible")) break; return;
+ case 'n': if (ends(z, "\03" "ant")) break;
+ if (ends(z, "\05" "ement")) break;
+ if (ends(z, "\04" "ment")) break;
+ if (ends(z, "\03" "ent")) break; return;
+ case 'o': if (ends(z, "\03" "ion") && (z->b[z->j] == 's' || z->b[z->j] == 't')) break;
+ if (ends(z, "\02" "ou")) break; return;
+ /* takes care of -ous */
+ case 's': if (ends(z, "\03" "ism")) break; return;
+ case 't': if (ends(z, "\03" "ate")) break;
+ if (ends(z, "\03" "iti")) break; return;
+ case 'u': if (ends(z, "\03" "ous")) break; return;
+ case 'v': if (ends(z, "\03" "ive")) break; return;
+ case 'z': if (ends(z, "\03" "ize")) break; return;
+ default: return;
+ }
+ if (m(z) > 1) z->k = z->j;
+}
+
+/* step5(z) removes a final -e if m(z) > 1, and changes -ll to -l if
+ m(z) > 1. */
+
+static void step5(stemmer * z)
+{
+ char * b;
+ b = z->b;
+ z->j = z->k;
+ if (b[z->k] == 'e')
+ { int a;
+ a = m(z);
+ if (a > 1 || (a == 1 && !cvc(z, z->k - 1))) z->k--;
+ }
+ if (b[z->k] == 'l' && doublec(z, z->k) && m(z) > 1) z->k--;
+}
+
+/* In stem(z, b, k), b is a char pointer, and the string to be stemmed is
+ from b[0] to b[k] inclusive. Possibly b[k+1] == '\0', but it is not
+ important. The stemmer adjusts the characters b[0] ... b[k] and returns
+ the new end-point of the string, k'. Stemming never increases word
+ length, so 0 <= k' <= k.
+*/
+
+char * tracker_stem_eng (char * b, int k)
+{
+
+ stemmer Stemmer;
+ char *result;
+
+ if (k <= 1) return g_strdup(b); /*-DEPARTURE-*/
+
+ Stemmer.b = g_strdup (b);
+ Stemmer.k = k; /* copy the parameters into z */
+ Stemmer.j = 0;
+
+ /* With this line, strings of length 1 or 2 don't go through the
+ stemming process, although no mention is made of this in the
+ published algorithm. Remove the line to match the published
+ algorithm. */
+
+ step1ab(&Stemmer); step1c(&Stemmer); step2(&Stemmer); step3(&Stemmer); step4(&Stemmer); step5(&Stemmer);
+
+ result = g_strndup (Stemmer.b, Stemmer.k+1);
+ g_free (Stemmer.b);
+
+ return result;
+}
diff --git a/src/trackerd/tracker-stemmer-english.h b/src/trackerd/tracker-stemmer-english.h
new file mode 100644
index 000000000..786163b63
--- /dev/null
+++ b/src/trackerd/tracker-stemmer-english.h
@@ -0,0 +1,25 @@
+/* Tracker
+ * Copyright (C) 2005, Mr Jamie McCracken (jamiemcc@gnome.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#ifndef _TRACKER_STEMMER_H_
+#define _TRACKER_STEMMER_H_
+
+char * tracker_stem_eng (char * b, int k);
+
+#endif