summaryrefslogtreecommitdiff
path: root/contrib/fulltextindex/fti.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/fulltextindex/fti.c')
-rw-r--r--contrib/fulltextindex/fti.c381
1 files changed, 381 insertions, 0 deletions
diff --git a/contrib/fulltextindex/fti.c b/contrib/fulltextindex/fti.c
new file mode 100644
index 0000000000..1131b6078b
--- /dev/null
+++ b/contrib/fulltextindex/fti.c
@@ -0,0 +1,381 @@
+#include "executor/spi.h"
+#include "commands/trigger.h"
+#include "c.h" /* endof() macro */
+#include <ctype.h> /* tolower */
+#include <stdio.h> /* debugging */
+
+/*
+ * Trigger function takes 2 arguments:
+ 1. relation in which to store the substrings
+ 2. field to extract substrings from
+
+ The relation in which to insert *must* have the following layout:
+
+ string varchar(#)
+ id oid
+
+ Example:
+
+create function fti() returns opaque as
+'/home/boekhold/src/postgresql-6.2/contrib/fti/fti.so' language 'c';
+
+create table title_fti (string varchar(25), id oid);
+create index title_fti_idx on title_fti (string);
+
+create trigger title_fti_trigger after update or insert or delete on product
+for each row execute procedure fti(title_fti, title);
+ ^^^^^^^^^
+ where to store index in
+ ^^^^^
+ which column to index
+
+ofcourse don't forget to create an index on title_idx, column string, else
+you won't notice much speedup :)
+
+After populating 'product', try something like:
+
+select p.* from product p, title_fti f1, title_fti f2 where
+ f1.string='slippery' and f2.string='wet' and f1.id=f2.id and p.oid=f1.id;
+*/
+
+/*
+ march 4 1998 Changed breakup() to return less substrings. Only breakup
+ in word parts which are in turn shortened from the start
+ of the word (ie. word, ord, rd)
+ Did allocation of substring buffer outside of breakup()
+ oct. 5 1997, fixed a bug in string breakup (where there are more nonalpha
+ characters between words then 1).
+
+ oct 4-5 1997 implemented the thing, at least the basic functionallity
+ of it all....
+*/
+
+/* IMPROVEMENTS:
+
+ save a plan for deletes
+ create a function that will make the index *after* we have populated
+ the main table (probably first delete all contents to be sure there's
+ nothing in it, then re-populate the fti-table)
+
+ can we do something with operator overloading or a seperate function
+ that can build the final query automatigally?
+ */
+
+HeapTuple fti(void);
+char *breakup(char*, char*);
+bool is_stopword(char*);
+
+bool new_tuple = false;
+
+
+/* THIS LIST MUST BE IN SORTED ORDER, A BINARY SEARCH IS USED!!!! */
+char *StopWords[] = { /* list of words to skip in indexing */
+ "no"
+ "the",
+ "yes",
+};
+
+/* stuff for caching query-plans, stolen from contrib/spi/\*.c */
+typedef struct
+{
+ char *ident;
+ int nplans;
+ void **splan;
+} EPlan;
+
+static EPlan *InsertPlans = NULL;
+static EPlan *DeletePlans = NULL;
+static int nInsertPlans = 0;
+static int nDeletePlans = 0;
+
+static EPlan *find_plan(char *ident, EPlan ** eplan, int *nplans);
+
+/***********************************************************************/
+HeapTuple
+fti()
+{
+ Trigger *trigger; /* to get trigger name */
+ int nargs; /* # of arguments */
+ char **args; /* arguments */
+ char *relname; /* triggered relation name */
+ Relation rel; /* triggered relation */
+ char *indexname; /* name of table for substrings */
+ HeapTuple rettuple = NULL;
+ TupleDesc tupdesc; /* tuple description */
+ bool isinsert=false;
+ bool isdelete=false;
+ int ret;
+ char query[8192];
+ Oid oid;
+ /*
+ FILE *debug;
+ */
+
+ /*
+ debug = fopen("/dev/xconsole", "w");
+ fprintf(debug, "FTI: entered function\n");
+ fflush(debug);
+ */
+
+ if (!CurrentTriggerData)
+ elog(ERROR, "Full Text Indexing: triggers are not initialized");
+ if (TRIGGER_FIRED_FOR_STATEMENT(CurrentTriggerData->tg_event))
+ elog(ERROR, "Full Text Indexing: can't process STATEMENT events");
+ if (TRIGGER_FIRED_BEFORE(CurrentTriggerData->tg_event))
+ elog(ERROR, "Full Text Indexing: must be fired AFTER event");
+
+ if (TRIGGER_FIRED_BY_INSERT(CurrentTriggerData->tg_event))
+ isinsert=true;
+ if (TRIGGER_FIRED_BY_UPDATE(CurrentTriggerData->tg_event))
+ { isdelete=true;isinsert=true;}
+ if (TRIGGER_FIRED_BY_DELETE(CurrentTriggerData->tg_event))
+ isdelete=true;
+
+ trigger = CurrentTriggerData->tg_trigger;
+ rel = CurrentTriggerData->tg_relation;
+ relname = SPI_getrelname(rel);
+ rettuple=CurrentTriggerData->tg_trigtuple;
+ if (isdelete&&isinsert) /* is an UPDATE */
+ rettuple=CurrentTriggerData->tg_newtuple;
+
+ CurrentTriggerData = NULL; /* invalidate 'normal' calls to this function */
+
+ if ((ret = SPI_connect()) <0)
+ elog(ERROR,"Full Text Indexing: SPI_connect failed, returned %d\n",ret);
+
+ nargs = trigger->tgnargs;
+ if (nargs != 2)
+ elog(ERROR, "Full Text Indexing: trigger can only have 2 arguments");
+
+ args = trigger->tgargs;
+ indexname = args[0];
+ tupdesc = rel->rd_att; /* what the tuple looks like (?) */
+
+ /* get oid of current tuple, needed by all, so place here */
+ oid = rettuple->t_oid;
+ if (!OidIsValid(oid))
+ elog(ERROR,"Full Text Indexing: oid of current tuple is NULL");
+
+ if (isdelete) {
+ void *pplan;
+ Oid *argtypes;
+ Datum values[1];
+ EPlan *plan;
+
+ sprintf(query, "D%s$%s", args[0], args[1]);
+ plan = find_plan(query, &DeletePlans, &nDeletePlans);
+ if (plan->nplans <= 0) {
+ argtypes = (Oid *)palloc(sizeof(Oid));
+
+ argtypes[0] = OIDOID;
+
+ sprintf(query, "DELETE FROM %s WHERE id = $1", indexname);
+ pplan = SPI_prepare(query, 1, argtypes);
+ if (!pplan)
+ elog(ERROR, "Full Text Indexing: SPI_prepare returned NULL "
+ "in delete");
+ pplan = SPI_saveplan(pplan);
+ if (pplan == NULL)
+ elog(ERROR, "Full Text Indexing: SPI_saveplan returned NULL "
+ "in delete");
+
+ plan->splan = (void **)malloc(sizeof(void*));
+ *(plan->splan) = pplan;
+ plan->nplans = 1;
+ }
+
+ values[0] = oid;
+
+ ret = SPI_execp(*(plan->splan), values, NULL, 0);
+ if (ret != SPI_OK_DELETE)
+ elog(ERROR, "Full Text Indexing: error executing plan in delete");
+ }
+
+ if (isinsert) {
+ char *substring, *column;
+ void *pplan;
+ Oid *argtypes;
+ Datum values[2];
+ int colnum;
+ struct varlena *data;
+ EPlan *plan;
+
+ sprintf(query, "I%s$%s", args[0], args[1]);
+ plan = find_plan(query, &InsertPlans, &nInsertPlans);
+
+ /* no plan yet, so allocate mem for argtypes */
+ if (plan->nplans <= 0) {
+ argtypes = (Oid *)palloc(2*sizeof(Oid));
+
+ argtypes[0] = VARCHAROID; /*create table t_name
+ (string varchar, */
+ argtypes[1] = OIDOID; /* id oid); */
+
+ /* prepare plan to gain speed */
+ sprintf(query, "INSERT INTO %s (string, id) VALUES ($1, $2)",
+ indexname);
+ pplan = SPI_prepare(query, 2, argtypes);
+ if (!pplan)
+ elog(ERROR, "Full Text Indexing: SPI_prepare returned NULL "
+ "in insert");
+
+ pplan = SPI_saveplan(pplan);
+ if (pplan == NULL)
+ elog(ERROR, "Full Text Indexing: SPI_saveplan returned NULL"
+ " in insert");
+
+ plan->splan = (void **)malloc(sizeof(void*));
+ *(plan->splan) = pplan;
+ plan->nplans = 1;
+ }
+
+
+ /* prepare plan for query */
+ colnum=SPI_fnumber(tupdesc, args[1]);
+ if (colnum == SPI_ERROR_NOATTRIBUTE)
+ elog(ERROR, "Full Text Indexing: column '%s' of '%s' not found",
+ args[1], args[0]);
+
+ /* Get the char* representation of the column with name args[1] */
+ column = SPI_getvalue(rettuple, tupdesc, colnum);
+
+ if (column) { /* make sure we don't try to index NULL's */
+ char *buff;
+ char *string = column;
+
+ while(*string != '\0') { /* placed 'really' inline. */
+ *string = tolower(*string); /* some compilers will choke */
+ string++; /* on 'inline' keyword */
+ }
+
+ data = (struct varlena*)palloc(sizeof(int32)+strlen(column)+1);
+ buff = palloc(strlen(column) + 1);
+ /* saves lots of calls in while-loop and in breakup()*/
+
+ new_tuple=true;
+ while ((substring = breakup(column, buff))) {
+ int l;
+
+ l = strlen(substring);
+
+ data->vl_len = l+sizeof(int32);
+ memcpy(VARDATA(data), substring, l);
+ values[0] = PointerGetDatum(data);
+ values[1] = oid;
+
+ ret = SPI_execp(*(plan->splan), values, NULL, 0);
+ if (ret != SPI_OK_INSERT)
+ elog(ERROR, "Full Text Indexing: error executing plan "
+ "in insert");
+ }
+ pfree(buff);
+ pfree(data);
+ }
+ }
+
+ SPI_finish();
+ return (rettuple);
+}
+
+char *breakup(char *string, char *substring)
+{
+ static char *last_start;
+ static char *cur_pos;
+
+ if (new_tuple)
+ {
+ cur_pos=last_start=&string[strlen(string)-1];
+ new_tuple=false; /* don't initialize this next time */
+ }
+
+ while (cur_pos > string) /* don't read before start of 'string' */
+ {
+ /* skip pieces at the end of a string that are not
+ alfa-numeric (ie. 'string$%^&', last_start first points to
+ '&', and after this to 'g' */
+ if (!isalnum((int)*last_start)) {
+ while (!isalnum((int)*last_start) &&
+ last_start > string)
+ last_start--;
+ cur_pos=last_start;
+ }
+
+ cur_pos--; /* substrings are at minimum 2 characters long */
+
+ if (isalnum((int)*cur_pos))
+ {
+ /* Houston, we have a substring! :) */
+ memcpy(substring, cur_pos, last_start - cur_pos + 1);
+ substring[last_start-cur_pos+1]='\0';
+ if (!is_stopword(substring)) return substring;
+ }
+ else
+ {
+ last_start=cur_pos-1;
+ cur_pos = last_start;
+ }
+ }
+
+ return NULL; /* we've processed all of 'string' */
+}
+
+/* copied from src/backend/parser/keywords.c and adjusted for our situation*/
+bool
+is_stopword(char *text)
+{
+ char **StopLow; /* for list of stop-words */
+ char **StopHigh;
+ char **StopMiddle;
+ unsigned int difference;
+
+ StopLow = &StopWords[0]; /* initialize stuff for binary search */
+ StopHigh = endof(StopWords);
+
+ while (StopLow <= StopHigh)
+ {
+ StopMiddle = StopLow + (StopHigh - StopLow) / 2;
+ difference = strcmp(*StopMiddle, text);
+ if (difference == 0)
+ return (true);
+ else if (difference < 0)
+ StopLow = StopMiddle + 1;
+ else
+ StopHigh = StopMiddle - 1;
+ }
+
+ return (false);
+}
+
+/* for caching of query plans, stolen from contrib/spi/\*.c */
+static EPlan *
+find_plan(char *ident, EPlan ** eplan, int *nplans)
+{
+ EPlan *newp;
+ int i;
+
+ if (*nplans > 0)
+ {
+ for (i = 0; i < *nplans; i++)
+ {
+ if (strcmp((*eplan)[i].ident, ident) == 0)
+ break;
+ }
+ if (i != *nplans)
+ return (*eplan + i);
+ *eplan = (EPlan *) realloc(*eplan, (i + 1) * sizeof(EPlan));
+ newp = *eplan + i;
+ }
+ else
+ {
+ newp = *eplan = (EPlan *) malloc(sizeof(EPlan));
+ (*nplans) = i = 0;
+ }
+
+ newp->ident = (char *) malloc(strlen(ident) + 1);
+ strcpy(newp->ident, ident);
+ newp->nplans = 0;
+ newp->splan = NULL;
+ (*nplans)++;
+
+ return (newp);
+}