diff options
Diffstat (limited to 'contrib/fulltextindex/fti.c')
-rw-r--r-- | contrib/fulltextindex/fti.c | 381 |
1 files changed, 381 insertions, 0 deletions
diff --git a/contrib/fulltextindex/fti.c b/contrib/fulltextindex/fti.c new file mode 100644 index 0000000000..1131b6078b --- /dev/null +++ b/contrib/fulltextindex/fti.c @@ -0,0 +1,381 @@ +#include "executor/spi.h" +#include "commands/trigger.h" +#include "c.h" /* endof() macro */ +#include <ctype.h> /* tolower */ +#include <stdio.h> /* debugging */ + +/* + * Trigger function takes 2 arguments: + 1. relation in which to store the substrings + 2. field to extract substrings from + + The relation in which to insert *must* have the following layout: + + string varchar(#) + id oid + + Example: + +create function fti() returns opaque as +'/home/boekhold/src/postgresql-6.2/contrib/fti/fti.so' language 'c'; + +create table title_fti (string varchar(25), id oid); +create index title_fti_idx on title_fti (string); + +create trigger title_fti_trigger after update or insert or delete on product +for each row execute procedure fti(title_fti, title); + ^^^^^^^^^ + where to store index in + ^^^^^ + which column to index + +ofcourse don't forget to create an index on title_idx, column string, else +you won't notice much speedup :) + +After populating 'product', try something like: + +select p.* from product p, title_fti f1, title_fti f2 where + f1.string='slippery' and f2.string='wet' and f1.id=f2.id and p.oid=f1.id; +*/ + +/* + march 4 1998 Changed breakup() to return less substrings. Only breakup + in word parts which are in turn shortened from the start + of the word (ie. word, ord, rd) + Did allocation of substring buffer outside of breakup() + oct. 5 1997, fixed a bug in string breakup (where there are more nonalpha + characters between words then 1). + + oct 4-5 1997 implemented the thing, at least the basic functionallity + of it all.... +*/ + +/* IMPROVEMENTS: + + save a plan for deletes + create a function that will make the index *after* we have populated + the main table (probably first delete all contents to be sure there's + nothing in it, then re-populate the fti-table) + + can we do something with operator overloading or a seperate function + that can build the final query automatigally? + */ + +HeapTuple fti(void); +char *breakup(char*, char*); +bool is_stopword(char*); + +bool new_tuple = false; + + +/* THIS LIST MUST BE IN SORTED ORDER, A BINARY SEARCH IS USED!!!! */ +char *StopWords[] = { /* list of words to skip in indexing */ + "no" + "the", + "yes", +}; + +/* stuff for caching query-plans, stolen from contrib/spi/\*.c */ +typedef struct +{ + char *ident; + int nplans; + void **splan; +} EPlan; + +static EPlan *InsertPlans = NULL; +static EPlan *DeletePlans = NULL; +static int nInsertPlans = 0; +static int nDeletePlans = 0; + +static EPlan *find_plan(char *ident, EPlan ** eplan, int *nplans); + +/***********************************************************************/ +HeapTuple +fti() +{ + Trigger *trigger; /* to get trigger name */ + int nargs; /* # of arguments */ + char **args; /* arguments */ + char *relname; /* triggered relation name */ + Relation rel; /* triggered relation */ + char *indexname; /* name of table for substrings */ + HeapTuple rettuple = NULL; + TupleDesc tupdesc; /* tuple description */ + bool isinsert=false; + bool isdelete=false; + int ret; + char query[8192]; + Oid oid; + /* + FILE *debug; + */ + + /* + debug = fopen("/dev/xconsole", "w"); + fprintf(debug, "FTI: entered function\n"); + fflush(debug); + */ + + if (!CurrentTriggerData) + elog(ERROR, "Full Text Indexing: triggers are not initialized"); + if (TRIGGER_FIRED_FOR_STATEMENT(CurrentTriggerData->tg_event)) + elog(ERROR, "Full Text Indexing: can't process STATEMENT events"); + if (TRIGGER_FIRED_BEFORE(CurrentTriggerData->tg_event)) + elog(ERROR, "Full Text Indexing: must be fired AFTER event"); + + if (TRIGGER_FIRED_BY_INSERT(CurrentTriggerData->tg_event)) + isinsert=true; + if (TRIGGER_FIRED_BY_UPDATE(CurrentTriggerData->tg_event)) + { isdelete=true;isinsert=true;} + if (TRIGGER_FIRED_BY_DELETE(CurrentTriggerData->tg_event)) + isdelete=true; + + trigger = CurrentTriggerData->tg_trigger; + rel = CurrentTriggerData->tg_relation; + relname = SPI_getrelname(rel); + rettuple=CurrentTriggerData->tg_trigtuple; + if (isdelete&&isinsert) /* is an UPDATE */ + rettuple=CurrentTriggerData->tg_newtuple; + + CurrentTriggerData = NULL; /* invalidate 'normal' calls to this function */ + + if ((ret = SPI_connect()) <0) + elog(ERROR,"Full Text Indexing: SPI_connect failed, returned %d\n",ret); + + nargs = trigger->tgnargs; + if (nargs != 2) + elog(ERROR, "Full Text Indexing: trigger can only have 2 arguments"); + + args = trigger->tgargs; + indexname = args[0]; + tupdesc = rel->rd_att; /* what the tuple looks like (?) */ + + /* get oid of current tuple, needed by all, so place here */ + oid = rettuple->t_oid; + if (!OidIsValid(oid)) + elog(ERROR,"Full Text Indexing: oid of current tuple is NULL"); + + if (isdelete) { + void *pplan; + Oid *argtypes; + Datum values[1]; + EPlan *plan; + + sprintf(query, "D%s$%s", args[0], args[1]); + plan = find_plan(query, &DeletePlans, &nDeletePlans); + if (plan->nplans <= 0) { + argtypes = (Oid *)palloc(sizeof(Oid)); + + argtypes[0] = OIDOID; + + sprintf(query, "DELETE FROM %s WHERE id = $1", indexname); + pplan = SPI_prepare(query, 1, argtypes); + if (!pplan) + elog(ERROR, "Full Text Indexing: SPI_prepare returned NULL " + "in delete"); + pplan = SPI_saveplan(pplan); + if (pplan == NULL) + elog(ERROR, "Full Text Indexing: SPI_saveplan returned NULL " + "in delete"); + + plan->splan = (void **)malloc(sizeof(void*)); + *(plan->splan) = pplan; + plan->nplans = 1; + } + + values[0] = oid; + + ret = SPI_execp(*(plan->splan), values, NULL, 0); + if (ret != SPI_OK_DELETE) + elog(ERROR, "Full Text Indexing: error executing plan in delete"); + } + + if (isinsert) { + char *substring, *column; + void *pplan; + Oid *argtypes; + Datum values[2]; + int colnum; + struct varlena *data; + EPlan *plan; + + sprintf(query, "I%s$%s", args[0], args[1]); + plan = find_plan(query, &InsertPlans, &nInsertPlans); + + /* no plan yet, so allocate mem for argtypes */ + if (plan->nplans <= 0) { + argtypes = (Oid *)palloc(2*sizeof(Oid)); + + argtypes[0] = VARCHAROID; /*create table t_name + (string varchar, */ + argtypes[1] = OIDOID; /* id oid); */ + + /* prepare plan to gain speed */ + sprintf(query, "INSERT INTO %s (string, id) VALUES ($1, $2)", + indexname); + pplan = SPI_prepare(query, 2, argtypes); + if (!pplan) + elog(ERROR, "Full Text Indexing: SPI_prepare returned NULL " + "in insert"); + + pplan = SPI_saveplan(pplan); + if (pplan == NULL) + elog(ERROR, "Full Text Indexing: SPI_saveplan returned NULL" + " in insert"); + + plan->splan = (void **)malloc(sizeof(void*)); + *(plan->splan) = pplan; + plan->nplans = 1; + } + + + /* prepare plan for query */ + colnum=SPI_fnumber(tupdesc, args[1]); + if (colnum == SPI_ERROR_NOATTRIBUTE) + elog(ERROR, "Full Text Indexing: column '%s' of '%s' not found", + args[1], args[0]); + + /* Get the char* representation of the column with name args[1] */ + column = SPI_getvalue(rettuple, tupdesc, colnum); + + if (column) { /* make sure we don't try to index NULL's */ + char *buff; + char *string = column; + + while(*string != '\0') { /* placed 'really' inline. */ + *string = tolower(*string); /* some compilers will choke */ + string++; /* on 'inline' keyword */ + } + + data = (struct varlena*)palloc(sizeof(int32)+strlen(column)+1); + buff = palloc(strlen(column) + 1); + /* saves lots of calls in while-loop and in breakup()*/ + + new_tuple=true; + while ((substring = breakup(column, buff))) { + int l; + + l = strlen(substring); + + data->vl_len = l+sizeof(int32); + memcpy(VARDATA(data), substring, l); + values[0] = PointerGetDatum(data); + values[1] = oid; + + ret = SPI_execp(*(plan->splan), values, NULL, 0); + if (ret != SPI_OK_INSERT) + elog(ERROR, "Full Text Indexing: error executing plan " + "in insert"); + } + pfree(buff); + pfree(data); + } + } + + SPI_finish(); + return (rettuple); +} + +char *breakup(char *string, char *substring) +{ + static char *last_start; + static char *cur_pos; + + if (new_tuple) + { + cur_pos=last_start=&string[strlen(string)-1]; + new_tuple=false; /* don't initialize this next time */ + } + + while (cur_pos > string) /* don't read before start of 'string' */ + { + /* skip pieces at the end of a string that are not + alfa-numeric (ie. 'string$%^&', last_start first points to + '&', and after this to 'g' */ + if (!isalnum((int)*last_start)) { + while (!isalnum((int)*last_start) && + last_start > string) + last_start--; + cur_pos=last_start; + } + + cur_pos--; /* substrings are at minimum 2 characters long */ + + if (isalnum((int)*cur_pos)) + { + /* Houston, we have a substring! :) */ + memcpy(substring, cur_pos, last_start - cur_pos + 1); + substring[last_start-cur_pos+1]='\0'; + if (!is_stopword(substring)) return substring; + } + else + { + last_start=cur_pos-1; + cur_pos = last_start; + } + } + + return NULL; /* we've processed all of 'string' */ +} + +/* copied from src/backend/parser/keywords.c and adjusted for our situation*/ +bool +is_stopword(char *text) +{ + char **StopLow; /* for list of stop-words */ + char **StopHigh; + char **StopMiddle; + unsigned int difference; + + StopLow = &StopWords[0]; /* initialize stuff for binary search */ + StopHigh = endof(StopWords); + + while (StopLow <= StopHigh) + { + StopMiddle = StopLow + (StopHigh - StopLow) / 2; + difference = strcmp(*StopMiddle, text); + if (difference == 0) + return (true); + else if (difference < 0) + StopLow = StopMiddle + 1; + else + StopHigh = StopMiddle - 1; + } + + return (false); +} + +/* for caching of query plans, stolen from contrib/spi/\*.c */ +static EPlan * +find_plan(char *ident, EPlan ** eplan, int *nplans) +{ + EPlan *newp; + int i; + + if (*nplans > 0) + { + for (i = 0; i < *nplans; i++) + { + if (strcmp((*eplan)[i].ident, ident) == 0) + break; + } + if (i != *nplans) + return (*eplan + i); + *eplan = (EPlan *) realloc(*eplan, (i + 1) * sizeof(EPlan)); + newp = *eplan + i; + } + else + { + newp = *eplan = (EPlan *) malloc(sizeof(EPlan)); + (*nplans) = i = 0; + } + + newp->ident = (char *) malloc(strlen(ident) + 1); + strcpy(newp->ident, ident); + newp->nplans = 0; + newp->splan = NULL; + (*nplans)++; + + return (newp); +} |