/* -*- Mode: c; c-basic-offset: 2 -*- * * raptor_set.c - Sets for checking IDs (based on Redland memory hash) * * $Id$ * * Copyright (C) 2003-2004, David Beckett http://purl.org/net/dajobe/ * Institute for Learning and Research Technology http://www.ilrt.bristol.ac.uk/ * University of Bristol, UK http://www.bristol.ac.uk/ * * This package is Free Software and part of Redland http://librdf.org/ * * It is licensed under the following three licenses as alternatives: * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version * 2. GNU General Public License (GPL) V2 or any newer version * 3. Apache License, V2.0 or any newer version * * You may not use this file except in compliance with at least one of * the above three licenses. * * See LICENSE.html or LICENSE.txt at the top of this package for the * complete terms and further detail along with the license texts for * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively. * * * */ #ifdef HAVE_CONFIG_H #include #endif #ifdef WIN32 #include #endif #include #include #include #include #ifdef HAVE_STDLIB_H #include /* for abort() as used in errors */ #endif /* Raptor includes */ #include "raptor.h" #include "raptor_internal.h" #ifndef STANDALONE /* * The only methods needed here are: * Create Set * Destroy Set * Check a (base, ID) pair present add it if not, return if added/not * */ struct raptor_id_set_node_s { struct raptor_id_set_node_s* next; char *item; size_t item_len; unsigned long hash; }; typedef struct raptor_id_set_node_s raptor_id_set_node; struct raptor_base_id_set_s { /* The base URI of this set of IDs */ raptor_uri *uri; /* neighbour ID sets */ struct raptor_base_id_set_s* prev; struct raptor_base_id_set_s* next; /* An array pointing to a list of nodes (buckets) */ raptor_id_set_node** nodes; /* this many buckets used (out of capacity) */ int size; /* this many items */ int items; /* size of the buckets array 'nodes' above */ int capacity; /* array load factor expressed out of 1000. * Always true: (size/capacity * 1000) < load_factor, * or in the code: size * 1000 < load_factor * capacity */ int load_factor; }; typedef struct raptor_base_id_set_s raptor_base_id_set; struct raptor_id_set_s { /* start of sets, 1 per base URI */ struct raptor_base_id_set_s* first; #ifdef RAPTOR_DEBUG int hits; int misses; #endif }; /* default load_factor out of 1000 */ static const int raptor_id_set_default_load_factor=750; /* starting capacity - MUST BE POWER OF 2 */ static const int raptor_id_set_initial_capacity=8; /* prototypes for local functions */ static raptor_id_set_node* raptor_base_id_set_find_node(raptor_base_id_set* set, unsigned char *item, size_t item_len, unsigned long hash); static void raptor_free_id_set_node(raptor_id_set_node* node); static int raptor_base_id_set_expand_size(raptor_base_id_set* set); /* * perldelta 5.8.0 says under *Performance Enhancements* * * Hashes now use Bob Jenkins "One-at-a-Time" hashing key algorithm * http://burtleburtle.net/bob/hash/doobs.html This algorithm is * reasonably fast while producing a much better spread of values * than the old hashing algorithm ... * * Changed here to set the string backwards to help do URIs better * */ #define ONE_AT_A_TIME_SET(set,str,len) \ do { \ register const unsigned char *c_oneat = str+len-1; \ register int i_oneat = len; \ register unsigned long set_oneat = 0; \ while (i_oneat--) { \ set_oneat += *c_oneat--; \ set_oneat += (set_oneat << 10); \ set_oneat ^= (set_oneat >> 6); \ } \ set_oneat += (set_oneat << 3); \ set_oneat ^= (set_oneat >> 11); \ (set) = (set_oneat + (set_oneat << 15)); \ } while(0) /* helper functions */ /** * raptor_id_set_find_node - Find the node for the given item with optional hash * @set: the memory set context * @item: item string * @item_len: item string length * @hash: hash value or 0 if it should be computed here * * Return value: &raptor_id_set_node of content or NULL on failure **/ static raptor_id_set_node* raptor_base_id_set_find_node(raptor_base_id_set* base, unsigned char *item, size_t item_len, unsigned long hash) { raptor_id_set_node* node; int bucket; /* empty set */ if(!base->capacity) return NULL; if(!hash) ONE_AT_A_TIME_SET(hash, (unsigned char*)item, item_len); /* find slot in table */ bucket=hash & (base->capacity - 1); /* check if there is a list present */ node=base->nodes[bucket]; if(!node) /* no list there */ return NULL; /* walk the list */ while(node) { if(item_len == node->item_len && !memcmp(item, node->item, item_len)) break; node=node->next; } return node; } static void raptor_free_id_set_node(raptor_id_set_node* node) { if(node->item) RAPTOR_FREE(cstring, node->item); RAPTOR_FREE(raptor_id_set_node, node); } static int raptor_base_id_set_expand_size(raptor_base_id_set* base) { int required_capacity=0; raptor_id_set_node **new_nodes; int i; if (base->capacity) { /* big enough */ if((1000 * base->items) < (base->load_factor * base->capacity)) return 0; /* grow set (keeping it a power of two) */ required_capacity=base->capacity << 1; } else { required_capacity=raptor_id_set_initial_capacity; } /* allocate new table */ new_nodes=(raptor_id_set_node**)RAPTOR_CALLOC(raptor_id_set_nodes, required_capacity, sizeof(raptor_id_set_node*)); if(!new_nodes) return 1; /* it is a new set empty set - we are done */ if(!base->size) { base->capacity=required_capacity; base->nodes=new_nodes; return 0; } for(i=0; icapacity; i++) { raptor_id_set_node *node=base->nodes[i]; /* walk all attached nodes */ while(node) { raptor_id_set_node *next; int bucket; next=node->next; /* find slot in new table */ bucket=node->hash & (required_capacity - 1); node->next=new_nodes[bucket]; new_nodes[bucket]=node; node=next; } } /* now free old table */ RAPTOR_FREE(raptor_id_set_nodes, base->nodes); /* attach new one */ base->capacity=required_capacity; base->nodes=new_nodes; return 0; } /* functions implementing the ID set api */ /** * raptor_new_id_set - Create a new ID set * * Return value: non 0 on failure **/ raptor_id_set* raptor_new_id_set(void) { raptor_id_set* set=(raptor_id_set*)RAPTOR_CALLOC(raptor_id_set, 1, sizeof(raptor_id_set)); if(!set) return NULL; return set; } /** * raptor_free_base_id_set - Destroy raptor_base_id_set * @set: &raptor_id_set **/ static void raptor_free_base_id_set(raptor_base_id_set *base) { if(base->nodes) { int i; for(i=0; icapacity; i++) { raptor_id_set_node *node=base->nodes[i]; /* this entry is used */ if(node) { raptor_id_set_node *next; /* free all attached nodes */ while(node) { next=node->next; raptor_free_id_set_node(node); node=next; } } } RAPTOR_FREE(raptor_id_set_nodes, base->nodes); } if(base->uri) raptor_free_uri(base->uri); RAPTOR_FREE(raptor_base_id_set, base); return; } /** * raptor_free_id_set - Destroy a set * @set: &raptor_id_set * **/ void raptor_free_id_set(raptor_id_set *set) { raptor_base_id_set *base=set->first; while(base) { raptor_base_id_set *next=base->next; raptor_free_base_id_set(base); base=next; } RAPTOR_FREE(raptor_id_set, set); } /** * raptor_id_set_add: - Add an item to the set * @set: &raptor_id_set * @base_uri: base &raptor_uri of identifier * @id: identifier name * @id_len: length of identifier * * Return value: <0 on failure, 0 on success, 1 if already present **/ int raptor_id_set_add(raptor_id_set* set, raptor_uri *base_uri, const unsigned char *id, size_t id_len) { raptor_base_id_set *base; unsigned char *base_uri_string; size_t base_uri_string_len; size_t item_len; unsigned char *item; raptor_id_set_node *node; unsigned long hash; char *new_item=NULL; int bucket= (-1); if(!base_uri || !id || !id_len) return -1; base=set->first; while(base) { if(raptor_uri_equals(base->uri, base_uri)) break; base=base->next; } if(!base) { /* a set for this base_uri not found */ base=(raptor_base_id_set*)RAPTOR_CALLOC(raptor_base_id_set, 1, sizeof(raptor_base_id_set)); if(!base) return -1; base->load_factor=raptor_id_set_default_load_factor; if(raptor_base_id_set_expand_size(base)) { RAPTOR_FREE(raptor_base_id_set, base); return -1; } base->uri=raptor_uri_copy(base_uri); /* Add to the start of the list */ if(set->first) set->first->prev=base; /* base->prev=NULL; */ base->next=set->first; set->first=base; } else { /* If not at the start of the list, move there */ if(base != set->first) { /* remove from the list */ base->prev->next=base->next; if(base->next) base->next->prev=base->prev; /* add at the start of the list */ set->first->prev=base; base->prev=NULL; base->next=set->first; } } /* ensure there is enough space in the set */ if (raptor_base_id_set_expand_size(base)) return -1; /* Storing ID + ' ' + base-uri-string + '\0' */ base_uri_string=raptor_uri_as_counted_string(base_uri, &base_uri_string_len); item_len=id_len+1+strlen((const char*)base_uri_string)+1; item=(unsigned char*)RAPTOR_MALLOC(cstring, item_len); if(!item) return 1; strcpy((char*)item, (const char*)id); item[id_len]=' '; strcpy((char*)item+id_len+1, (const char*)base_uri_string); /* strcpy for end '\0' */ ONE_AT_A_TIME_SET(hash, (unsigned char*)item, item_len); /* find node for item */ node=raptor_base_id_set_find_node(base, item, item_len, hash); /* if already there, error */ if(node) { #ifdef RAPTOR_DEBUG set->misses++; #endif return 1; } #ifdef RAPTOR_DEBUG set->hits++; #endif /* always a new node */ bucket=hash & (base->capacity - 1); /* allocate new node */ node=(raptor_id_set_node*)RAPTOR_CALLOC(raptor_id_set_node, 1, sizeof(raptor_id_set_node)); if(!node) return 1; node->hash=hash; /* allocate item for new node */ new_item=(char*)RAPTOR_MALLOC(cstring, item_len); if(!new_item) { RAPTOR_FREE(raptor_id_set_node, node); return -1; } /* copy new item */ memcpy(new_item, item, item_len); node->item=new_item; node->item_len=item_len; node->next=base->nodes[bucket]; base->nodes[bucket]=node; base->items++; /* Only increase bucket count use when previous value was NULL */ if(!node->next) base->size++; RAPTOR_FREE(cstring, item); return 0; } #ifdef RAPTOR_DEBUG void raptor_id_set_stats_print(raptor_id_set* set, FILE *stream) { fprintf(stream, "set hits: %d misses: %d\n", set->hits, set->misses); } #endif #endif #ifdef STANDALONE /* one more prototype */ int main(int argc, char *argv[]); int main(int argc, char *argv[]) { const char *program=raptor_basename(argv[0]); char *items[8] = { "ron", "amy", "jen", "bij", "jib", "daj", "jim", NULL }; raptor_id_set *set; raptor_uri *base_uri; int i=0; raptor_init(); base_uri=raptor_new_uri((const unsigned char*)"http://example.org/base#"); fprintf(stderr, "%s: Creating set\n", program); set=raptor_new_id_set(); if(!set) { fprintf(stderr, "%s: Failed to create set\n", program); exit(1); } for(i=0; items[i]; i++) { size_t len=strlen(items[i]); int rc; fprintf(stderr, "%s: Adding set item '%s'\n", program, items[i]); rc=raptor_id_set_add(set, base_uri, (const unsigned char*)items[i], len); if(rc) { fprintf(stderr, "%s: Adding set item %d '%s' failed, returning error %d\n", program, i, items[i], rc); exit(1); } } for(i=0; items[i]; i++) { size_t len=strlen(items[i]); int rc; fprintf(stderr, "%s: Adding duplicate set item '%s'\n", program, items[i]); rc=raptor_id_set_add(set, base_uri, (const unsigned char*)items[i], len); if(rc <= 0) { fprintf(stderr, "%s: Adding duplicate set item %d '%s' succeeded, should have failed, returning error %d\n", program, i, items[i], rc); exit(1); } } #ifdef RAPTOR_DEBUG raptor_id_set_stats_print(set, stderr); #endif fprintf(stderr, "%s: Freeing set\n", program); raptor_free_id_set(set); raptor_free_uri(base_uri); raptor_finish(); /* keep gcc -Wall happy */ return(0); } #endif