/* -*- Mode: c; c-basic-offset: 2 -*-
 *
 * raptor_set.c - Sets for checking IDs (based on Redland memory hash)
 *
 * $Id$
 *
 * Copyright (C) 2003-2004, David Beckett http://purl.org/net/dajobe/
 * Institute for Learning and Research Technology http://www.ilrt.bristol.ac.uk/
 * University of Bristol, UK http://www.bristol.ac.uk/
 * 
 * This package is Free Software and part of Redland http://librdf.org/
 * 
 * It is licensed under the following three licenses as alternatives:
 *   1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
 *   2. GNU General Public License (GPL) V2 or any newer version
 *   3. Apache License, V2.0 or any newer version
 * 
 * You may not use this file except in compliance with at least one of
 * the above three licenses.
 * 
 * See LICENSE.html or LICENSE.txt at the top of this package for the
 * complete terms and further detail along with the license texts for
 * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
 * 
 * 
 * 
 */


#ifdef HAVE_CONFIG_H
#include <raptor_config.h>
#endif

#ifdef WIN32
#include <win32_raptor_config.h>
#endif


#include <stdio.h>
#include <string.h>
#include <stdarg.h>
#include <sys/types.h>

#ifdef HAVE_STDLIB_H
#include <stdlib.h> /* for abort() as used in errors */
#endif

/* Raptor includes */
#include "raptor.h"
#include "raptor_internal.h"


#ifndef STANDALONE

/*
 * The only methods needed here are:
 *  Create Set
 *  Destroy Set
 *  Check a (base, ID) pair present add it if not, return if added/not
 *
 */

struct raptor_id_set_node_s
{
  struct raptor_id_set_node_s* next;
  char *item;
  size_t item_len;
  unsigned long hash;
};
typedef struct raptor_id_set_node_s raptor_id_set_node;


struct raptor_base_id_set_s
{
  /* The base URI of this set of IDs */
  raptor_uri *uri;
  
  /* neighbour ID sets */
  struct raptor_base_id_set_s* prev;
  struct raptor_base_id_set_s* next;

  /* An array pointing to a list of nodes (buckets) */
  raptor_id_set_node** nodes;

  /* this many buckets used (out of capacity) */
  int size;
  /* this many items */
  int items;
  /* size of the buckets array 'nodes' above */
  int capacity;

  /* array load factor expressed out of 1000.
   * Always true: (size/capacity * 1000) < load_factor,
   * or in the code: size * 1000 < load_factor * capacity
   */
  int load_factor;
};
typedef struct raptor_base_id_set_s raptor_base_id_set;


struct raptor_id_set_s
{
  /* start of sets, 1 per base URI */
  struct raptor_base_id_set_s* first;

#ifdef RAPTOR_DEBUG
  int hits;
  int misses;
#endif
};


/* default load_factor out of 1000 */
static const int raptor_id_set_default_load_factor=750;

/* starting capacity - MUST BE POWER OF 2 */
static const int raptor_id_set_initial_capacity=8;


/* prototypes for local functions */
static raptor_id_set_node* raptor_base_id_set_find_node(raptor_base_id_set* set, unsigned char *item, size_t item_len, unsigned long hash);
static void raptor_free_id_set_node(raptor_id_set_node* node);
static int raptor_base_id_set_expand_size(raptor_base_id_set* set);


/*
 * perldelta 5.8.0 says under *Performance Enhancements*
 *
 *   Hashes now use Bob Jenkins "One-at-a-Time" hashing key algorithm
 *   http://burtleburtle.net/bob/hash/doobs.html  This algorithm is
 *   reasonably fast while producing a much better spread of values
 *   than the old hashing algorithm ...
 *
 * Changed here to set the string backwards to help do URIs better
 *
 */

#define ONE_AT_A_TIME_SET(set,str,len) \
     do { \
        register const unsigned char *c_oneat = str+len-1; \
        register int i_oneat = len; \
        register unsigned long set_oneat = 0; \
        while (i_oneat--) { \
            set_oneat += *c_oneat--; \
            set_oneat += (set_oneat << 10); \
            set_oneat ^= (set_oneat >> 6); \
        } \
        set_oneat += (set_oneat << 3); \
        set_oneat ^= (set_oneat >> 11); \
        (set) = (set_oneat + (set_oneat << 15)); \
    } while(0)


/* helper functions */


/**
 * raptor_id_set_find_node - Find the node for the given item with optional hash
 * @set: the memory set context
 * @item: item string
 * @item_len: item string length
 * @hash: hash value or 0 if it should be computed here
 * 
 * Return value: &raptor_id_set_node of content or NULL on failure
 **/
static raptor_id_set_node*
raptor_base_id_set_find_node(raptor_base_id_set* base, 
                             unsigned char *item, size_t item_len,
                             unsigned long hash) 
{
  raptor_id_set_node* node;
  int bucket;

  /* empty set */
  if(!base->capacity)
    return NULL;

  if(!hash)
    ONE_AT_A_TIME_SET(hash, (unsigned char*)item, item_len);
  

  /* find slot in table */
  bucket=hash & (base->capacity - 1);

  /* check if there is a list present */ 
  node=base->nodes[bucket];
  if(!node)
    /* no list there */
    return NULL;
    
  /* walk the list */
  while(node) {
    if(item_len == node->item_len && !memcmp(item, node->item, item_len))
      break;
    node=node->next;
  }

  return node;
}


static void
raptor_free_id_set_node(raptor_id_set_node* node) 
{
  if(node->item)
    RAPTOR_FREE(cstring, node->item);
  RAPTOR_FREE(raptor_id_set_node, node);
}


static int
raptor_base_id_set_expand_size(raptor_base_id_set* base) {
  int required_capacity=0;
  raptor_id_set_node **new_nodes;
  int i;

  if (base->capacity) {
    /* big enough */
    if((1000 * base->items) < (base->load_factor * base->capacity))
      return 0;
    /* grow set (keeping it a power of two) */
    required_capacity=base->capacity << 1;
  } else {
    required_capacity=raptor_id_set_initial_capacity;
  }

  /* allocate new table */
  new_nodes=(raptor_id_set_node**)RAPTOR_CALLOC(raptor_id_set_nodes, 
                                                required_capacity,
                                                sizeof(raptor_id_set_node*));
  if(!new_nodes)
    return 1;


  /* it is a new set empty set - we are done */
  if(!base->size) {
    base->capacity=required_capacity;
    base->nodes=new_nodes;
    return 0;
  }
  

  for(i=0; i<base->capacity; i++) {
    raptor_id_set_node *node=base->nodes[i];
      
    /* walk all attached nodes */
    while(node) {
      raptor_id_set_node *next;
      int bucket;

      next=node->next;
      /* find slot in new table */
      bucket=node->hash & (required_capacity - 1);
      node->next=new_nodes[bucket];
      new_nodes[bucket]=node;

      node=next;
    }
  }

  /* now free old table */
  RAPTOR_FREE(raptor_id_set_nodes, base->nodes);

  /* attach new one */
  base->capacity=required_capacity;
  base->nodes=new_nodes;

  return 0;
}


/* functions implementing the ID set api */

/**
 * raptor_new_id_set - Create a new ID set
 * 
 * Return value: non 0 on failure
 **/
raptor_id_set*
raptor_new_id_set(void) 
{
  raptor_id_set* set=(raptor_id_set*)RAPTOR_CALLOC(raptor_id_set, 1, 
                                                   sizeof(raptor_id_set));
  if(!set)
    return NULL;
  
  return set;
}


/**
 * raptor_free_base_id_set - Destroy raptor_base_id_set
 * @set: &raptor_id_set
 **/
static void
raptor_free_base_id_set(raptor_base_id_set *base) 
{
  if(base->nodes) {
    int i;
  
    for(i=0; i<base->capacity; i++) {
      raptor_id_set_node *node=base->nodes[i];
      
      /* this entry is used */
      if(node) {
	raptor_id_set_node *next;
	/* free all attached nodes */
	while(node) {
	  next=node->next;
	  raptor_free_id_set_node(node);
	  node=next;
	}
      }
    }
    RAPTOR_FREE(raptor_id_set_nodes, base->nodes);
  }

  if(base->uri)
    raptor_free_uri(base->uri);

  RAPTOR_FREE(raptor_base_id_set, base);

  return;
}


/**
 * raptor_free_id_set - Destroy a set
 * @set: &raptor_id_set
 *
 **/
void
raptor_free_id_set(raptor_id_set *set) 
{
  raptor_base_id_set *base=set->first;
  while(base) {
    raptor_base_id_set *next=base->next;
    raptor_free_base_id_set(base);
    base=next;
  }
  RAPTOR_FREE(raptor_id_set, set);
}


/**
 * raptor_id_set_add: - Add an item to the set
 * @set: &raptor_id_set
 * @base_uri: base &raptor_uri of identifier
 * @id: identifier name
 * @id_len: length of identifier
 * 
 * Return value: <0 on failure, 0 on success, 1 if already present
 **/
int
raptor_id_set_add(raptor_id_set* set, raptor_uri *base_uri,
                  const unsigned char *id, size_t id_len)
{
  raptor_base_id_set *base;
  unsigned char *base_uri_string;
  size_t base_uri_string_len;
  size_t item_len;
  unsigned char *item;

  raptor_id_set_node *node;
  unsigned long hash;
  char *new_item=NULL;
  int bucket= (-1);

  if(!base_uri || !id || !id_len)
    return -1;

  base=set->first;
  while(base) {
    if(raptor_uri_equals(base->uri, base_uri))
      break;
    base=base->next;
  }

  if(!base) {
    /* a set for this base_uri not found */
    base=(raptor_base_id_set*)RAPTOR_CALLOC(raptor_base_id_set, 1, 
                                            sizeof(raptor_base_id_set));
    if(!base)
      return -1;

    base->load_factor=raptor_id_set_default_load_factor;
    if(raptor_base_id_set_expand_size(base)) {
      RAPTOR_FREE(raptor_base_id_set, base);
      return -1;
    }

    base->uri=raptor_uri_copy(base_uri);

    /* Add to the start of the list */
    if(set->first)
      set->first->prev=base;
    /* base->prev=NULL; */
    base->next=set->first;

    set->first=base;
  } else {
    /* If not at the start of the list, move there */
    if(base != set->first) {
      /* remove from the list */
      base->prev->next=base->next;
      if(base->next)
        base->next->prev=base->prev;
      /* add at the start of the list */
      set->first->prev=base;
      base->prev=NULL;
      base->next=set->first;
    }
  }
  
  
  /* ensure there is enough space in the set */
  if (raptor_base_id_set_expand_size(base))
    return -1;

  /* Storing ID + ' ' + base-uri-string + '\0' */
  base_uri_string=raptor_uri_as_counted_string(base_uri, &base_uri_string_len);
  item_len=id_len+1+strlen((const char*)base_uri_string)+1;

  item=(unsigned char*)RAPTOR_MALLOC(cstring, item_len);
  if(!item)
    return 1;

  strcpy((char*)item, (const char*)id);
  item[id_len]=' ';
  strcpy((char*)item+id_len+1, (const char*)base_uri_string); /* strcpy for end '\0' */
  

  ONE_AT_A_TIME_SET(hash, (unsigned char*)item, item_len);
  
  /* find node for item */
  node=raptor_base_id_set_find_node(base, item, item_len, hash);

  /* if already there, error */
  if(node) {
#ifdef RAPTOR_DEBUG
    set->misses++;
#endif
    return 1;
  }

#ifdef RAPTOR_DEBUG
  set->hits++;
#endif
  
  /* always a new node */

  bucket=hash & (base->capacity - 1);
  
  /* allocate new node */
  node=(raptor_id_set_node*)RAPTOR_CALLOC(raptor_id_set_node, 1,
                                          sizeof(raptor_id_set_node));
  if(!node)
    return 1;
  
  node->hash=hash;
  
  /* allocate item for new node */
  new_item=(char*)RAPTOR_MALLOC(cstring, item_len);
  if(!new_item) {
    RAPTOR_FREE(raptor_id_set_node, node);
    return -1;
  }
  
  /* copy new item */
  memcpy(new_item, item, item_len);
  node->item=new_item;
  node->item_len=item_len;

  node->next=base->nodes[bucket];
  base->nodes[bucket]=node;
  
  base->items++;

  /* Only increase bucket count use when previous value was NULL */
  if(!node->next)
    base->size++;

  RAPTOR_FREE(cstring, item);

  return 0;
}


#ifdef RAPTOR_DEBUG
void
raptor_id_set_stats_print(raptor_id_set* set, FILE *stream) {
  fprintf(stream, "set hits: %d misses: %d\n", set->hits, set->misses);
}
#endif

#endif


#ifdef STANDALONE

/* one more prototype */
int main(int argc, char *argv[]);


int
main(int argc, char *argv[]) 
{
  const char *program=raptor_basename(argv[0]);
  char *items[8] = { "ron", "amy", "jen", "bij", "jib", "daj", "jim", NULL };
  raptor_id_set *set;
  raptor_uri *base_uri;
  int i=0;
  
  raptor_init();
  
  base_uri=raptor_new_uri((const unsigned char*)"http://example.org/base#");

  fprintf(stderr, "%s: Creating set\n", program);

  set=raptor_new_id_set();
  if(!set) {
    fprintf(stderr, "%s: Failed to create set\n", program);
    exit(1);
  }

  for(i=0; items[i]; i++) {
    size_t len=strlen(items[i]);
    int rc;

    fprintf(stderr, "%s: Adding set item '%s'\n", program, items[i]);
  
    rc=raptor_id_set_add(set, base_uri, (const unsigned char*)items[i], len);
if(rc) {
      fprintf(stderr, "%s: Adding set item %d '%s' failed, returning error %d\n",
              program, i, items[i], rc);
      exit(1);
    }
  }

  for(i=0; items[i]; i++) {
    size_t len=strlen(items[i]);
    int rc;

    fprintf(stderr, "%s: Adding duplicate set item '%s'\n", program, items[i]);

    rc=raptor_id_set_add(set, base_uri, (const unsigned char*)items[i], len);
    if(rc <= 0) {
      fprintf(stderr, "%s: Adding duplicate set item %d '%s' succeeded, should have failed, returning error %d\n",
              program, i, items[i], rc);
      exit(1);
    }
  }

#ifdef RAPTOR_DEBUG
  raptor_id_set_stats_print(set, stderr);
#endif

  fprintf(stderr, "%s: Freeing set\n", program);
  raptor_free_id_set(set);

  raptor_free_uri(base_uri);
  
  raptor_finish();
  
  /* keep gcc -Wall happy */
  return(0);
}

#endif