summaryrefslogtreecommitdiff
path: root/girepository/cmph/chd_ph.c
diff options
context:
space:
mode:
Diffstat (limited to 'girepository/cmph/chd_ph.c')
-rw-r--r--girepository/cmph/chd_ph.c988
1 files changed, 988 insertions, 0 deletions
diff --git a/girepository/cmph/chd_ph.c b/girepository/cmph/chd_ph.c
new file mode 100644
index 00000000..b34415b9
--- /dev/null
+++ b/girepository/cmph/chd_ph.c
@@ -0,0 +1,988 @@
+#include<stdio.h>
+#include<stdlib.h>
+#include<string.h>
+#include<math.h>
+#include<time.h>
+#include<assert.h>
+#include<limits.h>
+
+#include "cmph_structs.h"
+#include "chd_structs_ph.h"
+#include "chd_ph.h"
+#include"miller_rabin.h"
+#include"bitbool.h"
+
+
+//#define DEBUG
+#include "debug.h"
+
+// NO_ELEMENT is equivalent to null pointer
+#ifndef NO_ELEMENT
+#define NO_ELEMENT UINT_MAX
+#endif
+
+// struct used to represent items at mapping, ordering and searching phases
+struct _chd_ph_item_t
+{
+ cmph_uint32 f;
+ cmph_uint32 h;
+};
+typedef struct _chd_ph_item_t chd_ph_item_t;
+
+// struct to represent the items at mapping phase only.
+struct _chd_ph_map_item_t
+{
+ cmph_uint32 f;
+ cmph_uint32 h;
+ cmph_uint32 bucket_num;
+};
+typedef struct _chd_ph_map_item_t chd_ph_map_item_t;
+
+// struct to represent a bucket
+struct _chd_ph_bucket_t
+{
+ cmph_uint32 items_list; // offset
+ union
+ {
+ cmph_uint32 size;
+ cmph_uint32 bucket_id;
+ };
+};
+
+typedef struct _chd_ph_bucket_t chd_ph_bucket_t;
+
+struct _chd_ph_sorted_list_t
+{
+ cmph_uint32 buckets_list;
+ cmph_uint32 size;
+};
+
+typedef struct _chd_ph_sorted_list_t chd_ph_sorted_list_t;
+
+
+static inline chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets);
+static inline void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets);
+static inline void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets);
+
+chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets)
+{
+ chd_ph_bucket_t * buckets = (chd_ph_bucket_t *) calloc(nbuckets, sizeof(chd_ph_bucket_t));
+ return buckets;
+}
+
+void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets)
+{
+ register cmph_uint32 i = 0;
+ assert(buckets);
+ for(i = 0; i < nbuckets; i++)
+ buckets[i].size = 0;
+}
+cmph_uint8 chd_ph_bucket_insert(chd_ph_bucket_t * buckets,chd_ph_map_item_t * map_items, chd_ph_item_t * items,
+ cmph_uint32 nbuckets,cmph_uint32 item_idx)
+{
+ register cmph_uint32 i = 0;
+ register chd_ph_item_t * tmp_item;
+ register chd_ph_map_item_t * tmp_map_item = map_items + item_idx;
+ register chd_ph_bucket_t * bucket = buckets + tmp_map_item->bucket_num;
+ tmp_item = items + bucket->items_list;
+
+ for(i = 0; i < bucket->size; i++)
+ {
+ if(tmp_item->f == tmp_map_item->f && tmp_item->h == tmp_map_item->h)
+ {
+ DEBUGP("Item not added\n");
+ return 0;
+ };
+ tmp_item++;
+ };
+ tmp_item->f = tmp_map_item->f;
+ tmp_item->h = tmp_map_item->h;
+ bucket->size++;
+ return 1;
+};
+void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets)
+{
+ free(buckets);
+}
+
+static inline cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items,
+ cmph_uint32 *max_bucket_size);
+
+static chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets,chd_ph_item_t ** items,
+ cmph_uint32 nbuckets,cmph_uint32 nitems, cmph_uint32 max_bucket_size);
+
+static cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items ,
+ cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table);
+
+static inline double chd_ph_space_lower_bound(cmph_uint32 _n, cmph_uint32 _r)
+{
+ double r = _r, n = _n;
+ return (1 + (r/n - 1.0 + 1.0/(2.0*n))*log(1 - n/r))/log(2);
+};
+
+/* computes the entropy of non empty buckets.*/
+static inline double chd_ph_get_entropy(cmph_uint32 * disp_table, cmph_uint32 n, cmph_uint32 max_probes)
+{
+ register cmph_uint32 * probe_counts = (cmph_uint32 *) calloc(max_probes, sizeof(cmph_uint32));
+ register cmph_uint32 i;
+ register double entropy = 0;
+
+ for(i = 0; i < n; i++)
+ {
+ probe_counts[disp_table[i]]++;
+ };
+
+ for(i = 0; i < max_probes; i++)
+ {
+ if(probe_counts[i] > 0)
+ entropy -= probe_counts[i]*log((double)probe_counts[i]/(double)n)/log(2);
+ };
+ free(probe_counts);
+ return entropy;
+};
+
+chd_ph_config_data_t *chd_ph_config_new()
+{
+ chd_ph_config_data_t *chd_ph;
+ chd_ph = (chd_ph_config_data_t *)malloc(sizeof(chd_ph_config_data_t));
+ assert(chd_ph);
+ memset(chd_ph, 0, sizeof(chd_ph_config_data_t));
+
+ chd_ph->hashfunc = CMPH_HASH_JENKINS;
+ chd_ph->cs = NULL;
+ chd_ph->nbuckets = 0;
+ chd_ph->n = 0;
+ chd_ph->hl = NULL;
+
+ chd_ph->m = 0;
+ chd_ph->use_h = 1;
+ chd_ph->keys_per_bin = 1;
+ chd_ph->keys_per_bucket = 4;
+ chd_ph->occup_table = 0;
+
+ return chd_ph;
+}
+
+void chd_ph_config_destroy(cmph_config_t *mph)
+{
+ chd_ph_config_data_t *data = (chd_ph_config_data_t *) mph->data;
+ DEBUGP("Destroying algorithm dependent data\n");
+ if(data->occup_table)
+ {
+ free(data->occup_table);
+ data->occup_table = NULL;
+ }
+ free(data);
+}
+
+
+void chd_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
+{
+ chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
+ CMPH_HASH *hashptr = hashfuncs;
+ cmph_uint32 i = 0;
+ while(*hashptr != CMPH_HASH_COUNT)
+ {
+ if (i >= 1) break; //chd_ph only uses one linear hash function
+ chd_ph->hashfunc = *hashptr;
+ ++i, ++hashptr;
+ }
+}
+
+
+void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket)
+{
+ assert(mph);
+ chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
+ if(keys_per_bucket < 1 || keys_per_bucket >= 15)
+ {
+ keys_per_bucket = 4;
+ }
+ chd_ph->keys_per_bucket = keys_per_bucket;
+}
+
+
+void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin)
+{
+ assert(mph);
+ chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
+ if(keys_per_bin <= 1 || keys_per_bin >= 128)
+ {
+ keys_per_bin = 1;
+ }
+ chd_ph->keys_per_bin = keys_per_bin;
+}
+
+cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, cmph_uint32 *max_bucket_size)
+{
+ register cmph_uint32 i = 0, g = 0;
+ cmph_uint32 hl[3];
+ chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
+ char * key = NULL;
+ cmph_uint32 keylen = 0;
+ chd_ph_map_item_t * map_item;
+ chd_ph_map_item_t * map_items = malloc(chd_ph->m*sizeof(chd_ph_map_item_t));
+ register cmph_uint32 mapping_iterations = 1000;
+ *max_bucket_size = 0;
+ while(1)
+ {
+ mapping_iterations--;
+ if (chd_ph->hl) hash_state_destroy(chd_ph->hl);
+ chd_ph->hl = hash_state_new(chd_ph->hashfunc, chd_ph->m);
+
+ chd_ph_bucket_clean(buckets, chd_ph->nbuckets);
+
+ mph->key_source->rewind(mph->key_source->data);
+
+ for(i = 0; i < chd_ph->m; i++)
+ {
+ mph->key_source->read(mph->key_source->data, &key, &keylen);
+ hash_vector(chd_ph->hl, key, keylen, hl);
+
+ map_item = (map_items + i);
+
+ g = hl[0] % chd_ph->nbuckets;
+ map_item->f = hl[1] % chd_ph->n;
+ map_item->h = hl[2] % (chd_ph->n - 1) + 1;
+ map_item->bucket_num=g;
+ mph->key_source->dispose(mph->key_source->data, key, keylen);
+// if(buckets[g].size == (chd_ph->keys_per_bucket << 2))
+// {
+// DEBUGP("BUCKET = %u -- SIZE = %u -- MAXIMUM SIZE = %u\n", g, buckets[g].size, (chd_ph->keys_per_bucket << 2));
+// goto error;
+// }
+ buckets[g].size++;
+ if(buckets[g].size > *max_bucket_size)
+ {
+ *max_bucket_size = buckets[g].size;
+ }
+ }
+ buckets[0].items_list = 0;
+ for(i = 1; i < chd_ph->nbuckets; i++)
+ {
+ buckets[i].items_list = buckets[i-1].items_list + buckets[i - 1].size;
+ buckets[i - 1].size = 0;
+ };
+ buckets[i - 1].size = 0;
+ for(i = 0; i < chd_ph->m; i++)
+ {
+ map_item = (map_items + i);
+ if(!chd_ph_bucket_insert(buckets, map_items, items, chd_ph->nbuckets, i))
+ break;
+ }
+ if(i == chd_ph->m)
+ {
+ free(map_items);
+ return 1; // SUCCESS
+ }
+
+ if(mapping_iterations == 0)
+ {
+ goto error;
+ }
+ }
+error:
+ free(map_items);
+ hash_state_destroy(chd_ph->hl);
+ chd_ph->hl = NULL;
+ return 0; // FAILURE
+}
+
+chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets, chd_ph_item_t ** _items,
+ cmph_uint32 nbuckets, cmph_uint32 nitems, cmph_uint32 max_bucket_size)
+{
+ chd_ph_sorted_list_t * sorted_lists = (chd_ph_sorted_list_t *) calloc(max_bucket_size + 1, sizeof(chd_ph_sorted_list_t));
+
+ chd_ph_bucket_t * input_buckets = (*_buckets);
+ chd_ph_bucket_t * output_buckets;
+ chd_ph_item_t * input_items = (*_items);
+ chd_ph_item_t * output_items;
+ register cmph_uint32 i, j, bucket_size, position, position2;
+// cmph_uint32 non_empty_buckets;
+ DEBUGP("MAX BUCKET SIZE = %u\n", max_bucket_size);
+ // Determine size of each list of buckets
+ for(i = 0; i < nbuckets; i++)
+ {
+ bucket_size = input_buckets[i].size;
+ if(bucket_size == 0)
+ continue;
+ sorted_lists[bucket_size].size++;
+ };
+ sorted_lists[1].buckets_list = 0;
+ // Determine final position of list of buckets into the contiguous array that will store all the buckets
+ for(i = 2; i <= max_bucket_size; i++)
+ {
+ sorted_lists[i].buckets_list = sorted_lists[i-1].buckets_list + sorted_lists[i-1].size;
+ sorted_lists[i-1].size = 0;
+ };
+ sorted_lists[i-1].size = 0;
+ // Store the buckets in a new array which is sorted by bucket sizes
+ output_buckets = calloc(nbuckets, sizeof(chd_ph_bucket_t)); // everything is initialized with zero
+// non_empty_buckets = nbuckets;
+
+ for(i = 0; i < nbuckets; i++)
+ {
+ bucket_size = input_buckets[i].size;
+ if(bucket_size == 0)
+ {
+// non_empty_buckets--;
+ continue;
+ };
+ position = sorted_lists[bucket_size].buckets_list + sorted_lists[bucket_size].size;
+ output_buckets[position].bucket_id = i;
+ output_buckets[position].items_list = input_buckets[i].items_list;
+ sorted_lists[bucket_size].size++;
+ };
+/* for(i = non_empty_buckets; i < nbuckets; i++)
+ output_buckets[i].size=0;*/
+ // Return the buckets sorted in new order and free the old buckets sorted in old order
+ free(input_buckets);
+ (*_buckets) = output_buckets;
+
+
+ // Store the items according to the new order of buckets.
+ output_items = (chd_ph_item_t*)calloc(nitems, sizeof(chd_ph_item_t));
+ position = 0;
+ i = 0;
+ for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++)
+ {
+ for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size + sorted_lists[bucket_size].buckets_list; i++)
+ {
+ position2 = output_buckets[i].items_list;
+ output_buckets[i].items_list = position;
+ for(j = 0; j < bucket_size; j++)
+ {
+ output_items[position].f = input_items[position2].f;
+ output_items[position].h = input_items[position2].h;
+ position++;
+ position2++;
+ };
+ };
+ };
+ //Return the items sorted in new order and free the old items sorted in old order
+ free(input_items);
+ (*_items) = output_items;
+ return sorted_lists;
+};
+
+static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets,
+ chd_ph_item_t *items, cmph_uint32 probe0_num, cmph_uint32 probe1_num,
+ cmph_uint32 bucket_num, cmph_uint32 size)
+{
+ register cmph_uint32 i;
+ register chd_ph_item_t * item;
+ register cmph_uint32 position;
+
+ item = items + buckets[bucket_num].items_list;
+ // try place bucket with probe_num
+ if(chd_ph->keys_per_bin > 1)
+ {
+ for(i = 0; i < size; i++) // placement
+ {
+ position = (cmph_uint32)((item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n);
+ if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin)
+ {
+ break;
+ }
+ (chd_ph->occup_table[position])++;
+ item++;
+ };
+ } else
+ {
+ for(i = 0; i < size; i++) // placement
+ {
+ position = (cmph_uint32)((item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n);
+ if(GETBIT32(((cmph_uint32 *)chd_ph->occup_table), position))
+ {
+ break;
+ }
+ SETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
+ item++;
+ };
+ };
+ if(i != size) // Undo the placement
+ {
+ item = items + buckets[bucket_num].items_list;
+ if(chd_ph->keys_per_bin > 1)
+ {
+ while(1)
+ {
+ if(i == 0)
+ {
+ break;
+ }
+ position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
+ (chd_ph->occup_table[position])--;
+ item++;
+ i--;
+ };
+ } else
+ {
+ while(1)
+ {
+ if(i == 0)
+ {
+ break;
+ }
+ position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
+ UNSETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
+
+// ([position/32]^=(1<<(position%32));
+ item++;
+ i--;
+ };
+ };
+ return 0;
+ }
+ return 1;
+};
+
+static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, cmph_uint32 max_probes,
+ cmph_uint32 * disp_table, cmph_uint32 bucket_num, cmph_uint32 size)
+
+{
+ register cmph_uint32 probe0_num, probe1_num, probe_num;
+ probe0_num = 0;
+ probe1_num = 0;
+ probe_num = 0;
+
+ while(1)
+ {
+ if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, bucket_num,size))
+ {
+ disp_table[buckets[bucket_num].bucket_id] = probe0_num + probe1_num * chd_ph->n;
+ return 1;
+ }
+ probe0_num++;
+ if(probe0_num >= chd_ph->n)
+ {
+ probe0_num -= chd_ph->n;
+ probe1_num++;
+ };
+ probe_num++;
+ if(probe_num >= max_probes || probe1_num >= chd_ph->n)
+ {
+ return 0;
+ };
+ };
+ return 0;
+};
+
+static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t * buckets, chd_ph_item_t *items,
+ cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes,
+ cmph_uint32 * disp_table)
+{
+ register cmph_uint32 i = 0;
+ register cmph_uint32 curr_bucket = 0;
+
+ for(i = max_bucket_size; i > 0; i--)
+ {
+ curr_bucket = sorted_lists[i].buckets_list;
+ while(curr_bucket < sorted_lists[i].size + sorted_lists[i].buckets_list)
+ {
+ if(!place_bucket(chd_ph, buckets, items, max_probes, disp_table, curr_bucket, i))
+ {
+ return 0;
+ }
+ curr_bucket++;
+ };
+ };
+ return 1;
+};
+
+static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items,
+ cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes,
+ cmph_uint32 * disp_table)
+{
+ register cmph_uint32 i,j, non_placed_bucket;
+ register cmph_uint32 curr_bucket;
+ register cmph_uint32 probe_num, probe0_num, probe1_num;
+ cmph_uint32 sorted_list_size;
+#ifdef DEBUG
+ cmph_uint32 items_list;
+ cmph_uint32 bucket_id;
+#endif
+ DEBUGP("USING HEURISTIC TO PLACE BUCKETS\n");
+ for(i = max_bucket_size; i > 0; i--)
+ {
+ probe_num = 0;
+ probe0_num = 0;
+ probe1_num = 0;
+ sorted_list_size = sorted_lists[i].size;
+ while(sorted_lists[i].size != 0)
+ {
+ curr_bucket = sorted_lists[i].buckets_list;
+ for(j = 0, non_placed_bucket = 0; j < sorted_lists[i].size; j++)
+ {
+ // if bucket is successfully placed remove it from list
+ if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, curr_bucket, i))
+ {
+ disp_table[buckets[curr_bucket].bucket_id] = probe0_num + probe1_num * chd_ph->n;
+// DEBUGP("BUCKET %u PLACED --- DISPLACEMENT = %u\n", curr_bucket, disp_table[curr_bucket]);
+ }
+ else
+ {
+// DEBUGP("BUCKET %u NOT PLACED\n", curr_bucket);
+#ifdef DEBUG
+ items_list = buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list;
+ bucket_id = buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id;
+#endif
+ buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list = buckets[curr_bucket].items_list;
+ buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id = buckets[curr_bucket].bucket_id;
+#ifdef DEBUG
+ buckets[curr_bucket].items_list=items_list;
+ buckets[curr_bucket].bucket_id=bucket_id;
+#endif
+ non_placed_bucket++;
+ }
+ curr_bucket++;
+ };
+ sorted_lists[i].size = non_placed_bucket;
+ probe0_num++;
+ if(probe0_num >= chd_ph->n)
+ {
+ probe0_num -= chd_ph->n;
+ probe1_num++;
+ };
+ probe_num++;
+ if(probe_num >= max_probes || probe1_num >= chd_ph->n)
+ {
+ sorted_lists[i].size = sorted_list_size;
+ return 0;
+ };
+ };
+ sorted_lists[i].size = sorted_list_size;
+ };
+ return 1;
+};
+
+cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items ,
+ cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes,
+ cmph_uint32 * disp_table)
+{
+ if(chd_ph->use_h)
+ {
+ return place_buckets2(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
+ }
+ else
+ {
+ return place_buckets1(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
+ }
+
+}
+
+static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items,
+ cmph_uint32 * disp_table, chd_ph_sorted_list_t * sorted_lists,cmph_uint32 max_bucket_size)
+{
+ register cmph_uint32 bucket_size, i, j;
+ register cmph_uint32 position, probe0_num, probe1_num;
+ register cmph_uint32 m = 0;
+ register chd_ph_item_t * item;
+ if(chd_ph->keys_per_bin > 1)
+ memset(chd_ph->occup_table, 0, chd_ph->n);
+ else
+ memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32));
+
+ for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++)
+ for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size +
+ sorted_lists[bucket_size].buckets_list; i++)
+ {
+ j = bucket_size;
+ item = items + buckets[i].items_list;
+ probe0_num = disp_table[buckets[i].bucket_id] % chd_ph->n;
+ probe1_num = disp_table[buckets[i].bucket_id] / chd_ph->n;
+ for(; j > 0; j--)
+ {
+ m++;
+ position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
+ if(chd_ph->keys_per_bin > 1)
+ {
+ if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin)
+ {
+ return 0;
+ }
+ (chd_ph->occup_table[position])++;
+ }
+ else
+ {
+ if(GETBIT32(((cmph_uint32*)chd_ph->occup_table), position))
+ {
+ return 0;
+ }
+ SETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
+ };
+ item++;
+ };
+ };
+ DEBUGP("We were able to place m = %u keys\n", m);
+ return 1;
+};
+
+
+cmph_t *chd_ph_new(cmph_config_t *mph, double c)
+{
+ cmph_t *mphf = NULL;
+ chd_ph_data_t *chd_phf = NULL;
+ chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
+
+ register double load_factor = c;
+ register cmph_uint8 searching_success = 0;
+ register cmph_uint32 max_probes = 1 << 20; // default value for max_probes
+ register cmph_uint32 iterations = 100;
+ chd_ph_bucket_t * buckets = NULL;
+ chd_ph_item_t * items = NULL;
+ register cmph_uint8 failure = 0;
+ cmph_uint32 max_bucket_size = 0;
+ chd_ph_sorted_list_t * sorted_lists = NULL;
+ cmph_uint32 * disp_table = NULL;
+ register double space_lower_bound = 0;
+ #ifdef CMPH_TIMING
+ double construction_time_begin = 0.0;
+ double construction_time = 0.0;
+ ELAPSED_TIME_IN_SECONDS(&construction_time_begin);
+ #endif
+
+
+ chd_ph->m = mph->key_source->nkeys;
+ DEBUGP("m = %u\n", chd_ph->m);
+
+ chd_ph->nbuckets = (cmph_uint32)(chd_ph->m/chd_ph->keys_per_bucket) + 1;
+ DEBUGP("nbuckets = %u\n", chd_ph->nbuckets);
+
+ if(load_factor < 0.5 )
+ {
+ load_factor = 0.5;
+ }
+
+ if(load_factor >= 0.99)
+ {
+ load_factor = 0.99;
+ }
+
+ DEBUGP("load_factor = %.3f\n", load_factor);
+
+ chd_ph->n = (cmph_uint32)(chd_ph->m/(chd_ph->keys_per_bin * load_factor)) + 1;
+
+ //Round the number of bins to the prime immediately above
+ if(chd_ph->n % 2 == 0) chd_ph->n++;
+ for(;;)
+ {
+ if(check_primality(chd_ph->n) == 1)
+ break;
+ chd_ph->n += 2; // just odd numbers can be primes for n > 2
+
+ };
+
+ DEBUGP("n = %u \n", chd_ph->n);
+ if(chd_ph->keys_per_bin == 1)
+ {
+ space_lower_bound = chd_ph_space_lower_bound(chd_ph->m, chd_ph->n);
+ }
+
+ if(mph->verbosity)
+ {
+ fprintf(stderr, "space lower bound is %.3f bits per key\n", space_lower_bound);
+ }
+
+ // We allocate the working tables
+ buckets = chd_ph_bucket_new(chd_ph->nbuckets);
+ items = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t));
+
+ max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes);
+
+ if(chd_ph->keys_per_bin == 1)
+ chd_ph->occup_table = (cmph_uint8 *) calloc(((chd_ph->n + 31)/32), sizeof(cmph_uint32));
+ else
+ chd_ph->occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8));
+
+ disp_table = (cmph_uint32 *) calloc(chd_ph->nbuckets, sizeof(cmph_uint32));
+//
+// init_genrand(time(0));
+
+ while(1)
+ {
+ iterations --;
+ if (mph->verbosity)
+ {
+ fprintf(stderr, "Starting mapping step for mph creation of %u keys with %u bins\n", chd_ph->m, chd_ph->n);
+ }
+
+ if(!chd_ph_mapping(mph, buckets, items, &max_bucket_size))
+ {
+ if (mph->verbosity)
+ {
+ fprintf(stderr, "Failure in mapping step\n");
+ }
+ failure = 1;
+ goto cleanup;
+ }
+
+ if (mph->verbosity)
+ {
+ fprintf(stderr, "Starting ordering step\n");
+ }
+ if(sorted_lists)
+ {
+ free(sorted_lists);
+ }
+
+ sorted_lists = chd_ph_ordering(&buckets, &items, chd_ph->nbuckets, chd_ph->m, max_bucket_size);
+
+ if (mph->verbosity)
+ {
+ fprintf(stderr, "Starting searching step\n");
+ }
+
+ searching_success = chd_ph_searching(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
+ if(searching_success) break;
+
+ // reset occup_table
+ if(chd_ph->keys_per_bin > 1)
+ memset(chd_ph->occup_table, 0, chd_ph->n);
+ else
+ memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32));
+ if(iterations == 0)
+ {
+ // Cleanup memory
+ if (mph->verbosity)
+ {
+ fprintf(stderr, "Failure because the max trials was exceeded\n");
+ }
+ failure = 1;
+ goto cleanup;
+ };
+ }
+
+ #ifdef DEBUG
+ {
+ if(!chd_ph_check_bin_hashing(chd_ph, buckets, items, disp_table,sorted_lists,max_bucket_size))
+ {
+
+ DEBUGP("Error for bin packing generation");
+ failure = 1;
+ goto cleanup;
+ }
+ }
+ #endif
+
+ if (mph->verbosity)
+ {
+ fprintf(stderr, "Starting compressing step\n");
+ }
+
+ if(chd_ph->cs)
+ {
+ free(chd_ph->cs);
+ }
+ chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t));
+ compressed_seq_init(chd_ph->cs);
+ compressed_seq_generate(chd_ph->cs, disp_table, chd_ph->nbuckets);
+
+ #ifdef CMPH_TIMING
+ ELAPSED_TIME_IN_SECONDS(&construction_time);
+ register double entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes);
+ DEBUGP("Entropy = %.4f\n", entropy/chd_ph->m);
+ #endif
+
+cleanup:
+ chd_ph_bucket_destroy(buckets);
+ free(items);
+ free(sorted_lists);
+ free(disp_table);
+ if(failure)
+ {
+ if(chd_ph->hl)
+ {
+ hash_state_destroy(chd_ph->hl);
+ }
+ chd_ph->hl = NULL;
+ return NULL;
+ }
+
+ mphf = (cmph_t *)malloc(sizeof(cmph_t));
+ mphf->algo = mph->algo;
+ chd_phf = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t));
+
+ chd_phf->cs = chd_ph->cs;
+ chd_ph->cs = NULL; //transfer memory ownership
+ chd_phf->hl = chd_ph->hl;
+ chd_ph->hl = NULL; //transfer memory ownership
+ chd_phf->n = chd_ph->n;
+ chd_phf->nbuckets = chd_ph->nbuckets;
+
+ mphf->data = chd_phf;
+ mphf->size = chd_ph->n;
+
+ DEBUGP("Successfully generated minimal perfect hash\n");
+ if (mph->verbosity)
+ {
+ fprintf(stderr, "Successfully generated minimal perfect hash function\n");
+ }
+
+ #ifdef CMPH_TIMING
+ register cmph_uint32 space_usage = chd_ph_packed_size(mphf)*8;
+ construction_time = construction_time - construction_time_begin;
+ fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\t%.4f\t%.4f\n", chd_ph->m, load_factor, chd_ph->keys_per_bucket, construction_time, space_usage/(double)chd_ph->m, space_lower_bound, entropy/chd_ph->m);
+ #endif
+
+ return mphf;
+}
+
+
+
+void chd_ph_load(FILE *fd, cmph_t *mphf)
+{
+ char *buf = NULL;
+ cmph_uint32 buflen;
+ register size_t nbytes;
+ chd_ph_data_t *chd_ph = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t));
+
+ DEBUGP("Loading chd_ph mphf\n");
+ mphf->data = chd_ph;
+
+ nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
+ DEBUGP("Hash state has %u bytes\n", buflen);
+ buf = (char *)malloc((size_t)buflen);
+ nbytes = fread(buf, (size_t)buflen, (size_t)1, fd);
+ chd_ph->hl = hash_state_load(buf, buflen);
+ free(buf);
+
+ nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
+ DEBUGP("Compressed sequence structure has %u bytes\n", buflen);
+ buf = (char *)malloc((size_t)buflen);
+ nbytes = fread(buf, (size_t)buflen, (size_t)1, fd);
+ chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t));
+ compressed_seq_load(chd_ph->cs, buf, buflen);
+ free(buf);
+
+ // loading n and nbuckets
+ DEBUGP("Reading n and nbuckets\n");
+ nbytes = fread(&(chd_ph->n), sizeof(cmph_uint32), (size_t)1, fd);
+ nbytes = fread(&(chd_ph->nbuckets), sizeof(cmph_uint32), (size_t)1, fd);
+}
+
+int chd_ph_dump(cmph_t *mphf, FILE *fd)
+{
+ char *buf = NULL;
+ cmph_uint32 buflen;
+ register size_t nbytes;
+ chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
+
+ __cmph_dump(mphf, fd);
+
+ hash_state_dump(data->hl, &buf, &buflen);
+ DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
+ nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
+ nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
+ free(buf);
+
+ compressed_seq_dump(data->cs, &buf, &buflen);
+ DEBUGP("Dumping compressed sequence structure with %u bytes to disk\n", buflen);
+ nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
+ nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
+ free(buf);
+
+ // dumping n and nbuckets
+ nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd);
+ nbytes = fwrite(&(data->nbuckets), sizeof(cmph_uint32), (size_t)1, fd);
+ return 1;
+}
+
+void chd_ph_destroy(cmph_t *mphf)
+{
+ chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
+ compressed_seq_destroy(data->cs);
+ free(data->cs);
+ hash_state_destroy(data->hl);
+ free(data);
+ free(mphf);
+
+}
+
+cmph_uint32 chd_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
+{
+ register chd_ph_data_t * chd_ph = mphf->data;
+ cmph_uint32 hl[3];
+ register cmph_uint32 disp,position;
+ register cmph_uint32 probe0_num,probe1_num;
+ register cmph_uint32 f,g,h;
+ hash_vector(chd_ph->hl, key, keylen, hl);
+ g = hl[0] % chd_ph->nbuckets;
+ f = hl[1] % chd_ph->n;
+ h = hl[2] % (chd_ph->n-1) + 1;
+
+ disp = compressed_seq_query(chd_ph->cs, g);
+ probe0_num = disp % chd_ph->n;
+ probe1_num = disp/chd_ph->n;
+ position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % chd_ph->n);
+ return position;
+}
+
+void chd_ph_pack(cmph_t *mphf, void *packed_mphf)
+{
+ chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
+ cmph_uint8 * ptr = packed_mphf;
+
+ // packing hl type
+ CMPH_HASH hl_type = hash_get_type(data->hl);
+ *((cmph_uint32 *) ptr) = hl_type;
+ ptr += sizeof(cmph_uint32);
+
+ // packing hl
+ hash_state_pack(data->hl, ptr);
+ ptr += hash_state_packed_size(hl_type);
+
+ // packing n
+ *((cmph_uint32 *) ptr) = data->n;
+ ptr += sizeof(data->n);
+
+ // packing nbuckets
+ *((cmph_uint32 *) ptr) = data->nbuckets;
+ ptr += sizeof(data->nbuckets);
+
+ // packing cs
+ compressed_seq_pack(data->cs, ptr);
+ //ptr += compressed_seq_packed_size(data->cs);
+
+}
+
+cmph_uint32 chd_ph_packed_size(cmph_t *mphf)
+{
+ register chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
+ register CMPH_HASH hl_type = hash_get_type(data->hl);
+ register cmph_uint32 hash_state_pack_size = hash_state_packed_size(hl_type);
+ register cmph_uint32 cs_pack_size = compressed_seq_packed_size(data->cs);
+
+ return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_pack_size + cs_pack_size + 3*sizeof(cmph_uint32));
+
+}
+
+cmph_uint32 chd_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
+{
+ register CMPH_HASH hl_type = *(cmph_uint32 *)packed_mphf;
+ register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4;
+
+ register cmph_uint32 * ptr = (cmph_uint32 *)(hl_ptr + hash_state_packed_size(hl_type));
+ register cmph_uint32 n = *ptr++;
+ register cmph_uint32 nbuckets = *ptr++;
+ cmph_uint32 hl[3];
+
+ register cmph_uint32 disp,position;
+ register cmph_uint32 probe0_num,probe1_num;
+ register cmph_uint32 f,g,h;
+
+ hash_vector_packed(hl_ptr, hl_type, key, keylen, hl);
+
+ g = hl[0] % nbuckets;
+ f = hl[1] % n;
+ h = hl[2] % (n-1) + 1;
+
+ disp = compressed_seq_query_packed(ptr, g);
+ probe0_num = disp % n;
+ probe1_num = disp/n;
+ position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % n);
+ return position;
+}
+
+
+