diff options
Diffstat (limited to 'girepository/cmph/chd_ph.c')
-rw-r--r-- | girepository/cmph/chd_ph.c | 988 |
1 files changed, 988 insertions, 0 deletions
diff --git a/girepository/cmph/chd_ph.c b/girepository/cmph/chd_ph.c new file mode 100644 index 00000000..b34415b9 --- /dev/null +++ b/girepository/cmph/chd_ph.c @@ -0,0 +1,988 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<math.h> +#include<time.h> +#include<assert.h> +#include<limits.h> + +#include "cmph_structs.h" +#include "chd_structs_ph.h" +#include "chd_ph.h" +#include"miller_rabin.h" +#include"bitbool.h" + + +//#define DEBUG +#include "debug.h" + +// NO_ELEMENT is equivalent to null pointer +#ifndef NO_ELEMENT +#define NO_ELEMENT UINT_MAX +#endif + +// struct used to represent items at mapping, ordering and searching phases +struct _chd_ph_item_t +{ + cmph_uint32 f; + cmph_uint32 h; +}; +typedef struct _chd_ph_item_t chd_ph_item_t; + +// struct to represent the items at mapping phase only. +struct _chd_ph_map_item_t +{ + cmph_uint32 f; + cmph_uint32 h; + cmph_uint32 bucket_num; +}; +typedef struct _chd_ph_map_item_t chd_ph_map_item_t; + +// struct to represent a bucket +struct _chd_ph_bucket_t +{ + cmph_uint32 items_list; // offset + union + { + cmph_uint32 size; + cmph_uint32 bucket_id; + }; +}; + +typedef struct _chd_ph_bucket_t chd_ph_bucket_t; + +struct _chd_ph_sorted_list_t +{ + cmph_uint32 buckets_list; + cmph_uint32 size; +}; + +typedef struct _chd_ph_sorted_list_t chd_ph_sorted_list_t; + + +static inline chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets); +static inline void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets); +static inline void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets); + +chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets) +{ + chd_ph_bucket_t * buckets = (chd_ph_bucket_t *) calloc(nbuckets, sizeof(chd_ph_bucket_t)); + return buckets; +} + +void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets) +{ + register cmph_uint32 i = 0; + assert(buckets); + for(i = 0; i < nbuckets; i++) + buckets[i].size = 0; +} +cmph_uint8 chd_ph_bucket_insert(chd_ph_bucket_t * buckets,chd_ph_map_item_t * map_items, chd_ph_item_t * items, + cmph_uint32 nbuckets,cmph_uint32 item_idx) +{ + register cmph_uint32 i = 0; + register chd_ph_item_t * tmp_item; + register chd_ph_map_item_t * tmp_map_item = map_items + item_idx; + register chd_ph_bucket_t * bucket = buckets + tmp_map_item->bucket_num; + tmp_item = items + bucket->items_list; + + for(i = 0; i < bucket->size; i++) + { + if(tmp_item->f == tmp_map_item->f && tmp_item->h == tmp_map_item->h) + { + DEBUGP("Item not added\n"); + return 0; + }; + tmp_item++; + }; + tmp_item->f = tmp_map_item->f; + tmp_item->h = tmp_map_item->h; + bucket->size++; + return 1; +}; +void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets) +{ + free(buckets); +} + +static inline cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, + cmph_uint32 *max_bucket_size); + +static chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets,chd_ph_item_t ** items, + cmph_uint32 nbuckets,cmph_uint32 nitems, cmph_uint32 max_bucket_size); + +static cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items , + cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table); + +static inline double chd_ph_space_lower_bound(cmph_uint32 _n, cmph_uint32 _r) +{ + double r = _r, n = _n; + return (1 + (r/n - 1.0 + 1.0/(2.0*n))*log(1 - n/r))/log(2); +}; + +/* computes the entropy of non empty buckets.*/ +static inline double chd_ph_get_entropy(cmph_uint32 * disp_table, cmph_uint32 n, cmph_uint32 max_probes) +{ + register cmph_uint32 * probe_counts = (cmph_uint32 *) calloc(max_probes, sizeof(cmph_uint32)); + register cmph_uint32 i; + register double entropy = 0; + + for(i = 0; i < n; i++) + { + probe_counts[disp_table[i]]++; + }; + + for(i = 0; i < max_probes; i++) + { + if(probe_counts[i] > 0) + entropy -= probe_counts[i]*log((double)probe_counts[i]/(double)n)/log(2); + }; + free(probe_counts); + return entropy; +}; + +chd_ph_config_data_t *chd_ph_config_new() +{ + chd_ph_config_data_t *chd_ph; + chd_ph = (chd_ph_config_data_t *)malloc(sizeof(chd_ph_config_data_t)); + assert(chd_ph); + memset(chd_ph, 0, sizeof(chd_ph_config_data_t)); + + chd_ph->hashfunc = CMPH_HASH_JENKINS; + chd_ph->cs = NULL; + chd_ph->nbuckets = 0; + chd_ph->n = 0; + chd_ph->hl = NULL; + + chd_ph->m = 0; + chd_ph->use_h = 1; + chd_ph->keys_per_bin = 1; + chd_ph->keys_per_bucket = 4; + chd_ph->occup_table = 0; + + return chd_ph; +} + +void chd_ph_config_destroy(cmph_config_t *mph) +{ + chd_ph_config_data_t *data = (chd_ph_config_data_t *) mph->data; + DEBUGP("Destroying algorithm dependent data\n"); + if(data->occup_table) + { + free(data->occup_table); + data->occup_table = NULL; + } + free(data); +} + + +void chd_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) +{ + chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; + CMPH_HASH *hashptr = hashfuncs; + cmph_uint32 i = 0; + while(*hashptr != CMPH_HASH_COUNT) + { + if (i >= 1) break; //chd_ph only uses one linear hash function + chd_ph->hashfunc = *hashptr; + ++i, ++hashptr; + } +} + + +void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket) +{ + assert(mph); + chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; + if(keys_per_bucket < 1 || keys_per_bucket >= 15) + { + keys_per_bucket = 4; + } + chd_ph->keys_per_bucket = keys_per_bucket; +} + + +void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin) +{ + assert(mph); + chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; + if(keys_per_bin <= 1 || keys_per_bin >= 128) + { + keys_per_bin = 1; + } + chd_ph->keys_per_bin = keys_per_bin; +} + +cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, cmph_uint32 *max_bucket_size) +{ + register cmph_uint32 i = 0, g = 0; + cmph_uint32 hl[3]; + chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; + char * key = NULL; + cmph_uint32 keylen = 0; + chd_ph_map_item_t * map_item; + chd_ph_map_item_t * map_items = malloc(chd_ph->m*sizeof(chd_ph_map_item_t)); + register cmph_uint32 mapping_iterations = 1000; + *max_bucket_size = 0; + while(1) + { + mapping_iterations--; + if (chd_ph->hl) hash_state_destroy(chd_ph->hl); + chd_ph->hl = hash_state_new(chd_ph->hashfunc, chd_ph->m); + + chd_ph_bucket_clean(buckets, chd_ph->nbuckets); + + mph->key_source->rewind(mph->key_source->data); + + for(i = 0; i < chd_ph->m; i++) + { + mph->key_source->read(mph->key_source->data, &key, &keylen); + hash_vector(chd_ph->hl, key, keylen, hl); + + map_item = (map_items + i); + + g = hl[0] % chd_ph->nbuckets; + map_item->f = hl[1] % chd_ph->n; + map_item->h = hl[2] % (chd_ph->n - 1) + 1; + map_item->bucket_num=g; + mph->key_source->dispose(mph->key_source->data, key, keylen); +// if(buckets[g].size == (chd_ph->keys_per_bucket << 2)) +// { +// DEBUGP("BUCKET = %u -- SIZE = %u -- MAXIMUM SIZE = %u\n", g, buckets[g].size, (chd_ph->keys_per_bucket << 2)); +// goto error; +// } + buckets[g].size++; + if(buckets[g].size > *max_bucket_size) + { + *max_bucket_size = buckets[g].size; + } + } + buckets[0].items_list = 0; + for(i = 1; i < chd_ph->nbuckets; i++) + { + buckets[i].items_list = buckets[i-1].items_list + buckets[i - 1].size; + buckets[i - 1].size = 0; + }; + buckets[i - 1].size = 0; + for(i = 0; i < chd_ph->m; i++) + { + map_item = (map_items + i); + if(!chd_ph_bucket_insert(buckets, map_items, items, chd_ph->nbuckets, i)) + break; + } + if(i == chd_ph->m) + { + free(map_items); + return 1; // SUCCESS + } + + if(mapping_iterations == 0) + { + goto error; + } + } +error: + free(map_items); + hash_state_destroy(chd_ph->hl); + chd_ph->hl = NULL; + return 0; // FAILURE +} + +chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets, chd_ph_item_t ** _items, + cmph_uint32 nbuckets, cmph_uint32 nitems, cmph_uint32 max_bucket_size) +{ + chd_ph_sorted_list_t * sorted_lists = (chd_ph_sorted_list_t *) calloc(max_bucket_size + 1, sizeof(chd_ph_sorted_list_t)); + + chd_ph_bucket_t * input_buckets = (*_buckets); + chd_ph_bucket_t * output_buckets; + chd_ph_item_t * input_items = (*_items); + chd_ph_item_t * output_items; + register cmph_uint32 i, j, bucket_size, position, position2; +// cmph_uint32 non_empty_buckets; + DEBUGP("MAX BUCKET SIZE = %u\n", max_bucket_size); + // Determine size of each list of buckets + for(i = 0; i < nbuckets; i++) + { + bucket_size = input_buckets[i].size; + if(bucket_size == 0) + continue; + sorted_lists[bucket_size].size++; + }; + sorted_lists[1].buckets_list = 0; + // Determine final position of list of buckets into the contiguous array that will store all the buckets + for(i = 2; i <= max_bucket_size; i++) + { + sorted_lists[i].buckets_list = sorted_lists[i-1].buckets_list + sorted_lists[i-1].size; + sorted_lists[i-1].size = 0; + }; + sorted_lists[i-1].size = 0; + // Store the buckets in a new array which is sorted by bucket sizes + output_buckets = calloc(nbuckets, sizeof(chd_ph_bucket_t)); // everything is initialized with zero +// non_empty_buckets = nbuckets; + + for(i = 0; i < nbuckets; i++) + { + bucket_size = input_buckets[i].size; + if(bucket_size == 0) + { +// non_empty_buckets--; + continue; + }; + position = sorted_lists[bucket_size].buckets_list + sorted_lists[bucket_size].size; + output_buckets[position].bucket_id = i; + output_buckets[position].items_list = input_buckets[i].items_list; + sorted_lists[bucket_size].size++; + }; +/* for(i = non_empty_buckets; i < nbuckets; i++) + output_buckets[i].size=0;*/ + // Return the buckets sorted in new order and free the old buckets sorted in old order + free(input_buckets); + (*_buckets) = output_buckets; + + + // Store the items according to the new order of buckets. + output_items = (chd_ph_item_t*)calloc(nitems, sizeof(chd_ph_item_t)); + position = 0; + i = 0; + for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++) + { + for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size + sorted_lists[bucket_size].buckets_list; i++) + { + position2 = output_buckets[i].items_list; + output_buckets[i].items_list = position; + for(j = 0; j < bucket_size; j++) + { + output_items[position].f = input_items[position2].f; + output_items[position].h = input_items[position2].h; + position++; + position2++; + }; + }; + }; + //Return the items sorted in new order and free the old items sorted in old order + free(input_items); + (*_items) = output_items; + return sorted_lists; +}; + +static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, + chd_ph_item_t *items, cmph_uint32 probe0_num, cmph_uint32 probe1_num, + cmph_uint32 bucket_num, cmph_uint32 size) +{ + register cmph_uint32 i; + register chd_ph_item_t * item; + register cmph_uint32 position; + + item = items + buckets[bucket_num].items_list; + // try place bucket with probe_num + if(chd_ph->keys_per_bin > 1) + { + for(i = 0; i < size; i++) // placement + { + position = (cmph_uint32)((item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n); + if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin) + { + break; + } + (chd_ph->occup_table[position])++; + item++; + }; + } else + { + for(i = 0; i < size; i++) // placement + { + position = (cmph_uint32)((item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n); + if(GETBIT32(((cmph_uint32 *)chd_ph->occup_table), position)) + { + break; + } + SETBIT32(((cmph_uint32*)chd_ph->occup_table), position); + item++; + }; + }; + if(i != size) // Undo the placement + { + item = items + buckets[bucket_num].items_list; + if(chd_ph->keys_per_bin > 1) + { + while(1) + { + if(i == 0) + { + break; + } + position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n); + (chd_ph->occup_table[position])--; + item++; + i--; + }; + } else + { + while(1) + { + if(i == 0) + { + break; + } + position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n); + UNSETBIT32(((cmph_uint32*)chd_ph->occup_table), position); + +// ([position/32]^=(1<<(position%32)); + item++; + i--; + }; + }; + return 0; + } + return 1; +}; + +static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, cmph_uint32 max_probes, + cmph_uint32 * disp_table, cmph_uint32 bucket_num, cmph_uint32 size) + +{ + register cmph_uint32 probe0_num, probe1_num, probe_num; + probe0_num = 0; + probe1_num = 0; + probe_num = 0; + + while(1) + { + if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, bucket_num,size)) + { + disp_table[buckets[bucket_num].bucket_id] = probe0_num + probe1_num * chd_ph->n; + return 1; + } + probe0_num++; + if(probe0_num >= chd_ph->n) + { + probe0_num -= chd_ph->n; + probe1_num++; + }; + probe_num++; + if(probe_num >= max_probes || probe1_num >= chd_ph->n) + { + return 0; + }; + }; + return 0; +}; + +static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t * buckets, chd_ph_item_t *items, + cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, + cmph_uint32 * disp_table) +{ + register cmph_uint32 i = 0; + register cmph_uint32 curr_bucket = 0; + + for(i = max_bucket_size; i > 0; i--) + { + curr_bucket = sorted_lists[i].buckets_list; + while(curr_bucket < sorted_lists[i].size + sorted_lists[i].buckets_list) + { + if(!place_bucket(chd_ph, buckets, items, max_probes, disp_table, curr_bucket, i)) + { + return 0; + } + curr_bucket++; + }; + }; + return 1; +}; + +static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, + cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, + cmph_uint32 * disp_table) +{ + register cmph_uint32 i,j, non_placed_bucket; + register cmph_uint32 curr_bucket; + register cmph_uint32 probe_num, probe0_num, probe1_num; + cmph_uint32 sorted_list_size; +#ifdef DEBUG + cmph_uint32 items_list; + cmph_uint32 bucket_id; +#endif + DEBUGP("USING HEURISTIC TO PLACE BUCKETS\n"); + for(i = max_bucket_size; i > 0; i--) + { + probe_num = 0; + probe0_num = 0; + probe1_num = 0; + sorted_list_size = sorted_lists[i].size; + while(sorted_lists[i].size != 0) + { + curr_bucket = sorted_lists[i].buckets_list; + for(j = 0, non_placed_bucket = 0; j < sorted_lists[i].size; j++) + { + // if bucket is successfully placed remove it from list + if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, curr_bucket, i)) + { + disp_table[buckets[curr_bucket].bucket_id] = probe0_num + probe1_num * chd_ph->n; +// DEBUGP("BUCKET %u PLACED --- DISPLACEMENT = %u\n", curr_bucket, disp_table[curr_bucket]); + } + else + { +// DEBUGP("BUCKET %u NOT PLACED\n", curr_bucket); +#ifdef DEBUG + items_list = buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list; + bucket_id = buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id; +#endif + buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list = buckets[curr_bucket].items_list; + buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id = buckets[curr_bucket].bucket_id; +#ifdef DEBUG + buckets[curr_bucket].items_list=items_list; + buckets[curr_bucket].bucket_id=bucket_id; +#endif + non_placed_bucket++; + } + curr_bucket++; + }; + sorted_lists[i].size = non_placed_bucket; + probe0_num++; + if(probe0_num >= chd_ph->n) + { + probe0_num -= chd_ph->n; + probe1_num++; + }; + probe_num++; + if(probe_num >= max_probes || probe1_num >= chd_ph->n) + { + sorted_lists[i].size = sorted_list_size; + return 0; + }; + }; + sorted_lists[i].size = sorted_list_size; + }; + return 1; +}; + +cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items , + cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, + cmph_uint32 * disp_table) +{ + if(chd_ph->use_h) + { + return place_buckets2(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table); + } + else + { + return place_buckets1(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table); + } + +} + +static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items, + cmph_uint32 * disp_table, chd_ph_sorted_list_t * sorted_lists,cmph_uint32 max_bucket_size) +{ + register cmph_uint32 bucket_size, i, j; + register cmph_uint32 position, probe0_num, probe1_num; + register cmph_uint32 m = 0; + register chd_ph_item_t * item; + if(chd_ph->keys_per_bin > 1) + memset(chd_ph->occup_table, 0, chd_ph->n); + else + memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32)); + + for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++) + for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size + + sorted_lists[bucket_size].buckets_list; i++) + { + j = bucket_size; + item = items + buckets[i].items_list; + probe0_num = disp_table[buckets[i].bucket_id] % chd_ph->n; + probe1_num = disp_table[buckets[i].bucket_id] / chd_ph->n; + for(; j > 0; j--) + { + m++; + position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n); + if(chd_ph->keys_per_bin > 1) + { + if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin) + { + return 0; + } + (chd_ph->occup_table[position])++; + } + else + { + if(GETBIT32(((cmph_uint32*)chd_ph->occup_table), position)) + { + return 0; + } + SETBIT32(((cmph_uint32*)chd_ph->occup_table), position); + }; + item++; + }; + }; + DEBUGP("We were able to place m = %u keys\n", m); + return 1; +}; + + +cmph_t *chd_ph_new(cmph_config_t *mph, double c) +{ + cmph_t *mphf = NULL; + chd_ph_data_t *chd_phf = NULL; + chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; + + register double load_factor = c; + register cmph_uint8 searching_success = 0; + register cmph_uint32 max_probes = 1 << 20; // default value for max_probes + register cmph_uint32 iterations = 100; + chd_ph_bucket_t * buckets = NULL; + chd_ph_item_t * items = NULL; + register cmph_uint8 failure = 0; + cmph_uint32 max_bucket_size = 0; + chd_ph_sorted_list_t * sorted_lists = NULL; + cmph_uint32 * disp_table = NULL; + register double space_lower_bound = 0; + #ifdef CMPH_TIMING + double construction_time_begin = 0.0; + double construction_time = 0.0; + ELAPSED_TIME_IN_SECONDS(&construction_time_begin); + #endif + + + chd_ph->m = mph->key_source->nkeys; + DEBUGP("m = %u\n", chd_ph->m); + + chd_ph->nbuckets = (cmph_uint32)(chd_ph->m/chd_ph->keys_per_bucket) + 1; + DEBUGP("nbuckets = %u\n", chd_ph->nbuckets); + + if(load_factor < 0.5 ) + { + load_factor = 0.5; + } + + if(load_factor >= 0.99) + { + load_factor = 0.99; + } + + DEBUGP("load_factor = %.3f\n", load_factor); + + chd_ph->n = (cmph_uint32)(chd_ph->m/(chd_ph->keys_per_bin * load_factor)) + 1; + + //Round the number of bins to the prime immediately above + if(chd_ph->n % 2 == 0) chd_ph->n++; + for(;;) + { + if(check_primality(chd_ph->n) == 1) + break; + chd_ph->n += 2; // just odd numbers can be primes for n > 2 + + }; + + DEBUGP("n = %u \n", chd_ph->n); + if(chd_ph->keys_per_bin == 1) + { + space_lower_bound = chd_ph_space_lower_bound(chd_ph->m, chd_ph->n); + } + + if(mph->verbosity) + { + fprintf(stderr, "space lower bound is %.3f bits per key\n", space_lower_bound); + } + + // We allocate the working tables + buckets = chd_ph_bucket_new(chd_ph->nbuckets); + items = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t)); + + max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes); + + if(chd_ph->keys_per_bin == 1) + chd_ph->occup_table = (cmph_uint8 *) calloc(((chd_ph->n + 31)/32), sizeof(cmph_uint32)); + else + chd_ph->occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8)); + + disp_table = (cmph_uint32 *) calloc(chd_ph->nbuckets, sizeof(cmph_uint32)); +// +// init_genrand(time(0)); + + while(1) + { + iterations --; + if (mph->verbosity) + { + fprintf(stderr, "Starting mapping step for mph creation of %u keys with %u bins\n", chd_ph->m, chd_ph->n); + } + + if(!chd_ph_mapping(mph, buckets, items, &max_bucket_size)) + { + if (mph->verbosity) + { + fprintf(stderr, "Failure in mapping step\n"); + } + failure = 1; + goto cleanup; + } + + if (mph->verbosity) + { + fprintf(stderr, "Starting ordering step\n"); + } + if(sorted_lists) + { + free(sorted_lists); + } + + sorted_lists = chd_ph_ordering(&buckets, &items, chd_ph->nbuckets, chd_ph->m, max_bucket_size); + + if (mph->verbosity) + { + fprintf(stderr, "Starting searching step\n"); + } + + searching_success = chd_ph_searching(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table); + if(searching_success) break; + + // reset occup_table + if(chd_ph->keys_per_bin > 1) + memset(chd_ph->occup_table, 0, chd_ph->n); + else + memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32)); + if(iterations == 0) + { + // Cleanup memory + if (mph->verbosity) + { + fprintf(stderr, "Failure because the max trials was exceeded\n"); + } + failure = 1; + goto cleanup; + }; + } + + #ifdef DEBUG + { + if(!chd_ph_check_bin_hashing(chd_ph, buckets, items, disp_table,sorted_lists,max_bucket_size)) + { + + DEBUGP("Error for bin packing generation"); + failure = 1; + goto cleanup; + } + } + #endif + + if (mph->verbosity) + { + fprintf(stderr, "Starting compressing step\n"); + } + + if(chd_ph->cs) + { + free(chd_ph->cs); + } + chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t)); + compressed_seq_init(chd_ph->cs); + compressed_seq_generate(chd_ph->cs, disp_table, chd_ph->nbuckets); + + #ifdef CMPH_TIMING + ELAPSED_TIME_IN_SECONDS(&construction_time); + register double entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes); + DEBUGP("Entropy = %.4f\n", entropy/chd_ph->m); + #endif + +cleanup: + chd_ph_bucket_destroy(buckets); + free(items); + free(sorted_lists); + free(disp_table); + if(failure) + { + if(chd_ph->hl) + { + hash_state_destroy(chd_ph->hl); + } + chd_ph->hl = NULL; + return NULL; + } + + mphf = (cmph_t *)malloc(sizeof(cmph_t)); + mphf->algo = mph->algo; + chd_phf = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t)); + + chd_phf->cs = chd_ph->cs; + chd_ph->cs = NULL; //transfer memory ownership + chd_phf->hl = chd_ph->hl; + chd_ph->hl = NULL; //transfer memory ownership + chd_phf->n = chd_ph->n; + chd_phf->nbuckets = chd_ph->nbuckets; + + mphf->data = chd_phf; + mphf->size = chd_ph->n; + + DEBUGP("Successfully generated minimal perfect hash\n"); + if (mph->verbosity) + { + fprintf(stderr, "Successfully generated minimal perfect hash function\n"); + } + + #ifdef CMPH_TIMING + register cmph_uint32 space_usage = chd_ph_packed_size(mphf)*8; + construction_time = construction_time - construction_time_begin; + fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\t%.4f\t%.4f\n", chd_ph->m, load_factor, chd_ph->keys_per_bucket, construction_time, space_usage/(double)chd_ph->m, space_lower_bound, entropy/chd_ph->m); + #endif + + return mphf; +} + + + +void chd_ph_load(FILE *fd, cmph_t *mphf) +{ + char *buf = NULL; + cmph_uint32 buflen; + register size_t nbytes; + chd_ph_data_t *chd_ph = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t)); + + DEBUGP("Loading chd_ph mphf\n"); + mphf->data = chd_ph; + + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + DEBUGP("Hash state has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, fd); + chd_ph->hl = hash_state_load(buf, buflen); + free(buf); + + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + DEBUGP("Compressed sequence structure has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, fd); + chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t)); + compressed_seq_load(chd_ph->cs, buf, buflen); + free(buf); + + // loading n and nbuckets + DEBUGP("Reading n and nbuckets\n"); + nbytes = fread(&(chd_ph->n), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fread(&(chd_ph->nbuckets), sizeof(cmph_uint32), (size_t)1, fd); +} + +int chd_ph_dump(cmph_t *mphf, FILE *fd) +{ + char *buf = NULL; + cmph_uint32 buflen; + register size_t nbytes; + chd_ph_data_t *data = (chd_ph_data_t *)mphf->data; + + __cmph_dump(mphf, fd); + + hash_state_dump(data->hl, &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + + compressed_seq_dump(data->cs, &buf, &buflen); + DEBUGP("Dumping compressed sequence structure with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + + // dumping n and nbuckets + nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(&(data->nbuckets), sizeof(cmph_uint32), (size_t)1, fd); + return 1; +} + +void chd_ph_destroy(cmph_t *mphf) +{ + chd_ph_data_t *data = (chd_ph_data_t *)mphf->data; + compressed_seq_destroy(data->cs); + free(data->cs); + hash_state_destroy(data->hl); + free(data); + free(mphf); + +} + +cmph_uint32 chd_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + register chd_ph_data_t * chd_ph = mphf->data; + cmph_uint32 hl[3]; + register cmph_uint32 disp,position; + register cmph_uint32 probe0_num,probe1_num; + register cmph_uint32 f,g,h; + hash_vector(chd_ph->hl, key, keylen, hl); + g = hl[0] % chd_ph->nbuckets; + f = hl[1] % chd_ph->n; + h = hl[2] % (chd_ph->n-1) + 1; + + disp = compressed_seq_query(chd_ph->cs, g); + probe0_num = disp % chd_ph->n; + probe1_num = disp/chd_ph->n; + position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % chd_ph->n); + return position; +} + +void chd_ph_pack(cmph_t *mphf, void *packed_mphf) +{ + chd_ph_data_t *data = (chd_ph_data_t *)mphf->data; + cmph_uint8 * ptr = packed_mphf; + + // packing hl type + CMPH_HASH hl_type = hash_get_type(data->hl); + *((cmph_uint32 *) ptr) = hl_type; + ptr += sizeof(cmph_uint32); + + // packing hl + hash_state_pack(data->hl, ptr); + ptr += hash_state_packed_size(hl_type); + + // packing n + *((cmph_uint32 *) ptr) = data->n; + ptr += sizeof(data->n); + + // packing nbuckets + *((cmph_uint32 *) ptr) = data->nbuckets; + ptr += sizeof(data->nbuckets); + + // packing cs + compressed_seq_pack(data->cs, ptr); + //ptr += compressed_seq_packed_size(data->cs); + +} + +cmph_uint32 chd_ph_packed_size(cmph_t *mphf) +{ + register chd_ph_data_t *data = (chd_ph_data_t *)mphf->data; + register CMPH_HASH hl_type = hash_get_type(data->hl); + register cmph_uint32 hash_state_pack_size = hash_state_packed_size(hl_type); + register cmph_uint32 cs_pack_size = compressed_seq_packed_size(data->cs); + + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_pack_size + cs_pack_size + 3*sizeof(cmph_uint32)); + +} + +cmph_uint32 chd_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + register CMPH_HASH hl_type = *(cmph_uint32 *)packed_mphf; + register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4; + + register cmph_uint32 * ptr = (cmph_uint32 *)(hl_ptr + hash_state_packed_size(hl_type)); + register cmph_uint32 n = *ptr++; + register cmph_uint32 nbuckets = *ptr++; + cmph_uint32 hl[3]; + + register cmph_uint32 disp,position; + register cmph_uint32 probe0_num,probe1_num; + register cmph_uint32 f,g,h; + + hash_vector_packed(hl_ptr, hl_type, key, keylen, hl); + + g = hl[0] % nbuckets; + f = hl[1] % n; + h = hl[2] % (n-1) + 1; + + disp = compressed_seq_query_packed(ptr, g); + probe0_num = disp % n; + probe1_num = disp/n; + position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % n); + return position; +} + + + |