diff options
Diffstat (limited to 'girepository/cmph/brz.c')
-rwxr-xr-x | girepository/cmph/brz.c | 985 |
1 files changed, 985 insertions, 0 deletions
diff --git a/girepository/cmph/brz.c b/girepository/cmph/brz.c new file mode 100755 index 00000000..eb89ac06 --- /dev/null +++ b/girepository/cmph/brz.c @@ -0,0 +1,985 @@ +#include "graph.h" +#include "fch.h" +#include "fch_structs.h" +#include "bmz8.h" +#include "bmz8_structs.h" +#include "brz.h" +#include "cmph_structs.h" +#include "brz_structs.h" +#include "buffer_manager.h" +#include "cmph.h" +#include "hash.h" +#include "bitbool.h" +#include <math.h> +#include <stdlib.h> +#include <stdio.h> +#include <assert.h> +#include <string.h> +#define MAX_BUCKET_SIZE 255 +//#define DEBUG +#include "debug.h" + +static int brz_gen_mphf(cmph_config_t *mph); +static cmph_uint32 brz_min_index(cmph_uint32 * vector, cmph_uint32 n); +static void brz_destroy_keys_vd(cmph_uint8 ** keys_vd, cmph_uint32 nkeys); +static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fchf, cmph_uint32 index, cmph_uint32 *buflen); +static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen); +brz_config_data_t *brz_config_new() +{ + brz_config_data_t *brz = NULL; + brz = (brz_config_data_t *)malloc(sizeof(brz_config_data_t)); + brz->algo = CMPH_FCH; + brz->b = 128; + brz->hashfuncs[0] = CMPH_HASH_JENKINS; + brz->hashfuncs[1] = CMPH_HASH_JENKINS; + brz->hashfuncs[2] = CMPH_HASH_JENKINS; + brz->size = NULL; + brz->offset = NULL; + brz->g = NULL; + brz->h1 = NULL; + brz->h2 = NULL; + brz->h0 = NULL; + brz->memory_availability = 1024*1024; + brz->tmp_dir = (cmph_uint8 *)calloc((size_t)10, sizeof(cmph_uint8)); + brz->mphf_fd = NULL; + strcpy((char *)(brz->tmp_dir), "/var/tmp/"); + assert(brz); + return brz; +} + +void brz_config_destroy(cmph_config_t *mph) +{ + brz_config_data_t *data = (brz_config_data_t *)mph->data; + free(data->tmp_dir); + DEBUGP("Destroying algorithm dependent data\n"); + free(data); +} + +void brz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) +{ + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + CMPH_HASH *hashptr = hashfuncs; + cmph_uint32 i = 0; + while(*hashptr != CMPH_HASH_COUNT) + { + if (i >= 3) break; //brz only uses three hash functions + brz->hashfuncs[i] = *hashptr; + ++i, ++hashptr; + } +} + +void brz_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability) +{ + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + if(memory_availability > 0) brz->memory_availability = memory_availability*1024*1024; +} + +void brz_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir) +{ + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + if(tmp_dir) + { + size_t len = strlen((char *)tmp_dir); + free(brz->tmp_dir); + if(tmp_dir[len-1] != '/') + { + brz->tmp_dir = (cmph_uint8 *)calloc((size_t)len+2, sizeof(cmph_uint8)); + sprintf((char *)(brz->tmp_dir), "%s/", (char *)tmp_dir); + } + else + { + brz->tmp_dir = (cmph_uint8 *)calloc((size_t)len+1, sizeof(cmph_uint8)); + sprintf((char *)(brz->tmp_dir), "%s", (char *)tmp_dir); + } + + } +} + +void brz_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd) +{ + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + brz->mphf_fd = mphf_fd; + assert(brz->mphf_fd); +} + +void brz_config_set_b(cmph_config_t *mph, cmph_uint32 b) +{ + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + if(b <= 64 || b >= 175) + { + b = 128; + } + brz->b = (cmph_uint8)b; +} + +void brz_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo) +{ + if (algo == CMPH_BMZ8 || algo == CMPH_FCH) // supported algorithms + { + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + brz->algo = algo; + } +} + +cmph_t *brz_new(cmph_config_t *mph, double c) +{ + cmph_t *mphf = NULL; + brz_data_t *brzf = NULL; + cmph_uint32 i; + cmph_uint32 iterations = 20; + + DEBUGP("c: %f\n", c); + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + switch(brz->algo) // validating restrictions over parameter c. + { + case CMPH_BMZ8: + if (c == 0 || c >= 2.0) c = 1; + break; + case CMPH_FCH: + if (c <= 2.0) c = 2.6; + break; + default: + assert(0); + } + brz->c = c; + brz->m = mph->key_source->nkeys; + DEBUGP("m: %u\n", brz->m); + brz->k = (cmph_uint32)ceil(brz->m/((double)brz->b)); + DEBUGP("k: %u\n", brz->k); + brz->size = (cmph_uint8 *) calloc((size_t)brz->k, sizeof(cmph_uint8)); + + // Clustering the keys by graph id. + if (mph->verbosity) + { + fprintf(stderr, "Partioning the set of keys.\n"); + } + + while(1) + { + int ok; + DEBUGP("hash function 3\n"); + brz->h0 = hash_state_new(brz->hashfuncs[2], brz->k); + DEBUGP("Generating graphs\n"); + ok = brz_gen_mphf(mph); + if (!ok) + { + --iterations; + hash_state_destroy(brz->h0); + brz->h0 = NULL; + DEBUGP("%u iterations remaining to create the graphs in a external file\n", iterations); + if (mph->verbosity) + { + fprintf(stderr, "Failure: A graph with more than 255 keys was created - %u iterations remaining\n", iterations); + } + if (iterations == 0) break; + } + else break; + } + if (iterations == 0) + { + DEBUGP("Graphs with more than 255 keys were created in all 20 iterations\n"); + free(brz->size); + return NULL; + } + DEBUGP("Graphs generated\n"); + + brz->offset = (cmph_uint32 *)calloc((size_t)brz->k, sizeof(cmph_uint32)); + for (i = 1; i < brz->k; ++i) + { + brz->offset[i] = brz->size[i-1] + brz->offset[i-1]; + } + // Generating a mphf + mphf = (cmph_t *)malloc(sizeof(cmph_t)); + mphf->algo = mph->algo; + brzf = (brz_data_t *)malloc(sizeof(brz_data_t)); + brzf->g = brz->g; + brz->g = NULL; //transfer memory ownership + brzf->h1 = brz->h1; + brz->h1 = NULL; //transfer memory ownership + brzf->h2 = brz->h2; + brz->h2 = NULL; //transfer memory ownership + brzf->h0 = brz->h0; + brz->h0 = NULL; //transfer memory ownership + brzf->size = brz->size; + brz->size = NULL; //transfer memory ownership + brzf->offset = brz->offset; + brz->offset = NULL; //transfer memory ownership + brzf->k = brz->k; + brzf->c = brz->c; + brzf->m = brz->m; + brzf->algo = brz->algo; + mphf->data = brzf; + mphf->size = brz->m; + DEBUGP("Successfully generated minimal perfect hash\n"); + if (mph->verbosity) + { + fprintf(stderr, "Successfully generated minimal perfect hash function\n"); + } + return mphf; +} + +static int brz_gen_mphf(cmph_config_t *mph) +{ + cmph_uint32 i, e, error; + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + cmph_uint32 memory_usage = 0; + cmph_uint32 nkeys_in_buffer = 0; + cmph_uint8 *buffer = (cmph_uint8 *)malloc((size_t)brz->memory_availability); + cmph_uint32 *buckets_size = (cmph_uint32 *)calloc((size_t)brz->k, sizeof(cmph_uint32)); + cmph_uint32 *keys_index = NULL; + cmph_uint8 **buffer_merge = NULL; + cmph_uint32 *buffer_h0 = NULL; + cmph_uint32 nflushes = 0; + cmph_uint32 h0; + register size_t nbytes; + FILE * tmp_fd = NULL; + buffer_manager_t * buff_manager = NULL; + char *filename = NULL; + char *key = NULL; + cmph_uint32 keylen; + cmph_uint32 cur_bucket = 0; + cmph_uint8 nkeys_vd = 0; + cmph_uint8 ** keys_vd = NULL; + + mph->key_source->rewind(mph->key_source->data); + DEBUGP("Generating graphs from %u keys\n", brz->m); + // Partitioning + for (e = 0; e < brz->m; ++e) + { + mph->key_source->read(mph->key_source->data, &key, &keylen); + + /* Buffers management */ + if (memory_usage + keylen + sizeof(keylen) > brz->memory_availability) // flush buffers + { + if(mph->verbosity) + { + fprintf(stderr, "Flushing %u\n", nkeys_in_buffer); + } + cmph_uint32 value = buckets_size[0]; + cmph_uint32 sum = 0; + cmph_uint32 keylen1 = 0; + buckets_size[0] = 0; + for(i = 1; i < brz->k; i++) + { + if(buckets_size[i] == 0) continue; + sum += value; + value = buckets_size[i]; + buckets_size[i] = sum; + + } + memory_usage = 0; + keys_index = (cmph_uint32 *)calloc((size_t)nkeys_in_buffer, sizeof(cmph_uint32)); + for(i = 0; i < nkeys_in_buffer; i++) + { + memcpy(&keylen1, buffer + memory_usage, sizeof(keylen1)); + h0 = hash(brz->h0, (char *)(buffer + memory_usage + sizeof(keylen1)), keylen1) % brz->k; + keys_index[buckets_size[h0]] = memory_usage; + buckets_size[h0]++; + memory_usage += keylen1 + (cmph_uint32)sizeof(keylen1); + } + filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char)); + sprintf(filename, "%s%u.cmph",brz->tmp_dir, nflushes); + tmp_fd = fopen(filename, "wb"); + free(filename); + filename = NULL; + for(i = 0; i < nkeys_in_buffer; i++) + { + memcpy(&keylen1, buffer + keys_index[i], sizeof(keylen1)); + nbytes = fwrite(buffer + keys_index[i], (size_t)1, keylen1 + sizeof(keylen1), tmp_fd); + } + nkeys_in_buffer = 0; + memory_usage = 0; + memset((void *)buckets_size, 0, brz->k*sizeof(cmph_uint32)); + nflushes++; + free(keys_index); + fclose(tmp_fd); + } + memcpy(buffer + memory_usage, &keylen, sizeof(keylen)); + memcpy(buffer + memory_usage + sizeof(keylen), key, (size_t)keylen); + memory_usage += keylen + (cmph_uint32)sizeof(keylen); + h0 = hash(brz->h0, key, keylen) % brz->k; + + if ((brz->size[h0] == MAX_BUCKET_SIZE) || (brz->algo == CMPH_BMZ8 && ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h0]) < brz->size[h0]))) + { + free(buffer); + free(buckets_size); + return 0; + } + brz->size[h0] = (cmph_uint8)(brz->size[h0] + 1U); + buckets_size[h0] ++; + nkeys_in_buffer++; + mph->key_source->dispose(mph->key_source->data, key, keylen); + } + if (memory_usage != 0) // flush buffers + { + if(mph->verbosity) + { + fprintf(stderr, "Flushing %u\n", nkeys_in_buffer); + } + cmph_uint32 value = buckets_size[0]; + cmph_uint32 sum = 0; + cmph_uint32 keylen1 = 0; + buckets_size[0] = 0; + for(i = 1; i < brz->k; i++) + { + if(buckets_size[i] == 0) continue; + sum += value; + value = buckets_size[i]; + buckets_size[i] = sum; + } + memory_usage = 0; + keys_index = (cmph_uint32 *)calloc((size_t)nkeys_in_buffer, sizeof(cmph_uint32)); + for(i = 0; i < nkeys_in_buffer; i++) + { + memcpy(&keylen1, buffer + memory_usage, sizeof(keylen1)); + h0 = hash(brz->h0, (char *)(buffer + memory_usage + sizeof(keylen1)), keylen1) % brz->k; + keys_index[buckets_size[h0]] = memory_usage; + buckets_size[h0]++; + memory_usage += keylen1 + (cmph_uint32)sizeof(keylen1); + } + filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char)); + sprintf(filename, "%s%u.cmph",brz->tmp_dir, nflushes); + tmp_fd = fopen(filename, "wb"); + free(filename); + filename = NULL; + for(i = 0; i < nkeys_in_buffer; i++) + { + memcpy(&keylen1, buffer + keys_index[i], sizeof(keylen1)); + nbytes = fwrite(buffer + keys_index[i], (size_t)1, keylen1 + sizeof(keylen1), tmp_fd); + } + nkeys_in_buffer = 0; + memory_usage = 0; + memset((void *)buckets_size, 0, brz->k*sizeof(cmph_uint32)); + nflushes++; + free(keys_index); + fclose(tmp_fd); + } + + free(buffer); + free(buckets_size); + if(nflushes > 1024) return 0; // Too many files generated. + // mphf generation + if(mph->verbosity) + { + fprintf(stderr, "\nMPHF generation \n"); + } + /* Starting to dump to disk the resultant MPHF: __cmph_dump function */ + nbytes = fwrite(cmph_names[CMPH_BRZ], (size_t)(strlen(cmph_names[CMPH_BRZ]) + 1), (size_t)1, brz->mphf_fd); + nbytes = fwrite(&(brz->m), sizeof(brz->m), (size_t)1, brz->mphf_fd); + nbytes = fwrite(&(brz->c), sizeof(double), (size_t)1, brz->mphf_fd); + nbytes = fwrite(&(brz->algo), sizeof(brz->algo), (size_t)1, brz->mphf_fd); + nbytes = fwrite(&(brz->k), sizeof(cmph_uint32), (size_t)1, brz->mphf_fd); // number of MPHFs + nbytes = fwrite(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, brz->mphf_fd); + + //tmp_fds = (FILE **)calloc(nflushes, sizeof(FILE *)); + buff_manager = buffer_manager_new(brz->memory_availability, nflushes); + buffer_merge = (cmph_uint8 **)calloc((size_t)nflushes, sizeof(cmph_uint8 *)); + buffer_h0 = (cmph_uint32 *)calloc((size_t)nflushes, sizeof(cmph_uint32)); + + memory_usage = 0; + for(i = 0; i < nflushes; i++) + { + filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char)); + sprintf(filename, "%s%u.cmph",brz->tmp_dir, i); + buffer_manager_open(buff_manager, i, filename); + free(filename); + filename = NULL; + key = (char *)buffer_manager_read_key(buff_manager, i, &keylen); + h0 = hash(brz->h0, key+sizeof(keylen), keylen) % brz->k; + buffer_h0[i] = h0; + buffer_merge[i] = (cmph_uint8 *)key; + key = NULL; //transfer memory ownership + } + e = 0; + keys_vd = (cmph_uint8 **)calloc((size_t)MAX_BUCKET_SIZE, sizeof(cmph_uint8 *)); + nkeys_vd = 0; + error = 0; + while(e < brz->m) + { + i = brz_min_index(buffer_h0, nflushes); + cur_bucket = buffer_h0[i]; + key = (char *)buffer_manager_read_key(buff_manager, i, &keylen); + if(key) + { + while(key) + { + //keylen = strlen(key); + h0 = hash(brz->h0, key+sizeof(keylen), keylen) % brz->k; + if (h0 != buffer_h0[i]) break; + keys_vd[nkeys_vd++] = (cmph_uint8 *)key; + key = NULL; //transfer memory ownership + e++; + key = (char *)buffer_manager_read_key(buff_manager, i, &keylen); + } + if (key) + { + assert(nkeys_vd < brz->size[cur_bucket]); + keys_vd[nkeys_vd++] = buffer_merge[i]; + buffer_merge[i] = NULL; //transfer memory ownership + e++; + buffer_h0[i] = h0; + buffer_merge[i] = (cmph_uint8 *)key; + } + } + if(!key) + { + assert(nkeys_vd < brz->size[cur_bucket]); + keys_vd[nkeys_vd++] = buffer_merge[i]; + buffer_merge[i] = NULL; //transfer memory ownership + e++; + buffer_h0[i] = UINT_MAX; + } + + if(nkeys_vd == brz->size[cur_bucket]) // Generating mphf for each bucket. + { + cmph_io_adapter_t *source = NULL; + cmph_config_t *config = NULL; + cmph_t *mphf_tmp = NULL; + char *bufmphf = NULL; + cmph_uint32 buflenmphf = 0; + // Source of keys + source = cmph_io_byte_vector_adapter(keys_vd, (cmph_uint32)nkeys_vd); + config = cmph_config_new(source); + cmph_config_set_algo(config, brz->algo); + //cmph_config_set_algo(config, CMPH_BMZ8); + cmph_config_set_graphsize(config, brz->c); + mphf_tmp = cmph_new(config); + if (mphf_tmp == NULL) + { + if(mph->verbosity) fprintf(stderr, "ERROR: Can't generate MPHF for bucket %u out of %u\n", cur_bucket + 1, brz->k); + error = 1; + cmph_config_destroy(config); + brz_destroy_keys_vd(keys_vd, nkeys_vd); + cmph_io_byte_vector_adapter_destroy(source); + break; + } + if(mph->verbosity) + { + if (cur_bucket % 1000 == 0) + { + fprintf(stderr, "MPHF for bucket %u out of %u was generated.\n", cur_bucket + 1, brz->k); + } + } + switch(brz->algo) + { + case CMPH_FCH: + { + fch_data_t * fchf = NULL; + fchf = (fch_data_t *)mphf_tmp->data; + bufmphf = brz_copy_partial_fch_mphf(brz, fchf, cur_bucket, &buflenmphf); + } + break; + case CMPH_BMZ8: + { + bmz8_data_t * bmzf = NULL; + bmzf = (bmz8_data_t *)mphf_tmp->data; + bufmphf = brz_copy_partial_bmz8_mphf(brz, bmzf, cur_bucket, &buflenmphf); + } + break; + default: assert(0); + } + nbytes = fwrite(bufmphf, (size_t)buflenmphf, (size_t)1, brz->mphf_fd); + free(bufmphf); + bufmphf = NULL; + cmph_config_destroy(config); + brz_destroy_keys_vd(keys_vd, nkeys_vd); + cmph_destroy(mphf_tmp); + cmph_io_byte_vector_adapter_destroy(source); + nkeys_vd = 0; + } + } + buffer_manager_destroy(buff_manager); + free(keys_vd); + free(buffer_merge); + free(buffer_h0); + if (error) return 0; + return 1; +} + +static cmph_uint32 brz_min_index(cmph_uint32 * vector, cmph_uint32 n) +{ + cmph_uint32 i, min_index = 0; + for(i = 1; i < n; i++) + { + if(vector[i] < vector[min_index]) min_index = i; + } + return min_index; +} + +static void brz_destroy_keys_vd(cmph_uint8 ** keys_vd, cmph_uint32 nkeys) +{ + cmph_uint8 i; + for(i = 0; i < nkeys; i++) { free(keys_vd[i]); keys_vd[i] = NULL;} +} + +static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fchf, cmph_uint32 index, cmph_uint32 *buflen) +{ + cmph_uint32 i = 0; + cmph_uint32 buflenh1 = 0; + cmph_uint32 buflenh2 = 0; + char * bufh1 = NULL; + char * bufh2 = NULL; + char * buf = NULL; + cmph_uint32 n = fchf->b;//brz->size[index]; + hash_state_dump(fchf->h1, &bufh1, &buflenh1); + hash_state_dump(fchf->h2, &bufh2, &buflenh2); + *buflen = buflenh1 + buflenh2 + n + 2U * (cmph_uint32)sizeof(cmph_uint32); + buf = (char *)malloc((size_t)(*buflen)); + memcpy(buf, &buflenh1, sizeof(cmph_uint32)); + memcpy(buf+sizeof(cmph_uint32), bufh1, (size_t)buflenh1); + memcpy(buf+sizeof(cmph_uint32)+buflenh1, &buflenh2, sizeof(cmph_uint32)); + memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, (size_t)buflenh2); + for (i = 0; i < n; i++) memcpy(buf+2*sizeof(cmph_uint32)+buflenh1+buflenh2+i,(fchf->g + i), (size_t)1); + free(bufh1); + free(bufh2); + return buf; +} +static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen) +{ + cmph_uint32 buflenh1 = 0; + cmph_uint32 buflenh2 = 0; + char * bufh1 = NULL; + char * bufh2 = NULL; + char * buf = NULL; + cmph_uint32 n = (cmph_uint32)ceil(brz->c * brz->size[index]); + hash_state_dump(bmzf->hashes[0], &bufh1, &buflenh1); + hash_state_dump(bmzf->hashes[1], &bufh2, &buflenh2); + *buflen = buflenh1 + buflenh2 + n + 2U * (cmph_uint32)sizeof(cmph_uint32); + buf = (char *)malloc((size_t)(*buflen)); + memcpy(buf, &buflenh1, sizeof(cmph_uint32)); + memcpy(buf+sizeof(cmph_uint32), bufh1, (size_t)buflenh1); + memcpy(buf+sizeof(cmph_uint32)+buflenh1, &buflenh2, sizeof(cmph_uint32)); + memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, (size_t)buflenh2); + memcpy(buf+2*sizeof(cmph_uint32)+buflenh1+buflenh2,bmzf->g, (size_t)n); + free(bufh1); + free(bufh2); + return buf; +} + + +int brz_dump(cmph_t *mphf, FILE *fd) +{ + brz_data_t *data = (brz_data_t *)mphf->data; + char *buf = NULL; + cmph_uint32 buflen; + register size_t nbytes; + DEBUGP("Dumping brzf\n"); + // The initial part of the MPHF have already been dumped to disk during construction + // Dumping h0 + hash_state_dump(data->h0, &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + // Dumping m and the vector offset. + nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(data->offset, sizeof(cmph_uint32)*(data->k), (size_t)1, fd); + return 1; +} + +void brz_load(FILE *f, cmph_t *mphf) +{ + char *buf = NULL; + cmph_uint32 buflen; + register size_t nbytes; + cmph_uint32 i, n; + brz_data_t *brz = (brz_data_t *)malloc(sizeof(brz_data_t)); + + DEBUGP("Loading brz mphf\n"); + mphf->data = brz; + nbytes = fread(&(brz->c), sizeof(double), (size_t)1, f); + nbytes = fread(&(brz->algo), sizeof(brz->algo), (size_t)1, f); // Reading algo. + nbytes = fread(&(brz->k), sizeof(cmph_uint32), (size_t)1, f); + brz->size = (cmph_uint8 *) malloc(sizeof(cmph_uint8)*brz->k); + nbytes = fread(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, f); + brz->h1 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k); + brz->h2 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k); + brz->g = (cmph_uint8 **) calloc((size_t)brz->k, sizeof(cmph_uint8 *)); + DEBUGP("Reading c = %f k = %u algo = %u \n", brz->c, brz->k, brz->algo); + //loading h_i1, h_i2 and g_i. + for(i = 0; i < brz->k; i++) + { + // h1 + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f); + DEBUGP("Hash state 1 has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, f); + brz->h1[i] = hash_state_load(buf, buflen); + free(buf); + //h2 + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f); + DEBUGP("Hash state 2 has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, f); + brz->h2[i] = hash_state_load(buf, buflen); + free(buf); + switch(brz->algo) + { + case CMPH_FCH: + n = fch_calc_b(brz->c, brz->size[i]); + break; + case CMPH_BMZ8: + n = (cmph_uint32)ceil(brz->c * brz->size[i]); + break; + default: assert(0); + } + DEBUGP("g_i has %u bytes\n", n); + brz->g[i] = (cmph_uint8 *)calloc((size_t)n, sizeof(cmph_uint8)); + nbytes = fread(brz->g[i], sizeof(cmph_uint8)*n, (size_t)1, f); + } + //loading h0 + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f); + DEBUGP("Hash state has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, f); + brz->h0 = hash_state_load(buf, buflen); + free(buf); + + //loading c, m, and the vector offset. + nbytes = fread(&(brz->m), sizeof(cmph_uint32), (size_t)1, f); + brz->offset = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*brz->k); + nbytes = fread(brz->offset, sizeof(cmph_uint32)*(brz->k), (size_t)1, f); + return; +} + +static cmph_uint32 brz_bmz8_search(brz_data_t *brz, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) +{ + register cmph_uint32 h0; + + hash_vector(brz->h0, key, keylen, fingerprint); + h0 = fingerprint[2] % brz->k; + + register cmph_uint32 m = brz->size[h0]; + register cmph_uint32 n = (cmph_uint32)ceil(brz->c * m); + register cmph_uint32 h1 = hash(brz->h1[h0], key, keylen) % n; + register cmph_uint32 h2 = hash(brz->h2[h0], key, keylen) % n; + register cmph_uint8 mphf_bucket; + + if (h1 == h2 && ++h2 >= n) h2 = 0; + mphf_bucket = (cmph_uint8)(brz->g[h0][h1] + brz->g[h0][h2]); + DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0); + DEBUGP("key: %s g[h1]: %u g[h2]: %u offset[h0]: %u edges: %u\n", key, brz->g[h0][h1], brz->g[h0][h2], brz->offset[h0], brz->m); + DEBUGP("Address: %u\n", mphf_bucket + brz->offset[h0]); + return (mphf_bucket + brz->offset[h0]); +} + +static cmph_uint32 brz_fch_search(brz_data_t *brz, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) +{ + register cmph_uint32 h0; + + hash_vector(brz->h0, key, keylen, fingerprint); + h0 = fingerprint[2] % brz->k; + + register cmph_uint32 m = brz->size[h0]; + register cmph_uint32 b = fch_calc_b(brz->c, m); + register double p1 = fch_calc_p1(m); + register double p2 = fch_calc_p2(b); + register cmph_uint32 h1 = hash(brz->h1[h0], key, keylen) % m; + register cmph_uint32 h2 = hash(brz->h2[h0], key, keylen) % m; + register cmph_uint8 mphf_bucket = 0; + h1 = mixh10h11h12(b, p1, p2, h1); + mphf_bucket = (cmph_uint8)((h2 + brz->g[h0][h1]) % m); + return (mphf_bucket + brz->offset[h0]); +} + +cmph_uint32 brz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + brz_data_t *brz = mphf->data; + cmph_uint32 fingerprint[3]; + switch(brz->algo) + { + case CMPH_FCH: + return brz_fch_search(brz, key, keylen, fingerprint); + case CMPH_BMZ8: + return brz_bmz8_search(brz, key, keylen, fingerprint); + default: assert(0); + } + return 0; +} +void brz_destroy(cmph_t *mphf) +{ + cmph_uint32 i; + brz_data_t *data = (brz_data_t *)mphf->data; + if(data->g) + { + for(i = 0; i < data->k; i++) + { + free(data->g[i]); + hash_state_destroy(data->h1[i]); + hash_state_destroy(data->h2[i]); + } + free(data->g); + free(data->h1); + free(data->h2); + } + hash_state_destroy(data->h0); + free(data->size); + free(data->offset); + free(data); + free(mphf); +} + +/** \fn void brz_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void brz_pack(cmph_t *mphf, void *packed_mphf) +{ + brz_data_t *data = (brz_data_t *)mphf->data; + cmph_uint8 * ptr = packed_mphf; + cmph_uint32 i,n; + + // packing internal algo type + memcpy(ptr, &(data->algo), sizeof(data->algo)); + ptr += sizeof(data->algo); + + // packing h0 type + CMPH_HASH h0_type = hash_get_type(data->h0); + memcpy(ptr, &h0_type, sizeof(h0_type)); + ptr += sizeof(h0_type); + + // packing h0 + hash_state_pack(data->h0, ptr); + ptr += hash_state_packed_size(h0_type); + + // packing k + memcpy(ptr, &(data->k), sizeof(data->k)); + ptr += sizeof(data->k); + + // packing c + *((cmph_uint64 *)ptr) = (cmph_uint64)data->c; + ptr += sizeof(data->c); + + // packing h1 type + CMPH_HASH h1_type = hash_get_type(data->h1[0]); + memcpy(ptr, &h1_type, sizeof(h1_type)); + ptr += sizeof(h1_type); + + // packing h2 type + CMPH_HASH h2_type = hash_get_type(data->h2[0]); + memcpy(ptr, &h2_type, sizeof(h2_type)); + ptr += sizeof(h2_type); + + // packing size + memcpy(ptr, data->size, sizeof(cmph_uint8)*data->k); + ptr += data->k; + + // packing offset + memcpy(ptr, data->offset, sizeof(cmph_uint32)*data->k); + ptr += sizeof(cmph_uint32)*data->k; + + #if defined (__ia64) || defined (__x86_64__) + cmph_uint64 * g_is_ptr = (cmph_uint64 *)ptr; + #else + cmph_uint32 * g_is_ptr = (cmph_uint32 *)ptr; + #endif + + cmph_uint8 * g_i = (cmph_uint8 *) (g_is_ptr + data->k); + + for(i = 0; i < data->k; i++) + { + #if defined (__ia64) || defined (__x86_64__) + *g_is_ptr++ = (cmph_uint64)g_i; + #else + *g_is_ptr++ = (cmph_uint32)g_i; + #endif + // packing h1[i] + hash_state_pack(data->h1[i], g_i); + g_i += hash_state_packed_size(h1_type); + + // packing h2[i] + hash_state_pack(data->h2[i], g_i); + g_i += hash_state_packed_size(h2_type); + + // packing g_i + switch(data->algo) + { + case CMPH_FCH: + n = fch_calc_b(data->c, data->size[i]); + break; + case CMPH_BMZ8: + n = (cmph_uint32)ceil(data->c * data->size[i]); + break; + default: assert(0); + } + memcpy(g_i, data->g[i], sizeof(cmph_uint8)*n); + g_i += n; + + } + +} + +/** \fn cmph_uint32 brz_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 brz_packed_size(cmph_t *mphf) +{ + cmph_uint32 i; + cmph_uint32 size = 0; + brz_data_t *data = (brz_data_t *)mphf->data; + CMPH_HASH h0_type = hash_get_type(data->h0); + CMPH_HASH h1_type = hash_get_type(data->h1[0]); + CMPH_HASH h2_type = hash_get_type(data->h2[0]); + size = (cmph_uint32)(2*sizeof(CMPH_ALGO) + 3*sizeof(CMPH_HASH) + hash_state_packed_size(h0_type) + sizeof(cmph_uint32) + + sizeof(double) + sizeof(cmph_uint8)*data->k + sizeof(cmph_uint32)*data->k); + // pointers to g_is + #if defined (__ia64) || defined (__x86_64__) + size += (cmph_uint32) sizeof(cmph_uint64)*data->k; + #else + size += (cmph_uint32) sizeof(cmph_uint32)*data->k; + #endif + + size += hash_state_packed_size(h1_type) * data->k; + size += hash_state_packed_size(h2_type) * data->k; + + cmph_uint32 n = 0; + for(i = 0; i < data->k; i++) + { + switch(data->algo) + { + case CMPH_FCH: + n = fch_calc_b(data->c, data->size[i]); + break; + case CMPH_BMZ8: + n = (cmph_uint32)ceil(data->c * data->size[i]); + break; + default: assert(0); + } + size += n; + } + return size; +} + + + +static cmph_uint32 brz_bmz8_search_packed(cmph_uint32 *packed_mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) +{ + register CMPH_HASH h0_type = *packed_mphf++; + register cmph_uint32 *h0_ptr = packed_mphf; + packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type)); + + register cmph_uint32 k = *packed_mphf++; + + register double c = (double)(*((cmph_uint64*)packed_mphf)); + packed_mphf += 2; + + register CMPH_HASH h1_type = *packed_mphf++; + + register CMPH_HASH h2_type = *packed_mphf++; + + register cmph_uint8 * size = (cmph_uint8 *) packed_mphf; + packed_mphf = (cmph_uint32 *)(size + k); + + register cmph_uint32 * offset = packed_mphf; + packed_mphf += k; + + register cmph_uint32 h0; + + hash_vector_packed(h0_ptr, h0_type, key, keylen, fingerprint); + h0 = fingerprint[2] % k; + + register cmph_uint32 m = size[h0]; + register cmph_uint32 n = (cmph_uint32)ceil(c * m); + + #if defined (__ia64) || defined (__x86_64__) + register cmph_uint64 * g_is_ptr = (cmph_uint64 *)packed_mphf; + #else + register cmph_uint32 * g_is_ptr = packed_mphf; + #endif + + register cmph_uint8 * h1_ptr = (cmph_uint8 *) g_is_ptr[h0]; + + register cmph_uint8 * h2_ptr = h1_ptr + hash_state_packed_size(h1_type); + + register cmph_uint8 * g = h2_ptr + hash_state_packed_size(h2_type); + + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n; + register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; + + register cmph_uint8 mphf_bucket; + + if (h1 == h2 && ++h2 >= n) h2 = 0; + mphf_bucket = (cmph_uint8)(g[h1] + g[h2]); + DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0); + DEBUGP("Address: %u\n", mphf_bucket + offset[h0]); + return (mphf_bucket + offset[h0]); +} + +static cmph_uint32 brz_fch_search_packed(cmph_uint32 *packed_mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) +{ + register CMPH_HASH h0_type = *packed_mphf++; + + register cmph_uint32 *h0_ptr = packed_mphf; + packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type)); + + register cmph_uint32 k = *packed_mphf++; + + register double c = (double)(*((cmph_uint64*)packed_mphf)); + packed_mphf += 2; + + register CMPH_HASH h1_type = *packed_mphf++; + + register CMPH_HASH h2_type = *packed_mphf++; + + register cmph_uint8 * size = (cmph_uint8 *) packed_mphf; + packed_mphf = (cmph_uint32 *)(size + k); + + register cmph_uint32 * offset = packed_mphf; + packed_mphf += k; + + register cmph_uint32 h0; + + hash_vector_packed(h0_ptr, h0_type, key, keylen, fingerprint); + h0 = fingerprint[2] % k; + + register cmph_uint32 m = size[h0]; + register cmph_uint32 b = fch_calc_b(c, m); + register double p1 = fch_calc_p1(m); + register double p2 = fch_calc_p2(b); + + #if defined (__ia64) || defined (__x86_64__) + register cmph_uint64 * g_is_ptr = (cmph_uint64 *)packed_mphf; + #else + register cmph_uint32 * g_is_ptr = packed_mphf; + #endif + + register cmph_uint8 * h1_ptr = (cmph_uint8 *) g_is_ptr[h0]; + + register cmph_uint8 * h2_ptr = h1_ptr + hash_state_packed_size(h1_type); + + register cmph_uint8 * g = h2_ptr + hash_state_packed_size(h2_type); + + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % m; + register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % m; + + register cmph_uint8 mphf_bucket = 0; + h1 = mixh10h11h12(b, p1, p2, h1); + mphf_bucket = (cmph_uint8)((h2 + g[h1]) % m); + return (mphf_bucket + offset[h0]); +} + +/** cmph_uint32 brz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 brz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + register cmph_uint32 *ptr = (cmph_uint32 *)packed_mphf; + register CMPH_ALGO algo = *ptr++; + cmph_uint32 fingerprint[3]; + switch(algo) + { + case CMPH_FCH: + return brz_fch_search_packed(ptr, key, keylen, fingerprint); + case CMPH_BMZ8: + return brz_bmz8_search_packed(ptr, key, keylen, fingerprint); + default: assert(0); + } +} + |